From 881e40be880fdebb5d3ebfbbce18de442c01d9e5 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Wed, 22 Jan 2025 12:43:15 -0300 Subject: [PATCH 001/296] Update Bloom example, removed memory barriers on FFT Signed-off-by: kevyuu --- 28_FFTBloom/app_resources/fft_common.hlsl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/28_FFTBloom/app_resources/fft_common.hlsl b/28_FFTBloom/app_resources/fft_common.hlsl index 295c05223..41f8821cc 100644 --- a/28_FFTBloom/app_resources/fft_common.hlsl +++ b/28_FFTBloom/app_resources/fft_common.hlsl @@ -32,11 +32,6 @@ struct PreloadedAccessorCommonBase NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocation = FFTParameters::ElementsPerInvocation; NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = FFTParameters::WorkgroupSize; NBL_CONSTEXPR_STATIC_INLINE uint16_t TotalSize = FFTParameters::TotalSize; - - void memoryBarrier() - { - // Preloaded Accessors don't access any memory in this stage, so we don't need to do anything here - } }; struct PreloadedAccessorBase : PreloadedAccessorCommonBase From 444c91729670f1c804b656899ef7d483ca9f30b8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Jan 2025 11:05:21 +0700 Subject: [PATCH 002/296] Implement Ray Tracing Demo - Multiple HitGroup. Each with closesthit and anythit shader - Multiple Miss Shader Group. Signed-off-by: kevyuu --- 71_RayTracingPipeline/CMakeLists.txt | 28 + .../app_resources/common.hlsl | 101 ++ .../app_resources/random.hlsl | 34 + .../app_resources/raytrace.rahit.hlsl | 27 + .../app_resources/raytrace.rchit.hlsl | 152 ++ .../app_resources/raytrace.rgen.hlsl | 72 + .../app_resources/raytrace.rmiss.hlsl | 8 + .../app_resources/raytraceShadow.rmiss.hlsl | 7 + 71_RayTracingPipeline/include/common.hpp | 93 ++ 71_RayTracingPipeline/main.cpp | 1289 +++++++++++++++++ CMakeLists.txt | 1 + common/include/CCamera.hpp | 15 +- 12 files changed, 1825 insertions(+), 2 deletions(-) create mode 100644 71_RayTracingPipeline/CMakeLists.txt create mode 100644 71_RayTracingPipeline/app_resources/common.hlsl create mode 100644 71_RayTracingPipeline/app_resources/random.hlsl create mode 100644 71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl create mode 100644 71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl create mode 100644 71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl create mode 100644 71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl create mode 100644 71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl create mode 100644 71_RayTracingPipeline/include/common.hpp create mode 100644 71_RayTracingPipeline/main.cpp diff --git a/71_RayTracingPipeline/CMakeLists.txt b/71_RayTracingPipeline/CMakeLists.txt new file mode 100644 index 000000000..4a555f4ce --- /dev/null +++ b/71_RayTracingPipeline/CMakeLists.txt @@ -0,0 +1,28 @@ +set(NBL_INCLUDE_SEARCH_DIRECTORIES + "${CMAKE_CURRENT_SOURCE_DIR}/include" +) + +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "${NBL_INCLUDE_SEARCH_DIRECTORIES}" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl new file mode 100644 index 000000000..3b6c36abc --- /dev/null +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -0,0 +1,101 @@ +#ifndef RQG_COMMON_HLSL +#define RQG_COMMON_HLSL + +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +NBL_CONSTEXPR uint32_t WorkgroupSize = 16; + +struct Material +{ + float32_t3 ambient; + float32_t3 diffuse; + float32_t3 specular; + float32_t shininess; + float32_t dissolve; // 1 == opaque; 0 == fully transparent + uint32_t illum; // illumination model (see http://www.fileformat.info/format/material/) +}; + +struct SGeomInfo +{ + uint64_t vertexBufferAddress; + uint64_t indexBufferAddress; + + uint32_t vertexStride : 29; + uint32_t indexType : 2; // 16 bit, 32 bit or none + uint32_t smoothNormals : 1; // flat for cube, rectangle, disk + + uint32_t objType; + + Material material; +}; + +struct SPushConstants +{ + uint64_t geometryInfoBuffer; + uint32_t frameCounter; + + float32_t3 camPos; + float32_t4x4 invMVP; + +}; + +#ifdef __HLSL_VERSION + +struct [raypayload] ColorPayload +{ + float32_t3 hitValue; + uint32_t seed; +}; + +struct [raypayload] ShadowPayload +{ + bool isShadowed; + uint32_t seed; +}; + +enum ObjectType : uint32_t // matches c++ +{ + OT_CUBE = 0, + OT_SPHERE, + OT_CYLINDER, + OT_RECTANGLE, + OT_DISK, + OT_ARROW, + OT_CONE, + OT_ICOSPHERE, + + OT_COUNT +}; + +static uint32_t s_offsetsToNormalBytes[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 }; // based on normals data position +float32_t3 computeDiffuse(Material mat, float32_t3 light_dir, float32_t3 normal) +{ + // Lambertian + float32_t dotNL = max(dot(normal, light_dir), 0.0); + float32_t3 c = mat.diffuse * dotNL; + if (mat.illum >= 1) + c += mat.ambient; + return c; +} + +float32_t3 computeSpecular(Material mat, float32_t3 view_dir, + float32_t3 light_dir, float32_t3 normal) +{ + if (mat.illum < 2) + return float32_t3(0, 0, 0); + + // Compute specular only if not in shadow + const float32_t kPi = 3.14159265; + const float32_t kShininess = max(mat.shininess, 4.0); + + // Specular + const float32_t kEnergyConservation = (2.0 + kShininess) / (2.0 * kPi); + float32_t3 V = normalize(-view_dir); + float32_t3 R = reflect(-light_dir, normal); + float32_t specular = kEnergyConservation * pow(max(dot(V, R), 0.0), kShininess); + + return float32_t3(mat.specular * specular); +} +#endif + +#endif // RQG_COMMON_HLSL diff --git a/71_RayTracingPipeline/app_resources/random.hlsl b/71_RayTracingPipeline/app_resources/random.hlsl new file mode 100644 index 000000000..e01d7ff6c --- /dev/null +++ b/71_RayTracingPipeline/app_resources/random.hlsl @@ -0,0 +1,34 @@ +// Generate a random unsigned int from two unsigned int values, using 16 pairs +// of rounds of the Tiny Encryption Algorithm. See Zafar, Olano, and Curtis, +// "GPU Random Numbers via the Tiny Encryption Algorithm" +uint32_t tea(uint32_t val0, uint32_t val1) +{ + uint32_t v0 = val0; + uint32_t v1 = val1; + uint32_t s0 = 0; + + for(uint32_t n = 0; n < 16; n++) + { + s0 += 0x9e3779b9; + v0 += ((v1 << 4) + 0xa341316c) ^ (v1 + s0) ^ ((v1 >> 5) + 0xc8013ea4); + v1 += ((v0 << 4) + 0xad90777d) ^ (v0 + s0) ^ ((v0 >> 5) + 0x7e95761e); + } + + return v0; +} + +// Generate a random unsigned int in [0, 2^24) given the previous RNG state +// using the Numerical Recipes linear congruential generator +uint32_t lcg(inout uint32_t prev) +{ + uint32_t LCG_A = 1664525u; + uint32_t LCG_C = 1013904223u; + prev = (LCG_A * prev + LCG_C); + return prev & 0x00FFFFFF; +} + +// Generate a random float32_t in [0, 1) given the previous RNG state +float32_t rnd(inout uint32_t prev) +{ + return (float32_t(lcg(prev)) / float32_t(0x01000000)); +} diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl new file mode 100644 index 000000000..f68d607aa --- /dev/null +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -0,0 +1,27 @@ +#include "common.hlsl" +#include "random.hlsl" + +[[vk::push_constant]] SPushConstants pc; + +[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; + +#if defined(USE_COLOR_PAYLOAD) +using AnyHitPayload = ColorPayload; +#elif defined(USE_SHADOW_PAYLOAD) +using AnyHitPayload = ShadowPayload; +#endif + +[shader("anyhit")] +void main(inout AnyHitPayload p, in BuiltInTriangleIntersectionAttributes attribs) +{ + const int instID = InstanceID(); + const SGeomInfo geom = vk::RawBufferLoad < SGeomInfo > (pc.geometryInfoBuffer + instID * sizeof(SGeomInfo)); + + if (geom.material.illum != 4) + return; + + if (geom.material.dissolve == 0.0) + IgnoreHit(); + else if (rnd(p.seed) > geom.material.dissolve) + IgnoreHit(); +} diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl new file mode 100644 index 000000000..b77412ff7 --- /dev/null +++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl @@ -0,0 +1,152 @@ +#include "common.hlsl" + +[[vk::push_constant]] SPushConstants pc; + +[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; + +float3 unpackNormals3x10(uint32_t v) +{ + // host side changes float32_t3 to EF_A2B10G10R10_SNORM_PACK32 + // follows unpacking scheme from https://github.com/KhronosGroup/SPIRV-Cross/blob/main/reference/shaders-hlsl/frag/unorm-snorm-packing.frag + int signedValue = int(v); + int3 pn = int3(signedValue << 22, signedValue << 12, signedValue << 2) >> 22; + return clamp(float3(pn) / 511.0, -1.0, 1.0); +} + +struct VertexData { + float32_t3 position; + float32_t3 normal; +}; + +VertexData fetchVertexData(int instID, int primID, SGeomInfo geom, float2 bary) +{ + uint idxOffset = primID * 3; + + const uint indexType = geom.indexType; + const uint vertexStride = geom.vertexStride; + + const uint32_t objType = geom.objType; + const uint64_t indexBufferAddress = geom.indexBufferAddress; + + uint i0, i1, i2; + switch (indexType) + { + case 0: // EIT_16BIT + { + i0 = uint32_t(vk::RawBufferLoad(indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u)); + i1 = uint32_t(vk::RawBufferLoad(indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u)); + i2 = uint32_t(vk::RawBufferLoad(indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u)); + } + break; + case 1: // EIT_32BIT + { + i0 = vk::RawBufferLoad(indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t)); + i1 = vk::RawBufferLoad(indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t)); + i2 = vk::RawBufferLoad(indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t)); + } + break; + default: // EIT_NONE + { + i0 = idxOffset; + i1 = idxOffset + 1; + i2 = idxOffset + 2; + } + } + + const uint64_t vertexBufferAddress = geom.vertexBufferAddress; + float32_t3 p0 = vk::RawBufferLoad(vertexBufferAddress + i0 * vertexStride); + float32_t3 p1 = vk::RawBufferLoad(vertexBufferAddress + i1 * vertexStride); + float32_t3 p2 = vk::RawBufferLoad(vertexBufferAddress + i2 * vertexStride); + + const uint64_t normalVertexBufferAddress = vertexBufferAddress + s_offsetsToNormalBytes[objType]; + float3 n0, n1, n2; + switch (objType) + { + case OT_CUBE: + { + uint32_t v0 = vk::RawBufferLoad(normalVertexBufferAddress + i0 * vertexStride, 2u); + uint32_t v1 = vk::RawBufferLoad(normalVertexBufferAddress + i1 * vertexStride, 2u); + uint32_t v2 = vk::RawBufferLoad(normalVertexBufferAddress + i2 * vertexStride, 2u); + + n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz); + n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz); + n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz); + } + break; + case OT_SPHERE: + case OT_CYLINDER: + case OT_ARROW: + case OT_CONE: + { + uint32_t v0 = vk::RawBufferLoad(normalVertexBufferAddress + i0 * vertexStride); + uint32_t v1 = vk::RawBufferLoad(normalVertexBufferAddress + i1 * vertexStride); + uint32_t v2 = vk::RawBufferLoad(normalVertexBufferAddress + i2 * vertexStride); + + n0 = normalize(unpackNormals3x10(v0)); + n1 = normalize(unpackNormals3x10(v1)); + n2 = normalize(unpackNormals3x10(v2)); + } + break; + case OT_RECTANGLE: + case OT_DISK: + case OT_ICOSPHERE: + default: + { + n0 = normalize(vk::RawBufferLoad(normalVertexBufferAddress + i0 * vertexStride)); + n1 = normalize(vk::RawBufferLoad(normalVertexBufferAddress + i1 * vertexStride)); + n2 = normalize(vk::RawBufferLoad(normalVertexBufferAddress + i2 * vertexStride)); + } + } + + float3 barycentrics = float3(0.0, bary); + barycentrics.x = 1.0 - barycentrics.y - barycentrics.z; + + VertexData data; + data.position = barycentrics.x * p0 + barycentrics.y * p1 + barycentrics.z * p2; + data.normal = normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2); + return data; +} + +[shader("closesthit")] +void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs) +{ + const int instID = InstanceID(); + const int primID = PrimitiveIndex(); + const SGeomInfo geom = vk::RawBufferLoad(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo)); + const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics); + const float32_t3 worldPosition = mul(ObjectToWorld3x4(), float32_t4(vertexData.position, 1)); + const float32_t3 worldNormal = mul(vertexData.normal, WorldToObject3x4()).xyz; + + const float32_t lightIntensity = 1; + const float32_t3 lightDirection = normalize(float32_t3(1, 1, -1)); + + float32_t3 diffuse = computeDiffuse(geom.material, lightDirection, worldNormal); + float32_t3 specular = float32_t3(0, 0, 0); + float32_t attenuation = 1; + + if (dot(worldNormal, lightDirection) > 0) + { + RayDesc rayDesc; + rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent() + worldNormal * 0.02f; + rayDesc.Direction = lightDirection; + rayDesc.TMin = 0.001; + rayDesc.TMax = 1000; + + uint flags = RAY_FLAG_SKIP_CLOSEST_HIT_SHADER; + ShadowPayload shadowPayload; + shadowPayload.isShadowed = true; + shadowPayload.seed = p.seed; + TraceRay(topLevelAS, flags, 0xFF, 1, 0, 1, rayDesc, shadowPayload); + p.seed = shadowPayload.seed; + + if (shadowPayload.isShadowed) + { + attenuation = 0.3; + } + else + { + specular = computeSpecular(geom.material, WorldRayDirection(), lightDirection, worldNormal); + } + } + p.hitValue = (lightIntensity * attenuation * (specular + diffuse)); +} \ No newline at end of file diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl new file mode 100644 index 000000000..90b950f76 --- /dev/null +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -0,0 +1,72 @@ +#include "common.hlsl" +#include "random.hlsl" + +#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl" + +static const int32_t s_sampleCount = 10; + +[[vk::push_constant]] SPushConstants pc; + +[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; + +[[vk::binding(1, 0)]] RWTexture2D colorImage; + +float32_t3 reinhardTonemap(float32_t3 v) +{ + return v / (1.0f + v); +} + +[shader("raygeneration")] +void main() +{ + uint32_t3 launchID = DispatchRaysIndex(); + uint32_t3 launchSize = DispatchRaysDimensions(); + uint32_t2 coords = launchID.xy; + uint32_t seed = tea(launchID.y * launchSize.x + launchID.x, pc.frameCounter); + + float32_t3 hitValues = float32_t3(0, 0, 0); + for (uint32_t sample_i = 0; sample_i < s_sampleCount; sample_i++) + { + const float32_t r1 = rnd(seed); + const float32_t r2 = rnd(seed); + const float32_t2 subpixelJitter = pc.frameCounter == 0 ? float32_t2(0.5f, 0.5f) : float32_t2(r1, r2); + + const float32_t2 pixelCenter = float32_t2(coords) + subpixelJitter; + const float32_t2 inUV = pixelCenter / float32_t2(launchSize.xy); + + const float32_t2 d = inUV * 2.0 - 1.0; + const float32_t4 tmp = mul(pc.invMVP, float32_t4(d.x, d.y, 1, 1)); + const float32_t3 targetPos = tmp.xyz / tmp.w; + + float32_t3 direction = normalize(targetPos - pc.camPos); + + RayDesc rayDesc; + rayDesc.Origin = pc.camPos; + rayDesc.Direction = direction; + rayDesc.TMin = 0.01; + rayDesc.TMax = 1000.0; + + ColorPayload payload; + payload.seed = seed; + payload.hitValue = float32_t3(0, 0, 0); + TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, 0, 0, 0, rayDesc, payload); + + hitValues += payload.hitValue; + } + + float32_t3 hitValue = hitValues / s_sampleCount; + + if (pc.frameCounter > 0) + { + float32_t a = 1.0f / float32_t(pc.frameCounter + 1); + float32_t3 oldColor = colorImage[coords].xyz; + colorImage[coords] = float32_t4(lerp(oldColor, hitValue, a), 1.0f); + } + else + { + colorImage[coords] = float32_t4(hitValue, 1.0f); + } +} diff --git a/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl new file mode 100644 index 000000000..70db3b0e4 --- /dev/null +++ b/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl @@ -0,0 +1,8 @@ +#include "common.hlsl" + +[shader("miss")] +void main(inout ColorPayload p) +{ + p.hitValue = float32_t3(0.3, 0.3, 0.6); + +} diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl new file mode 100644 index 000000000..295e721f2 --- /dev/null +++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl @@ -0,0 +1,7 @@ +#include "common.hlsl" + +[shader("miss")] +void main(inout ShadowPayload p) +{ + p.isShadowed = false; +} diff --git a/71_RayTracingPipeline/include/common.hpp b/71_RayTracingPipeline/include/common.hpp new file mode 100644 index 000000000..e50cb4473 --- /dev/null +++ b/71_RayTracingPipeline/include/common.hpp @@ -0,0 +1,93 @@ +#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ +#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ + +#include +#include "nbl/asset/utils/CGeometryCreator.h" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" + +#include "SimpleWindowedApplication.hpp" + +#include "InputSystem.hpp" +#include "CEventCallback.hpp" + +#include "CCamera.hpp" + +#include +#include +#include + +using namespace nbl; +using namespace core; +using namespace hlsl; +using namespace system; +using namespace asset; +using namespace ui; +using namespace video; +using namespace scene; + +#include "app_resources/common.hlsl" + +namespace nbl::scene +{ + +enum ObjectType : uint8_t +{ + OT_CUBE, + OT_SPHERE, + OT_CYLINDER, + OT_RECTANGLE, + OT_DISK, + OT_ARROW, + OT_CONE, + OT_ICOSPHERE, + + OT_COUNT, + OT_UNKNOWN = std::numeric_limits::max() +}; + +static constexpr uint32_t s_smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 }; + +struct ObjectMeta +{ + ObjectType type = OT_UNKNOWN; + std::string_view name = "Unknown"; +}; + +struct ObjectDrawHookCpu +{ + nbl::core::matrix3x4SIMD model; + nbl::asset::SBasicViewParameters viewParameters; + ObjectMeta meta; +}; + +struct ReferenceObjectCpu +{ + ObjectMeta meta; + nbl::asset::CGeometryCreator::return_type data; + Material material; + core::matrix3x4SIMD transform; +}; + +struct ReferenceObjectGpu +{ + struct Bindings + { + nbl::asset::SBufferBinding vertex, index; + }; + + ObjectMeta meta; + Bindings bindings; + uint32_t vertexStride; + nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN; + uint32_t indexCount = {}; + Material material; + core::matrix3x4SIMD transform; + + const bool useIndex() const + { + return bindings.index.buffer && (indexType != E_INDEX_TYPE::EIT_UNKNOWN); + } +}; +} + +#endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp new file mode 100644 index 000000000..54a692317 --- /dev/null +++ b/71_RayTracingPipeline/main.cpp @@ -0,0 +1,1289 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "common.hpp" + +class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = examples::SimpleWindowedApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using clock_t = std::chrono::steady_clock; + + constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; + constexpr static inline uint32_t MaxFramesInFlight = 3u; + + constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); + + struct ShaderBindingTable + { + SStridedBufferRegion raygenGroupRegion; + SStridedBufferRegion hitGroupsRegion; + SStridedBufferRegion missGroupsRegion; + SStridedBufferRegion callableGroupsRegion; + }; + + struct CameraView + { + float32_t3 position; + float32_t3 target; + float32_t3 upVector; + }; + +public: + inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) { + } + + inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + { + auto retval = device_base_t::getRequiredDeviceFeatures(); + retval.rayTracingPipeline = true; + retval.accelerationStructure = true; + retval.rayQuery = true; + return retval; + } + + inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override + { + auto retval = device_base_t::getPreferredDeviceFeatures(); + retval.accelerationStructureHostCommands = true; + return retval; + } + + inline core::vector getSurfaces() const override + { + if (!m_surface) + { + { + auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); + IWindow::SCreationParams params = {}; + params.callback = core::make_smart_refctd_ptr(); + params.width = WIN_W; + params.height = WIN_H; + params.x = 32; + params.y = 32; + params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; + params.windowCaption = "RaytracingPipelineApp"; + params.callback = windowCallback; + const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); + } + + auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); + const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); + } + + if (m_surface) + return { {m_surface->getSurface()/*,EQF_NONE*/} }; + + return {}; + } + + // so that we can use the same queue for asset converter and rendering + inline core::vector getQueueRequirements() const override + { + auto reqs = device_base_t::getQueueRequirements(); + reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + return reqs; + } + + inline bool onAppInitialized(smart_refctd_ptr&& system) override + { + m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + const auto compileShader = [&](const std::string& filePath, const std::string& header = "") -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lparams = {}; + lparams.logger = m_logger.get(); + lparams.workingDirectory = ""; + auto bundle = m_assetMgr->getAsset(filePath, lparams); + if (bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER) + { + m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath); + exit(-1); + } + + const auto assets = bundle.getContents(); + assert(assets.size() == 1); + smart_refctd_ptr sourceRaw = IAsset::castDown(assets[0]); + if (!sourceRaw) + m_logger->log("Fail to load shader source", ILogger::ELL_ERROR, filePath); + smart_refctd_ptr source = CHLSLCompiler::createOverridenCopy( + sourceRaw.get(), + "%s\n", + header.c_str() + ); + + return m_device->createShader(source.get()); + }; + + // shader + const auto raygenShader = compileShader("app_resources/raytrace.rgen.hlsl"); + const auto closestHitShader = compileShader("app_resources/raytrace.rchit.hlsl"); + const auto anyHitShaderColorPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_COLOR_PAYLOAD\n"); + const auto anyHitShaderShadowPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_SHADOW_PAYLOAD\n"); + const auto missShader = compileShader("app_resources/raytrace.rmiss.hlsl"); + const auto shadowMissShader = compileShader("app_resources/raytraceShadow.rmiss.hlsl"); + + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); + + ISwapchain::SCreationParams swapchainParams = { .surface = core::smart_refctd_ptr(m_surface->getSurface()) }; + if (!swapchainParams.deduceFormat(m_physicalDevice)) + return logFail("Could not choose a Surface Format for the Swapchain!"); + + auto gQueue = getGraphicsQueue(); + if (!m_surface || !m_surface->init(gQueue, std::make_unique(), swapchainParams.sharedParams)) + return logFail("Could not create Window & Surface or initialize the Surface!"); + + auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + + m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); + + for (auto i = 0u; i < MaxFramesInFlight; i++) + { + if (!pool) + return logFail("Couldn't create Command Pool!"); + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) + return logFail("Couldn't create Command Buffer!"); + } + + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); + + // create output images + m_hdrImage = m_device->createImage({ + { + .type = IGPUImage::ET_2D, + .samples = asset::ICPUImage::ESCF_1_BIT, + .format = asset::EF_R16G16B16A16_SFLOAT, + .extent = {WIN_W, WIN_H, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .flags = IImage::ECF_NONE, + .usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT + } + }); + + if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid()) + return logFail("Could not create HDR Image"); + + auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); + auto* geometryCreator = assetManager->getGeometryCreator(); + + auto cQueue = getComputeQueue(); + + // create geometry objects + if (!createGeometries(gQueue, geometryCreator)) + return logFail("Could not create geometries from geometry creator"); + + if (!createAccelerationStructures(cQueue)) + return logFail("Could not create acceleration structures"); + + + // create pipelines + { + // descriptors + const IGPUDescriptorSetLayout::SBinding bindings[] = { + { + .binding = 0, + .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, + .count = 1, + }, + { + .binding = 1, + .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, + .count = 1, + } + }; + const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); + + const std::array dsLayoutPtrs = { descriptorSetLayout.get() }; + m_renderPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); + if (!m_renderPool) + return logFail("Could not create descriptor pool"); + m_renderDs = m_renderPool->createDescriptorSet(descriptorSetLayout); + if (!m_renderDs) + return logFail("Could not create descriptor set"); + + const SPushConstantRange pcRange = { + .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, + .offset = 0u, + .size = sizeof(SPushConstants), + }; + const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr); + + IGPURayTracingPipeline::SCreationParams params = {}; + + + const IGPUShader::SSpecInfo shaders[] = { + {.shader = raygenShader.get()}, + {.shader = closestHitShader.get()}, + {.shader = anyHitShaderColorPayload.get()}, + {.shader = anyHitShaderShadowPayload.get()}, + {.shader = missShader.get()}, + {.shader = shadowMissShader.get()}, + }; + + params.layout = pipelineLayout.get(); + params.shaders = std::span(shaders, std::size(shaders)); + params.cached.shaderGroups.raygenGroup = { + .shaderIndex = 0, + }; + params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 1, .anyHitShaderIndex = 2 }); + params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 1, .anyHitShaderIndex = 3 }); + params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 4 }); + params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 5 }); + params.cached.maxRecursionDepth = 2; + if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) + return logFail("Failed to create ray tracing pipeline"); + m_logger->log("Ray Tracing Pipeline Created!",system::ILogger::ELL_INFO); + + //create shader binding table + if (!createShaderBindingTable(gQueue, m_rayTracingPipeline)) + return logFail("Could not create shader binding table"); + } + + + // write descriptors + IGPUDescriptorSet::SDescriptorInfo infos[2]; + infos[0].desc = m_gpuTlas; + infos[1].desc = m_device->createImageView({ + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, + .image = m_hdrImage, + .viewType = IGPUImageView::E_TYPE::ET_2D, + .format = asset::EF_R16G16B16A16_SFLOAT + }); + if (!infos[1].desc) + return logFail("Failed to create image view"); + infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL; + IGPUDescriptorSet::SWriteDescriptorSet writes[3] = { + {.dstSet = m_renderDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]}, + {.dstSet = m_renderDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]} + }; + m_device->updateDescriptorSets(std::span(writes, 2), {}); + + // camera + { + core::vectorSIMDf cameraPosition(-5.81655884, 2.58630896, -4.23974705); + core::vectorSIMDf cameraTarget(-0.349590302, -0.213266611, 0.317821503); + matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), float(WIN_W) / WIN_H, 0.1, 1000); + m_camera = Camera(cameraPosition, cameraTarget, projectionMatrix, 1.069f, 0.4f); + } + + m_winMgr->show(m_window.get()); + m_oracle.reportBeginFrameRecord(); + + return true; + } + + inline void workLoopBody() override + { + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + + const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); + + if (m_realFrameIx >= framesInFlight) + { + const ISemaphore::SWaitInfo cbDonePending[] = + { + { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + 1 - framesInFlight + } + }; + if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) + return; + } + + m_inputSystem->getDefaultMouse(&m_mouse); + m_inputSystem->getDefaultKeyboard(&m_keyboard); + + auto updatePresentationTimestamp = [&]() + { + m_currentImageAcquire = m_surface->acquireNextImage(); + + m_oracle.reportEndFrameRecord(); + const auto timestamp = m_oracle.getNextPresentationTimeStamp(); + m_oracle.reportBeginFrameRecord(); + + return timestamp; + }; + + const auto nextPresentationTimestamp = updatePresentationTimestamp(); + + if (!m_currentImageAcquire) + return; + + static bool first = true; + if (first) + { + m_api->startCapture(); + first = false; + } + + auto* const cmdbuf = m_cmdBufs.data()[resourceIx].get(); + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame"); + { + m_camera.beginInputProcessing(nextPresentationTimestamp); + m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + { + if (m_camera.mouseProcess(events)) + { + m_frameAccumulationCounter = 0; + } + }, m_logger.get()); + m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + { + if (m_camera.keyboardProcess(events)) + { + m_frameAccumulationCounter = 0; + } + }, m_logger.get()); + m_camera.endInputProcessing(nextPresentationTimestamp); + + } + + const auto viewMatrix = m_camera.getViewMatrix(); + const auto projectionMatrix = m_camera.getProjectionMatrix(); + const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix(); + + core::matrix3x4SIMD modelMatrix; + modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0)); + modelMatrix.setRotation(quaternion(0, 0, 0)); + + core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); + core::matrix4SIMD invModelViewProjectionMatrix; + modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix); + + auto* queue = getGraphicsQueue(); + + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }; + imageBarriers[0].image = m_hdrImage.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; + imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + // do ray query + SPushConstants pc; + pc.geometryInfoBuffer = m_geometryInfoBuffer->getDeviceAddress(); + pc.frameCounter = m_frameAccumulationCounter; + const core::vector3df camPos = m_camera.getPosition().getAsVector3df(); + pc.camPos = { camPos.X, camPos.Y, camPos.Z }; + memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP)); + + cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); + cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); + cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_renderDs.get()); + cmdbuf->traceRays(m_shaderBindingTable.raygenGroupRegion, + m_shaderBindingTable.missGroupsRegion, + m_shaderBindingTable.hitGroupsRegion, + m_shaderBindingTable.callableGroupsRegion, + WIN_W, WIN_H, 1); + + // blit + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[2]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT + } + }; + imageBarriers[0].image = m_hdrImage.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; + imageBarriers[0].newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL; + + imageBarriers[1].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT + } + }; + imageBarriers[1].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); + imageBarriers[1].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[1].oldLayout = IImage::LAYOUT::UNDEFINED; + imageBarriers[1].newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + { + IGPUCommandBuffer::SImageBlit regions[] = { { + .srcMinCoord = {0,0,0}, + .srcMaxCoord = {WIN_W,WIN_H,1}, + .dstMinCoord = {0,0,0}, + .dstMaxCoord = {WIN_W,WIN_H,1}, + .layerCount = 1, + .srcBaseLayer = 0, + .dstBaseLayer = 0, + .srcMipLevel = 0, + .dstMipLevel = 0, + .aspectMask = IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT + } }; + + auto srcImg = m_hdrImage.get(); + auto scRes = static_cast(m_surface->getSwapchainResources()); + auto dstImg = scRes->getImage(m_currentImageAcquire.imageIndex); + + cmdbuf->blitImage(srcImg, IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, dstImg, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, regions, ISampler::ETF_NEAREST); + } + + // TODO: transition to present + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::NONE, + .dstAccessMask = ACCESS_FLAGS::NONE + } + }; + imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL; + imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + cmdbuf->endDebugMarker(); + cmdbuf->end(); + + { + const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = + { + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } + }; + { + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cmdbuf } + }; + + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = + { + { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = rendered + } + }; + + if (queue->submit(infos) == IQueue::RESULT::SUCCESS) + { + const nbl::video::ISemaphore::SWaitInfo waitInfos[] = + { { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + } }; + + m_device->blockForSemaphores(waitInfos); // this is not solution, quick wa to not throw validation errors + } + else + --m_realFrameIx; + } + } + + std::string caption = "[Nabla Engine] Ray Tracing Pipeline"; + { + caption += ", displaying [all objects]"; + m_window->setCaption(caption); + } + m_surface->present(m_currentImageAcquire.imageIndex, rendered); + } + + m_frameAccumulationCounter++; + } + + inline bool keepRunning() override + { + if (m_surface->irrecoverable()) + return false; + + return true; + } + + inline bool onAppTerminated() override + { + return device_base_t::onAppTerminated(); + } + +private: + uint32_t getWorkgroupCount(uint32_t dim, uint32_t size) + { + return (dim + size - 1) / size; + } + + smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams& params) + { + smart_refctd_ptr buffer; + buffer = m_device->createBuffer(std::move(params)); + auto bufReqs = buffer->getMemoryReqs(); + bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + + return buffer; + } + + smart_refctd_ptr getSingleUseCommandBufferAndBegin(smart_refctd_ptr pool) + { + smart_refctd_ptr cmdbuf; + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) + return nullptr; + + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + return cmdbuf; + } + + void cmdbufSubmitAndWait(smart_refctd_ptr cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue) + { + cmdbuf->end(); + + uint64_t finishedValue = startValue + 1; + + // submit builds + { + auto completed = m_device->createSemaphore(startValue); + + std::array signals; + { + auto& signal = signals.front(); + signal.value = finishedValue; + signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS); + signal.semaphore = completed.get(); + } + + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + .cmdbuf = cmdbuf.get() + } }; + + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = {}, + .commandBuffers = commandBuffers, + .signalSemaphores = signals + } + }; + + if (queue->submit(infos) != IQueue::RESULT::SUCCESS) + { + m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR); + return; + } + + const ISemaphore::SWaitInfo info[] = + { { + .semaphore = completed.get(), + .value = finishedValue + } }; + + m_device->blockForSemaphores(info); + } + } + + bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) + { + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!pool) + return logFail("Couldn't create Command Pool for geometry creation!"); + + const auto defaultMaterial = Material{ + .ambient = {0.1, 0.1, 0.1}, + .diffuse = {0.8, 0.3, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .illum = 2 + }; + + auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) + { + core::matrix3x4SIMD transform; + transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); + return transform; + }; + + core::matrix3x4SIMD planeTransform; + planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{1, 0, 0})); + + const auto cpuObjects = std::array{ + ReferenceObjectCpu { + .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, + .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), + .material = defaultMaterial, + .transform = planeTransform, + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), + .material = defaultMaterial, + .transform = getTranslationMatrix(0, 0.5f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_SPHERE, .name = "Sphere Mesh"}, + .data = gc->createSphereMesh(2, 16, 16), + .material = { + .ambient = {0.1, 0.1, 0.1}, + .diffuse = {0.2, 0.2, 0.8}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .illum = 2 + }, + .transform = getTranslationMatrix(-5.0f, 1.0f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_SPHERE, .name = "Transparent Sphere Mesh"}, + .data = gc->createSphereMesh(2, 16, 16), + .material = { + .ambient = {0.1, 0.1, 0.1}, + .diffuse = {0.2, 0.8, 0.2}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .dissolve = 0.2, + .illum = 4 + }, + .transform = getTranslationMatrix(5.0f, 1.0f, 0), + }, + }; + + struct ScratchVIBindings + { + nbl::asset::SBufferBinding vertex, index; + }; + std::array scratchBuffers; + + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + const auto& cpuObject = cpuObjects[i]; + + auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset + auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + vBuffer->addUsageFlags(vUsage); + vBuffer->setContentHash(vBuffer->computeContentHash()); + + auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset + auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + + if (cpuObject.data.indexType != EIT_UNKNOWN) + if (iBuffer) + { + iBuffer->addUsageFlags(iUsage); + iBuffer->setContentHash(iBuffer->computeContentHash()); + } + + scratchBuffers[i] = { + .vertex = {.offset = 0, .buffer = vBuffer}, + .index = {.offset = 0, .buffer = iBuffer}, + }; + + } + + auto cmdbuf = getSingleUseCommandBufferAndBegin(pool); + cmdbuf->beginDebugMarker("Build geometry vertex and index buffers"); + + CAssetConverter::SInputs inputs = {}; + inputs.logger = m_logger.get(); + std::array tmpBuffers; + { + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get(); + tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get(); + } + + std::get>(inputs.assets) = tmpBuffers; + } + + auto reservation = m_converter->reserve(inputs); + { + auto prepass = [&](const auto & references) -> bool + { + auto objects = reservation.getGPUObjects(); + uint32_t counter = {}; + for (auto& object : objects) + { + auto gpu = object.value; + auto* reference = references[counter]; + + if (reference) + { + if (!gpu) + { + m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); + return false; + } + } + counter++; + } + return true; + }; + + prepass.template operator() < ICPUBuffer > (tmpBuffers); + } + + auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(SGeomInfo) }); + SGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); + + m_gpuObjects.reserve(std::size(cpuObjects)); + // convert + { + // not sure if need this (probably not, originally for transition img view) + auto semaphore = m_device->createSemaphore(0u); + + std::array cmdbufs = {}; + cmdbufs.front().cmdbuf = cmdbuf.get(); + + SIntendedSubmitInfo transfer = {}; + transfer.queue = queue; + transfer.scratchCommandBuffers = cmdbufs; + transfer.scratchSemaphore = { + .semaphore = semaphore.get(), + .value = 0u, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + + CAssetConverter::SConvertParams params = {}; + params.utilities = m_utils.get(); + params.transfer = &transfer; + + auto future = reservation.convert(params); + if (future.copy() != IQueue::RESULT::SUCCESS) + { + m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); + return false; + } + + auto&& buffers = reservation.getGPUObjects(); + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + auto& cpuObject = cpuObjects[i]; + + m_gpuObjects.push_back(ReferenceObjectGpu{ + .meta = cpuObject.meta, + .bindings = { + .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, + .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, + }, + .vertexStride = cpuObject.data.inputParams.bindings[0].stride, + .indexType = cpuObject.data.indexType, + .indexCount = cpuObject.data.indexCount, + .material = cpuObject.material, + .transform = cpuObject.transform, + }); + } + + for (uint32_t i = 0; i < m_gpuObjects.size(); i++) + { + const auto& gpuObject = m_gpuObjects[i]; + const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); + geomInfos[i] = { + .vertexBufferAddress = vertexBufferAddress, + .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, + .vertexStride = gpuObject.vertexStride, + .indexType = gpuObject.indexType, + .smoothNormals = s_smoothNormals[gpuObject.meta.type], + .objType = gpuObject.meta.type, + .material = gpuObject.material, + }; + } + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = geomInfoBuffer->getSize(); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_geometryInfoBuffer); + } + + return true; + } + + bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr& pipeline) + { + const auto& limits = m_device->getPhysicalDevice()->getLimits(); + const auto handleSize = limits.shaderGroupHandleSize; + const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment); + + auto& raygenRegion = m_shaderBindingTable.raygenGroupRegion; + auto& hitRegion = m_shaderBindingTable.hitGroupsRegion; + auto& missRegion = m_shaderBindingTable.missGroupsRegion; + auto& callableRegion = m_shaderBindingTable.callableGroupsRegion; + + raygenRegion = { + .offset = 0, + .stride = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment), + .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment) + }; + + missRegion = { + .offset = raygenRegion.size, + .stride = handleSizeAligned, + .size = core::alignUp(pipeline->getMissGroupCount(), limits.shaderGroupBaseAlignment), + }; + + hitRegion = { + .offset = missRegion.offset + missRegion.size, + .stride = handleSizeAligned, + .size = core::alignUp(pipeline->getHitGroupCount(), limits.shaderGroupBaseAlignment), + }; + + callableRegion = { + .offset = hitRegion.offset + hitRegion.size, + .stride = handleSizeAligned, + .size = core::alignUp(pipeline->getCallableGroupCount(), limits.shaderGroupBaseAlignment), + }; + + const auto bufferSize = raygenRegion.size + missRegion.size + hitRegion.size + callableRegion.size; + + ICPUBuffer::SCreationParams cpuBufferParams; + cpuBufferParams.size = bufferSize; + auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams)); + uint8_t* pData = reinterpret_cast(cpuBuffer->getPointer()); + + // copy raygen region + memcpy(pData, pipeline->getRaygenGroupShaderHandle().data(), handleSize); + + // copy miss region + uint8_t* pMissData = pData + missRegion.offset; + for (int32_t missIx = 0; missIx < pipeline->getMissGroupCount(); missIx++) + { + memcpy(pMissData, pipeline->getMissGroupShaderHandle(missIx).data(), handleSize); + pMissData += missRegion.stride; + } + + // copy hit region + uint8_t* pHitData = pData + hitRegion.offset; + for (int32_t hitIx = 0; hitIx < pipeline->getHitGroupCount(); hitIx++) + { + memcpy(pHitData, pipeline->getHitGroupShaderHandle(hitIx).data(), handleSize); + pHitData += hitRegion.stride; + } + + // copy callable region + uint8_t* pCallableData = pData + callableRegion.offset; + for (int32_t callableIx = 0; callableIx < pipeline->getCallableGroupCount(); callableIx++) + { + memcpy(pCallableData, pipeline->getCallableGroupShaderHandle(callableIx).data(), handleSize); + pCallableData += callableRegion.stride; + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT; + params.size = bufferSize; + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRegion.buffer); + m_logger->log("Device address : %d", ILogger::ELL_INFO, raygenRegion.buffer->getDeviceAddress()); + missRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer); + hitRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer); + callableRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer); + } + + return true; + } + + bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue) + { + IQueryPool::SCreationParams qParams{ .queryCount = static_cast(m_gpuObjects.size()), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE}; + smart_refctd_ptr queryPool = m_device->createQueryPool(std::move(qParams)); + + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + if (!pool) + return logFail("Couldn't create Command Pool for blas/tlas creation!"); + + m_api->startCapture(); +#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it + m_currentImageAcquire = m_surface->acquireNextImage(); + { + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS + } }; + m_surface->present(m_currentImageAcquire.imageIndex, acquired); + } + m_currentImageAcquire = m_surface->acquireNextImage(); +#endif + size_t totalScratchSize = 0; + + // build bottom level ASes + { + core::vector blasBuildInfos(m_gpuObjects.size()); + core::vector primitiveCounts(m_gpuObjects.size()); + core::vector> triangles(m_gpuObjects.size()); + core::vector scratchSizes(m_gpuObjects.size()); + m_gpuBlasList.resize(m_gpuObjects.size()); + + for (uint32_t i = 0; i < m_gpuObjects.size(); i++) + { + const auto& gpuObject = m_gpuObjects[i]; + + const uint32_t vertexStride = gpuObject.vertexStride; + const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride; + if (gpuObject.useIndex()) + primitiveCounts[i] = gpuObject.indexCount / 3; + else + primitiveCounts[i] = numVertices / 3; + + triangles[i].vertexData[0] = gpuObject.bindings.vertex; + triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex; + triangles[i].maxVertex = numVertices - 1; + triangles[i].vertexStride = vertexStride; + triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; + triangles[i].indexType = gpuObject.indexType; + triangles[i].geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT; + + auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; + if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) + blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR; + + blasBuildInfos[i].buildFlags = blasFlags; + blasBuildInfos[i].geometryCount = 1; // only 1 geometry object per blas + blasBuildInfos[i].srcAS = nullptr; + blasBuildInfos[i].dstAS = nullptr; + blasBuildInfos[i].triangles = &triangles[i]; + blasBuildInfos[i].scratch = {}; + + ILogicalDevice::AccelerationStructureBuildSizes buildSizes; + { + const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; + buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{ &triangles[i], 1 }, maxPrimCount); + if (!buildSizes) + return logFail("Failed to get BLAS build sizes"); + } + + scratchSizes[i] = buildSizes.buildScratchSize; + totalScratchSize += buildSizes.buildScratchSize; + + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; + params.size = buildSizes.accelerationStructureSize; + smart_refctd_ptr asBuffer = createBuffer(params); + + IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; + blasParams.bufferRange.buffer = asBuffer; + blasParams.bufferRange.offset = 0u; + blasParams.bufferRange.size = buildSizes.accelerationStructureSize; + blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; + m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); + if (!m_gpuBlasList[i]) + return logFail("Could not create BLAS"); + } + } + + auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool); + cmdbufBlas->beginDebugMarker("Build BLAS"); + + cmdbufBlas->resetQueryPool(queryPool.get(), 0, m_gpuObjects.size()); + + smart_refctd_ptr scratchBuffer; + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + params.size = totalScratchSize; + scratchBuffer = createBuffer(params); + } + + uint32_t queryCount = 0; + core::vector buildRangeInfos(m_gpuObjects.size()); + core::vector pRangeInfos(m_gpuObjects.size()); + for (uint32_t i = 0; i < m_gpuObjects.size(); i++) + { + blasBuildInfos[i].dstAS = m_gpuBlasList[i].get(); + blasBuildInfos[i].scratch.buffer = scratchBuffer; + blasBuildInfos[i].scratch.offset = (i == 0) ? 0u : blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1]; + + buildRangeInfos[i].primitiveCount = primitiveCounts[i]; + buildRangeInfos[i].primitiveByteOffset = 0u; + buildRangeInfos[i].firstVertex = 0u; + buildRangeInfos[i].transformByteOffset = 0u; + + pRangeInfos[i] = &buildRangeInfos[i]; + } + + if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data())) + return logFail("Failed to build BLAS"); + + { + SMemoryBarrier memBarrier; + memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; + memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; + memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; + memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT; + cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); + } + + + core::vector ases(m_gpuObjects.size()); + for (uint32_t i = 0; i < m_gpuObjects.size(); i++) + ases[i] = m_gpuBlasList[i].get(); + if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE, + queryPool.get(), queryCount++)) + return logFail("Failed to write acceleration structure properties!"); + + cmdbufBlas->endDebugMarker(); + cmdbufSubmitAndWait(cmdbufBlas, queue, 39); + } + + auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool); + cmdbufCompact->beginDebugMarker("Compact BLAS"); + + // compact blas + { + core::vector asSizes(m_gpuObjects.size(), 0); + if (!m_device->getQueryPoolResults(queryPool.get(), 0, m_gpuObjects.size(), asSizes.data(), sizeof(size_t), IQueryPool::WAIT_BIT)) + return logFail("Could not get query pool results for AS sizes"); + + core::vector> cleanupBlas(m_gpuObjects.size()); + for (uint32_t i = 0; i < m_gpuObjects.size(); i++) + { + cleanupBlas[i] = m_gpuBlasList[i]; + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; + params.size = asSizes[i]; + smart_refctd_ptr asBuffer = createBuffer(params); + + IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; + blasParams.bufferRange.buffer = asBuffer; + blasParams.bufferRange.offset = 0u; + blasParams.bufferRange.size = asSizes[i]; + blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; + m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); + if (!m_gpuBlasList[i]) + return logFail("Could not create compacted BLAS"); + } + + IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo; + copyInfo.src = cleanupBlas[i].get(); + copyInfo.dst = m_gpuBlasList[i].get(); + copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT; + if (!cmdbufCompact->copyAccelerationStructure(copyInfo)) + return logFail("Failed to copy AS to compact"); + } + } + + cmdbufCompact->endDebugMarker(); + cmdbufSubmitAndWait(cmdbufCompact, queue, 40); + + auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool); + cmdbufTlas->beginDebugMarker("Build TLAS"); + + // build top level AS + { + const uint32_t instancesCount = m_gpuObjects.size(); + core::vector instances(m_gpuObjects.size()); + for (uint32_t i = 0; i < instancesCount; i++) + { + instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress; + instances[i].base.mask = 0xFF; + instances[i].base.instanceCustomIndex = i; + instances[i].base.instanceShaderBindingTableRecordOffset = 0; + instances[i].base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); + instances[i].transform = m_gpuObjects[i].transform; + } + + { + size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT | + IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = bufSize; + m_instanceBuffer = createBuffer(params); + + SBufferRange range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer }; + cmdbufTlas->updateBuffer(range, instances.data()); + } + + // make sure instances upload complete first + { + SMemoryBarrier memBarrier; + memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; + memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; + cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); + } + + auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); + + IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo; + tlasBuildInfo.buildFlags = tlasFlags; + tlasBuildInfo.srcAS = nullptr; + tlasBuildInfo.dstAS = nullptr; + tlasBuildInfo.instanceData.buffer = m_instanceBuffer; + tlasBuildInfo.instanceData.offset = 0u; + tlasBuildInfo.scratch = {}; + + auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount); + if (!buildSizes) + return logFail("Failed to get TLAS build sizes"); + + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; + params.size = buildSizes.accelerationStructureSize; + smart_refctd_ptr asBuffer = createBuffer(params); + + IGPUTopLevelAccelerationStructure::SCreationParams tlasParams; + tlasParams.bufferRange.buffer = asBuffer; + tlasParams.bufferRange.offset = 0u; + tlasParams.bufferRange.size = buildSizes.accelerationStructureSize; + tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE; + m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams)); + if (!m_gpuTlas) + return logFail("Could not create TLAS"); + } + + smart_refctd_ptr scratchBuffer; + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + params.size = buildSizes.buildScratchSize; + scratchBuffer = createBuffer(params); + } + + tlasBuildInfo.dstAS = m_gpuTlas.get(); + tlasBuildInfo.scratch.buffer = scratchBuffer; + tlasBuildInfo.scratch.offset = 0u; + + IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u]; + buildRangeInfo[0].instanceCount = instancesCount; + buildRangeInfo[0].instanceByteOffset = 0u; + IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos; + pRangeInfos = &buildRangeInfo[0]; + + if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos)) + return logFail("Failed to build TLAS"); + } + + cmdbufTlas->endDebugMarker(); + cmdbufSubmitAndWait(cmdbufTlas, queue, 45); + +#ifdef TRY_BUILD_FOR_NGFX + { + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS + } }; + m_surface->present(m_currentImageAcquire.imageIndex, acquired); + } +#endif + m_api->endCapture(); + + return true; + } + + + smart_refctd_ptr m_window; + smart_refctd_ptr> m_surface; + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; + uint32_t m_frameAccumulationCounter = -1; + std::array, MaxFramesInFlight> m_cmdBufs; + ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + + core::smart_refctd_ptr m_inputSystem; + InputSystem::ChannelReader m_mouse; + InputSystem::ChannelReader m_keyboard; + + Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); + CameraView m_oldCameraView; + video::CDumbPresentationOracle m_oracle; + + std::vector m_gpuObjects; + + std::vector> m_gpuBlasList; + smart_refctd_ptr m_gpuTlas; + smart_refctd_ptr m_instanceBuffer; + + smart_refctd_ptr m_geometryInfoBuffer; + ShaderBindingTable m_shaderBindingTable; + smart_refctd_ptr m_hdrImage; + + smart_refctd_ptr m_rayTracingPipeline; + smart_refctd_ptr m_renderDs; + smart_refctd_ptr m_renderPool; + + smart_refctd_ptr m_converter; + smart_refctd_ptr m_sbtBuffer; + + uint16_t gcIndex = {}; + +}; + +NBL_MAIN_FUNC(RaytracingPipelineApp) diff --git a/CMakeLists.txt b/CMakeLists.txt index d840850a6..bd200d8a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,6 +93,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL) add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) + add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") endif() diff --git a/common/include/CCamera.hpp b/common/include/CCamera.hpp index 1b0fe9c0f..d9f31a260 100644 --- a/common/include/CCamera.hpp +++ b/common/include/CCamera.hpp @@ -132,8 +132,10 @@ class Camera public: - void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events) + // return whether camera is moved by mouse + bool mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events) { + bool cameraMoved = false; for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++) { auto ev = *eventIt; @@ -179,11 +181,15 @@ class Camera mat.transformVect(localTarget); setTarget(localTarget + pos); + + cameraMoved = true; } } + return cameraMoved; } - void keyboardProcess(const nbl::ui::IKeyboardEventChannel::range_t& events) + // return whether camera is moved by keyboard + bool keyboardProcess(const nbl::ui::IKeyboardEventChannel::range_t& events) { for(uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) perActionDt[k] = 0.0; @@ -194,12 +200,14 @@ class Camera * And If an UP event was sent It will get subtracted it from this value. (Currently Disabled Because we Need better Oracle) */ + bool cameraMoved = false; for(uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) if(keysDown[k]) { auto timeDiff = std::chrono::duration_cast(nextPresentationTimeStamp - lastVirtualUpTimeStamp).count(); assert(timeDiff >= 0); perActionDt[k] += timeDiff; + cameraMoved = true; } for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++) @@ -237,8 +245,11 @@ class Camera position = initialPosition; target = initialTarget; recomputeViewMatrix(); + cameraMoved = true; } } + + return cameraMoved; } void beginInputProcessing(std::chrono::microseconds _nextPresentationTimeStamp) From c991e20d986b6d427b64925b3730f6723b62079e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Jan 2025 21:34:34 +0700 Subject: [PATCH 003/296] Add ImGui Overlay Signed-off-by: kevyuu --- 71_RayTracingPipeline/CMakeLists.txt | 45 +- .../app_resources/present.frag.hlsl | 19 + 71_RayTracingPipeline/include/common.hpp | 4 + 71_RayTracingPipeline/main.cpp | 710 ++++++++++++------ 4 files changed, 528 insertions(+), 250 deletions(-) create mode 100644 71_RayTracingPipeline/app_resources/present.frag.hlsl diff --git a/71_RayTracingPipeline/CMakeLists.txt b/71_RayTracingPipeline/CMakeLists.txt index 4a555f4ce..07b0fd396 100644 --- a/71_RayTracingPipeline/CMakeLists.txt +++ b/71_RayTracingPipeline/CMakeLists.txt @@ -1,28 +1,37 @@ -set(NBL_INCLUDE_SEARCH_DIRECTORIES - "${CMAKE_CURRENT_SOURCE_DIR}/include" -) - include(common RESULT_VARIABLE RES) if(NOT RES) - message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") endif() -nbl_create_executable_project("" "" "${NBL_INCLUDE_SEARCH_DIRECTORIES}" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") +if(NBL_BUILD_IMGUI) + set(NBL_INCLUDE_SERACH_DIRECTORIES + "${CMAKE_CURRENT_SOURCE_DIR}/include" + ) + + list(APPEND NBL_LIBRARIES + imtestengine + "${NBL_EXT_IMGUI_UI_LIB}" + ) + + nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") -if(NBL_EMBED_BUILTIN_RESOURCES) - set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) - set(RESOURCE_DIR "app_resources") + if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") - get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) - get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) - get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) - file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") - foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) - LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") - endforeach() + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() - ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") - LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) + endif() endif() + + diff --git a/71_RayTracingPipeline/app_resources/present.frag.hlsl b/71_RayTracingPipeline/app_resources/present.frag.hlsl new file mode 100644 index 000000000..00ab6e31d --- /dev/null +++ b/71_RayTracingPipeline/app_resources/present.frag.hlsl @@ -0,0 +1,19 @@ +// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#pragma wave shader_stage(fragment) + +// vertex shader is provided by the fullScreenTriangle extension +#include +using namespace nbl::hlsl; +using namespace ext::FullScreenTriangle; + +// binding 0 set 0 +[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D texture; +[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] SamplerState samplerState; + +[[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0 +{ + return float32_t4(texture.Sample(samplerState, vxAttr.uv).rgb, 1.0f); +} diff --git a/71_RayTracingPipeline/include/common.hpp b/71_RayTracingPipeline/include/common.hpp index e50cb4473..3a8411fd2 100644 --- a/71_RayTracingPipeline/include/common.hpp +++ b/71_RayTracingPipeline/include/common.hpp @@ -16,6 +16,10 @@ #include #include +#include "nbl/ui/ICursorControl.h" +#include "nbl/ext/ImGui/ImGui.h" +#include "imgui/imgui_internal.h" + using namespace nbl; using namespace core; using namespace hlsl; diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 54a692317..4fc992c90 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -3,6 +3,7 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "common.hpp" +#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { @@ -12,6 +13,15 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; constexpr static inline uint32_t MaxFramesInFlight = 3u; + constexpr static inline uint8_t MaxUITextureCount = 1u; + + enum E_LIGHT_TYPE : uint8_t + { + ELT_SPHERE, + ELT_TRIANGLE, + ELT_RECTANGLE, + ELT_COUNT + }; constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); @@ -23,12 +33,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, SStridedBufferRegion callableGroupsRegion; }; - struct CameraView - { - float32_t3 position; - float32_t3 target; - float32_t3 upVector; - }; public: inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) @@ -97,31 +101,32 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) return false; - const auto compileShader = [&](const std::string& filePath, const std::string& header = "") -> smart_refctd_ptr + + const auto compileShader = [&](const std::string & filePath, const std::string & header = "") -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lparams = {}; + lparams.logger = m_logger.get(); + lparams.workingDirectory = ""; + auto bundle = m_assetMgr->getAsset(filePath, lparams); + if (bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER) { - IAssetLoader::SAssetLoadParams lparams = {}; - lparams.logger = m_logger.get(); - lparams.workingDirectory = ""; - auto bundle = m_assetMgr->getAsset(filePath, lparams); - if (bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER) - { - m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath); - exit(-1); - } + m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath); + exit(-1); + } - const auto assets = bundle.getContents(); - assert(assets.size() == 1); - smart_refctd_ptr sourceRaw = IAsset::castDown(assets[0]); - if (!sourceRaw) - m_logger->log("Fail to load shader source", ILogger::ELL_ERROR, filePath); - smart_refctd_ptr source = CHLSLCompiler::createOverridenCopy( - sourceRaw.get(), - "%s\n", - header.c_str() - ); - - return m_device->createShader(source.get()); - }; + const auto assets = bundle.getContents(); + assert(assets.size() == 1); + smart_refctd_ptr sourceRaw = IAsset::castDown(assets[0]); + if (!sourceRaw) + m_logger->log("Fail to load shader source", ILogger::ELL_ERROR, filePath); + smart_refctd_ptr source = CHLSLCompiler::createOverridenCopy( + sourceRaw.get(), + "%s\n", + header.c_str() + ); + + return m_device->createShader(source.get()); + }; // shader const auto raygenShader = compileShader("app_resources/raytrace.rgen.hlsl"); @@ -135,13 +140,49 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, if (!m_semaphore) return logFail("Failed to Create a Semaphore!"); - ISwapchain::SCreationParams swapchainParams = { .surface = core::smart_refctd_ptr(m_surface->getSurface()) }; - if (!swapchainParams.deduceFormat(m_physicalDevice)) - return logFail("Could not choose a Surface Format for the Swapchain!"); - auto gQueue = getGraphicsQueue(); - if (!m_surface || !m_surface->init(gQueue, std::make_unique(), swapchainParams.sharedParams)) - return logFail("Could not create Window & Surface or initialize the Surface!"); + + // Create renderpass and init surface + nbl::video::IGPURenderpass* renderpass; + { + ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr(m_surface->getSurface()) }; + if (!swapchainParams.deduceFormat(m_physicalDevice)) + return logFail("Could not choose a Surface Format for the Swapchain!"); + + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = + { + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; + + auto scResources = std::make_unique(m_device.get(), swapchainParams.surfaceFormat.format, dependencies); + renderpass = scResources->getRenderpass(); + + if (!renderpass) + return logFail("Failed to create Renderpass!"); + + if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) + return logFail("Could not create Window & Surface or initialize the Surface!"); + } auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); @@ -158,23 +199,33 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); m_surface->recreateSwapchain(); + // create output images m_hdrImage = m_device->createImage({ { .type = IGPUImage::ET_2D, - .samples = asset::ICPUImage::ESCF_1_BIT, - .format = asset::EF_R16G16B16A16_SFLOAT, + .samples = ICPUImage::ESCF_1_BIT, + .format = EF_R16G16B16A16_SFLOAT, .extent = {WIN_W, WIN_H, 1}, .mipLevels = 1, .arrayLayers = 1, .flags = IImage::ECF_NONE, - .usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT + .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT } }); if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid()) return logFail("Could not create HDR Image"); + m_hdrImageView = m_device->createImageView({ + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT, + .image = m_hdrImage, + .viewType = IGPUImageView::E_TYPE::ET_2D, + .format = asset::EF_R16G16B16A16_SFLOAT + }); + + auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); auto* geometryCreator = assetManager->getGeometryCreator(); @@ -187,10 +238,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, if (!createAccelerationStructures(cQueue)) return logFail("Could not create acceleration structures"); + ISampler::SParams samplerParams = { + .AnisotropicFilter = 0 + }; + auto defaultSampler = m_device->createSampler(samplerParams); - // create pipelines + // ray trace pipeline and descriptor set layout setup { - // descriptors const IGPUDescriptorSetLayout::SBinding bindings[] = { { .binding = 0, @@ -210,12 +264,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); const std::array dsLayoutPtrs = { descriptorSetLayout.get() }; - m_renderPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); - if (!m_renderPool) - return logFail("Could not create descriptor pool"); - m_renderDs = m_renderPool->createDescriptorSet(descriptorSetLayout); - if (!m_renderDs) - return logFail("Could not create descriptor set"); + m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); + m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout); const SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, @@ -226,7 +276,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, IGPURayTracingPipeline::SCreationParams params = {}; - const IGPUShader::SSpecInfo shaders[] = { {.shader = raygenShader.get()}, {.shader = closestHitShader.get()}, @@ -248,115 +297,241 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, params.cached.maxRecursionDepth = 2; if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) return logFail("Failed to create ray tracing pipeline"); - m_logger->log("Ray Tracing Pipeline Created!",system::ILogger::ELL_INFO); + m_logger->log("Ray Tracing Pipeline Created!", system::ILogger::ELL_INFO); - //create shader binding table if (!createShaderBindingTable(gQueue, m_rayTracingPipeline)) return logFail("Could not create shader binding table"); } + { + const IGPUDescriptorSetLayout::SBinding bindings[] = { + { + .binding = 0u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, + .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT, + .count = 1u, + .immutableSamplers = &defaultSampler + } + }; + auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings); + const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() }; + const uint32_t setCounts[] = { 1u }; + m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts); + m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout); + + auto scRes = static_cast(m_surface->getSwapchainResources()); + ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); + if (!fsTriProtoPPln) + return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); + + // Load Fragment Shader + auto fragmentShader = compileShader("app_resources/present.frag.hlsl"); + if (!fragmentShader) + return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!"); + + const IGPUShader::SSpecInfo fragSpec = { + .entryPoint = "main", + .shader = fragmentShader.get() + }; + + auto presentLayout = m_device->createPipelineLayout( + {}, + core::smart_refctd_ptr(gpuPresentDescriptorSetLayout), + nullptr, + nullptr, + nullptr + ); + m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass()); + if (!m_presentPipeline) + return logFail("Could not create Graphics Pipeline!"); + } // write descriptors - IGPUDescriptorSet::SDescriptorInfo infos[2]; + IGPUDescriptorSet::SDescriptorInfo infos[3]; infos[0].desc = m_gpuTlas; - infos[1].desc = m_device->createImageView({ - .flags = IGPUImageView::ECF_NONE, - .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, - .image = m_hdrImage, - .viewType = IGPUImageView::E_TYPE::ET_2D, - .format = asset::EF_R16G16B16A16_SFLOAT - }); + + infos[1].desc = m_hdrImageView; if (!infos[1].desc) return logFail("Failed to create image view"); infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL; - IGPUDescriptorSet::SWriteDescriptorSet writes[3] = { - {.dstSet = m_renderDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]}, - {.dstSet = m_renderDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]} + + infos[2].desc = m_hdrImageView; + infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + + IGPUDescriptorSet::SWriteDescriptorSet writes[] = { + {.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]}, + {.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]}, + {.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] }, }; - m_device->updateDescriptorSets(std::span(writes, 2), {}); + m_device->updateDescriptorSets(std::span(writes), {}); + + // gui descriptor setup + { + using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS; + { + IGPUSampler::SParams params; + params.AnisotropicFilter = 1u; + params.TextureWrapU = ISampler::ETC_REPEAT; + params.TextureWrapV = ISampler::ETC_REPEAT; + params.TextureWrapW = ISampler::ETC_REPEAT; + + m_ui.samplers.gui = m_device->createSampler(params); + m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler"); + } + + std::array, 69u> immutableSamplers; + for (auto& it : immutableSamplers) + it = smart_refctd_ptr(m_ui.samplers.scene); + + immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui); + + nbl::ext::imgui::UI::SCreationParameters params; - // camera + params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u }; + params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u }; + params.assetManager = m_assetMgr; + params.pipelineCache = nullptr; + params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount); + params.renderpass = smart_refctd_ptr(renderpass); + params.streamingBuffer = nullptr; + params.subpassIx = 0u; + params.transfer = getTransferUpQueue(); + params.utilities = m_utils; + { + m_ui.manager = ext::imgui::UI::create(std::move(params)); + + // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources + const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u); + const auto& params = m_ui.manager->getCreationParameters(); + + IDescriptorPool::SCreateInfo descriptorPoolInfo = {}; + descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT; + descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount; + descriptorPoolInfo.maxSets = 1u; + descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT; + + m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo)); + assert(m_guiDescriptorSetPool); + + m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet); + assert(m_ui.descriptorSet); + } + } + + m_ui.manager->registerListener( + [this]() -> void { + ImGuiIO& io = ImGui::GetIO(); + + m_camera.setProjectionMatrix([&]() + { + static matrix4SIMD projection; + + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar); + + return projection; + }()); + + ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); + + // create a window and insert the inspector + ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); + ImGui::Begin("Controls"); + + ImGui::SameLine(); + + ImGui::Text("Camera"); + + ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Fov", &fov, 20.f, 150.f); + ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); + ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); + + ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); + + ImGui::End(); + } + ); + + // Set Camera { - core::vectorSIMDf cameraPosition(-5.81655884, 2.58630896, -4.23974705); - core::vectorSIMDf cameraTarget(-0.349590302, -0.213266611, 0.317821503); - matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), float(WIN_W) / WIN_H, 0.1, 1000); - m_camera = Camera(cameraPosition, cameraTarget, projectionMatrix, 1.069f, 0.4f); + core::vectorSIMDf cameraPosition(0, 5, -10); + matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( + core::radians(60.0f), + WIN_W / WIN_H, + 0.01f, + 500.0f + ); + m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); } + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); m_winMgr->show(m_window.get()); m_oracle.reportBeginFrameRecord(); + m_camera.mapKeysToWASD(); return true; } - inline void workLoopBody() override + bool updateGUIDescriptorSet() { - const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout + static std::array descriptorInfo; + static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount]; - const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); + descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr(m_ui.manager->getFontAtlasView()); + + for (uint32_t i = 0; i < descriptorInfo.size(); ++i) + { + writes[i].dstSet = m_ui.descriptorSet.get(); + writes[i].binding = 0u; + writes[i].arrayElement = i; + writes[i].count = 1u; + } + writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId; + + return m_device->updateDescriptorSets(writes, {}); + } + inline void workLoopBody() override + { + // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. + const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); + // We block for semaphores for 2 reasons here: + // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] + // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] if (m_realFrameIx >= framesInFlight) { - const ISemaphore::SWaitInfo cbDonePending[] = + const ISemaphore::SWaitInfo cbDonePending[] = { - { - .semaphore = m_semaphore.get(), - .value = m_realFrameIx + 1 - framesInFlight - } + { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + 1 - framesInFlight + } }; if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) return; } + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; - m_inputSystem->getDefaultMouse(&m_mouse); - m_inputSystem->getDefaultKeyboard(&m_keyboard); - - auto updatePresentationTimestamp = [&]() - { - m_currentImageAcquire = m_surface->acquireNextImage(); - - m_oracle.reportEndFrameRecord(); - const auto timestamp = m_oracle.getNextPresentationTimeStamp(); - m_oracle.reportBeginFrameRecord(); + m_api->startCapture(); - return timestamp; - }; + update(); - const auto nextPresentationTimestamp = updatePresentationTimestamp(); + auto queue = getGraphicsQueue(); + auto cmdbuf = m_cmdBufs[resourceIx].get(); - if (!m_currentImageAcquire) + if (!keepRunning()) return; - static bool first = true; - if (first) - { - m_api->startCapture(); - first = false; - } - - auto* const cmdbuf = m_cmdBufs.data()[resourceIx].get(); cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame"); - { - m_camera.beginInputProcessing(nextPresentationTimestamp); - m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void - { - if (m_camera.mouseProcess(events)) - { - m_frameAccumulationCounter = 0; - } - }, m_logger.get()); - m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void - { - if (m_camera.keyboardProcess(events)) - { - m_frameAccumulationCounter = 0; - } - }, m_logger.get()); - m_camera.endInputProcessing(nextPresentationTimestamp); - - } const auto viewMatrix = m_camera.getViewMatrix(); const auto projectionMatrix = m_camera.getProjectionMatrix(); @@ -370,14 +545,12 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, core::matrix4SIMD invModelViewProjectionMatrix; modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix); - auto* queue = getGraphicsQueue(); - { IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; imageBarriers[0].barrier = { .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, - .srcAccessMask = ACCESS_FLAGS::NONE, + .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader + .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS, .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS } @@ -390,37 +563,39 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .baseArrayLayer = 0u, .layerCount = 1u }; - imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; + imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL; imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); } - // do ray query - SPushConstants pc; - pc.geometryInfoBuffer = m_geometryInfoBuffer->getDeviceAddress(); - pc.frameCounter = m_frameAccumulationCounter; - const core::vector3df camPos = m_camera.getPosition().getAsVector3df(); - pc.camPos = { camPos.X, camPos.Y, camPos.Z }; - memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP)); - - cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); - cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); - cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_renderDs.get()); - cmdbuf->traceRays(m_shaderBindingTable.raygenGroupRegion, - m_shaderBindingTable.missGroupsRegion, - m_shaderBindingTable.hitGroupsRegion, - m_shaderBindingTable.callableGroupsRegion, - WIN_W, WIN_H, 1); - - // blit + // Trace Rays Pass + { + SPushConstants pc; + pc.geometryInfoBuffer = m_geometryInfoBuffer->getDeviceAddress(); + pc.frameCounter = m_frameAccumulationCounter; + const core::vector3df camPos = m_camera.getPosition().getAsVector3df(); + pc.camPos = { camPos.X, camPos.Y, camPos.Z }; + memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP)); + + cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); + cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); + cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get()); + cmdbuf->traceRays(m_shaderBindingTable.raygenGroupRegion, + m_shaderBindingTable.missGroupsRegion, + m_shaderBindingTable.hitGroupsRegion, + m_shaderBindingTable.callableGroupsRegion, + WIN_W, WIN_H, 1); + } + + // pipeline barrier { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[2]; + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; imageBarriers[0].barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, - .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, - .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, - .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT } }; imageBarriers[0].image = m_hdrImage.get(); @@ -431,75 +606,58 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .baseArrayLayer = 0u, .layerCount = 1u }; - imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; - imageBarriers[0].newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL; - - imageBarriers[1].barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, - .srcAccessMask = ACCESS_FLAGS::NONE, - .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, - .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT - } - }; - imageBarriers[1].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); - imageBarriers[1].subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .baseMipLevel = 0u, - .levelCount = 1u, - .baseArrayLayer = 0u, - .layerCount = 1u - }; - imageBarriers[1].oldLayout = IImage::LAYOUT::UNDEFINED; - imageBarriers[1].newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL; + imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL; + imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); } { - IGPUCommandBuffer::SImageBlit regions[] = { { - .srcMinCoord = {0,0,0}, - .srcMaxCoord = {WIN_W,WIN_H,1}, - .dstMinCoord = {0,0,0}, - .dstMaxCoord = {WIN_W,WIN_H,1}, - .layerCount = 1, - .srcBaseLayer = 0, - .dstBaseLayer = 0, - .srcMipLevel = 0, - .dstMipLevel = 0, - .aspectMask = IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT - } }; + asset::SViewport viewport; + { + viewport.minDepth = 1.f; + viewport.maxDepth = 0.f; + viewport.x = 0u; + viewport.y = 0u; + viewport.width = WIN_W; + viewport.height = WIN_H; + } + cmdbuf->setViewport(0u, 1u, &viewport); - auto srcImg = m_hdrImage.get(); - auto scRes = static_cast(m_surface->getSwapchainResources()); - auto dstImg = scRes->getImage(m_currentImageAcquire.imageIndex); - cmdbuf->blitImage(srcImg, IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, dstImg, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, regions, ISampler::ETF_NEAREST); - } + VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} }; + cmdbuf->setScissor(defaultScisors); - // TODO: transition to present - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; - imageBarriers[0].barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, - .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT, - .dstStageMask = PIPELINE_STAGE_FLAGS::NONE, - .dstAccessMask = ACCESS_FLAGS::NONE - } + auto scRes = static_cast(m_surface->getSwapchainResources()); + const VkRect2D currentRenderArea = + { + .offset = {0,0}, + .extent = {m_window->getWidth(),m_window->getHeight()} }; - imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); - imageBarriers[0].subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .baseMipLevel = 0u, - .levelCount = 1u, - .baseArrayLayer = 0u, - .layerCount = 1u + const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; + const IGPUCommandBuffer::SRenderpassBeginInfo info = + { + .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex), + .colorClearValues = &clearColor, + .depthStencilClearValues = nullptr, + .renderArea = currentRenderArea }; - imageBarriers[0].oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL; - imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC; + nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u }; + + cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); + + cmdbuf->bindGraphicsPipeline(m_presentPipeline.get()); + cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get()); + ext::FullScreenTriangle::recordDrawCall(cmdbuf); + + const auto uiParams = m_ui.manager->getCreationParameters(); + auto* uiPipeline = m_ui.manager->getPipeline(); + cmdbuf->bindGraphicsPipeline(uiPipeline); + cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get()); + m_ui.manager->render(cmdbuf, waitInfo); + + cmdbuf->endRenderPass(); - cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); } cmdbuf->endDebugMarker(); @@ -538,32 +696,102 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } }; - if (queue->submit(infos) == IQueue::RESULT::SUCCESS) - { - const nbl::video::ISemaphore::SWaitInfo waitInfos[] = - { { - .semaphore = m_semaphore.get(), - .value = m_realFrameIx - } }; + updateGUIDescriptorSet(); - m_device->blockForSemaphores(waitInfos); // this is not solution, quick wa to not throw validation errors - } - else - --m_realFrameIx; + if (queue->submit(infos) != IQueue::RESULT::SUCCESS) + m_realFrameIx--; } } - std::string caption = "[Nabla Engine] Ray Tracing Pipeline"; - { - caption += ", displaying [all objects]"; - m_window->setCaption(caption); - } + m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline"); m_surface->present(m_currentImageAcquire.imageIndex, rendered); } - + m_api->endCapture(); m_frameAccumulationCounter++; } + inline void update() + { + m_camera.setMoveSpeed(moveSpeed); + m_camera.setRotateSpeed(rotateSpeed); + + static std::chrono::microseconds previousEventTimestamp{}; + + m_inputSystem->getDefaultMouse(&m_mouse); + m_inputSystem->getDefaultKeyboard(&m_keyboard); + + auto updatePresentationTimestamp = [&]() + { + m_currentImageAcquire = m_surface->acquireNextImage(); + + m_oracle.reportEndFrameRecord(); + const auto timestamp = m_oracle.getNextPresentationTimeStamp(); + m_oracle.reportBeginFrameRecord(); + + return timestamp; + }; + + const auto nextPresentationTimestamp = updatePresentationTimestamp(); + + struct + { + std::vector mouse{}; + std::vector keyboard{}; + } capturedEvents; + + m_camera.beginInputProcessing(nextPresentationTimestamp); + { + bool camera_moved = false; + m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + { + camera_moved |= m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.mouse.emplace_back(e); + + } + }, m_logger.get()); + + m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + { + camera_moved |= m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.keyboard.emplace_back(e); + } + }, m_logger.get()); + + if (camera_moved) + m_frameAccumulationCounter = 0; + } + m_camera.endInputProcessing(nextPresentationTimestamp); + + const core::SRange mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size()); + const core::SRange keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size()); + const auto cursorPosition = m_window->getCursorControl()->getPosition(); + const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()); + + const ext::imgui::UI::SUpdateParameters params = + { + .mousePosition = mousePosition, + .displaySize = { m_window->getWidth(), m_window->getHeight() }, + .mouseEvents = mouseEvents, + .keyboardEvents = keyboardEvents + }; + + m_ui.manager->update(params); + } + inline bool keepRunning() override { if (m_surface->irrecoverable()) @@ -673,9 +901,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); return transform; }; - + core::matrix3x4SIMD planeTransform; - planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{1, 0, 0})); + planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); const auto cpuObjects = std::array{ ReferenceObjectCpu { @@ -842,7 +1070,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .indexCount = cpuObject.data.indexCount, .material = cpuObject.material, .transform = cpuObject.transform, - }); + }); } for (uint32_t i = 0; i < m_gpuObjects.size(); i++) @@ -912,7 +1140,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, cpuBufferParams.size = bufferSize; auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams)); uint8_t* pData = reinterpret_cast(cpuBuffer->getPointer()); - + // copy raygen region memcpy(pData, pipeline->getRaygenGroupShaderHandle().data(), handleSize); @@ -956,7 +1184,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue) { - IQueryPool::SCreationParams qParams{ .queryCount = static_cast(m_gpuObjects.size()), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE}; + IQueryPool::SCreationParams qParams{ .queryCount = static_cast(m_gpuObjects.size()), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE }; smart_refctd_ptr queryPool = m_device->createQueryPool(std::move(qParams)); auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); @@ -1253,7 +1481,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, smart_refctd_ptr> m_surface; smart_refctd_ptr m_semaphore; uint64_t m_realFrameIx = 0; - uint32_t m_frameAccumulationCounter = -1; + uint32_t m_frameAccumulationCounter = 0; std::array, MaxFramesInFlight> m_cmdBufs; ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; @@ -1261,10 +1489,26 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, InputSystem::ChannelReader m_mouse; InputSystem::ChannelReader m_keyboard; + float fov = 60.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; + float viewWidth = 10.f; + float camYAngle = 165.f / 180.f * 3.14159f; + float camXAngle = 32.f / 180.f * 3.14159f; Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); - CameraView m_oldCameraView; video::CDumbPresentationOracle m_oracle; + struct C_UI + { + nbl::core::smart_refctd_ptr manager; + + struct + { + core::smart_refctd_ptr gui, scene; + } samplers; + + core::smart_refctd_ptr descriptorSet; + } m_ui; + core::smart_refctd_ptr m_guiDescriptorSetPool; + std::vector m_gpuObjects; std::vector> m_gpuBlasList; @@ -1272,17 +1516,19 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, smart_refctd_ptr m_instanceBuffer; smart_refctd_ptr m_geometryInfoBuffer; - ShaderBindingTable m_shaderBindingTable; smart_refctd_ptr m_hdrImage; + smart_refctd_ptr m_hdrImageView; + smart_refctd_ptr m_rayTracingDsPool; + smart_refctd_ptr m_rayTracingDs; smart_refctd_ptr m_rayTracingPipeline; - smart_refctd_ptr m_renderDs; - smart_refctd_ptr m_renderPool; + ShaderBindingTable m_shaderBindingTable; - smart_refctd_ptr m_converter; - smart_refctd_ptr m_sbtBuffer; + smart_refctd_ptr m_presentDs; + smart_refctd_ptr m_presentDsPool; + smart_refctd_ptr m_presentPipeline; - uint16_t gcIndex = {}; + smart_refctd_ptr m_converter; }; From 6ac8f88906b02ad2464961c9106d79474bfa191d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Jan 2025 22:49:22 +0700 Subject: [PATCH 004/296] Implement multiple light type Signed-off-by: kevyuu --- .../app_resources/common.hlsl | 37 ++++- .../app_resources/raytrace.rchit.hlsl | 137 +++++++++++------- 71_RayTracingPipeline/main.cpp | 60 ++++++-- 3 files changed, 167 insertions(+), 67 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index 3b6c36abc..d28e646fe 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -29,14 +29,47 @@ struct SGeomInfo Material material; }; +enum E_LIGHT_TYPE : int32_t +{ + ELT_DIRECTIONAL, + ELT_POINT, + ELT_SPOT, + ELT_COUNT +}; + +struct Light +{ + float32_t3 direction; + float32_t3 position; + float32_t intensity; + float32_t innerCutoff; + float32_t outerCutoff; + int32_t type; + +#ifndef __HLSL_VERSION + bool operator==(const Light&) const = default; +#endif + +}; + struct SPushConstants { - uint64_t geometryInfoBuffer; - uint32_t frameCounter; + Light light; float32_t3 camPos; float32_t4x4 invMVP; + uint64_t geometryInfoBuffer; + uint32_t frameCounter; +}; + + +struct RayLight +{ + float32_t3 inHitPosition; + float32_t outLightDistance; + float32_t3 outLightDir; + float32_t outIntensity; }; #ifdef __HLSL_VERSION diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl index b77412ff7..d8c527389 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl @@ -13,7 +13,8 @@ float3 unpackNormals3x10(uint32_t v) return clamp(float3(pn) / 511.0, -1.0, 1.0); } -struct VertexData { +struct VertexData +{ float32_t3 position; float32_t3 normal; }; @@ -33,30 +34,30 @@ VertexData fetchVertexData(int instID, int primID, SGeomInfo geom, float2 bary) { case 0: // EIT_16BIT { - i0 = uint32_t(vk::RawBufferLoad(indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u)); - i1 = uint32_t(vk::RawBufferLoad(indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u)); - i2 = uint32_t(vk::RawBufferLoad(indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u)); - } - break; + i0 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u)); + i1 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u)); + i2 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u)); + } + break; case 1: // EIT_32BIT { - i0 = vk::RawBufferLoad(indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t)); - i1 = vk::RawBufferLoad(indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t)); - i2 = vk::RawBufferLoad(indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t)); - } - break; - default: // EIT_NONE + i0 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t)); + i1 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t)); + i2 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t)); + } + break; + default: // EIT_NONE { - i0 = idxOffset; - i1 = idxOffset + 1; - i2 = idxOffset + 2; - } + i0 = idxOffset; + i1 = idxOffset + 1; + i2 = idxOffset + 2; + } } const uint64_t vertexBufferAddress = geom.vertexBufferAddress; - float32_t3 p0 = vk::RawBufferLoad(vertexBufferAddress + i0 * vertexStride); - float32_t3 p1 = vk::RawBufferLoad(vertexBufferAddress + i1 * vertexStride); - float32_t3 p2 = vk::RawBufferLoad(vertexBufferAddress + i2 * vertexStride); + float32_t3 p0 = vk::RawBufferLoad < float32_t3 > (vertexBufferAddress + i0 * vertexStride); + float32_t3 p1 = vk::RawBufferLoad < float32_t3 > (vertexBufferAddress + i1 * vertexStride); + float32_t3 p2 = vk::RawBufferLoad < float32_t3 > (vertexBufferAddress + i2 * vertexStride); const uint64_t normalVertexBufferAddress = vertexBufferAddress + s_offsetsToNormalBytes[objType]; float3 n0, n1, n2; @@ -64,42 +65,45 @@ VertexData fetchVertexData(int instID, int primID, SGeomInfo geom, float2 bary) { case OT_CUBE: { - uint32_t v0 = vk::RawBufferLoad(normalVertexBufferAddress + i0 * vertexStride, 2u); - uint32_t v1 = vk::RawBufferLoad(normalVertexBufferAddress + i1 * vertexStride, 2u); - uint32_t v2 = vk::RawBufferLoad(normalVertexBufferAddress + i2 * vertexStride, 2u); - - n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz); - n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz); - n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz); - } - break; + uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride, 2u); + uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride, 2u); + uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride, 2u); + + n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz); + n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz); + n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz); + } + break; case OT_SPHERE: case OT_CYLINDER: case OT_ARROW: case OT_CONE: { - uint32_t v0 = vk::RawBufferLoad(normalVertexBufferAddress + i0 * vertexStride); - uint32_t v1 = vk::RawBufferLoad(normalVertexBufferAddress + i1 * vertexStride); - uint32_t v2 = vk::RawBufferLoad(normalVertexBufferAddress + i2 * vertexStride); - - n0 = normalize(unpackNormals3x10(v0)); - n1 = normalize(unpackNormals3x10(v1)); - n2 = normalize(unpackNormals3x10(v2)); - } - break; + uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride); + uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride); + uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride); + + n0 = normalize(unpackNormals3x10(v0)); + n1 = normalize(unpackNormals3x10(v1)); + n2 = normalize(unpackNormals3x10(v2)); + } + break; case OT_RECTANGLE: case OT_DISK: case OT_ICOSPHERE: default: { - n0 = normalize(vk::RawBufferLoad(normalVertexBufferAddress + i0 * vertexStride)); - n1 = normalize(vk::RawBufferLoad(normalVertexBufferAddress + i1 * vertexStride)); - n2 = normalize(vk::RawBufferLoad(normalVertexBufferAddress + i2 * vertexStride)); - } + n0 = normalize(vk::RawBufferLoad < + float3 > (normalVertexBufferAddress + i0 * vertexStride)); + n1 = normalize(vk::RawBufferLoad < + float3 > (normalVertexBufferAddress + i1 * vertexStride)); + n2 = normalize(vk::RawBufferLoad < + float3 > (normalVertexBufferAddress + i2 * vertexStride)); + } } float3 barycentrics = float3(0.0, bary); - barycentrics.x = 1.0 - barycentrics.y - barycentrics.z; + barycentrics.x = 1.0 - barycentrics.y - barycentrics.z; VertexData data; data.position = barycentrics.x * p0 + barycentrics.y * p1 + barycentrics.z * p2; @@ -110,27 +114,52 @@ VertexData fetchVertexData(int instID, int primID, SGeomInfo geom, float2 bary) [shader("closesthit")] void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs) { - const int instID = InstanceID(); - const int primID = PrimitiveIndex(); - const SGeomInfo geom = vk::RawBufferLoad(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo)); + const int instID = InstanceID(); + const int primID = PrimitiveIndex(); + const SGeomInfo geom = vk::RawBufferLoad < SGeomInfo > (pc.geometryInfoBuffer + instID * sizeof(SGeomInfo)); const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics); const float32_t3 worldPosition = mul(ObjectToWorld3x4(), float32_t4(vertexData.position, 1)); const float32_t3 worldNormal = mul(vertexData.normal, WorldToObject3x4()).xyz; - const float32_t lightIntensity = 1; - const float32_t3 lightDirection = normalize(float32_t3(1, 1, -1)); + RayLight cLight; + cLight.inHitPosition = worldPosition; + if (pc.light.type == 0) + { + cLight.outLightDir = normalize(-pc.light.direction); + cLight.outIntensity = 1.0; + cLight.outLightDistance = 10000000; + } + if (pc.light.type == 1) + { + float32_t3 lDir = pc.light.position - cLight.inHitPosition; + float lightDistance = length(lDir); + cLight.outIntensity = pc.light.intensity / (lightDistance * lightDistance); + cLight.outLightDir = normalize(lDir); + cLight.outLightDistance = lightDistance; + } + else if (pc.light.type == 2) + { + float32_t3 lDir = pc.light.position - cLight.inHitPosition; + cLight.outLightDistance = length(lDir); + cLight.outIntensity = pc.light.intensity / (cLight.outLightDistance * cLight.outLightDistance); + cLight.outLightDir = normalize(lDir); + float theta = dot(cLight.outLightDir, normalize(-pc.light.direction)); + float epsilon = pc.light.innerCutoff - pc.light.outerCutoff; + float spotIntensity = clamp((theta - pc.light.outerCutoff) / epsilon, 0.0, 1.0); + cLight.outIntensity *= spotIntensity; + } - float32_t3 diffuse = computeDiffuse(geom.material, lightDirection, worldNormal); + float32_t3 diffuse = computeDiffuse(geom.material, cLight.outLightDir, worldNormal); float32_t3 specular = float32_t3(0, 0, 0); float32_t attenuation = 1; - if (dot(worldNormal, lightDirection) > 0) + if (dot(worldNormal, cLight.outLightDir) > 0) { RayDesc rayDesc; - rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent() + worldNormal * 0.02f; - rayDesc.Direction = lightDirection; + rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent() + worldNormal * 0.02f; + rayDesc.Direction = cLight.outLightDir; rayDesc.TMin = 0.001; - rayDesc.TMax = 1000; + rayDesc.TMax = cLight.outLightDistance; uint flags = RAY_FLAG_SKIP_CLOSEST_HIT_SHADER; ShadowPayload shadowPayload; @@ -145,8 +174,8 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs } else { - specular = computeSpecular(geom.material, WorldRayDirection(), lightDirection, worldNormal); + specular = computeSpecular(geom.material, WorldRayDirection(), cLight.outLightDir, worldNormal); } } - p.hitValue = (lightIntensity * attenuation * (specular + diffuse)); + p.hitValue = (cLight.outIntensity * attenuation * (diffuse + specular)); } \ No newline at end of file diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 4fc992c90..c83498896 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -15,12 +15,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, constexpr static inline uint32_t MaxFramesInFlight = 3u; constexpr static inline uint8_t MaxUITextureCount = 1u; - enum E_LIGHT_TYPE : uint8_t - { - ELT_SPHERE, - ELT_TRIANGLE, - ELT_RECTANGLE, - ELT_COUNT + static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = { + "Directional", + "Point", + "Spot" }; constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); @@ -36,7 +34,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public: inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) - : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) { + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) + { } inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override @@ -229,13 +228,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); auto* geometryCreator = assetManager->getGeometryCreator(); - auto cQueue = getComputeQueue(); - // create geometry objects if (!createGeometries(gQueue, geometryCreator)) return logFail("Could not create geometries from geometry creator"); - if (!createAccelerationStructures(cQueue)) + if (!createAccelerationStructures(getComputeQueue())) return logFail("Could not create acceleration structures"); ISampler::SParams samplerParams = { @@ -449,6 +446,37 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, ImGui::SliderFloat("Fov", &fov, 20.f, 150.f); ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); + Light m_oldLight = m_light; + ImGui::ListBox("LightType", &m_light.type, s_lightTypeNames, ELT_COUNT); + if (m_light.type == ELT_DIRECTIONAL) + { + ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); + } else if (m_light.type == ELT_POINT) + { + ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); + ImGui::SliderFloat("Light Intensity", &m_light.intensity, 0.0f, 500.f); + } else if (m_light.type == ELT_SPOT) + { + ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); + ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); + ImGui::SliderFloat("Light Intensity", &m_light.intensity, 0.0f, 500.f); + + float32_t dInnerCutoff = degrees(acos(m_light.innerCutoff)); + float32_t dOuterCutoff = degrees(acos(m_light.outerCutoff)); + if (ImGui::SliderFloat("Light Inner Cutoff", &dInnerCutoff, 0.0f, 45.0f)) + { + dInnerCutoff = dInnerCutoff > dOuterCutoff ? dOuterCutoff : dInnerCutoff; + m_light.innerCutoff = cos(radians(dInnerCutoff)); + } + if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f)) + { + m_light.outerCutoff = cos(radians(dOuterCutoff)); + } + } + if (m_light != m_oldLight) + { + m_frameAccumulationCounter = 0; + } ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); @@ -571,6 +599,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, // Trace Rays Pass { SPushConstants pc; + pc.light = m_light; pc.geometryInfoBuffer = m_geometryInfoBuffer->getDeviceAddress(); pc.frameCounter = m_frameAccumulationCounter; const core::vector3df camPos = m_camera.getPosition().getAsVector3df(); @@ -1494,6 +1523,16 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, float camYAngle = 165.f / 180.f * 3.14159f; float camXAngle = 32.f / 180.f * 3.14159f; Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); + + Light m_light = { + .direction = {-1.0f, -1.0f, -0.4f}, + .position = {10.0f, 15.0f, 8.0f}, + .intensity = 100.0f, + .innerCutoff = 0.939692621f, // {cos(radians(20.0f))}, + .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, + .type = ELT_DIRECTIONAL + }; + video::CDumbPresentationOracle m_oracle; struct C_UI @@ -1531,5 +1570,4 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, smart_refctd_ptr m_converter; }; - NBL_MAIN_FUNC(RaytracingPipelineApp) From 9091da112b9b5763d2b340045e261fe0032d6bdc Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Jan 2025 00:02:44 +0700 Subject: [PATCH 005/296] Implement callable shader Signed-off-by: kevyuu --- .../app_resources/common.hlsl | 12 ++++---- .../app_resources/lgiht_spot.rcall.hlsl | 16 ++++++++++ .../light_directional.rcall.hlsl | 11 +++++++ .../app_resources/light_point.rcall.hlsl | 13 ++++++++ .../app_resources/raytrace.rahit.hlsl | 3 +- .../app_resources/raytrace.rchit.hlsl | 30 ++----------------- .../app_resources/raytrace.rgen.hlsl | 1 - 71_RayTracingPipeline/main.cpp | 21 +++++++++---- 8 files changed, 66 insertions(+), 41 deletions(-) create mode 100644 71_RayTracingPipeline/app_resources/lgiht_spot.rcall.hlsl create mode 100644 71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl create mode 100644 71_RayTracingPipeline/app_resources/light_point.rcall.hlsl diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index d28e646fe..ce82181c3 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -66,9 +66,9 @@ struct SPushConstants struct RayLight { - float32_t3 inHitPosition; + float32_t3 inHitPosition; float32_t outLightDistance; - float32_t3 outLightDir; + float32_t3 outLightDir; float32_t outIntensity; }; @@ -76,14 +76,14 @@ struct RayLight struct [raypayload] ColorPayload { - float32_t3 hitValue; - uint32_t seed; + float32_t3 hitValue : read(caller) : write(closesthit,miss); + uint32_t seed : read(closesthit,anyhit) : write(caller); }; struct [raypayload] ShadowPayload { - bool isShadowed; - uint32_t seed; + bool isShadowed : read(caller) : write(caller,miss); + uint32_t seed : read(anyhit) : write(caller); }; enum ObjectType : uint32_t // matches c++ diff --git a/71_RayTracingPipeline/app_resources/lgiht_spot.rcall.hlsl b/71_RayTracingPipeline/app_resources/lgiht_spot.rcall.hlsl new file mode 100644 index 000000000..5dbc5a830 --- /dev/null +++ b/71_RayTracingPipeline/app_resources/lgiht_spot.rcall.hlsl @@ -0,0 +1,16 @@ +#include "common.hlsl" + +[[vk::push_constant]] SPushConstants pc; + +[shader("callable")] +void main(inout RayLight cLight) +{ + float32_t3 lDir = pc.light.position - cLight.inHitPosition; + cLight.outLightDistance = length(lDir); + cLight.outIntensity = pc.light.intensity / (cLight.outLightDistance * cLight.outLightDistance); + cLight.outLightDir = normalize(lDir); + float theta = dot(cLight.outLightDir, normalize(-pc.light.direction)); + float epsilon = pc.light.innerCutoff - pc.light.outerCutoff; + float spotIntensity = clamp((theta - pc.light.outerCutoff) / epsilon, 0.0, 1.0); + cLight.outIntensity *= spotIntensity; +} diff --git a/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl new file mode 100644 index 000000000..d4aeca85e --- /dev/null +++ b/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl @@ -0,0 +1,11 @@ +#include "common.hlsl" + +[[vk::push_constant]] SPushConstants pc; + +[shader("callable")] +void main(inout RayLight cLight) +{ + cLight.outLightDir = normalize(-pc.light.direction); + cLight.outIntensity = 1.0; + cLight.outLightDistance = 10000000; +} diff --git a/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl new file mode 100644 index 000000000..e82d17ec8 --- /dev/null +++ b/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl @@ -0,0 +1,13 @@ +#include "common.hlsl" + +[[vk::push_constant]] SPushConstants pc; + +[shader("callable")] +void main(inout RayLight cLight) +{ + float32_t3 lDir = pc.light.position - cLight.inHitPosition; + float lightDistance = length(lDir); + cLight.outIntensity = pc.light.intensity / (lightDistance * lightDistance); + cLight.outLightDir = normalize(lDir); + cLight.outLightDistance = lightDistance; +} \ No newline at end of file diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl index f68d607aa..660e506c4 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -20,8 +20,9 @@ void main(inout AnyHitPayload p, in BuiltInTriangleIntersectionAttributes attrib if (geom.material.illum != 4) return; + uint32_t seed = p.seed; if (geom.material.dissolve == 0.0) IgnoreHit(); - else if (rnd(p.seed) > geom.material.dissolve) + else if (rnd(seed) > geom.material.dissolve) IgnoreHit(); } diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl index d8c527389..c89b69142 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl @@ -123,31 +123,7 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs RayLight cLight; cLight.inHitPosition = worldPosition; - if (pc.light.type == 0) - { - cLight.outLightDir = normalize(-pc.light.direction); - cLight.outIntensity = 1.0; - cLight.outLightDistance = 10000000; - } - if (pc.light.type == 1) - { - float32_t3 lDir = pc.light.position - cLight.inHitPosition; - float lightDistance = length(lDir); - cLight.outIntensity = pc.light.intensity / (lightDistance * lightDistance); - cLight.outLightDir = normalize(lDir); - cLight.outLightDistance = lightDistance; - } - else if (pc.light.type == 2) - { - float32_t3 lDir = pc.light.position - cLight.inHitPosition; - cLight.outLightDistance = length(lDir); - cLight.outIntensity = pc.light.intensity / (cLight.outLightDistance * cLight.outLightDistance); - cLight.outLightDir = normalize(lDir); - float theta = dot(cLight.outLightDir, normalize(-pc.light.direction)); - float epsilon = pc.light.innerCutoff - pc.light.outerCutoff; - float spotIntensity = clamp((theta - pc.light.outerCutoff) / epsilon, 0.0, 1.0); - cLight.outIntensity *= spotIntensity; - } + CallShader(pc.light.type, cLight); float32_t3 diffuse = computeDiffuse(geom.material, cLight.outLightDir, worldNormal); float32_t3 specular = float32_t3(0, 0, 0); @@ -166,9 +142,9 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs shadowPayload.isShadowed = true; shadowPayload.seed = p.seed; TraceRay(topLevelAS, flags, 0xFF, 1, 0, 1, rayDesc, shadowPayload); - p.seed = shadowPayload.seed; - if (shadowPayload.isShadowed) + bool isShadowed = shadowPayload.isShadowed; + if (isShadowed) { attenuation = 0.3; } diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index 90b950f76..efbbcd56e 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -51,7 +51,6 @@ void main() ColorPayload payload; payload.seed = seed; - payload.hitValue = float32_t3(0, 0, 0); TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, 0, 0, 0, rayDesc, payload); hitValues += payload.hitValue; diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index c83498896..51001f4f8 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -134,6 +134,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto anyHitShaderShadowPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_SHADOW_PAYLOAD\n"); const auto missShader = compileShader("app_resources/raytrace.rmiss.hlsl"); const auto shadowMissShader = compileShader("app_resources/raytraceShadow.rmiss.hlsl"); + const auto directionalLightCallShader = compileShader("app_resources/light_directional.rcall.hlsl"); + const auto pointLightCallShader = compileShader("app_resources/light_point.rcall.hlsl"); + const auto spotLightCallShader = compileShader("app_resources/light_spot.rcall.hlsl"); m_semaphore = m_device->createSemaphore(m_realFrameIx); if (!m_semaphore) @@ -275,11 +278,14 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const IGPUShader::SSpecInfo shaders[] = { {.shader = raygenShader.get()}, + {.shader = missShader.get()}, + {.shader = shadowMissShader.get()}, {.shader = closestHitShader.get()}, {.shader = anyHitShaderColorPayload.get()}, {.shader = anyHitShaderShadowPayload.get()}, - {.shader = missShader.get()}, - {.shader = shadowMissShader.get()}, + {.shader = directionalLightCallShader.get()}, + {.shader = pointLightCallShader.get()}, + {.shader = spotLightCallShader.get()}, }; params.layout = pipelineLayout.get(); @@ -287,10 +293,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, params.cached.shaderGroups.raygenGroup = { .shaderIndex = 0, }; - params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 1, .anyHitShaderIndex = 2 }); - params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 1, .anyHitShaderIndex = 3 }); - params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 4 }); - params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 5 }); + params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 1 }); + params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 2 }); + params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 3, .anyHitShaderIndex = 4 }); + params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 3, .anyHitShaderIndex = 5 }); + params.cached.shaderGroups.callableGroups.push_back({.shaderIndex = 6}); + params.cached.shaderGroups.callableGroups.push_back({.shaderIndex = 7}); + params.cached.shaderGroups.callableGroups.push_back({.shaderIndex = 8}); params.cached.maxRecursionDepth = 2; if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) return logFail("Failed to create ray tracing pipeline"); From d303356516823fedea27e4a3da03d56d753b66a8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Jan 2025 15:35:51 +0700 Subject: [PATCH 006/296] Implement procedural geometry intersection shader demo Signed-off-by: kevyuu --- .../app_resources/common.hlsl | 39 ++- ..._spot.rcall.hlsl => light_spot.rcall.hlsl} | 0 .../app_resources/raytrace.rahit.hlsl | 2 +- .../app_resources/raytrace.rchit.hlsl | 10 +- .../app_resources/raytrace.rgen.hlsl | 6 +- .../app_resources/raytrace.rint.hlsl | 54 ++++ .../raytrace_procedural.rchit.hlsl | 61 ++++ 71_RayTracingPipeline/main.cpp | 304 +++++++++++++----- 8 files changed, 378 insertions(+), 98 deletions(-) rename 71_RayTracingPipeline/app_resources/{lgiht_spot.rcall.hlsl => light_spot.rcall.hlsl} (100%) create mode 100644 71_RayTracingPipeline/app_resources/raytrace.rint.hlsl create mode 100644 71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index ce82181c3..50306b516 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -15,7 +15,20 @@ struct Material uint32_t illum; // illumination model (see http://www.fileformat.info/format/material/) }; -struct SGeomInfo +struct SProceduralGeomInfo +{ + float32_t3 center; + float32_t radius; + Material material; +}; + +struct Aabb +{ + float32_t3 minimum; + float32_t3 maximum; +}; + +struct STriangleGeomInfo { uint64_t vertexBufferAddress; uint64_t indexBufferAddress; @@ -29,6 +42,13 @@ struct SGeomInfo Material material; }; +enum E_GEOM_TYPE : int32_t +{ + EGT_TRIANGLES, + EGT_PROCEDURAL, + EGT_COUNT +}; + enum E_LIGHT_TYPE : int32_t { ELT_DIRECTIONAL, @@ -37,6 +57,20 @@ enum E_LIGHT_TYPE : int32_t ELT_COUNT }; +enum E_RAY_TYPE : int32_t +{ + ERT_PRIMARY, // Ray shoot from camera + ERT_OCCLUSION, + ERT_COUNT +}; + +enum E_MISS_TYPE : int32_t +{ + EMT_PRIMARY, + EMT_OCCLUSION, + EMT_COUNT +}; + struct Light { float32_t3 direction; @@ -59,7 +93,8 @@ struct SPushConstants float32_t3 camPos; float32_t4x4 invMVP; - uint64_t geometryInfoBuffer; + uint64_t proceduralGeomInfoBuffer; + uint64_t triangleGeomInfoBuffer; uint32_t frameCounter; }; diff --git a/71_RayTracingPipeline/app_resources/lgiht_spot.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl similarity index 100% rename from 71_RayTracingPipeline/app_resources/lgiht_spot.rcall.hlsl rename to 71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl index 660e506c4..5db6d70fa 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -15,7 +15,7 @@ using AnyHitPayload = ShadowPayload; void main(inout AnyHitPayload p, in BuiltInTriangleIntersectionAttributes attribs) { const int instID = InstanceID(); - const SGeomInfo geom = vk::RawBufferLoad < SGeomInfo > (pc.geometryInfoBuffer + instID * sizeof(SGeomInfo)); + const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); if (geom.material.illum != 4) return; diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl index c89b69142..734491e7d 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl @@ -19,7 +19,7 @@ struct VertexData float32_t3 normal; }; -VertexData fetchVertexData(int instID, int primID, SGeomInfo geom, float2 bary) +VertexData fetchVertexData(int instID, int primID, STriangleGeomInfo geom, float2 bary) { uint idxOffset = primID * 3; @@ -116,7 +116,7 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs { const int instID = InstanceID(); const int primID = PrimitiveIndex(); - const SGeomInfo geom = vk::RawBufferLoad < SGeomInfo > (pc.geometryInfoBuffer + instID * sizeof(SGeomInfo)); + const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics); const float32_t3 worldPosition = mul(ObjectToWorld3x4(), float32_t4(vertexData.position, 1)); const float32_t3 worldNormal = mul(vertexData.normal, WorldToObject3x4()).xyz; @@ -132,16 +132,16 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs if (dot(worldNormal, cLight.outLightDir) > 0) { RayDesc rayDesc; - rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent() + worldNormal * 0.02f; + rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent(); rayDesc.Direction = cLight.outLightDir; - rayDesc.TMin = 0.001; + rayDesc.TMin = 0.01; rayDesc.TMax = cLight.outLightDistance; uint flags = RAY_FLAG_SKIP_CLOSEST_HIT_SHADER; ShadowPayload shadowPayload; shadowPayload.isShadowed = true; shadowPayload.seed = p.seed; - TraceRay(topLevelAS, flags, 0xFF, 1, 0, 1, rayDesc, shadowPayload); + TraceRay(topLevelAS, flags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload); bool isShadowed = shadowPayload.isShadowed; if (isShadowed) diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index efbbcd56e..43b052630 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -46,12 +46,12 @@ void main() RayDesc rayDesc; rayDesc.Origin = pc.camPos; rayDesc.Direction = direction; - rayDesc.TMin = 0.01; - rayDesc.TMax = 1000.0; + rayDesc.TMin = 0.001; + rayDesc.TMax = 10000.0; ColorPayload payload; payload.seed = seed; - TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, 0, 0, 0, rayDesc, payload); + TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload); hitValues += payload.hitValue; } diff --git a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl new file mode 100644 index 000000000..f302543b6 --- /dev/null +++ b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl @@ -0,0 +1,54 @@ +#include "common.hlsl" + +[[vk::push_constant]] SPushConstants pc; + +struct Ray +{ + float32_t3 origin; + float32_t3 direction; +}; + +struct Attrib +{ + float3 HitAttribute; +}; + +// Ray-Sphere intersection +// http://viclw17.github.io/2018/07/16/raytracing-ray-sphere-intersection/ +float32_t hitSphere(SProceduralGeomInfo s, Ray r) +{ + float32_t3 oc = r.origin - s.center; + float32_t a = dot(r.direction, r.direction); + float32_t b = 2.0 * dot(oc, r.direction); + float32_t c = dot(oc, oc) - s.radius * s.radius; + float32_t discriminant = b * b - 4 * a * c; + + if (discriminant < 0) + { + return -1.0; + } + else + { + return (-b - sqrt(discriminant)) / (2.0 * a); + } +} + +[shader("intersection")] +void main() +{ + Ray ray; + ray.origin = WorldRayOrigin(); + ray.direction = WorldRayDirection(); + + const int primID = PrimitiveIndex(); + + // Sphere data + SProceduralGeomInfo sphere = vk::RawBufferLoad < SProceduralGeomInfo > (pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo)); + + float32_t tHit = hitSphere(sphere, ray); + + Attrib attrib; + // Report hit point + if (tHit > 0) + ReportHit(tHit, 0, attrib); +} \ No newline at end of file diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl new file mode 100644 index 000000000..ef3503346 --- /dev/null +++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl @@ -0,0 +1,61 @@ +#include "common.hlsl" + +[[vk::push_constant]] SPushConstants pc; + +[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; + +[shader("closesthit")] +void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs) +{ + const int instID = InstanceID(); + const int primID = PrimitiveIndex(); + float32_t3 worldPosition = WorldRayOrigin() + WorldRayDirection() * RayTCurrent(); + + SProceduralGeomInfo sphere = vk::RawBufferLoad < SProceduralGeomInfo > (pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo)); + + // Computing the normal at hit position + float32_t3 worldNormal = normalize(worldPosition - sphere.center); + + RayLight cLight; + cLight.inHitPosition = worldPosition; + CallShader(pc.light.type, cLight); + + // Material of the object + Material mat = sphere.material; + + // Diffuse + float3 diffuse = computeDiffuse(sphere.material, cLight.outLightDir, worldNormal); + float3 specular = float3(0, 0, 0); + float attenuation = 1; + + // Tracing shadow ray only if the light is visible from the surface + if (dot(worldNormal, cLight.outLightDir) > 0) + { + RayDesc rayDesc; + rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent(); + rayDesc.Direction = cLight.outLightDir; + rayDesc.TMin = 0.01; + rayDesc.TMax = cLight.outLightDistance; + + uint flags = + RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_OPAQUE | + RAY_FLAG_SKIP_CLOSEST_HIT_SHADER; + + ShadowPayload shadowPayload; + shadowPayload.isShadowed = true; + shadowPayload.seed = p.seed; + TraceRay(topLevelAS, flags, 0xFF, ERT_OCCLUSION, 0, EMT_PRIMARY, rayDesc, shadowPayload); + + bool isShadowed = shadowPayload.isShadowed; + if (isShadowed) + { + attenuation = 0.3; + } + else + { + specular = computeSpecular(sphere.material, WorldRayDirection(), cLight.outLightDir, worldNormal); + } + } + + p.hitValue = (cLight.outIntensity * attenuation * (diffuse + specular)); +} \ No newline at end of file diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 51001f4f8..ac3befb5e 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -14,6 +14,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; constexpr static inline uint32_t MaxFramesInFlight = 3u; constexpr static inline uint8_t MaxUITextureCount = 1u; + constexpr static inline uint32_t NumberOfProceduralGeometries = 5; static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = { "Directional", @@ -130,6 +131,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, // shader const auto raygenShader = compileShader("app_resources/raytrace.rgen.hlsl"); const auto closestHitShader = compileShader("app_resources/raytrace.rchit.hlsl"); + const auto proceduralClosestHitShader = compileShader("app_resources/raytrace_procedural.rchit.hlsl"); + const auto intersectionHitShader = compileShader("app_resources/raytrace.rint.hlsl"); const auto anyHitShaderColorPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_COLOR_PAYLOAD\n"); const auto anyHitShaderShadowPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_SHADOW_PAYLOAD\n"); const auto missShader = compileShader("app_resources/raytrace.rmiss.hlsl"); @@ -276,37 +279,85 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, IGPURayTracingPipeline::SCreationParams params = {}; - const IGPUShader::SSpecInfo shaders[] = { - {.shader = raygenShader.get()}, - {.shader = missShader.get()}, - {.shader = shadowMissShader.get()}, - {.shader = closestHitShader.get()}, - {.shader = anyHitShaderColorPayload.get()}, - {.shader = anyHitShaderShadowPayload.get()}, - {.shader = directionalLightCallShader.get()}, - {.shader = pointLightCallShader.get()}, - {.shader = spotLightCallShader.get()}, + enum RtDemoShader + { + RTDS_RAYGEN, + RTDS_MISS, + RTDS_SHADOW_MISS, + RTDS_CLOSEST_HIT, + RTDS_SPHERE_CLOSEST_HIT, + RTDS_ANYHIT_COLOR, + RTDS_ANYHIT_SHADOW, + RTDS_INTERSECTION, + RTDS_DIRECTIONAL_CALL, + RTDS_POINT_CALL, + RTDS_SPOT_CALL, + RTDS_COUNT }; + IGPUShader::SSpecInfo shaders[RTDS_COUNT]; + shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()}; + shaders[RTDS_MISS] = {.shader = missShader.get()}; + shaders[RTDS_SHADOW_MISS] = {.shader = shadowMissShader.get()}; + shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()}; + shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()}; + shaders[RTDS_ANYHIT_COLOR] = {.shader = anyHitShaderColorPayload.get()}; + shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get()}; + shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get() }; + shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get()}; + shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get()}; + shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get()}; + params.layout = pipelineLayout.get(); params.shaders = std::span(shaders, std::size(shaders)); - params.cached.shaderGroups.raygenGroup = { - .shaderIndex = 0, + + auto& shaderGroups = params.cached.shaderGroups; + + shaderGroups.raygenGroup = { .shaderIndex = RTDS_RAYGEN }; + + shaderGroups.missGroups.resize(E_MISS_TYPE::EMT_COUNT, {}); + shaderGroups.missGroups[EMT_PRIMARY] = { .shaderIndex = RTDS_MISS }; + shaderGroups.missGroups[EMT_OCCLUSION] = { .shaderIndex = RTDS_SHADOW_MISS }; + + auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) + { + return geomType * ERT_COUNT + rayType; + }; + shaderGroups.hitGroups.resize(E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT); + shaderGroups.hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { + .closestHitShaderIndex = RTDS_CLOSEST_HIT, + .anyHitShaderIndex = RTDS_ANYHIT_COLOR, + }; + shaderGroups.hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { + .closestHitShaderIndex = RTDS_CLOSEST_HIT, + .anyHitShaderIndex = RTDS_ANYHIT_SHADOW, + }; + shaderGroups.hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { + .closestHitShaderIndex = RTDS_SPHERE_CLOSEST_HIT, + .anyHitShaderIndex = RTDS_ANYHIT_COLOR, + .intersectionShaderIndex = RTDS_INTERSECTION, + }; + shaderGroups.hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { + .closestHitShaderIndex = RTDS_CLOSEST_HIT, + .anyHitShaderIndex = RTDS_ANYHIT_SHADOW, + .intersectionShaderIndex = RTDS_INTERSECTION, }; - params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 1 }); - params.cached.shaderGroups.missGroups.push_back({ .shaderIndex = 2 }); - params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 3, .anyHitShaderIndex = 4 }); - params.cached.shaderGroups.hitGroups.push_back({ .closestHitShaderIndex = 3, .anyHitShaderIndex = 5 }); - params.cached.shaderGroups.callableGroups.push_back({.shaderIndex = 6}); - params.cached.shaderGroups.callableGroups.push_back({.shaderIndex = 7}); - params.cached.shaderGroups.callableGroups.push_back({.shaderIndex = 8}); + + shaderGroups.callableGroups.resize(ELT_COUNT); + shaderGroups.callableGroups[ELT_DIRECTIONAL] = { .shaderIndex = RTDS_DIRECTIONAL_CALL }; + shaderGroups.callableGroups[ELT_POINT] = { .shaderIndex = RTDS_POINT_CALL }; + shaderGroups.callableGroups[ELT_SPOT] = { .shaderIndex = RTDS_SPOT_CALL }; + params.cached.maxRecursionDepth = 2; + if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) return logFail("Failed to create ray tracing pipeline"); m_logger->log("Ray Tracing Pipeline Created!", system::ILogger::ELL_INFO); if (!createShaderBindingTable(gQueue, m_rayTracingPipeline)) return logFail("Could not create shader binding table"); + + m_logger->log("Shader binding table created", system::ILogger::ELL_INFO); } { @@ -609,7 +660,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { SPushConstants pc; pc.light = m_light; - pc.geometryInfoBuffer = m_geometryInfoBuffer->getDeviceAddress(); + pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress(); + pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress(); pc.frameCounter = m_frameAccumulationCounter; const core::vector3df camPos = m_camera.getPosition().getAsVector3df(); pc.camPos = { camPos.X, camPos.Y, camPos.Z }; @@ -957,8 +1009,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .transform = getTranslationMatrix(0, 0.5f, 0), }, ReferenceObjectCpu { - .meta = {.type = OT_SPHERE, .name = "Sphere Mesh"}, - .data = gc->createSphereMesh(2, 16, 16), + .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), .material = { .ambient = {0.1, 0.1, 0.1}, .diffuse = {0.2, 0.2, 0.8}, @@ -969,8 +1021,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .transform = getTranslationMatrix(-5.0f, 1.0f, 0), }, ReferenceObjectCpu { - .meta = {.type = OT_SPHERE, .name = "Transparent Sphere Mesh"}, - .data = gc->createSphereMesh(2, 16, 16), + .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), .material = { .ambient = {0.1, 0.1, 0.1}, .diffuse = {0.2, 0.8, 0.2}, @@ -1060,10 +1112,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, prepass.template operator() < ICPUBuffer > (tmpBuffers); } - auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(SGeomInfo) }); - SGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); + auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) }); + STriangleGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); - m_gpuObjects.reserve(std::size(cpuObjects)); + m_gpuTriangleGeometries.reserve(std::size(cpuObjects)); // convert { // not sure if need this (probably not, originally for transition img view) @@ -1097,7 +1149,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { auto& cpuObject = cpuObjects[i]; - m_gpuObjects.push_back(ReferenceObjectGpu{ + m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ .meta = cpuObject.meta, .bindings = { .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, @@ -1111,9 +1163,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, }); } - for (uint32_t i = 0; i < m_gpuObjects.size(); i++) + for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) { - const auto& gpuObject = m_gpuObjects[i]; + const auto& gpuObject = m_gpuTriangleGeometries[i]; const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); geomInfos[i] = { .vertexBufferAddress = vertexBufferAddress, @@ -1131,7 +1183,50 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, IGPUBuffer::SCreationParams params; params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; params.size = geomInfoBuffer->getSize(); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_geometryInfoBuffer); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); + } + + // intersection geometries setup + { + auto spheresInfoBuffer = ICPUBuffer::create({ NumberOfProceduralGeometries * sizeof(SProceduralGeomInfo) }); + SProceduralGeomInfo* sphereInfos = reinterpret_cast(spheresInfoBuffer->getPointer()); + core::vector aabbs; + for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) + { + const auto middle_i = NumberOfProceduralGeometries / 2.0; + SProceduralGeomInfo sphere = { + .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), + .radius = 1, + .material = { + .ambient = {0.1, 0.1, 0.1}, + .diffuse = {0.3, 0.2 * i, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .illum = 2 + }, + }; + + sphereInfos[i] = sphere; + aabbs.push_back({ + .minimum = sphere.center - sphere.radius, + .maximum = sphere.center + sphere.radius, + }); + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = spheresInfoBuffer->getSize(); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), sphereInfos).move_into(m_proceduralGeomInfoBuffer); + m_logger->log("Device address : %d", ILogger::ELL_INFO, m_proceduralGeomInfoBuffer->getDeviceAddress()); + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT; + params.size = aabbs.size() * sizeof(Aabb); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer); + } } return true; @@ -1157,19 +1252,19 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, missRegion = { .offset = raygenRegion.size, .stride = handleSizeAligned, - .size = core::alignUp(pipeline->getMissGroupCount(), limits.shaderGroupBaseAlignment), + .size = core::alignUp(pipeline->getMissGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment), }; hitRegion = { .offset = missRegion.offset + missRegion.size, .stride = handleSizeAligned, - .size = core::alignUp(pipeline->getHitGroupCount(), limits.shaderGroupBaseAlignment), + .size = core::alignUp(pipeline->getHitGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment), }; callableRegion = { .offset = hitRegion.offset + hitRegion.size, .stride = handleSizeAligned, - .size = core::alignUp(pipeline->getCallableGroupCount(), limits.shaderGroupBaseAlignment), + .size = core::alignUp(pipeline->getCallableGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment), }; const auto bufferSize = raygenRegion.size + missRegion.size + hitRegion.size + callableRegion.size; @@ -1222,7 +1317,12 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue) { - IQueryPool::SCreationParams qParams{ .queryCount = static_cast(m_gpuObjects.size()), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE }; + // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}} + // spheres. Each sphere is a primitive instead one instance or geometry + const auto blasCount = m_gpuTriangleGeometries.size() + 1; + const auto proceduralBlasIdx = blasCount - 1; + + IQueryPool::SCreationParams qParams{ .queryCount = static_cast(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE }; smart_refctd_ptr queryPool = m_device->createQueryPool(std::move(qParams)); auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); @@ -1244,48 +1344,72 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, #endif size_t totalScratchSize = 0; + // build bottom level ASes { - core::vector blasBuildInfos(m_gpuObjects.size()); - core::vector primitiveCounts(m_gpuObjects.size()); - core::vector> triangles(m_gpuObjects.size()); - core::vector scratchSizes(m_gpuObjects.size()); - m_gpuBlasList.resize(m_gpuObjects.size()); - - for (uint32_t i = 0; i < m_gpuObjects.size(); i++) + core::vector primitiveCounts(blasCount); + core::vector> triangles(m_gpuTriangleGeometries.size()); + core::vector scratchSizes(blasCount); + IGPUBottomLevelAccelerationStructure::AABBs aabbs; + + auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; + if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) + blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR; + + IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo; + initBuildInfo.buildFlags = blasFlags; + initBuildInfo.geometryCount = 1; // only 1 geometry object per blas + initBuildInfo.srcAS = nullptr; + initBuildInfo.dstAS = nullptr; + initBuildInfo.scratch = {}; + + auto blasBuildInfos = core::vector(blasCount, initBuildInfo); + + m_gpuBlasList.resize(blasCount); + // setup blas info for triangle geometries + for (uint32_t i = 0; i < blasCount; i++) { - const auto& gpuObject = m_gpuObjects[i]; - - const uint32_t vertexStride = gpuObject.vertexStride; - const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride; - if (gpuObject.useIndex()) - primitiveCounts[i] = gpuObject.indexCount / 3; - else - primitiveCounts[i] = numVertices / 3; - - triangles[i].vertexData[0] = gpuObject.bindings.vertex; - triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex; - triangles[i].maxVertex = numVertices - 1; - triangles[i].vertexStride = vertexStride; - triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; - triangles[i].indexType = gpuObject.indexType; - triangles[i].geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT; - - auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; - if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) - blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR; - - blasBuildInfos[i].buildFlags = blasFlags; - blasBuildInfos[i].geometryCount = 1; // only 1 geometry object per blas - blasBuildInfos[i].srcAS = nullptr; - blasBuildInfos[i].dstAS = nullptr; - blasBuildInfos[i].triangles = &triangles[i]; - blasBuildInfos[i].scratch = {}; - + bool isProcedural = i == proceduralBlasIdx; + if (isProcedural) + { + aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer); + aabbs.stride = sizeof(Aabb); + aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now + + primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries; + blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs; + blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; + } else + { + const auto& gpuObject = m_gpuTriangleGeometries[i]; + + const uint32_t vertexStride = gpuObject.vertexStride; + const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride; + if (gpuObject.useIndex()) + primitiveCounts[i] = gpuObject.indexCount / 3; + else + primitiveCounts[i] = numVertices / 3; + + triangles[i].vertexData[0] = gpuObject.bindings.vertex; + triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex; + triangles[i].maxVertex = numVertices - 1; + triangles[i].vertexStride = vertexStride; + triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; + triangles[i].indexType = gpuObject.indexType; + triangles[i].geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT; + + blasBuildInfos[i].triangles = &triangles[i]; + } ILogicalDevice::AccelerationStructureBuildSizes buildSizes; { const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; - buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{ &triangles[i], 1 }, maxPrimCount); + if (isProcedural) + { + buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{&aabbs, 1}, maxPrimCount); + } else + { + buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{&triangles[i], 1}, maxPrimCount); + } if (!buildSizes) return logFail("Failed to get BLAS build sizes"); } @@ -1310,10 +1434,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } } + auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool); cmdbufBlas->beginDebugMarker("Build BLAS"); - cmdbufBlas->resetQueryPool(queryPool.get(), 0, m_gpuObjects.size()); + cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount); smart_refctd_ptr scratchBuffer; { @@ -1324,9 +1449,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } uint32_t queryCount = 0; - core::vector buildRangeInfos(m_gpuObjects.size()); - core::vector pRangeInfos(m_gpuObjects.size()); - for (uint32_t i = 0; i < m_gpuObjects.size(); i++) + core::vector buildRangeInfos(blasCount); + core::vector pRangeInfos(blasCount); + for (uint32_t i = 0; i < blasCount; i++) { blasBuildInfos[i].dstAS = m_gpuBlasList[i].get(); blasBuildInfos[i].scratch.buffer = scratchBuffer; @@ -1353,8 +1478,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } - core::vector ases(m_gpuObjects.size()); - for (uint32_t i = 0; i < m_gpuObjects.size(); i++) + core::vector ases(blasCount); + for (uint32_t i = 0; i < blasCount; i++) ases[i] = m_gpuBlasList[i].get(); if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE, queryPool.get(), queryCount++)) @@ -1369,12 +1494,12 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, // compact blas { - core::vector asSizes(m_gpuObjects.size(), 0); - if (!m_device->getQueryPoolResults(queryPool.get(), 0, m_gpuObjects.size(), asSizes.data(), sizeof(size_t), IQueryPool::WAIT_BIT)) + core::vector asSizes(blasCount); + if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), IQueryPool::WAIT_BIT)) return logFail("Could not get query pool results for AS sizes"); - core::vector> cleanupBlas(m_gpuObjects.size()); - for (uint32_t i = 0; i < m_gpuObjects.size(); i++) + core::vector> cleanupBlas(blasCount); + for (uint32_t i = 0; i < blasCount; i++) { cleanupBlas[i] = m_gpuBlasList[i]; { @@ -1410,16 +1535,17 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, // build top level AS { - const uint32_t instancesCount = m_gpuObjects.size(); - core::vector instances(m_gpuObjects.size()); + const uint32_t instancesCount = m_gpuBlasList.size(); + core::vector instances(instancesCount); for (uint32_t i = 0; i < instancesCount; i++) { + const auto isProceduralInstance = i == proceduralBlasIdx; instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress; instances[i].base.mask = 0xFF; instances[i].base.instanceCustomIndex = i; - instances[i].base.instanceShaderBindingTableRecordOffset = 0; + instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0; instances[i].base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); - instances[i].transform = m_gpuObjects[i].transform; + instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform; } { @@ -1557,13 +1683,17 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } m_ui; core::smart_refctd_ptr m_guiDescriptorSetPool; - std::vector m_gpuObjects; + core::vector m_gpuTriangleGeometries; + core::vector m_gpuIntersectionSpheres; + uint32_t m_intersectionHitGroupIdx; std::vector> m_gpuBlasList; smart_refctd_ptr m_gpuTlas; smart_refctd_ptr m_instanceBuffer; - smart_refctd_ptr m_geometryInfoBuffer; + smart_refctd_ptr m_triangleGeomInfoBuffer; + smart_refctd_ptr m_proceduralGeomInfoBuffer; + smart_refctd_ptr m_proceduralAabbBuffer; smart_refctd_ptr m_hdrImage; smart_refctd_ptr m_hdrImageView; From f261f7c42e1b1465e225b8671ecbdb97a8f2385b Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Jan 2025 16:23:42 +0700 Subject: [PATCH 007/296] Add Readme Signed-off-by: kevyuu --- 71_RayTracingPipeline/Readme.md | 11 ++++++++ .../app_resources/common.hlsl | 24 +++++++++--------- .../docs/Images/final_result.png | Bin 0 -> 103835 bytes .../docs/Images/shader_binding_table.png | Bin 0 -> 8569 bytes 71_RayTracingPipeline/main.cpp | 11 ++++---- 5 files changed, 29 insertions(+), 17 deletions(-) create mode 100644 71_RayTracingPipeline/Readme.md create mode 100644 71_RayTracingPipeline/docs/Images/final_result.png create mode 100644 71_RayTracingPipeline/docs/Images/shader_binding_table.png diff --git a/71_RayTracingPipeline/Readme.md b/71_RayTracingPipeline/Readme.md new file mode 100644 index 000000000..4317be9c3 --- /dev/null +++ b/71_RayTracingPipeline/Readme.md @@ -0,0 +1,11 @@ +# Vulkan Ray Tracing Pipeline Demo +![finalResult](docs/Images/final_result.png) + +The scene is rendered using two ray. The first ray(primary ray) is shoot from the camera/generation shader and the second ray(occlusion ray) is shoot from the closest hit shader. +To test intersection shader, the acceleration structures consist of two types of geometries. The cubes are stored as triangle geometries while the spheres are stored as procedural geometries. +To test callable shader, we calculate lighting information of different type in its own callable shader + +## Shader Table Layout +![shaderBindingTable](docs/Images/shader_binding_table.png) + + diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index 50306b516..a35bd3fcd 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -42,35 +42,35 @@ struct STriangleGeomInfo Material material; }; -enum E_GEOM_TYPE : int32_t +enum E_GEOM_TYPE : uint16_t { EGT_TRIANGLES, EGT_PROCEDURAL, EGT_COUNT }; -enum E_LIGHT_TYPE : int32_t -{ - ELT_DIRECTIONAL, - ELT_POINT, - ELT_SPOT, - ELT_COUNT -}; - -enum E_RAY_TYPE : int32_t +enum E_RAY_TYPE : uint16_t { ERT_PRIMARY, // Ray shoot from camera ERT_OCCLUSION, ERT_COUNT }; -enum E_MISS_TYPE : int32_t +enum E_MISS_TYPE : uint16_t { EMT_PRIMARY, EMT_OCCLUSION, EMT_COUNT }; +enum E_LIGHT_TYPE : uint16_t +{ + ELT_DIRECTIONAL, + ELT_POINT, + ELT_SPOT, + ELT_COUNT +}; + struct Light { float32_t3 direction; @@ -78,7 +78,7 @@ struct Light float32_t intensity; float32_t innerCutoff; float32_t outerCutoff; - int32_t type; + int type; #ifndef __HLSL_VERSION bool operator==(const Light&) const = default; diff --git a/71_RayTracingPipeline/docs/Images/final_result.png b/71_RayTracingPipeline/docs/Images/final_result.png new file mode 100644 index 0000000000000000000000000000000000000000..af1f2b9b88c16271ba23a15333ee4ea0e9549d9a GIT binary patch literal 103835 zcmb@u1z1#F+cu1WqDb8aN;oPdBB0dJqkte%5{fh^DGUuGF^n`yN=hr;-QC?aFbv%> zbTbU!M#1NP?&p2K#2Rm)Fu_L{}-U;^E;DiN6$)!^69X z!^1n@LvRUrN2&r;2>d!{Atxq;m(fAB0KB-U_gv~Z9$vODAyx|?czwn6rIH069&zo- z&$&jEByBvr<9KnA=L(enC2wGeL)>TgBP}{SC)$TXXc5jAuO&S5 z=ukAC1xBry^*kDSG>w=g_bf@frEl%^ZmvQ0so+uux0Tz?ADAxrKfWwX$l>;qEbgT^ z9*h((drF{ecSU)3z&v%<9$_P0k(Sn$mR4fef8=m=pKUWDfFXCWzdUo%Gaxi{d&WD1 z6!Y0DGC<2A`Zjf1abtSz-75p!$#G^jg= zJI{^U)~z0Ms6Bg}j?phEiqMYP9h@fvY%73@%S zFXpIWtW!ipgd+nDlX94obTWwyO^iCA(76BL^oY4qONIbG@%y;$Q4JzHJ-RQxUlTh! z!Qprtdv=0M>fWXQf0sPm?3346Le0FBkmy9wJ&38*FR z*j)y`n0pQ9@$1kxT368c*U0O}eSg32;h}&2=^oA5nXqTWB@JV-zJiZr_}5+3Ja^mJ zJDV&z1@w^gjfxc;HzO-o;j6toWKy@(r%u12VW>b9d%rC_GBPN7KQ#JFyWX|qKkExw)?eF=T(uX&t&tqRo*O3Pe{<{lKtM|`fk}8n(jlQSdz6Z_`por19m;%QljE6@=BM2e%F<%Tr z@!J=>Kk+<4?g{VBCYU~)YpL0;&e1Hsp6?xyM}6k2-9>y)fct{ML^K9`F1FI9cl-{csE)!%5DeDDE?L{=QH!mfeIc36j%8gm;W2tz zblA1E_v@00pX}}*6mhWZEN4`Rh)lV`w&{LNKE}*i(|G0izo6m%KeF+^F^s<0%A(!Y z#&zp29&T3K_@hq`e@v>uzx;xbeuP=w-zbaMlm^EF$68o3Mv_-|kR`jMzk7fFs=J!3 z&e@E_Z)3e3WzTaPZSHFvIzjLcZ|D2`MmMmCmz^swbl{s=7V+;->tV?pzJCyC^2WR7 zdtNg4ZvM!G1c@I+N`|Oa{Jh|v-{EE^C(`WtN$!Vrq zFklBPd1hR1w`x%Lxta6XS~aYNL?Td6HgNpR0hKN#g8@R^>U4a0@M}1F)gF&(VCeSu z>jD7uhB^_xk7s31`C|}DJ5gjb?o0B0G`XTi43R&b--tGrQ-8R+G zTBvAxY$R;kbu>D?sEg&@-D@qE@`)(&)B?x3NYO0G#Qr-W1;jmfm!$2m>kM&k?fNoK zk>*Vfh|J+*Dy;YQiCN2*ODZC-lpzqGwMs1T)ZIQhk{~v_)#B!$szL z;KftW5>o=7L|Y42#*x{@4s_#$k~sA_G_wRBzKgOhrVA_-^RM#yO~tjTBNG=9GZp70mF=A$fT_aRX%)t!RfH(Mxer(CxD;sQCjfR!8Y!rd0)m3i}D0x_}$|$#63$wzo%r`r5$Cv zXsA|7WeCOa*z64HEh|^XhD2vc48s=RS#Ts?Nqb*)oMcCzKNILa9C1#0n$yP_9LzD(RbC4(1QYp`cpb6xw1qpO8iFjl-HR-PN=8L#|v1yFp)oc_3x+uotgd~2(JSIeCP$+7AX4U z{a>q|wPoE2WlPT)l%Kc=T}&+i1!9)}XW*SE-w7196SeB00eyOM)R6^EyxhR@KO`UT{$w6|;3VG_Amfmb+4g1l(&QrA zR^gA(F#rtre()R2K&UcKkf+B4aeojHvio_2#^l87{)T-iYsW^OtW5W5|xsj~u2J&082L_@7^rGPn8QXle`&S}04)*3OqXs-R4r0_|XaVT) z|3zXY8@m~EzFM>NIh>bjzwj`lqU$*K=FP!k(kD~|B?Z;C3{+HZhu%v8{Js#JOW#Ld zAV&Um!~mo8)sKFGkFb35y$d^%4 z4`DUirLrN~nY&=doka?S4CF|~dLiK?epZW#tTQ|f@VmJIPjpv(a_i=I{8p!>lPmfJ z0~U!r%dg>XZ()f39y6@oO|GyB^JjKoSNvfy=&28j1f1}c zKgT0e@S^|nHtwOa!hA?wKJ5DG)zs>MLoi19+pQ5%9-O{Mh7T;T9ql81LLVQ(0e7)> z%v&JzAM!q;zQkvvI2PAyc^Thn0@`aVcJyzQ<6+$-ed&LQkvaevJ9lJGNZ~AY`iDmR zuSP|fmus+v0K^yzS$CE?w*0u8rTcm98A&w&<2Z}3AA~(SW6S@fFKt6<6i}nf1-U6T5u(}|X21jaIZ18$bROB*A%?t30^ivacSLf>`9=O$37Jd7i z@i^Hbkk=!1tbs3b+)>_N;==1vI(NKV>woiwbx5*|Wu$`Zw=96Er6TyGD%x;}N7B7Q? zms-kmfAYL~6jM|6{G3+SSQ0d{=!nN)m$Fc*2;OJ=8gGeZG3SEuM^;5o(`}wF?YwzDus$aIO41`AuBKQbCI+nCy=ISHN zb_MLhSy)Y>_W^b$Qo!seynu|xrTQfw7*p?2T0VM~*u~eLz<0|tO49G?Ac|my+dz}<2Ps3C%}%isQht~YBf`efs+Ua$o$+~ z&(usLN{xEmeXT?4+RX0>0{`xGR75U@b5NMRwow(hmU1d14<7Jd+o9VNei*_*L8@cs zU*N1MaOUsbTKTEf(UdlGPJ*WW$7oZWgp ztA&5N4_RW(Gk`)_Y%J$?4(2tz~p#3Q~hH01H!ruRJh|Prm)O)Xg z<~AaE;GHc64Hv|?C|(KU<7DDX{b!6xMr{pTQcUJxHld1s`Rs9HKToyopJiJt1RU9E zLQNI#+RXQ|9T1eFT9|;Rc0_=ePXRsEr2xxePzIXN%OTp zH+S)Y5VrFcCi6cEFGKy~ti(=ym#jtZ0{Ig8zW{A#{&zE~{C$DCDlQ`U+yAxtuspVQ#%K+D{oH3slp79USW1K#F{N{9i!Q<)b>aIzPj(+%sVf zr1(gW%vLFT=%lL#B`^2?F!+pbCIzQpvMO1!C~0u?GgdSI4~dC)@*gpPwK!nFEzfyx z{YFO2i1iQWUujCd#DW$j*wtGZ*5zgHCEjV$SK>T-yjJH0@qUEm4~Rm@jabKDNs6-H zV7E5V^ZM(nR`~c0okU;u<{xueA&sZWX5JXF(Hi*GcPFUev7yFVq9!PIfSMzgYB;G= zAbI7-^TAz&t!{m^=#}G|P7^ux@DaBs?NZK8;8UEU_yESX_<41Eaw*5I0h2F9p*6P{ zgQ^SSb6&iB#T{V=FV2ZOkX^i&;SdNtUG|0{$7{F($|_p#3X1%dd*e%NgGn?;{_4Zd zd#Jj|oUh}LxX-gben)=O9DfNWBnPSfKy+7Pk0&nO!fE^YgcXBSG*I@Z-MqpM9u7uJ zs$3NdljBAOVuj>$*qOh1L4{8YU1~h$1i8pTAmwy@-Aeo~2Q|?v`!!u8VzMi~nxy2! zGo%&8jN+ewsx}G~YA%)%5@tRbn`*v3_jYEbGrf5PGU9LNkiZPHk6|uEq|p`H=dq9r zmOR#TSd`+~$p&j5OcchMS}VmB>DZkRKulzTNi`^=)~$+;8xt5;A(}qSWUn?^jI5RV zlwK3la_NfO7h{2k3;PWH-Sg;qK0vLzs{@*0I*ae+JM3dvc8|Ra`1jVs(C|a?V$7Zr zk0KvKVrt@y_I4aqv>=OEirDmgpR7{aV_SJEQ6H+-)@(kc_C&T~+(wmzKTl3u_b*1^ zTYeU`$oG=_`+X9PkWXf@7o4BR5U$^z_3nt|Dh==#JtCqY1GuC#)>=$>1?L*fzZdYP z*uM2n@#~^{iEUFwHjl#^BW&8fi}mv-u*u6cE5=1I*~u3?jyDpYn~_vX$hny;r4whp zF4a7C#Qd$8Zef=DsvlreeTM(KUeUGxw^S@%UH07DCtlCts@&JnTxi|Ck~|=7IHXv& zDrQ{CAM#mSqPVtOLv#E(WOlDMbHbsFi}a?x>xKLxKjyM@!|TSNe7NawQbq0L&dnQz z(Q&u{pATJ6iBZs43qr$2`}DcWY--z7z^%&Z!5pZBYf#DuBdK$Ag(XSwlR8JYKC$*1IK!H6 zk5bQ$3`=DGS_7FhZjxAsuNFvrLWm^NO9DI9B@vamkHpx-EIBml`M=J7<0Nr$U|FlJ zCBd%g4`pSNksXAxwUHexltByX^7{RE-m36!zhE3G5@s98<`;K8S-P=x`5PzVS>GaU zHH~EePy3YKno*byjA~hOs!aOkKyXO$$&xasxR{Z0n`r3WnBRf28rMLZeSf$~T3B-% z7zwuemw1u6ST8!JE>Y<~S{neaYJQ#N7f1e4-ZZ1~Cl_X#q}|Qe`866c@OkByU>5 z;Y0qzQD{GTb_VTge$!ZD)y_^=2Lh}qH3Y^Cp`Jl*i0*T8Val{xve1KpZ+r8ewyxx9 zM2vbSkFW{kl^dj%)2XPa^ekA$4P#o~>Pf6_dBSzfU-0-JJ!RBQusS}uJ;&|eZ4p@# zudULx>sCBm79zUs7ZSHisyZV(Jio!MHs67L`*%{(($bqYdUDbpXEybC)2XRfWG^#+ za2R1yQBqz{?59l4DJY{ilrYSx(6&|H5k2l#l&sW+TUg0+1%tIM@HcPT*d58ljZ7F+ zp4`;{(*zXXW5nx#4Ak=BOo{nMEt5H}ri_Y_YJqaW-EdzESW*bur4}?%n4R3)FzeP) z8;jc#y(^U~RZBTD=ev_Duh+U|lxUoo8jE8K_42{iCRIvu(G1zYO-NywS*~%X3oo`Q zd3pkn7axFV^M*6fVxShyC1Gu?w|itBXDgD>e&Hv`c=FtetwVcy0tYxeCp+7WearBB zvT=eGE1sp}z4MUAQ$YOTzze=!Q;|L!46`-S>{%m8Jg{yXB|T6oR+V34tG9z;^riWMnp@g`A8JXt~P4Gkp6L|I`}B`3e?oXw?VWPbo2JLi=59? z#{&BeQ#THnI>pN!)O>%5^5kg%D-dbDvXRpks7o8&yrj=ZbW|cWqz`RE9fYY@k_43M zf`kNJ94NN5N(yM1z-6}OJ^*x=QhCLi=A0^m4Au7=4%~cjwFH$~99&$}c4LP!v$vt= zey!qZJwT4PkUqEfMR~AB;Yg5mZhGwMqM?8|4~4wZ^=|pGTKUWD*_@WMnQh33~0QXec4&Va~|q;VTgA1ZbG9CAcsytLT1E zZmOJM?t}R2hI0HR^!xRk&Fw}W(}kb{fS9`XCfWr&Jl?MXF`wnTY2SgFx>^Y~#i#)D zjK{^s(j_Wt$0*y|`*TjuByZ=tQ`*h9^P67Q=nGFSo*3D>fSjzgI~Ml559@gZgf>K2yo6jl)uJ`qi?@?ZDM!*wL$dRO%4-R}XsU|Xo};;^KSH-8>N ze!#gnx}kp2q2tSSlGb<$h%S;|#UX*tqR^;mD?7L^AtCNN>#O^Ue)e3MKSAm+G|`=E zkWl5@QaTLR`O1VmW#&k*%Is>g4z8lDteCM!J^WU&nKGLSeknO1*ehZ7t?F%RXmSPr=5d%&QNo?lhpnv;aeudey*`}$-${| zjBn!<9vaLi#$v1NW5!S$E1#D|_YDT!Ka)6WXbP>lho=;glAfCwSTt+Si1b^2JG6A- z81ywha3P`BPE~bZuF5!V8_>G4Uei5rJtqE)AOUOn@t-Y&SVYExj`OozIdV4ym9J5p(;O% zo=?=bfK7&(ciZ0|-yqkf2gk~DDh9o(crYWR^lIP#QqgVv@2%6cdrhdz(lZ^_?f#)z zUX1d#5@ER-A?=HW@!=EOct8eAU+;sr)Cgq-1Yg7c z8%_`xhawAB>En@B!+km>C7WIr?@HBlW(JT-Fd^*CblbOHbNkt%$J4hs)>6YR*Hrl1 zj4OK)90GkkYJO7mgy>!F_QIi_-u$OxNyF?zY*0hg`0AKcWWqvXe_+sxqRyKN2DJ!b zNxplN~t_4guQRD#)u9RUTml8{W!&E zUcSw_U}>`m^f0@NP}$?GRM)h4sBlD-m*`!k#NkA!&aB3#20M$Z=ZFstr9?z5{j4mY zLm%z5drAIIy|SOoI)sn+^{0R{2IYRn~j1l(C@cjlin3!(v3hF@X2N9myJ&&1zgoP2w)NRG- zS4XWq3k8@_0HOL3%L{7ofb($=Wq-;RK^Q)cb^MsRijt1*Q12shf=(5Pr+EIwPs~ts z^kdG?l1jSIy2An4Jy8#|NxS2gZh_-14#NGRI@=tbZ$}96D7W2=GN87wB{jqOg>uYJ zHL=6!F%CgHWYm%29n$m7EdpUF?9&Mf*`^ZB}jHq?9tz zj&?&>uh4r<4jUQS(wxIk!?V4k77b_j`08EAB5cJp*%$Kh_(om)6rI&A2!eX<@NgcU z+oYJod?N+cntL!CsAK5&05$a(aU`SB%cVi&3yKGr_5zsVSiTfw&oGV4vJQ0qr}Ij7 zBaW$g&eIxde`7Yx9G7jy!wVC+l|UZ%c1;KZ`^Y-ZYj=BQ&fS4})pArbD-j5x#z_+Q zQRWw&S@t}Mj*Qmk#xB;mIeafY(qp?58eSnR4YH{W2HdCn(_nA79WWXowQ=OWlDJ>8 zRDOT0*IYffdg52fc|jCD#o!;8|%Y)1{YRUFk?e8r3aS76r$KK8cZ^gz;ekpWJZ3>gEq9UyJG6btM(${M@ z;*n|g#3%6w%P8W-Y+|BL3I`oPbgO-Y#+IP-kWDZRDuXzWiQ5O?GCUq$;(oAYaX*vLeq_2vFk;bL9I41PGZS+BjEgeWHUe-}%eCJz z4R2b>9U`1%A82Jo8Qg2zOL$b!z_8Ui^U8mr{KlxGB2Vdq+}vLh(GfxyvM#;!V%}ZW zfnv`J95=dEBM1(YsTBZgWd9t?DO|3325uo4F79@$&gvxj^mK>}_*`glX)CMr6=Lbm`I(6wuAxtc8&v@}H5H5VsffY6ULBja{PM98$Tq2qg({ zrf|`S(bU8pQ5Of?yx=Cxch|ABVC-IND{HGaW#+H84{y`&iVwBiolV)S-iI#@bK_xK zQ0VvMp`84w4If=0o@)exfGu2cy>~J{fA{d1fATxl zv-FrLoGtH#gH}uEvy&DTctVUc}wZ)BVb5%q9bL;E|U?$BaSuUCU<$5B2wGN zMoCx@OfsM)v^QrwJ|VhW{LKMJQ0eWqV%V3(n$-P{?XdD!@HhYC>>feWNl}ZQ<0cWt zvt%5coF*Z}6&nk%`0x3!6^x}-pIYuvV7s?eG7^{$gc7S|t`!a0AOWvd5kOQMf!`iQ zDkhx-5WW%Zb1oUjBDBKzi+NgQM*s1YtW0F!X0xo7QFX7Le{))a* zt6*I1&1p6XI2G|vIq7Mi`V4H%g_}HYBB@hL_byQj*xDGa%!g~d>*=*qn|Ifg7Op&S zL#RQ>m(w$B5;xprmaLGQ7RdG9leDaK7TFr9<>a5k%HI!5Xvrsht# z1Y3||JLN}l+TH>Ipvd~76}}NrxzV|7o~FA=WV79`xJ*ITJz=_n#%H`GGYpgmn@!6e zZQv}_Y{vF8B2-8jXI}^>&S~s_gFaPXDVk!4fBkSc-#OiMnVK;q$G>x$?!(fJIJwU4ZXm_ex<7k6eh965MDx`n-Rx{pEE%ir zeEX9M>UQ1`NoDdPpXgGROvEdb*mxu%nuL3#aO7@hZyijfqR)Rn7dedg?te6Y3>GMFzpGC#~sWqHE5?c>CP5Qs4|0#V}EkWB`5hAMn>{hImR~}qDqL|qpi-dCkB#)Te`hSfOBbK^vM~Hxd-?@_RI*&iUD9!TgJEFD6A>Ak|3?ug{=nZoX_Za8m^Np5`Xktuk#oHrhEGIWO(T{|!6Vry)v@pIgJS=b|_b=Fb=9@7gSGsb)-|-nL7x4_x63*eJwfcbD&~jpRaSUV!MVNE zJRe(9Z1P%~bPqpY1X-9!WE3%mN{fme70cU?G(o@zGPQ!>;;zoV;w}=$=%@j^&ns1` zsgYd0WTn=hbO5y#$fs@WMCzDnDc4r)1M>=vT0dWzoTBjPCS%a6*-cmQN7j&58!Zhk z!-B%qHLx?SzR7ZeoOGHk+D5`fB_@CQYAD;K!2S_q%YDC3L@b23Q##^n31?ra-fO5YZ`#_t}bM6+S-Ix&T<)C@AH zeCH){!L@Ut;ZX&KWam2xk>IpjrRK&N%p?&;EetHs7QGeqkm%A_77)*vpPg^sv_d{j z5PHSOa{1**X+g9=J`&&l_Dmvjg|N+P8YFr+l&rKXdT_sw$3r=tsMM1C%`9@JFP+JJ zvMvpQ5iB$MR*^&x2a-g5Y}#L7PQGc)at*y`G|#yY>=-R_UTtWXx3fDBK^N8HJLrDo zRH3*^t%;6Dp_kbFZfqKN^_r=@PMlD0#YTqj;*WxOA6Q!Sl6kj^%-e}IkrbtFGHCPrZv_@ zt@F$9y1a3{S(lLq!$34!ZqukM(c18WMKSm~1IuuJb4atb&cyN?9`n5E$R0jSWKSw= zk@-{N=6eN=+oJpV>w`I3AR&IfVolcDYfvsOlZ@fruLV%9GQ6S3hmO(I3yBuJl|Ea9 zs>hw)kBth$b8>c4Bo#Zw1(9~%l0dCT5~zTCyg$i~jue=_mhGyF3R)VzWqGU{_i^BG zBi*B_l~tUew#Nlg7$wM@JClQ*VLCj%v(|$S*8HA?yPexnxSmaoJV?IB;-PgF~ zjwI%kclD7B;k7MU3k@wm!{Zp|652Pmf**h1kOtGJ=za`O4mW5Ooz+<=C>`~9%)Zdh zOS*$k?*e})_L*HeA+fWmJGO8rzd5+sS{t{_m-8u(mN1vlwox5jb>V09Xc-Ot7W*mz z(UIyn`L+EgB_shV1M*d}ZsR7Kx}@{76nk1*Nkm6meVQj72gBiT#GpK!b1N5lT-#?1 z*0z}u%Xxxu*Us&Z{p*W|zWJ~I%mvolIcIw#yglYkKLOj4+uZk0w%m7*7I8;^AN3DxV|K7TZP6YugRMSi~sytECFCrC`npLcNdFUwR4D^So=5%e4|^DrH#I z?W5JgjdJxygK5OB+A3!vKAPCPT1BZe-Pp2qcwQ~*Krqd*SDReY98iUu#nw<& z*Y?keDC+*ibI~?qd$EV6&ZK-3EY-Z=Rk29ZIn6`j%r~7))P(|fLd*a%a~c1=k$Fd{ z!+zaiP2sJ>x&Y7qeB=mb;?+cmq8;c zv5GQ+$v-usJ*K2L?eFEhP=$ZR#T220GME|@b9$h#vMLgqe9{fAilLsRz2bdtH-{$z za5Rfl!Q@t0Tbp$`B~0NYrUPny;uU^ilgq0nR5XXAY0yyjeac|b#Gm)0DVr5DkvioS z88B0I4ULd(&RnCGC&fzw2dxR^;idvFVe+@B@%0&~nZNedk0e)q(r~5sy0om+c-U2T z*+dngq`u!By?8Ry>E9_xo{Z0wE;Vd=I%G6FM2tl{Ub#j8GbtsbYfVChwfOea=Rp8{ z7jFtSXg|2@N2eqUb~6N7hGz-&y77Cc#G7A zwiE6>fjNSl1i>g7VMuBDB?lmfz5qEcK@c31?oAwNUl1hPCtbO}Rm)h-HEof2=w`(W zwOOx+aBZ$rL0^f8IIo}V?s)pig5&{8++xOy-<*ZPcg#PaRC-oBMZdQE_27*gl(^?V zGBV%jmqz6m!52W?RvA=?V9yseVy(wQBelXpik88U733hT(8WsjLld3fY=OQOPcOkgD+hKO;1(5TsRCw=|oCO|#xB(K~bX`=d zB99=Q#h00?-#+3%=OXuo=jTP&io-w66Z8Z^g1sC3+d9+(&23P@q-5Q+|7vpNUGM1B znXc&N;Smh)Hr#))4)LnLsQ;x}IVnRA>QVAz@Cd(UL@u}AKV-~S!eQ-TX=9sQBM_OP zQOMamYOx-JHlK9iH(l_j3J=eXKc}nlO^nR!T1A)Hp`kE8Xv`08V1Rb;Vlr&qaLvSJ zb1X?kDpQW?rNu9yBFUR z!N1it7O0xBJ4*``YH=qie*H--68mUYWJMq};ILCW-j5^>enq40D6EJpf!}(miVF^X zaIh)i@53;`ZJB^@`(9!jdmkAbq67UP%&SY?S&^R~AG_z%(Qa^OR>> zReqE*ng2z_ZD zQd!r82LWi8Wgd(b#TGARL_-7U2s(A}J7;9HdF> zio0!$>(otYHDvbOWf6|w95Tv{edRwrg>FYRECCMMV*)n5#14jes#aioe~tS^9lin( z4}0j}E@aW2$%Lo|M%yJlOq|PApTEpSS{*IW{Zi<445!&PV4>D7u64V$z3o93T|~B{ z^pCR47d>Vor>;b<#Mkn)4x?9vbH214)D?iXTTD}ThW;f4;kKA1+hn5^m|71`RYjJ* zn#&n&5~S`&?if#YTM$9eUa>19-)l#T6;05N;nr#C&5Nj%$~4#-rJlsWf;XQL zchb)vrDpC#e`Oz!5FBY5EYNL63;=}=GO4R4nVE2_1IeRoN@FayK7Q?ZXOS6Y==RFXrt zzU9YK38Nj1Y_y?hhxZ3x4-faZ?C7V6i+snd6VcgzL9UZ;m!%9Fl#S(chJyD?SBQ?D zXemVbY~{kHtbRs=aQ)DxLgRt#vf2LP0+y{_fcUU&alp?F{-eSXbZJ$m&0ORh<}Xa4 zQ6cmGirbCSRJnyRnxOPTosUaW$psqU7}?(_XTB`n&IzQ11is(oYz70azDeC|l>WsC z`gQyO*e!ACWWV_h=ac7062G@LIqR95Bb&<1m&p3ju~%v(i0av2uWsvGlFJzJdzJiPCI-gtYtJ%fp#~`*jVdpQCd9W2R38qf zsUlRb?ere6jRRx!FB^|)1*;*x@31WswrHDg41TD5RIFMZ+@;hc$rPiz5>IL20zeB_wvoBFa#@+ZQ--UL^u^n7L4Swk z)Xw$pkYJY<4TnSM{v7atwKWurHG$yFia%kC)I2q#O-jSRL(D$E!I zdtC-$>5|`)FH>M{p^;_x!r6k9{NESQ6|0JxyGFpKzN@~?8A{V$hVXxnt_ljd)VBinf1yw>179yZlf^ zS3`Htcix*xo<=wSwip3EbbT6~%54Cj|5AcW*NOq`4`2RX(u4w3V%w~dv=F3nSBbJP z+M^pKE3wBOKLwbbGFefKm|#*m3-+gd%C0>9WeAyrq| zL@!oj>d1e{UlKzp_b7OaxB_qjS;g6BYHO1#IP=kUpc`IjN6ZYW~Wa;*MwZ=lg+YAZN>J|3>N{#?DXEMeF!01 zBb}W%`Y^Cvt}QIMJ9@RE(RPK{JNd(yiSDrKe%|yW23p0Wg$f{ZIcc7JtseKCH?5wp zXieNO$9e2Oa9i5Jr@KfM>jJ7MI*2=306jhBU;#tnJ|5*TQ!3tG$OSjFZ*U#vf~1-@ zR5Lv4pa}k~&tdYw%xy&GN0~vz_v6hm9MtB6L31#b{T%tNX+^;U+2r8Xf{QZA$HfOm zYTBMv){(}d11dI32D3BDu}L%+LfQ}B3L#6A!_`9C`U=}t9i48NMA{| z4|apkBn<&J18zwcd6rcb2Or)4eUY?w$&e&21?$N72Um~FRaKO}9q)h2+faURG7K*gSj1t+ZTeGusuY@aw)Me)S{q6wUCd&HBHYVVcLJ&A;h=;zc6K%(g z9o#(-U)66w=_NVUYsDSQNq_hIPIKoxcuE~tCMETZInoRYvPBkP6)GWp1p2yHf8~8nK zo+e%0j;5L#8=%$G@={Ef4XFe0LT`RdpYBIC0WdRrE_;b;`Wu}TF1n!PuhInPC1znl zKP@dT-lIRZMa=5a-@f|59G1-|x}qgpYjDw(n|V;u_|B>pGy7_}CZ$?~9h$!YMVXU* z@HLMjRW5>8cL#U~PEjNeaTeEd*s!onPk(>~O=nBc@sxTjS-)^0xp4kVku9?nYM>7~ z(>9kPr^sN`)d8a`)Val`mb(b`gxJ}Nqd;~2B;22$eDqKl86+JnNP`#+!7!aF5X!8ThQlgio3#O)LJ?+ z7^|NeGHc`nM_Adn);?E;?{5i1rhdHTVhIK;5tO_VdRH?u8p6Vfuv-jCK2B`f$YnUQwy z{d(g4{Tc;AWD29t%;c;3uw|d7$l9OTD_|?$zK+*kNw^EyVMRCA@#$x@%{|yd4tC{? zJ1(IHO?nBpMDeU8e=d;6UdUFnCbYoFv0c$4i193%62C@${#PO5B>U=y#4M@WXh6|| z_hMLoi**I+{?0^ZJ*GBTY~WJCLbC6?goMxxYvoY^c5`f>y0oBjh2KF{aMO{bX>u!% z`3irc3z9w|K`*?5>*>*c{4hGY_vs6wP3L^{qn1~>nzKczrm|rK!xt7+5OgXdsiR+q zL~BXC;&SeP<-;IdZ5^ug7F9zF>dP#$WR|>v#zC$5LXYndClCxtg|=5P<$rv8(%I(@ zLgK^Y&BdJsUF>;NSVMY0cXq~L?Al!|s&|#1m^Ki|KQ5I`STXLwQO$7K4}7L*EdU)0 ze1#7(Qk^f6{mgw!vcBah@W8sWfBOu^@vrWpiIe@Yw}HMMhr@=_w^He8pzIVl^Hb#& zo1%qAnr-Fau6PPafjnhZx1@)~!zj3&oWX)$O@5?3RCiW_xiR8VSd**^Rmta*`@w+cZ=G$kEDyZsHA3%}9) z4xTT(T&v+Z?9S|)Mr7105Be1=&KYvrZ1vFDY>vQYPzNQLgMRWsC1|elM}+{kOL7=<6Uq!7GJ;hy#Vt}- z+~MCzgNr_AnrLhMkd|41+R48kclg}i77?_+Z(;4-^UxZsEr@vWG2bZ)m72KwJ`nhe zJOgVcEx_I%7mWhHplclx;L4@-Qpld@KvlaB*S6T!{jA(rX)B*ys!?ahlj?UdrEQUX z-;tnXsEResu@enmwh1|2Npinx&)>}=&f7Qnz3AQ2cmC03 zCa+kim|hlpn8@9Cu5pdC?ITKpB`!UP6E>ZT>7ycAg zi#a#^J}+9ZWUrS`joUD|`&J?oSBq{GdqA>?2En;}oi?*YgAdllGMWH1f8wzDMMv`| zC8)AYt_%T@K7(eKm0Ar-Yta4VMd8Dw;MR?%cnP`YnQ5`r3MS@Fe0V0>X3_=Q<1R?+ zJL*t6ZF=4DTqi`XrKI95+3-G{IGiFo|0iY=HK8zJZY(kDDaE6b-b!hxs8ig%CPYRa zT-2@T_(R5Iz|rdQa2CmRry4Ws<-u|`0eD|V_r3a+lD3)57KQvYG9Qen*epcJE^^%Y|8GghUu~}Kx|B2j3Vq__;gMMs&6ts z(pe+DvR!Lr@|JHX!zdWDeJDv*!?PLfGg!OJ%dv4+w^>2|nZC^Cizgm_Bv3Na#h1@w z4R@S-5oWwkP~#|szB6V+*QdhtRo1@DVXdH9t)EYF{^1#My{>)+>XK@I6DaLt>OML^i(2Vvdl=%~5JMjL%C{KAKwqIw}Qtcvbbr&l){`s4d z`rwRMtdn%K!@HIlgHloPt0N`k2mOzLJuRD?@{t*~$c^wYf4O?Ta-{dh^$2FqvCSP= zIUzRg11@(Jyz6k)&Bk_4-;Vu`T48Gj)vB1HPx0{Rwrev9b~)~+hGrg+fDbWJ;qnW) zbaWQ1LPPmF*L?NcP97Bhl>%snvFJ?W*kA0ku7vxScHY!^Wip_}6#0Lcdh38D+cs`k z0b#eq;6??h5e6vTJwiZ~P*4$x(IMSEnxP;lBAp_o(p{rP8Uz{Lj2I(FE#SL&p7(jb z_aFWSj`KRtaiy`)En#C4^$R!o66QM{GKl`&NrjPX_Ld-dnpMpR}vj_F@2h_5`=cI&TCM-*LuaA_KL8e-YnIA>}^^9!3UJp{%>|XlsDQ={!&x^AEfzf__aI&hqOVWhXyseUw^S5pTi)j zd8FzbZv3<@H&+X}WJ{@8ice4oN zaI+#4_AjbXAifwoQ(9({bDnwh^2`T%n&dexr9nJ9Y>QC31{|hOn$LN)`T-At&o)p2 zFILfXKmCHYcWLpi3Q(UOK&)P$1<2vL!{&eUeC`x7tI8#j%3PF)aXX9a)6^eLUNdD7 zm4xVz8#HGYJz?W@&GZ0`b?Du{Yp~ENxz|aMV8b^6Y7}q@hTKCcwFftFtzl(ImoB}5 zaa0j-b6?mDn++=96_cPzer$&P*!ARlc$a?}UEU`BS1!*nR+kjfSbV36 zkH~kyg^ot1gHXy4JF9yU-XL6gOBR{qOb$!`8y9>>am!pAlWK(TYo8hOtWB$C&e7^_ zHbVP7&<%!Tb2*BLjXF!&U&oD*Ln<dwk}W#6&ws;Q(mwGj%(*PHK2CI1}3td(^) z$Gu@WVVOBVPS=xquso%~mxGV$w(8U6V{-r}^mz+WKb0x{S=K)z<^220nsq~E^&O|f z$RICE)xOBEI2!qua?Uejow&iFz7AZqPIn-D(c_&$*ofr$G=0x3&odP2frVV_ol;#R z4*i)rli@cxr4@gZE@q@!S#BRU{9}GyrL}8aDk_dWK*_n4)aea$;f+S3V?i$;=*zTZ zLDbJS*#iM%jLt65yN}NWm@J~z7dML=Lor7UoZ$Ws>YeFQgKRuRvfS*b5bwn7)*#Nk zL%`rM0xo1sZ!mp2A`kYVT0rgrhJ?`G>VxE^8f{Swo}g3X zj|PrbIL}3u{+U|qaD};fV(4xtXh)a~jGZ%9m5;@XdEjVeWrq*?nxPVHVqP1JtiBo> z?AZo3C!veasGFa;T6(E!fJI`us zgr^a;Wi*_N~CmeVdLHSWZOh;vGIu}8Jc*1-kx1@{`Dpz(|0;Qfr| zCgeQR@fpVG^lHtA`@e%uNM8pA*yLBc5S= zQ)H|}4gEH9VUoiE>Zo@ci^+gA zA?zNp>~i1_be^Ao_$+0QNnnIpaQ79gA9)I=#%Es=2860vL{WULT>E#X+j~=*s2#e!R6MnB}ktmFiGVE>6*+cTJ(J>e!2xo z04Axs*v`kL?6bPd$IpkkzLhCjK;MY=9Y$jwYW{&=4q^$%5`7h_c1`u(b%GrGyOmpr zV1dJ`@DUdkb{Tgmm&uQuV@^*S4r}Aks*A}&=%W{=v-RXre4CD7 zoN1(c{n&Z-G}O51S9-|5?4HCyoy&jna+j8lB1!2b{H(`zE&SwFen`_H88QWYss}CS zNSKZ|9+iZ}z1m;v56s=FmDPK@NXcS4iINx;MgjljXlZs-V01^cY*VZfU2b}lI)qGy zcBR%iD!5)fXCl3p`9rdrneB8^nxPO*OaU{f>R6A^hcis)3QXIV+a-D3$@aT3J+Hn2 z$%SlElH=e1)ULQFR}OXj879AkfORhd*+r$CX06$OicKXkRZ;(gK6tpm;!X0 z`X@r;v5xSI;EXU@ljfb*!>^?xd7GLiY`a^?u!I{L+d4)q^l2(?j> zOO$xe4j2MMA!zA*pTjvX`%;N&cC)A}l#FE10TH|qb6*#wc)O(c4z<%tSvM{}|G8H! zzx$7Rk_ImCt_UFx02VLkw|>iaXCHs}Xj<*5eb+OC7w z;DX!KlyH`<%?qo>+vnyg8--UF@q<@JBH{3W0k^m1oBg}2xv)-F?IIv#h6s2YnNFuP zRW(@;BG(1?SSKbX3XGeM697at{5Rb5GNW}`LF2p3+2ox5!9z&u=abpzU;exp`q9R; zrwkkUyk-7Ofo775>|N4X*tf#gTi5-kC(!}VF6Tg{?c%&CPaka&~&vuFmt#!N%hXy7s7<&0w?S5C1@I+lS`C3h`IOEmy^ z5IR4Ax@sr0rN|g&0;G07zS59%9?y5n7c4m{fcJLRrVkd(&)pnm45chc@XMmrH|0B8 z7)@S+xwCIfhn(|tV|grC1vEDv4ld}&ryMAU8e2JaNy0T%f-59Ko$%9lunr;OgzzG7 zaCYlP{9>9_y=G@5HKZJPCrA9BR>~X(%*Gv1pAT^IcLqEnuQS^~=m)T-E1)*Vijhl* zfOB4`+!lakmCTHQ^R8+K_|Pgr_o{*jJfv-2XvX`U4n^AYu+VJUW7iA?#ty|p4qdrM zp`jmT@NOV~Ncl3hk)eAL2n?;=MUJWY4GY{XegR<0?Kwgm0K*YGsWB7i*XT}a)cVq2 z`4;KQMeMh(Vo7QcdUV=>5`iVs)7e}%Ozlg)W6=Aw@O?$_e6kYlY9({S`k>gKBi&6e zDU_?vzE|P3JOOmbJr=e!?6atI&u=-Ymh9%-JIerVHsiJJ4I;idDgd~6J>i}Pb0qWf zIMC|L2I;(!m~K z(9}DB`rZ#E8yFl>3n8r*UYky=v{Y1E!KF~&j2~!@%HMbNK(A_MLQISkr^gd(I~SNJ z2)mYjJPu^ofGr;UDA37x0n&&^Se}CHE}^vo!CFb@3kH(C<}5(_gqjmXDHR}vWiMxu zod372Oe-d=xCA{tU!fM74D0n=uh2i3BxNV!UzwKHJ*nJ0k0;}cJ$@+BdOzR&IiOBU z{rbFqi zO$qHSCkU@|x=?6IzBVnz{u($Gb%1Y-N=nJGj=PD%kzX{VuE)Dago4Q82L|m8Zv-mG z-r(JJU;yY1>g4pJx-!!*Oh|%B1AeFJAkP3M0Sim7_4cgyUc%w3XWf^=qC#U15A_6- zkNlQn{J|bHK|Nr%VIdrw(o^ofF90a_VOm2AS`iF3+tNGP{jX@EV(nASWq(zVj8z`i zu*KUd4M+m)RW{I~7PzFl%0;YQET06!O8;VNttwv(0x1T>ig?Ytdmf|PtJe7@NwlRsw9~ewQ(GM7Izk4v!N>E&N5+UoFhj!I4b_{3mcCO}3 zDKo0{^)warJ5*V1VGUWt2ZK(oNKuZm`kLXgTOJCB$!qyvFC)GGFb1#C9_r`18P+;r zqp|hFYtz|TQ*eg$@p4KC@b+wRpN`DlfY*IRB-IDijF|{cCYPPXs<$E@d z;;#PCYSSkrDYe!QBcwI&W$pqg2ni*VK_ChS%AtXQ-z1*}ft!aul2ExHJ&E{JuB8UG z$~6Eu1>PJt5^MN*9R984bk8w)fncSi1#9b_+}2EASSU<1r!{37iS585kG6gsv<9B_ z1Eyl)awoW7!FC8pY}(l^;<;_;l5AZSE?myNTiaJz`vrCI&L%4udER{E`fZ^-0GB%? zR6l#Laq(!c?9BEa`=;%Ar$YhQSZ*S&?@$9XwKoJAwIx@K2ohYS&Z@-3?}*LR!;L>O z)8L6RWQtV_W+UrKK1TOGf9~NRP@!w~&Ij*&4Zn;@_>aCI{O=RP2Fy39#0*N^PK@Y#}-?dY(YSAi9*++O2 zQQJNZHia9Wx3-=H=)qto*G~+s-#+1Jynr{jqGRLe9x-pkNW)>qCgYgI1BtETm<^4m zU-5O6B=)1faphoOy7+Z;&5hK3bnWtCimeyQ+{|XUltQL91|JTH9c22YqCPB*b{*M~GvAgPM${`O+qAq-5f!v?`1474WJkZ2a9#zcw z3&Xz6!%s|Hz8OpYxO%&ki^};nDkD8Tb#(MCX!4+dolMF{`+neaRr;d-g4v+`KavGB zF#7$wv`438??!{p&MLF!W1fflHtL2{kVCUmh1FQ$xk=N ztikX+J{YHQa8wpNm&7K|%-~C$6!E*@iT~X&R@g!>ywjk(akPh0S?fzu8^=}e0`$)R zUn;t$_TckM z>ipfS%ces2H@&DUNsivv^%frVv5>V-B>r^1X+`dHmqmlMEOlYfpc%009#g79<)*={ zVp7PkZ2^hf8c@%JGBxER)AR?yVh>G6Gxl~?VV%X2i0wsCY zZyg-8MZV9FcHX5(Kkipc^A4ZB^LJP2LaG#oDyM-J%`(aG^9UQgF*zy&X8vG?#wC1k zh_x|<+MK1YoE_;|;^erI{5B%AeuMS>QE^dM?280(K*e0Bqc!?auwD%XrZ!xNdtiLB+(;2~f{U==)gv8J-JWY(l5P z_9dL)C&W2$(BoTgCCWHAk1GFN@K6>hGjIC*dCgl*1B9IV*&CIL8ceK2k*W@o(I@q1 zEr!CaJx@&+5e|(Oe?GxO<%r9fk*MBl9)i_ld^ff?d7-gr>MxdmtbszzSF0vFWy`X+ zwGF4HHgKNIom6>$pu2XcIFNGkwsbTx)(>jQ&gJWDZ_;=Q=|tOhHM`}1mdZ957}iAv z&TlSPJsl;czuhW#R7{y1va99Yz%`|-6Umt!n zEkWBY+yO>)bu#fs*qr`rgPhuA58Af|x3l+&WT&S6p$Gf?Qfot)(VIialv)&XQdeTw zw&*FiIlH3erSNoI+!OoOgIK1X5lI-X5*_YpnU{+dFBfw0077*?EudU)J#og+WH9DEMDFt~JOVyAsmGt)m) zuMgRSd=EB8=htuTzp?8LK^2urj?V)Uiu;??=Q*v}0N5C`0oe~yn|M@x@-zxdO$I2# zZo9z%u@6rKAO#fjTJR%zok<1b3v11YiGV&Q822guw{gwer$6ga{US%?Z)Ov5EV2dK zP3dE!p--`VE#8aigMdY`_@rv4(ddsMKJ7gIHZ5^GV&Q_fb5MQ{3ajSh@cl3lu`n!H zuUfReFe)W|AXLslM=Xz3B*M23l<9~Wj3qmf-Z2>0xcZubIh36%^vUbFXQhFEyK>r`*}LGwQc#QQdNuVWrsSn>kwhKe-cK z6JaQE^`PPXz~D_H*quBT>`IP(l>QCRP}kzN5>@`C6tB}#vl zm8`8{)1Gz#cf;&3pT3aXQt0)sV-Tf?1$`d*D73bxGLHE0Mzv6)IlIraS~iG4KRK9= zbi=lPKjSp6o8dI>lH8d2p!En8y+_{@qQv0DaJ5rZRU+NpXvDC}>>i;x+_~{fp^z689%_lo}{#Gn_w*3T< z@zrMtLwBDKJACYF&C7}Qwm(6%t9+o-Vf-~0@emLlv0@Wui65GM8nIrRKZ#$g4h>1! z5B%Q>-8+$q|3)*iEepm zJoWKUlS2YbEZaoV6?n|EhEt*LHgQd_G0}=;GD#8}Vta>+XkRcnddK$BLjQ3~i8Z<< z-fu7H14pC&-A`wRpDcSGJyEN2p$yIYA1}zRo*$^bBWIL!GAnv_>PZ(B|Diem7jbOu zjLo?0{c#)M!XcMR{ojtkAHZ_Tdrt5=RGXt0D8|(-8i~v49Un>tA>`C8$|(EFB>v`;yw;NepF@Rk6vZ z{E|vjDOX06+N4_MFNvu)gwm)$e;LXFrwznHVVatSC?fi#G|C+$HgMx6(flABIxsr) zcM8ZpAQ{6z%D?XHd2sk|BwN3IfDgVm+MYDE^~CKjHQ~C{R@$cZz`9Drf`SK;hVxVv z6iqXGF25MG34dxa~>?j0t)OaICjdo=3eJ(qhJkvxjICeZ;0#-d*7Cst0NqLn`MQz+t@ueUJ z$$-DP&8vh9SJKrWOpU!`VCH7)s<4FZyb0Tp4c{Q2$pKPCF-3JfC0D=ljfMQd?WzlM zWJ*E4w&R%8o~~?J^o3(9Z~?)4R8;t19C@-nI~m0B9>sfNpcg7-k)^c0LEH|iWtl>N z21RpH!)lP86rSX435kP=I>P8D3(5wg34c=0Z*x_f45yU@tZltK3zk2OttNvmP>r?# zwjo`G)0}}94|<2GfnKC%9Bk({mJ`$tRMJT+UA-_Tbn(dh<(APk!}$ljX@+`{U7coPcvBzlDH#d zl7MhdO(>okv6rz&8h^7Vmm5WSZ|OS|cA9q6x=W*+@k@WSm^16%lsS}3nv`r6SEtZ7 zb47O|Ud3*8Qt3xHgVyN3FR2!>+EcOoO2rO@WktmZ#h1=Ux^7S_vmZ0k+#Hp177rbT zZ$QlNbjU}<7m?#4dn5fOA!LgDJp5fAS8?$q^HyHf0G!`W*FPl&wN8NV^gkbE8KCxW zJvF%KE7eZ2D82(dKV3O!oAbdqKoW6{4F{KM-*Y`+Y%XgR+1M>8^Nt8IOaDC`?L7y{z_f+3G|O_BWZbKXX~1kt_OKZ++gT_-Bf zispC;WKN&lvjNeoq^#Q1{ac)01{rsa$)1vRVZGKN>kZJ%JY<2a9L#XWf1U zDn0NkO#AiZ(0V1O`gRSS76}-8DJ?_vZA9LH+gWS79zz31)Xbtpl|l+2ii4}p384;z z(56kwY3^^GzvU_5ygiDp(}F5$v|9rRn`+f72z?g@`sMKIIFR_vvbbsBNcVwO%~lZS zuR{?=3+Of^$Se_pw#(znTk23|dc1tPZ^-VExXG&uiJ(e>3?2LnhK#k zVD!`4qE-PqpsJ%&zw?r^K(+Jb4^o~aPH7i-0rYSqPil9THSxf&TlBK;5x0X0n+3zM zIV)jG>1LQgy3(7kzci-891N_ye(=t_bo#RH!c}XhpTd7LFyW6fe%alFE_Uv|ZQ?NY z23Gmh=L`$vxky2zfJQY&RLoNO2b;GC z8E{KF6N0Ch+xu_tkz|BF6K=PW@51;AQ1V=1T_J=z4$p=#U0Gi5 zI3GBJhNq-k@?|yg=C9b^G2-{rO$uz#g&!E4r<%n5bwqNNA@T2lz-L*uq0A=E?qxuP zB7Qz3bZb=~JlGeola<61gI4`?CM@+4-rHW-L=nElSfs`r zH8LESFb-~z*1i?Wp(4lMH0pkO<-O$Vp5l6G31%ny$y`Xa8Wgl5tnv)noc7NV%at^6 zE$Ax4l`U;mEjbc&y+Aj@J;Z5*LpeLxn6eKSUYhDDryu3;&sVmdC7qiSmf}Zvt{MK7 z`n?SC6!A4_Nj9iD!*R20f&BZVdwbv$#-zCt>@M_WYrV#iqz70GiR*JJo&R#Ak%y9!X)M&b5-HVv5yuQbf*6t zt(^Gpwq_5p4OcG~3d;Ml3X`}WtyLm*WA(Cb%W%FgDn1_yNQk!GzV8F@6ZYcqbiIrR zkNs+Y>RXpCYggSY9(t88@6=cDP_H!AUrXPbPy0)Av4x^T;-GD&p7KF=GqeGi*%h=h zC}W3Nk3G%U`Itd8M+j$<_Oe=KPbJ;{2wXNi%cvq9>iNV`R-(xGGc2`FjS?cei>?wbr>q-Gw+>_+*xcD z6e%kPI%m5DeWp=e_u-i`M3?p;HW3^NV${^=5p{p{O|16?S7Ti@fu=fUhD`e)HSR?vt>N#o z-gh09#v{V6hH+~c(R7Et4>i_Pj->%hWN=uMt4^>BRTz+l71|3v?ZC7~JnLcPL1FJ1{llN#3GyBFnYYml@LHdXFvoV=A1>ze&#vjOj`Cees zO-*{g_Lb@Q%Yh#6El(BTJ(z7?sR@2iPL1CSkAkQS%ns+LF4^{xTpJg4`pY>2Q?i@7 zG|Y^G%;tzRHLpth(M>Lo3*_&fJvl-n!6N@h>o_Vcr?CEEC*X9)UdHchn2U7nJ@sj@ zTKIJSCY6)-_iSmNaVKX-J0Q1_c8LWYvPXm2p}Y<1cfBs4A|A-wE!u~>aOQ~>`%LLp zz`=tnj|FFLfR`Xh@#U2gYFxR()|_O@3ME8Z;`L0*IXy3@7SNrs`mljcU1{vXks|)| zS97yCVlBniLL@;C+y+1*6>&~;9vxoSN2UCC{Jgf2+-iSwH3`mRLq$w`KM@rCLV_RCF@-z=>Uhr$CWX)zoSi*tOe z(uniDVB{J+WLxr!%%6<09YJyqitI*>z&%pq605XRd;hNCrxT24>^63v${P!>cR+#s zyc~p|@06D8GT7RrY-mAp96SNQ?5;~4D`;zNH}$L7M#m}vv4vobw=RND`Le(Kan$S+ zvrc`JazNJFkRA$%fVj5D7~p>^Z#-0EpF_x~^?Yk$2b1~V16L1XX9L!&&lZz3-mbk|?jOt%dSyS|mgC)t`+P?ms(GY2 zqcQGO^X7cBE@XsGnX^O6rxVOBVT&B#O6fd4oqRVM3?e*>&!G)Xi*oJ$dPehiyba8i zN$F-2r^t}S*`~S|wMdOQJy?(XXNfIo6uL z!s|CCUz&F7JA)MxOvMa1^f8ARDjefo$?BPr zieWA7-b_I4wo*5D&nH{HI4^Wx&ZhG`?)Ei!4*-~ch6QemoOP}B%XZMC_B)4to_#kk zA948dRq-#J8`x&%EVhx7q0|&=)P-wko=U8Hg^>yeF-=T0Q2HHHQn27sWz-yt)@=+ZuAx@_D&%jrX74xpkph2}FwkFQq1`?Uej zd?nygod9<7O)bCH)4VM$!_`}%wXZwef)68^R#PvC+16gx-9J(|p%-INfx0X?TU7J< zPYYAsOS_BzTp$DBc{IMDG(A*Q#ev_j)PeOm3>in~g>KghfgHzBHcThig0OnW@3I?M z9ULkx;2OTB* z(2ZqE$ch)+I=mho2vxgXEyO&MJW((QnXDzRi{3Z{g0T+<;^#$Y(qu`)@>)%Sg6t1_2|GoaPRNa$(&$c|x8c(TScm)gsds>!m3&KbkjW*&$}Z9soXZv_5#h_s zfXJa+`%#c-3L9=lgZ|8d0CRf;Fy^tpw4L6Fme-H>iv4LM2|HzYupBO-@(7nVv(#BK zMK^Mb&`}He$90PXuSOq@oh?!gEb>Bq9J@GlM%%?(wj<2jkVmTJh8jl34)i20&?C86 z?ylN;MpsBx<`7P|KPl4Q{zWEV%q;iFgn+r{szK{M?b#U6&$BQ;m*LuJ3N$>A0S8Z( z`Vnm*hiNiFHEv#)=i$YL8Nhu3lOn(Ekh~enuFTrVNc1pMDu2ZBY3`ek zdL`c&!=M=*F>Six(138d*zGaQ@y&ztXvK`mhxOM%iZ3Xr{#|i$xE};-;vCZQXYVf> z8P)TX*XsIR8F&ws)!UsE#gfji zK3dWYqg?AM(68Zd3lct`EWG;CdQxF|P;nyg#3^^26@PW2)7p))MtcUOgZu0a)ud!s zWz$D6_=66vgx*eKC^wHYqf<_mdwI)=VUmDs?wLhRq*t_9kbcE)X%x88UBMmO^;jt)48-ro z&kI(8?%d;^gv5cZPtDlQ`cnJ>8ZJp7=Eh4Vl;X3UB;l|7;qeOhE;e>a3Vz!yyAm*^ z=1=h}&R$L>yIy;!D8E6@WiM9ESw-*fv+sYh=L!+GX+`VR{40VY<9M)&V{8}YL=Gxk z=G);J1*C~DT+`FV{^Yiu)b{hC#2EPRMP0N{fsy)TZYbiR598Ks4M~L&zMxZ?dK(@! zFLYL;ok~K-e;axGG}mT#GKST6AixLHZQru<2EHQ;XTDlgc${|PkIYEc>Ln&F^3W4!~k!vo}0Z%K`THrP~Ya$ z3dn`o7ko3%ZUWL9pO#(zOoEgyB@3huQ=b-IhX_~#85by5aC{rz&JJ2)2dJJ64oIYE zuFsjhXaFS=W}@z)py+mt<(4Ri_Z*B)!FbWuzv`|O#3krLcNGgEMV@@4yE;D`9{oG| zIw?+j&CdfoZSY0q)sCD&k}kB8(`R0h=jEZfPE>;-;&ZK2nz!8Ou+Fw^6CwJ$7(~hK zjgX=h_s{lBLkT-Eh&w~VI4dpf)w$nt6rLd=gCG8MicVYn2`O94)QefGwFiLAxZ~Hq zsO;F)QciYgVz1C^L^=31JdX;oR4Fga0}Y-tuvoxQfKysF{P7jyBJe6{};6p@N)Z{@Fcttnzk`8D9GwfjH=5EPHl7=V=H{NlZUH`%7CKw65Ko>^{HQA~s*# z0wVT}d0W9aw>=6^x%WCHNTSyc^%-dJ#*<7$^Y$6$&u8lf!IT4pxk!H&bm}mh;CQF| z4{YW{(ZIOsi(7_X)DHjME0R$)i7aT^26mrI4sjjB-$ z!39tXXAr48R<>H19uZ{mjRWr$6iSVG7S9mE+=5_AHv5Il1FYMJ3Pi{aek`iXz%52aqc@_h7g=Sj2U zJ&ICRbPGbXSqg(Dec!!*INu?EygLDK$TRnH%ajSB?E-q`1kD)(-KK9Vb%(F{vS2B# z-62khT|cILzbt!LjEOUX^+)sG@k{9MF|(^&W@areQ(8MGC+6<^%Zz%0!+`gnI&#gc zIEX;od$S5C@|bnYJ4Dt{%!tbtdb2sdQX#+2_s}0c6?lvWL7R^l@Yj|us+LL|(Xt)G z^A&H%Wh1=XIl(B5>LQ}ytgdU&1hlc`dHZPc7cPHogE5Fpx=XY}u?l>F<^r7$48Iii zJPD@54zqIDXs_eTt#?NCZ6UlyM({b@Ls2oGC)o$zmVu2D8_%A~wj@iPkx(rX@yBb!qndt|8CssJvgxq&5%4_fe>@#-7iS+ z$uzW#T8k-QNwJAmtiJOs?u2fhRVWojTeeh_f7fqjU_Xm}VqhTFbj%&@taEFy+N@yn z?aS}_`mzZ(c2&2VE{bC7EhD34Z=#qeYBwFKVhCYK?>yW;-c7$HMS_kKe_>J4q$%p_ zHJdXQ&maYg-KN}xceBoI=hfaLA~K=gGqs{@;)Rwa46Jn~? zHW5Y-rw~r9tEO~mPxnh1rb{v2TSxdy5~(SJ+Z%rP^p|Fv!AgUnI57;q3SH1PoJC0O zbYVB~Udl4`2DkDrFKr38dJcaUl?qnR#^1SQED;kq4tSt>ZgF*#74pdv>|djn_@s=& zib1UOp3Zm|0yv6oQ_%K-VxzXpH%3LPWmO&(Y6|NRha+<Sy=h5oDhj5Fn5A+8d(;kcQMhnzD_gEM6=Of z#_pa?ZbXGAOvjskz99BFNU0)}_7;=&GxVDq88O5&aPL&rZiPejj3`#1n6&mV0h2O( z>u-845d8`eUg=iP7biT*jH4=?zV@e49Q#)7eZI9P{TQ!F@ zAIID1zruL2N3N@HEs@^~lS}0KruqzB+JgAxf7pcp`5PknS`)21dyIu9Y zT$c3nT?eYc8d8sUf05&D-z=`r6I8EhAq~kL z41Ia%j-B&qp_D7KJBJTt%J3xbHa;E7lIN>=`*-5CTMzB?VX5UaUZbpW4PBEQwq3+- zD=vX}4k3Ly`}4OlpFqC5W+Z6*aqPa%1}0L@WTl0572-GRVBQt=)UAx_5n$q>*$}Ff zEQUVb5G(j4wyKz0DDK=UY_~xF-{OC?feihZ8^z{Wx|3lkoy}+Fg)an&LNG#scwO_t zN&8;=HF|_05P*t+TuLE%0<&_Lyc+e+&N2RJF6Q5J+F`|L6Vxzd6CvX~|Ga{S^!{3= zt2Pj#xm0x}3`%>!Ph5TLYI83rKD`&bNPo`t4bT{u$6Vhx9zk3g`OOs?crA0Q?wTs@z$dDO>%!m^_KJK?qL<2FM0$q)XA=H5Y0+PZ~m^r7G^P#=AY zc_M^NiozxZ<8?Z~W^AOy8x5`$lIN%?#j$0bFL}eRl zWDD+*^XIV@ZZ0!lS>iYjUk6g&CT)+gVQSSLQZ()G5vDLe+Yq!QvnVq-{Kz=DcN_!6 zt-Slchu~|SupTb^1=X8>AM`jxD!I-De)5r`HWUk9Kb89>s!qF%_hMz1c=yR3*aYJlKs!Y_;#KDi)|lFsS=F18P8 zuFf`qQ-L*Y$4~Za9_sm%(pCr7A2Z?Y_Y9hD`-ZSO--tEsYQu09Z#M6AKzr#WIFX5v-I$w@Rl|` zakUnSFKRTqos|1V;VxKy@Al!YznK-D;!+HVcrrZ#W+sm6H278nw)|J0yDe{injO5T zV(Aa`)pU}Y-g(hPDn)#g%x%=k<8nA^6tqm7lLIc=5X7^d%!wHPjE;Bu_ZTR-H-KM| zev)gv6QTRl(ul$ZwCd@ZV>qFY&~Dk88W^RT&Z8wudp||#B+LR{1MTl|F_*`ZhF|<> zgpoeRfctDZ_H%V(|9xZ^=Gx~(1>&dx)3Oe-9qq7$N$VI(2`5Y;F6r$eS)*G8`3zpZ zbf3u96a-YTBp`!;Zlkr#JB4%)$8IR5wLt&3RRHrtb7KWjhJ0|Dl!m&+7N~KHY3Y<)7Oy@sνH~R;ctLQ{2V6a=LFBXI!4y5s=54 zBqQ?dY%zdR%@HbLHxIsi+>@mwSBq_1`2}L}Rqf=d!4Ds5tmY!l3(ihPZ;h1pi4WR- zD7#Q{zjnCmTeCRV&19ZUOY-cvOvP0LHfUTTqakg&7lmKWbc1DPRf_!&9CtKCu-2>e8*;1 zCxm_S9qk~!bnHYY{fy;55Os!(C0RevH2tc5h^?aM*Jf+Q#J6%{8jDe2Alq-4pJ&?x z1Df6w`8W64)?O|%L~oR-kAdQ=(7>q6P<3Z52nSR(Z;yT?SC=A;*{dKZ&APht{P_-< z#(-P%2AF4Twn$|gf_0_w(~q;ph5A=_o48f5hqXN}00W2#*QM=A>nYEcYwZ71)?Y`E z`_=(YH&GLb;2P93=)0Ne+90QY&KJ^!UeDJ&=((zHj66oC9}s<7A1)9l>Toao zGGyaVidlQNzMT=Y58s^K{iiGsI5}K**(}q}xo5ul&cFNP2zfhJr}kb^U7_zcmU3Iy zVGJqw->waq#P8^Q4nF(kMPh%FmEC;N39K@({^4*QboxIsm8Zg-={=HOtdzv^J=gyR$S`u7CdtUobA z^!BYOZNgZeJYYBvpWc1}(e(a4A6Vp=1}+En(9d%PlE=2GK8_l>s_%N+3j`4cIFZ>o z6V%)QIV6msZEetdlJOqUbqEh7=7IV1Xn|sLPkQ81#*y=oM(*p({B@vPe(^#-PLxdu z4LZDnnY?ur;s9E{#}(pE!Mg$JiCT}1Z_zK%)h~9!PXYp#qDDXm{dGuTSTXeA+c~Kw z1~ew_Q5;Q1}V zNz=&PChh0g-4lIpYmD@XM;Vv&w3`^+)aYvB1hBEb84EjIO;^L-|6JVE61OluTensQbi z*sKIZdI_$Ch741}8=d=cGS@gt1z$%Y9)c(CV)}mlN@J7rZZd%c7hf)Dl#K;e-sgwz zf|KKtSIK<_`^HsdNOhpYtB$;#)w(m@_h)E=#hn7gcV;(0ooDu*rZ5i_)HPzjeF#fo%qR6yoeuI%r`ez<(ZI2>@qgjA4jLwB; zSGBQ@7U(E;ROOUL5J6vuB7-NH;%HG>Mk(iw|AKim#Upp`B?1dr{VnX6^U=?4p9s3?I^~Qy#B!->rk;t$2I3?Ndr*V#PI|psV<%?9 z_#uZ9ikfGJb8^bqOtl`nT(L@Uqe#R$xOqRTIF0b?E{{LJ+0avxLS48tGIh00EGXB zM~wWIi1hCfTK7Wr1G=^Il<4o9=SuY2Bx1&MFH)yi&Xh&MmA)J(qE z=bl^y~xqum3ZEnn{W~g#^C?}=W4g?We^w@qpy}*d{ zO?1DjbIb7jKGTRb;z?WVq(lY&kfR1Xbqs=YurNbbe8~f4{7MdJd(p{!5u%3B$4Pj` zzo@Bg4FXL3G0m`#WeSSy_Cx#| z?x=$45%y7v6f2%N?ln|Dd)IZwXkhBH{UFkO&lP2@(tzscFf~H0nYKr~;-F$mEIdKw zjZ#)&nNwlS;b8!Yy|c{nKu3T?Ks2m(ZoWRfhQ}HSsD;Bf zh`~*0`!4mAr~8M$9IJy8r|~KEdpK+NvS+1NZdk+2<+ zQq$kdM(UpUSR;)b1v)ff|7tVIC0YYb6PcFTVCLGXsye!7cD?^BbauI7MQLNd_Q_4i z(ZXWoL$4U1e6ujDfOihs=;JkffVy)H;@J-wjlfgozod5PhF^hIYDR;ujei~8_zZTS zMwF&G3FQQN*=S(o0`Irp&=I#VF@uc&6N&wt)s8@uD!ntAJ+_?uIO;O%Pg^7ovdn&cHdI!b-TlULSsx1Aknvw4136xLR0) zhJ0S?Q)5Q${&>7`jb3?R=ZST=2DK2C0m|vvhYeot?DuRe~RsILnsdv3~FuTQ> z<}G3Mdcf=!x?84A4$v8dy-2ad^I1BSvjM1>um&hwaPIo}64(Lu>0V74Rkfz(QThii zyr0&yo{e41pM{M&!(LR=ZdsAZ`5pc4vM8F>$NW8nP zHt%FDe|dax(r>vA^kMl_zKDTj#>q1Zb-l2@U$n=lLEp87f1sqM`fWGSl3>6#YufkGYiqP zqmVEI&21pqI$7}5)EaE<|LE?CIg-OG%mO(%*Bh-@km>R82CXpJg|ZU=3D<1}3&dD6(vI zOE)Rdhg5>`@PeFRz>fhGq5wy|WjiU*IU#q{|4DI@n|)^nqv*UC{Fe2=8qYg1im(b8 z;R0gmi05s6WCMO5Lc^60F-$t`jMK z6Hp*D>$R($W$qyfY`lo^o&1(vyDMm9!|>j(hkq;gvv?*c=N<^L_8vG%REAy~5_iu# ziP-<;K;p5NDfap-7G!v|B7rmp5ZLi|9i*z`4||-z_Ok(+0L#1i4N=unlYWEOD``JX zm%2NiSJ)5JeKJXsOaj&39?cu6OYsK+nJY6HAMJT%(^l_8fuE}|<1F3eeJBf^8r`)` z&bSOM8np;ebtSU13coGQV4?N1a%%2oiu>Lyy5-Xq{7V0mS-maDXk#=t39tu-%vHnH z6e|P+$zGXS#etAN+Xh?hBFOQ;OMDx9|AvXbwm8uc%JpN*fK8(FU{CD6Q{julmUDqG zRq1rHxorVf2Ozn|f(FZZhMXU#tidyKHdJ;`0-zS&2&3a)VS+JhV-w<4?iW6x&NI(_ zfMOeYDlU_NU1lM#m#qP#FyLni=m$BtTo zKAsvB2Ck;Gv2lF~yT89oT$`5Dm41njtQ#wT;6&U*u6OhJ(TRL`X6Ur^0yFvQXiE_v z|9amq>HBY5M(};Q!>QS3=Z`ykpNflAHZqJH%C%owFNaTh{12Lv?>{I$>Bce+ytRA>m^`upbEkTA>>U67gv>qnq4E^*{NT?CF+cunoubU~Pn|$Ok zFz=W~4%;--&b~Y}o9q%9KRu6!(<901S=p4WdHi->j^LMQJYEhHzO2AIC+Yq6QJX2(?n&Iu4(Hq)$xk*)VY186EhMzsR zy%x0!CFs+Qi6S{Ghs@P{A9Mgr>kIx7$KS0(@VhmGI5^^;iocj^jC$%vj5Ut$WIwX& z3w+E2RP(y#cpAK75?oVI3mlBkuI#OKv#XwoY498!z0Y~l|&%#3EMiwt?GO1 z0r!aI4uDPZbT6O_=Lh*pS-D_#BKh1uBLmx8zE@45b_&yHR{*B^Ex?BaNsgeq`CTWK z^9f_YxNGG=F79W$VYirjJ9P4whV(|ymJrnLoMH$I<7Roi=AT$@iBo=ed@KJWYI(d@ zYgA&V?7sf`u?nO-zev;uUw!GfMB)}Y`NKg7AOtD+`*jxFn8ELdLf?Y^2CwnQf)que zn~iLp6M&=WgP=72`YSVXao?B5bBZRfBSI#{f9mRF1UXD77vM4~K0i)Di!tn4E*q@c z5Q8il&;XCINq>4w)cN9a=@h*4XeAH!5h5!2q$mmL7_^y*`lCM0@E9$N)CxVfy~mkp zw}K{g+|M(4JJ%bX?{(|v*o`jtio50`Ecq@Gf!ls8dlM1RG}i*CVyE7Sn=Ul<6kLnq zaNYhejV(OHx1^e9d)vkg}8T{K5esEfm`VTFUHcuL6VbCCdx z@S`2`Xoj-v69KBwo!2No#KfNvh1=8_*y0ly%O0E?hyy7uz_&U@b&$TRq>rOa$}=+p z?ERY5#@p_ao`08>V1v>sLg<3pj(yFA?|fhfa^_-_Cju;=O9P~7YJ4~9L;Pxsx8fw& z390zo5CE;QMAR$!z7ou3dO-ntjj&q`Ih9vXz@5*^AuCvxyOPMV*tyzP!^^Hq;p z^5+-}_VH7yN*`uCUqo|zsxWHawfvCf(muit8WS1)3ur_H(cZbH8E?2a(U0v8q00A0 zgh|ZteI5Tc;%4+Si)GCL9a{x1d@rff|O{eo!SS^CtvHux|Qev)Q2MXqB10enI@>Y>X?`Ees}#tev(6#!SDFs z6tfLv@mg+y3&P}u!|2(=2^J;EEw*f`q)%oBU_p}!QWmiY%LvW5Lknb7Q;4PkEPp3) zLq&3NkiAbw-zLb+r)!1LW89@rc_PT{tz80!t=_r88mhF_j{o@2_m?BB$LBf1cb+`z zz4+(Ar?+!+a318ct>SQf9@E*cIm!?|D(*CXU=R7$<2)Sq*brt;dQ{Z2aBz=x1d#GP z{XkF?@01K-g{jQ`3JqCIIkLs+!VRO#{0}n6LdpVd=s;d$d12psHvDpt<=_jy>NNz+naL-Cmi5P8zbKc0|r$+o2r@+4ZcBpHt z?w~iGPI(ME9D7S00(i!^?ip`)o?qQP`v9@>Ton&U?C4IdO%5dc#nCoD)!)V+9PR`pMhJ;G_mrvQkdY?gV ztxWzHssX%klM;SAKLA&s9_^|*Q;{C<=!=swf9&;H@6>!>F*p#!4EKX?#61+Fn7R#lHO_v zkk*|znEB?KEJ0~1$4QJ?@k~^w5acB@&5IZ#l${RXt{*Zc(S7W5&!s;j03*1%`U``o z1kE*{27rS*rhox3=@1KB2d3ZfQM)Jg7M)Z8i{Q$ll-qE~DXmSOHy;7j(th6Eo|EX> z+WIM$7Fe0tW2e0TASorOJuMPRD zj_>`_TLulR0`AiPo8OOE;{1ASkiO3boNlDd`)sK|oDP-zS67YwHd>-6E**~t4h4Xd zQJ?n>8!mB}plnkqMLa{2!DEu_B(Jt#yez=Q3m z%siUj6DUKfC#!9EhVC~49x&7goK9TOuK>Z}@qnBB1&BH(1@WdYX5 zt@_z}=|@hoRJD6NH`cu(u8+Jsxki1N-DyqfANY2_io3Q`7I8Cf-0sP~;9*&+ z_hIk%m(_6?Fa`l#^~z4n9^UY1R^ksIa{m*rKZv1JMrLa?gL315XPIL&nQ~RX2+~|P z&pwt@q^p`CC6_KlS|WdSqn24AheJ*|QJ&}Glz%H<{&!n(a2h)Zc2abWI*FK_f7Dk8 zaKczNB! zy|4X^e7fc{yS!mDjLQu!oW7OSq06`u>KthtYW1J?nJf>CGjj(%eGk^`HJT4H`c%3Z zKREH&P$wsBp<{qQz~m47qx#0`XWGj^7Whp`=Pxt!OyiPw^t6ac(i2sNMOAjn{B+7! zMA|(>$bcv93Q=fCtsf=5LuVhw5n7od*#w2~vLA?at5LuHeySi1EgBaJ^hEgRW?USP z6Lid+jyt1nRB1zL6$rXz`%g|c*WMlV4~`F1sqMVu=v?tamxv$QIAFWfyEQZn*fxm# z-1rtGbPN(@#bw6w{6i{mRGfemYV99WGf2_`5(ry-;6wTIV1FqNp@R(aSFbG$deG!3 zXfRlz3t&S?&DDUml3l>Pky5t(1^i9>+iqC-hGWJM^vfvfn0>QAK3DwD@xqryk3pIs za((X8Twl;mHl5o+{pddr1SK~P+QiLwjwYAlNnY;4V{D@sA~kw?ub2*@@o*7hI=Bva zJ&3Eqjn3cRs4}^$S(*Pz z`*Pl+VnDW=G0BO?o2tL}$-Cz7XV06;{p)B=(6BPdQM-DrJlv7H1m(&i#20cmFkZE| z!dAIbfVLqqoMTcf~K%qHFWBDgB`3{#gnBV>mTI54=DI8uE6^6B>^`4p4MRn^i$%V4=@Jfj1FVmH zS?8v+KeE%O{e$jjIZ_Vv7D*-hpCyeZEQgxz~a}S}u>y>&z{xDIe16?gwM14&EjL~Y5scO z7BtEG!1)XR(RMG5E0t(47xdG!gI`9J3%5(}7tDbu%+FOWHtYL|2ph>3w5J;b6S=QN zMfr4?`5b&P~(icxNSGyKyIt$OM}$QZHg-J06q`g7P#b6c!xy4 z1I<&Q9JYP6(70&jlLEchXCYH;U?R*I+wW5S{P*DXl4u1q-15q0<$Pesq~_%2$V#e9 zq6|EwUb2I}U>S}U1XP3yK@o#qQoN5VWL6R#+wCYV-mj$jua>JJnND%6R`JDuRtmY7 z+WS5C;&?pDNb_xkBipR)WkfE0TP;|=_kKCxKl?-cU%!SA8Xri&Lcz8>Q^Znr49}g! ztjhd=NzNhUFgs_DH;dyPgKnzym@NI10(`vk^76_(0avxYMTsL?-}tXr5hEA?pRa5D z>rpgVdTAtsgDNwRqqIMh8mD}HfJmthd#?8{F|L-3kPjuDrX%J43XSbxF5|1z=u{3Z zSO&g_jw3}~+b-xE_<+4hGlh%XBGlf#o{l-dj%t-w3+26gduwqJ{pgW8RVaWvD1*b_ z9~XQm+_IzCIaNIwE&KJl?Al0|keK+5Y3ofBW6y58$S zE{i|?D%sf#Jt0LmKsDS9~(NU8>Qvt}a|9%O*-|R=*(@oK88DUhpS)6#@rWSQf{Y%2rPWjC{XJFgqq5LE;Wf2MW8^WXj%3hCG7*NBZ_#=~E1AfhPq+SSr)`?i`gt$*SU!=s=Kl~lCwRQV?1Udt{*Js~%Eny=;{G2b z%l%OpEU&@aao&eC-qp3$`O9%Dn-B7$BflsSk9E*Z!jE5jDVJ3A1c>R(mHVR@o-z{c z4}S}^Th%qx#F)&gvItpe^&Tw#JZ18`61sFK9!hPMTe8)O1LL-}_C}|H7(>1I6u!ij zjrE2Jw)Jj*y!1TNI^gY-Kdz-m{zPO#y&^a}ufGF=8UIfu!n<|n{$%e-w3ob}?=p7a z(1+PCi+SHdpiLyELFy-S>{}HfY&o%zpeSPZMOiqGZIs+ zNBlHP;(4m9Q_sGvHey9W0c(9_kkAkDA1;8m%KK_IG3E~h$~!!A>8jr$0LSqK1Xj;F z@2|es>vY>T-v}Tbi_V5_9LKsH&oMd~B^zFL5r{kgksSX2;I0m9Ow_-n7x0_~#{lvC zF%E>ZUIr|Y+$Ut|k35);G@oc+O}crh?(y|UZ>*!>|MTnEKZs%7;kuE7Pa7Mz_0SGF z8uigd=4jD=x9aeodv4?nEvbGHi$x~Rr58)EF9WwzyXzbm4^M->jW}V&XQgh$3fAeZ zR}!*;5WeHHtSq{XRaJduQa{HH5c6a{CXju!T%U^TMWaT&KOB6S6xFQ>J#Lzi<;zBR z`}WS|AK^hvEySx`G08Hw{GF7;97mid-tLbYvtT99uputXW6Xj%zF5+OG}oB)|Q4jOxLj!D!_2Y_p<&Y;B;64XE30`dp=o(hgY)PHNN z$vhr5;Cs0O9*PXLsbDGQgB zid8-u?R&Y%`k?4N;Zt+1`1QZ2o1Iu8RAhbQ-e`6EC8_b<53Gxd|3^Ivpfv2*zJ987 znZ_IMoBiT$Q6CO=e?1J~fAkIUqd-dujEf?i<@3(qpb2_^F1pWru-+_~QXu9>fz}fW zaB=8HEb>evh-AC4p4%?Ag5l3iq0X3oJ{N~`m8jHxgF1V%uAzpn-CH2XNG3tGJu(tY}~JOq-W< zCy1%nbxqNfjxC$ao-nfwQI!s}x-;9fgQ_~aS)8A}d`Ey~i2eL5rYtsJ=0J#F{ToU; z%pGvWIJ!#_z*n6fKb6CX9u1NvTW2(`-I{JjeBsWty) zwQ(xIc0^zIu|de`&jFvhoMSl&FpLhN3a{=Q^OqLaC}5(w|6VK0eq)zJS5#50uwkKT zi5w%o#be=*0cbzsh(<}bY2GGj&zirlL|^`^_cRI};jc#(n%5V4gptT*!5JHKzvK=w zPQAi2lP6zgIO>o=oaFts?>$V4`r#Q)6PK^ElH?wL)_t*Z67gk>F>EZROya6TKsLE;#-!dZv@@2vu8nUPA$2O0PeJXIG}XDLBx?16Nm3oaww~YJlUD9XK>fu&uiJ5HZ#CDQl}1?o@bSb;o>z|eD!oh! zD48s#-XB-it*LuinnuyDh`HqgwiyIGdbgtc*?f96$gkPiIZsK2*)a8;Sd$Ix^3C1# z?k;Ddnx2S|n2^`yhLw4heIA9J0^5vD4~b0>>0o2nxb$am(K0O%`Ee{qP-&(2`xz4I zAX^YN%OSsv@{tkZR#)hBJp-sHg-&)(?4CF&azY>FjjLbbR5=x*{$WNz$hTYMilmJt z4lw-yy4+JPV2kI_SGLIL9~9Q8;vB62^n0Vuz&3BNcQ!M~6Ck1gno>={-T*jh7KOn; zFyB~u1AA+vaOp;)L^AWnLFnR8BqtCkx^Q{R%XQYA?f!o94wKX0;U&21RNzQ$x{G<0 za8dnkf&(w!+o+PX5dW71d12}OY`$lMyL)pxoot%=Wr034R__1Nln85NAjAd)QJgO` zUaowgM!E?!fthkSTSTGf{pl}w{H$<`|~sG z;g29M{3+l5J7MB4y7N>WsXi5~b39q`Sti5Odod$tjU=;7%7#N$!8KTAr;KBdq7|XS zo^Ob2A4XgzBufh!yqK7Zqd8=`nNgnsXM&X#g{^xS?S5{!Gh~A?$6aSLau7k1+QK zFG>0UXJy)wn&++f=tG$(M1D7aDb;)Y*+^Yg1H-)EWlr3r?IXJly;nPxg^DoZ5ZVRj z^XdfCTY5wNE59~=Ug>)G$=k%cq6SuPzRJ_L5)ru=v~6X+qwZO2EIbu}m6?>WEH_CI zGbz7q{LV%!!QN>Y{n%7Jgq}GBxA*nLg;zgR*W~=1CuwhC^L3DoRwoa99B+wcIHeOfMZN~Jp6~klLvHrx3EUB-T z-o0@cdl)kVWuX1xs(*#iigVX?n||8pYcoyiNwFl1{aP($C*U9OocB7rAoe_${s$qC z`S)1MpSfB5HjUS{-qc06F8Y(RfD|)w?S2fuD%;@L+b7YlL(Hb=b+NI}|G7|I$OBV$ z6JSlRY)mh`Wb$*rt=vAV#8XRkB&BiD6?e(urh9FTP^U;V+BhGoa zEN`|y;f}uc5Pif#YVf#Bws4JlsPSp*rc`E=LHo{%lB)wYu%9Gc0#8RtTd`QY`1#SN zo4VHF)vG2u4YmW#f3Covm*#ZTZ|Zz*ye+-PG>Le{vgQbJZb;f*m>KwZ%hfNi_O0z@ zK|I@(4+sd=oUHQM?CDl77mR@GR+`Btzv&-g0D+K*Q5O;O)16_Wo3^*8gsAXt?D2 zCmugn_V#~LiS%>eS>(f4)TbGGGNJ}aB3 z0R)WI^l{Jvt4-d&cO?vqc@MJFg5)Sc+w;1K$;q?kw&&Pb8VlFPjYD5{wZ;?&_3)BB z!u_s(1pof>1CnFO5iKR_A*5@95R7@r>utJ%Ln?4)h_^4 zA*7EsP42u%HFcOXo>e%ChSK6jD~r~Uo?fto0uwyF&*bhU^i#cWn_BGjDi@j_yD-+e zZLm*7xh$;BXV}e$6upO-2&7c@O9=&f(E`a~{Ci5}W%Ni2dQ{Zf^ z{TXq%{FE#KJGeO!5nGQu!p1eFi*at^a5F$)(hKu0%Z8chqwH!{IMKjN-nTxNxEW z%e0DKg!+iwIF$eH+1%D#;WS0vN^+I`_yV%|3_u$8S-QFk;D22Ew?&5hw8`gFCWd?U zSGaxNw-rF^)tc^_&gxn_xhFM~Nxy%0ZF>l4Jy9;Vj)0q9gh1J%N=x$}gg5H~hf9`f ztb81+Zz`GKKQaXn#Sb{@ZMb0Ptg}xQ%{MZfjocJ;?}d~Uc^OzV~T25xw`g*DqQfiL#*w~LT%;#!@N2kn_fg5p+-+YY8WG3Z4b zW3NC;IPOIwukj=YIVp7WHxmAoJSN-d6vDBYcrVmg-CJMzilxh4?fuSGrqYNqk@36h z82iI~>;;id0KI$T8Q-4HT*BIi> zu_?W!e zJ9j?U<(*AfSm4uDWN20ID|rA>Q0U-w*fd&2#Xxr&b9e(*oObw&&NPb(g8Peu!@%&x z79bGkOD6K?Nfa9Ai^)49MF(E)k7ZFSBbWqZAwd>F{Ay;=Eem9x*SK@dz~>Ch1t}~R zihOS0HsL12xlQ$MgK0f6`k`q=Bfo@0uovF z{qoOTcnt7iqY4kpyHjLoE->;b6FSpRg^R)4BPyS9>b9m&=!CzE&71ToF(NlHpq~d9 zx8vIc9c3MOn2%azf(tJX3b-#Dw7Mt8scOl;dv6%#sm>w7Yv2p%-&A=Qbgehdq^h%I zetIKRn?P_~T-5iSYi1_JF_RTfyYLFr{GJvd-+joCd}!DON%qtJ-s`P!i-^LXiIRA^d8WE@Kb%E5urbOdszp=K*_!ZLu3b`;w6tiU%pJ-i zUt7(}=i+-6^eha_kEvLh{pi0KXGoGuMqzz+^ybw;SDgD>48K{vKkszgP&Rg#uuZ;+ zt_*IG&93E+@?mS;kY^lf<%cW2iA;&hAHJ^KnI)b?QzS8Pwq6*&>Zc5F*fxs^EC-mi#9u#9Ysb^2cgw|k`vcl!X0BBJd>$; z_3s3N7``&(p5J$djaPZ3<-(^4vJ7^r{M@yosczX7Vm;1w8`R3_e&>$A#nB;~CO!pP zUm5$D5ObH`(3fhetb-1Ax4am=wom35!idbbre{cSsMY%o1~sF9_FfU81eM8axedy-XHB zjM|-gq*r=tU6O(C)VI(caG-@6D`GZ>ZyqoXXvuTR3f2^P`@4BNTX{^1QF#`aS2$4& zjC@l0<%^2Wr4+OI!GYMpDc-gJ~Q^`3jZAs2K3%m7jdYxuj!-Xhy|^cg`1Dt-n5Dblg{kP z6R#vAoCol}UBi9AwM%xnv1VQZx92JAnp%WMS!t0joMK)@YF0f_>URxfG%-#NNfWa8utprz!CH~h}HzcqoY_V z$TwBHLrZ)OScfpSf|NGvZ@_6}9H9hi9vnX2h4sj1YaAm6%Y6n7MwE}51c>xYIJ5pt zwC4v^_DofCA=;dxF$yzpEY0Z`K*;jU;*-};?QZf>J7>zTbPNAH3qO~nq=Hd1AmpN0 z8yy$=whL}o=GYaQ+r%`u)Dch7c0L+TFURn8Pk{vJlq#WBMKZPE+B2K=K)Q(Q&X!Gi znys#9EqbltB>HQxh1dtVh5R(>g!c>9Uxf0sUiWc_`n63<-33dZDiVP9inR<5%Q1}$ z9vNacLY$o=BbHxotcI@#OL0#WgEW2avWnD9F(0geUXjpVST})#^`x{2{@s*wWVX|} zSs9t1V)-p&JZbrlNzW{lokP5$*AE|&NB?wcRIoBDJDz`;G<=N+Da5+zviqD4Fm*5W zIq!LjzvfQ7{mi5T`KnXL(X1OSPI93k7#7p(a&D^biRHh2l-q4@UYXp^$u|=dmyeP& zVK{kb-0)HB-PE@Z91HFp3t5(&cD$UOcy+@z!x1fw^zb-04skbWvJoSAxwxv8&yC!q z6x198fm3FD0*mwJvlv^gv)TC^nat}@hq!xDc+DuDV*N01bmOffD}mZh))Y4~zZ~(6 za+n-zw$V;pn?Buvi>_|TT_e7l726ka(3Ci{Uu}pI{fN&3e15A19s1{LjPF0;8;(UM zav_K^fBG4%38{&63&FtOaeT?{RO+(oc;>g!DR3|1y@mKi-E3)Cy!erz2N5SF@4qRG z@3gz~e1Y&uym3$~wgHlk6iinLtlaco$+R~cm5RUe)9%Opp^t)!-iLQ-8!*aw(-=}o zG_~9*6h&IK?9(liVH&R+JpkB2Hrz--QI|ny|dy}su%b<2|}=WUerVDeE$9>`^)uU&o&0iG@1Og zQVu#|zudy2Jc=A5mfDRJ`e1N4R`gTkVj=o?uMfPWsXsy-tKiA~#2b{AXZH zg??~?)2xoQ>OOU6dxq0}VlVNL&JM!TpEswtYIvk#D`9H!^)1JK4T%@V9?q7*as$Q2 z;RM`uOy4-h4SuR5Z^!pNTZ*lG@A@pC!}L>MV1cL|U`=t54?p}CrF#xO-BR)ml_68X zy$}2^1Uj5=`wXH?bDMjG8s2E_QKpBvLX@^v z7Oqhb&_spTt-FKG&e41Lu+aH9IwSP<%(f(W`{({l0T0T$Hu_-gv9}ZW5XJxAI zKTL@)Zjp>?=#5xy;xIPvqe#Vw83=^6y=XoCoFz(_?RZ|g-_mU2YqO3&TGVn?B_^!B zjb1&xigWFg*euZTOI6ey{lGypPO3-H*}9m8eD5l1THPgJH`>nzK4B3&+FPAsVOtw- zD?If<*tlNwiH2q|mOq1mxqgC)r6(E%fG)tGYTv`JD$zz^S3A9(dW!dqVA+@ZWoFXm zRdUQ3f%C8LThBqS1PLB#RSu5VFbE=cGPD*Ren-B_lk#ji?!Q7d^LuUf;6Ciq&O z@^$|507VxQlWma;g$l<{45#V#@9r%vV6OeC+>lC3Tnq^Cf$XSD7?nh#Buq<~!>2vq z-7`h%R zvB&!%T7IOrJ!7tov9tK!SAF-(l^z`(W_D@3c)^}VBU;L@ZNACBRpH#&W!Kx_1M_D_ z@6)zj_s8T&-WG5{f-bxI^v49);bz>-cP{hkOL8t#6>Q#cTz_IHoJE>xSe56NiDK*{ z^R=-He_s9>ihObRiKEbCIGVw(SoW1HE|1MUhW^{!yulZC45R^dLGtI+&^slbBnS#ZKiTetBZt z=UltDF25N5T?LaD?Oj44tAZgg#Gq*9!pyGZG{IKj0rASMeWu_%v{SxAY9#WyQ$|p= z@=euPs^DO>1ICsG++k}c;$cE~sYDZmJw@r30r17DIJzW8?vF4Mm|XEC`r#HbEMCQt z;Sz!RsC|t?GLL`0TV=7k49AXTO#O8*`X_*#-Qa>1psdMBuW${RRNUJa68(NM9QD@e zU)v$*A8bVfn1xjjCO>o9?S@KbZV00re~NyE^Q4vGfhC5`$>>%u!}jz<$!MbXpDuJs z5x=u`({16JhF11bgYwpd+py+6^FR)9d~HVsEyVEU>;m24>aP*N=VW&L%_v37u{${! zOPMiIfgb)UYSc4FrfJP>cf~26$quEpuwb+%V$Ct)epAfT;tJd_c<8ah)D*U;Wu zp>~u5xo5H9HPFL=pt(c+(~r%TRF-N~7V+YSn#hSheN~*ADm?ma=1|jJ z^iU`eL7CgHd|A0j6_klEMk7+p4n74c zih;k(-rBSxiq-ml6#&l{TcTp*hMKtHi<7alqrXG+O=-rit6~*ckKy8|z!Iru-k0&p zQ&%}FMxTZ)^2bkZQ9`iY3H2ED*SXtkA)5c2x@U~e2e|;{#`1}Vvh&F|3 z`CdANQK-aLqo{DNV*);r-ie$PvUxFv{0wF}DDlP;m~AgCNukr!6-0}#9wckZtx6rO z*B?MCbI5n=9-s5{!PNO)H7Wz?!Rk5-{Ao3DK^N9zt@wz3HnyLVU97c>S(|p}?zgG? zT}5m1Ml@s?zqrHkLxFoi<%NH|ne@9zhp{)U=1>HHPhvpsp-2iu`ONqfSug9!?3vPe zMTQ6WLlp`8V_n)&k`M%ot&91qjwp#HTFm=fhNnf1WbT}CL69xIFJ5*e=AL1KNWRN=qaO-CZd+;@s544PRYQOX0}WcmJFW8RXH zhpR)R9F^E22JS5`xzuKP37CmK`sE981sm>WLSwU85K>&|#J0SXB`PRG_Eue$3ABJx z+pRtJ9qX88rn1kMzzkAHFLs*fQeGQzQQBBK$!k71`R4h{^RaU5aR1q!GsM_5CKCZH z#SLH+UT`(bM5!BEIBhxca*dv@J~{-RZ7xRkNBa>V9(FBbY6vYJt3?E*( zW*4~#+$A=+XfEg;Obk5lH$gB1&m&Z4wnoHFVu;v1II4yeqhf;Os=3+Dw4(+EwNa&3Ku&)4|jD)*<8 zSIjL(u#L&?xDzj8YVB_a1&9Z+f@*HY#kIIpX@Kaxq}aeGN9{sqjQ)g5aYq^MWwMP) zK^@sO37ZS_t(%)BQCK#<*Y;}A1QGvOmX^wn2;Htq^Jq0b^hX}Kw&NK@QnT9Kh%?9< z#u>3Xkq~A~@14mKKgub4mmZw~L;;gaDa1n*$kyfKh4>^(tKO@G{@ii}SZp}Yi~ zEGmTsPm$?h3A0ZFa91#zW)l`BZpIFkGjv6*YKq`Fq6~?6v!+ri!ti7(8W*SrvyBt{ z7KN_tkrl?16~TC-7-lgv=5FnI3Fk}FS=_ZDY};1i(U?}jkSa+*FJn%rtwzl&QwrP& z;xe>$wg~#;BI2?23*W@LIqvPRmN8l!3f?VmWN?-D)RN-{Wq;`Nnb+6Dd-{asO^#tLctB|&{lK+|V+-}{z-%5bq_1uk7-B^6aJw`imJ64(g+BKBLcWd|@ zNM-EO=WM6Bn~S3EZ;HjFAiZ8&m+bAz3VQy2W-ersv>OiUyj|JvWY?Em3zR!F6vFKU zEM_8k(~6>9wH;MspdptdF6HKBv#d{oPk?~A0e95Su*kC%8ARps^nZO#aS6p1xFS^U z4|W5PNp$rhnbK`GFCRM>6LtMEg9{BiIKnuII~Vn6swA>4**{)@PrAa+)<}jG=G$gC zYdOf$2-&m9X5;h5B?5ak_+;_sC2F?l%S`du(i|(G$7{iXh=zEx)FH;MZrzKkD-piK`=TyphrvN|RD^%JexN;B6Ns zlGDSFF)j2{HveqGEGAB=wc;P*zU_Q+DXT8i@Lf%L;cYHllE%!&50Hp}els^A>tlof z4}aJ}jSt3jXB!%{##@n$EuA;V(HgXbjj_-(%fk+LKqdA5#zP9lhvmINl_jz#78XXD zI;whHy`4EHp7RIW(^rpWm!Es&~#;Ib~sm%nj;+6HqFBabdY){nL9V$%glm!_u{6`Gv0f@5m;6*E#$ z76xKLCSqg8bP~LnaG{qo;2K``6c?;KrS$&PG*A?$N_bFxn~l7_IARpK%}@}?MJ*Nw+y@2iBA2St)rYcOHyssbX>U;3y(4S@iloo$eHSb4J+3onJ|!&weTWI6;VrPIMC`B+;WqPof8rM4~f#XNb-qf@p(?h!#W_ zy?3Mc-bOb>@53mg|4**#zMtoPe($IGFrW6`>)dOt<2cr_?{1Yqe~Hh=S_?y(kM%u* zA!B|F*5l2KXUBy$K`ZUH8UPiF!x&U$sub zg6rss1C&%~KaIM#bN(57OGCQp%p-S`L1|Lbzr=NQvmSlUJB$>r6FuE}%6TAXAp6%`P8EfW@SB&JJ zYHH;t!o*L{6O-zEP$oNQlG8q5hSwHCmd)stl=2h<`-GvS;VPmwC5C>)hJ8)p z(H#5s_q~c70xvPI9(teg!Y>{i7R7b!%MjcAWttFZU12y~IL7eL|D|&&s$`ar=M=u` zmjqbikaN@u>jbLh7a?!pU0;d9lNjIWM`?XQ`+K`8tmpCVvPYdRC#TMsg%s(Tx5P&_ zGk1x1C)4zWC#Sj*?04vtFhY^W8td&#C)Ad)h`5IUw(8IwMQc6vzI9>XuW!BQR6r_EaLGPBX?KvBoHQt@4pF*J0yW8`IH|LETe(BSVUIDnv;o z-f=lH9PVM4xkf>qvYPRNUzJ>lFUVFGi{lQlI~`{9b?V?@w8P`mIAnthmu@46_amv@ z+{7?Cknh>mt7={8^7cMJP`yH55qJ8xGY z1Zs7L;2XNPUbtzU6Bo#n|80@Us3&}V%RecFiCU+D5qA%}KF!ucl(})42m!ab=_C+y}dMSz#z$nQg#vMPBTz_{8cy8l=2dIS8g@l``zi zB8VGK3T%tc8b#L_w)5K+$r0bYG0icVJPfTL*wcOPHi&~6Kr-_30X=R(6FIFi?D^Yz zdV?d#fw>hruzSZ23Gi7N6IYVj%rBT98%B`;Ij>K=&Ju6Vo$7Fqs*tU<3WLU3G5!>S-VE+c_oRYJhugb zTsY@Ee+y8Te3j7X*DtG3Fk<+z7i8oW!1Bz;SAwT9+}pV4A;j;y5k-h~napN)FITRZIy0v>A7Z(yt-B5VrY95+k?4>f4hP*30h7JLzgeen!4C8ri= zEmF{Sv|!J5eBMP6yEdIM>JuAJ2cp#LXP!Rvc&V@TYIG%MJ(o?5f&7XDE_ng9{L`*y zM5<#u89P8t$0Z~1wc^D6DCH8vj78_M+(R?NrVzgZhy+hyP zL#V3H5F}(nVRV1xg-YE$l0)2&8Yf*=6ih`OeLeSY7~_O?nVK!7?xeI}fiU(hDWAiS zZu^_yRh`3mpVLEd;B100I-ha^hOTEp`r4Qp=EwUFtE9^I*4?ZcW(G$QwuV7b~S@ zWw&Ny8gO>!Didw_g`N3a6);a&1P-6+|EtZhwU+)r(wxI2zHVnRQFcswR|0B((r>29 zV{L6M!0XJL29Y<})o-rOf1nNqD!Y3lLvy&0IpDs%N=ZaT_;~Tawi!PPUyz))WP1Ei z4ARyuhmINN82*K9EreOM$2)}=n_I`w6AEHRK!*x<=H)Hb)l)!CH2t1Cx8|(Bv)gT+$0oq4X_J> zc@2p)!E2K~3)d}xo7dJD?QFe-Al~!<*4YqPzLhB}C>DDT&+p>kILkylL&YHje@1943o4&GdwHEPxqHj5XSYkYb+<#?CoH;xF7`Z~ znvqiC_vwUejlx9F?@Qt!3qoDz+sqz%>w~;^G@vacHb)EQR|^1ago`^th=sCxQTOSV z7s|J<$u}isMVBq;{Em>>JB*o}j0^T`9TlE?jWgCU#DhqbzZ=6UKs$hA-NIQS*i)<| zm+>g15348ISxusU)a(K{f1pUM3CGA{81R zAa=VrENpswRUce*cUA^pYH`@fUiWRXAbz>>COPIT?_EshuW)P5M?dUnlgxkH{+w+( z0_d9nJkWX85E^lD-f*}H@(13UmzXGBwawkZWqf)=>AYjwnFL(Ci^V*xnu(7jGPJ1j4+->E+Jxil%N}T;dhaZP{{A!*f zYy}?Y5IOr>$WWTakbGS~Db@hFDFdaT_hP2<$g+X$>G2fnORW2{4Z?nDBGs^%_0$Q@R23MP{sQl+zRwYM7lXSb`R-O+7QLkQ)3_i0 zx9?%9Lt%M3-(Gae0wg@nLItqrvy3l(m zO!@pT+oefHiY(Tahthu!wnUy}xVW9DZ-n=&2z2Cj6S$5g7ub;{P|Gm7o*t+g4%9ev zXmqd{=+ro>Vl3q-LI&dfwWm6Qemz6vfBVIVkPCgR>Z5~3K0cphpq;vPJa@+Ab}-~a zm>?|GE=gQy=LD!*3iNgj2^+BSyzh8;q>jJ!C#lKh0WNyQt=UmLkT;Y?e6Q7PqR5N- zcdpsh13z}oF8Rm6=8U1D!GR?38cF!MlQRIi1cpS1#R2Snj3HUnLE9<=!`3~xex0|*A@4xhDZt~G@)(8xA4bCum$Z`veV-4W~y^F=1Y-h_jRj3 zVVRj{rGl=K*398C^#MpK?jCz~bRip^V<^NuCB_TWDUdG&KYN zq!_F|ag~YIiNF6r?p@oLmtCG$vf5!>oezr=z&5j|xYRV&1V4ZNOhXb1r^QC+;^fMi zn*$oT16<&Xt1q%1*GY>PmT%~IzsfFy`_JC_&i9EDZBc)&lqk7vkS=^>@}*(6;3vhq zT5F;h470KGh1V%%e)MV@Eit7vzZtEXS-AMw@NhRa9TtXnsJJgq#vp1oQQ)W#7P7^x zcl5|KqM|g36u$v8>4O=O3Md^f*vsEQH&)o9AU=m0(VN2)WO*7kJ+Qk7If?;J=GnW` zUcXvHanG2Cf;)SBO~g&yNg^35@#WMN+5~Ka51768r>s$xJavtgJ4FtQX}m%e1c*Xp zk0il;q-cx`a|cO&z{FMfyZnx11+#*XFk=(R{RuJbHAf5z*_5Fq<#)Ayu^Y(LMF`XM zuZb1=k2v)^bseqy{*i5LT!LTMx@Bx!V!-;EudBWJRgoX)1b9W2#m}#|-EL-dq($;H zMc5;}595o;?~iw+^jI%YgEb=GHi>6_{`v)v>j{}{GL3nR=X@6W6tdZtyuLX;AQPcx zzu0aWn*?u_55e=HIy{E_ZAd;sBURgkjW>yIfZTfqHowW|pu5ytHJ*eoAioE;3EbcR z^&32_-k5H$T@nmF+|JEi?HQ0)$=Q2bEMyGdQdpyx+`40ZmyUae?L%;T{1&syk*opL z@+-jidO%eYpL{5EqXv~(@3{c>p2yyDAGxoFHq}WLROo8}I_Jf>7t%Eqq&Z$Nfv}LW zNh)7lvj5_$&Bo6bvGTwEi>sy^#aFO;fZdO@Fbl=TP^J6&cd8r~=?vW52w*UqFvf&b zswoDjK&TDatk3oS_iwlyi=Wut&$~ar28+9N1ZljIyg2arG8b-&Y4Aq*4d)hW%KQtj z#>cDZi<=obkoN1t?{^F&`Z*4{)_t1Blg7s8rI$d2!(M<@iO<4?y=h4VWCO-F3)H^< zqw}U9yhE&g-GK9ZLv{==dw<2nDFVBTOHOV+-yX)FM@qdB^5Xo0U%W2rVuyx)hFSvg z(mZS4F#w;%2+Y4iDE?m3rj4 z$K0|t6N(32S_6L;*0D_m`Qes6hVrv8*ZSJ&li!1Q>?5zTu39T|i1jaC+aE2*$8x!Ogu}Nt*>UG`?Zh zh$1}wC(lQb1My#IlsCyRbOXID+5%7fwRM6E25eLZ7(P$W?{XEFmPQp7@#3keE!v+Q z;q>;-!18){rKIjeMNy?8>1MZ1hKI$`0QQe^#F?HJODQdl5X98f;-wZ4#KYV-DgU;_H}w5MUcLco6C0Zx8DEL~E6%_x$@l#3pXz zVQ?%P$XC53wX+4F0iV3*qe*VX&Zz1suluenJIPW2a&g!?6Z+Yp?bmw zM`a4DPuwy613pAC1^Rk?>bL4%+0U*$hRHSkw&EJ(zpEw;ye7U|w znC|I;X^8uG{j7T~prM+!H)v8bxz6}OBacMQ?NcE_EGO+>Gan+~vOjnC@7zqaB)3i& zV0|+_o|R0TJV*;@a8>o_xYWdq$QC&u*}rh%Iql!xKBBcQ2^9dob-l<<#DTy$v8rTO zk_Js(zp+~CdW^~0(KZV1T8AxEpi^n2fDyQfUFHyZ9NHYV0(%)2^ZTVi2P%Rjk#A)5 z$!Y%$-OhV|Vl(qwC*t-TFjwBIV$dFt+j4)6^Kr{m`Hiq+ZIjNjhM89^LkxGw$#@4A zocN*G0oE&T1_DXl<7qfoJUaJ*?s^)U7(C<|dYySKHjpy)&t&wFIP2!vdfo8UPXo7r zfx#3)I#=hyWdl_AVxhuJnt1imioa_i6gOlHfKf#59R}#x+gbjAkmeHJWHNsiVx%BW z!(P(1{eU#xZ|f?&-}M2K_79LroF4vcGTc-W3D0A6Yl>^&Y~Jn|)X*QGM1u4_Linea znFJY`h{Cs+l?8O2t5q_GZSog0mRipY(i?Y_Com;=Jm@Z{r{JsW0eQj=Kd8>ej7{z5&{_?t6xl4e!@6sfnJb|+d?J=7uaBLw0 zv2%aKT8)DGx4JUwa3zS^I5M#n13mITkQpTh9e+7XDqnK7DQ2H)Ql(thV&gTx0|4E2 za_4FYgO|oB>ox0L(Pq!x_gQ;;2U=2Hda1floFZ`Qy@xwe%qWIZ01VeK7ijUUu$w1&%r=6>t*};v0i`gCmt9N zG5UI&Acv3ce&L;O((^-A1PWup4sJR9z_`riT=m$I9-j0X=9<^9PZ3GIKhX337zxL)`Ry zW5KAspU8Ig@SpY{GAYT!u2!>>zBmf+VPkqSDqR_1u&pnG)<=y~<+W+sDV#D9?I25;}+&&pE1Gwn;oM#UO6j^b-|f66V=5%(Q7D-Y|dFg37ORs|{#|*kAE~ zb@E@z%j0<}q4abriYo6MI}7`mRkS}fADR>v5ugyazz3#4%vw1vleDT5Zob36fO3{O z@G^6j-os zNv)@Cq%pG)&7p)bNNW46B2_m$aFm?=Gj&S!m*^cHdQqGNVZg1qsmUUznUYr;h%|SR z1Ub{GXJMV(^zSfq#v~)CvaV!WCC|Sq7Vv{+0%)%8g9n+0RvY552+)_2ET7hbyT1|> z!^5lB+-!IK3%HvY4D&U$?j-8*in!y4Gn|RR@1QO{cY^z`JTU4q;X!gCVytDbzZAb} zp>a~yghi3wj#l^Hfa~rS{(62piyP2i#nG&-@2Kk&YEq}{b2#bo8=c2x5OUD9=J*v{ z52k%9GB*Z^m$*_0Z82~hZ@}OBpN_v~c(r_#VV!Gy9c$s>XsE8B?t!Us#fkDEbdofR z8tm>|Zn_ub_jSXH1K?#Luwa3cY07t@$hNlkX z&&;h4&Mc09a&4J0$+;3BW`*xvtIqr%1N(JT0K)#wkjyXTJ?ZKW=iXI;q$(jQpvr<* zK%LQfH*z2_GykjQFBQgFYOuGcs3fpu8@%d28A!v{r$|6YFKC-H2`WtikR+1rjJb&8 z6VqPlI#3JfAg?6r9##!oF(&v79sgOA<$ynOb7i_!&%g^Rff17XL2hAV!Rkhz zptP*#KS*gkVhX|LeGOx{bJQl&UkUAZ`x7}-8pD67@{a-g6Hg(&cKtP)oKMbN@M6=t zsTyc@1oggXmGOUywZmOgTTtejQpw658wxluRg?c@69v$=OfxIsU zdg+g$0W2lIuqNOWJbY(uD7qB3F(&H1FK>@tQ3O6kOn+jLsSOPB@fOes{w|EDm$-Vba$L-X zA;xmcV3}jWjHzWAm=Rrx+%c$X0N2_5l!7SEPv%v6DC-fl^S3ErCg4au!$AiPU zQ_o~*ryW>{!q(`!Q zH4Lf3V>e_Jt?4 zAc03mL~C2hP$4kZPze`jZ!w?SFx+O0Zzc}*nCUsVuUU(d7 zg=^faa&)9@YGX=P#j($pgC!~Rnc(kAz;5#pF?=}+Cz>eouAf}B4JOIwkOw&X zn7rtaItSKq`FB3b?JcS5G97`W40K?(BP~tNb!vPMGzCpER#QjcNXVNk`Bcd5bhbe z9#T%k$}S|(kWQ-Yh2@v&{lM!QYBgURB0xjDcRlS74#zcMwDW?BZhj=^x-aS!@7h7M zbp7qL81C>>*H?KDOe2Y0ccc07luA>x#XuT|J z)?(Yg9?3fyh3q?w-KE*}>@`=8-uj3jnfO2`@^5~;?umYI7+TY+O!6JIu8*MF3cV>a zv(zO^m18vjg!qgh3iJp-9)JqI=PO~KkEpi&N~alVW4=G7HD~e*JN?&MrIwlK-s| z{y!yGxIT`@!bcyZ_o-!ULNb_-j1c$i%y1W&%{AXtU)wGA*|au>SnN=7uucilu*cvo zOa{~geWJ>BpBVJhWO|CxZGeT&z#dKV!nL0Pu1?ORs%(SOis0d*Vi?gR_erR^-i{}) zqy_bt0GcbZp7Y}ERKZ?QV}VuYziIG$iTt$I0HFGIvzGKQ40pZqI;vi+$`EuzU7juY z?Rrol|FR-}?{U^=lJ+teu~hUSsLdiR{-?hdXXED{w9=&ChaNA-ELacub*>W6UAjy> ztpdE*!c5QnTjU(CQ`z-&xN!s>pq*9)bkp475sn0rxP`Z78jSI3If~Y$(gn8hfkGS* zL<%hSegd7_cuOIf)yZ$Zkh1$2cg|y5kiXCQj&0FZq{^Vgl)Uh!f}e(d5Toe=e?#F( zl|^iOJSM@4Mf6g%29P7~w?}>x3860?BWKzd9($o|RX#M{#+qzx8-nQ#>Mf9dqoR^V z28Q-b&epn?%1w?1<5$gLf%+pfE-p6kBnO>Fh@Xg9$I7j7jgU;BOV&R`ZKm;M!2)IT z6_N%b^gRi=XKUIj@6sd=4J-tLpd?dzQd!S}cZ%~jvlR0X(<3c&b5o=j^zYHCy@zY& z*iv|?gA64bB=OPV`o&xbs&<^!OX&MMUgUh_0G`$s)y1LB})>PqS3qpyaq1v1_{a5 z^2+=!Wn4{@ZG=Tj7mrzTZZOeoF5qZCgMFsetFDTobIL+{mK5DSQi|z^Z}S zV`nd0tKq%le^yOur2LgL)5?V$x+L(YWlY1c2Ntwda{kPdR@R$lLWZ7>#uty6{_|wO zohqm0dz?HAw&EMNg)T1JIwadArF-tq3NBN-6O^IhTfKzt??pVHmL84Ee{eMK~@n+S_NeqT?bm8Sw~IL2B}9PdNDX~S_}!a+%j3#Nko zE-j8Pn~C23ts0CPgfVVW&!faM57#hfCcEE^&p#%KIXtY=@s>6FA=k zy!4mMA%DoC)>AkD+|^$kH%c5g<^wHJVH3gO&RW}E!+9doccH0?5<6K9-ODV4rfO_A z*?d~+IGRSUW*e%V&LVP1P~7R`ZiJO2>HP~r`gE?=M+)|s7Sa5%e=@@6gDaRFke%gC z@CxC&+D>ATM@8|{T-Y(S8erT1MQ|Z*)$s|oUGlLU%RQ_)n_|hN0ahD8!&~(|(gJ^N zg%OD2FM4zy3%19Feui~g@;nzDAD8hDZ+9r_I{u)9kWqD04ebq}pS)sOs_N5x=fUQp zM%elhDPZ1JQ(iq3$UM-y#i>gR3N-iLBY5s|{=D}iDnH@s+!*B7=f;j?O|51Lt`-Ul zM!YaE22xXyTt9WR5HJkGMq*jSLk4sz^a|}Plp_<}L$4z)mr#ZnBd1=Hlk)-4`NR9^ zbYdkJ-()IOpW6s@WW8+W5q_ms+QmWXxHTiE=`je@8Gg|53X@#8PlfRvm=|Gz+-VZjX_?74neBFPw@!b629!+SHLU7 zNZA%4#5G7)_JH9^JegVGRd`RkjS&gh1AZ<&C2d*N&hkWT`hBap9J6PX3P}$zn7~Xt zEbz3%+Jc)4_&NZrXt^bDp>rtBese`d@;#C5_BN^*d?q+iZJSVZ2{?ZYB}$!rFK4OP zWN6_evn?Z#e}5Uq6&xPX4jL|cd@#)7a{Iejo=>&DGp^&z|uu#oo!+U!deKX0kZ!tTC0k`ISVTqb%2Q`~z(#cX^v@J#3H zX#TSgoHQw4rYO^PjZo`HDu=P_jGWpV)X6<;0)BsFA+|{qnKtQ`nSd8;6RM)BI=^`X=<|gb-4bwiadI%3D9ICm=zppMUx=ml6@5L0e?a;EF8g!&H`MsiQT$NZClzgKYV zKLqNn+J{M6!!Gc|vbidE@;9%VuLLccJVyw9$1J@5(_&CjZ|E;E|=D z8~ocg4h}Rn@3$w2#G4a5QNbe;K#nThlwo6CZq}C6NjejKKpBR0U4}i5hVQtz*^iTm zpMF)jg*`iy^O@Fj0pB?_RXS*fJHDbgWf6MTwsf+&R&907x|Y)b;twO@KVkme>p8iW zYJx%jES9wjQC_4Ivei}jmaNO;6Z^kK04oV9Ztz!gFrdVf;aQ@hdJsEH1v(2uey#-^ zAn-NKWUA6>DQY1z_?Uyz^V+yuOwx6&FSM~D;+xcgjMa9VV0xwj|XQat5Y-K3TyvlA2 zrQVq>Ct#c?#j6n&D?bjUB(qI~@%PlDN>4m74~n&KZuE>IFFl^Ri%VzT8f%K*(!;%J zY+&cR?)%Iwu5YL$rZp3uM^Q6?M`)oIKScnV!>m^W_9r}8$jGShm88tLu&J@my@s3k zml<|>$H*#Qx~EJszzJ*#v7#k6Cud_D&_JB{;Vq2D@!O-@!=hg*c`vene7$N$3L5CJl0EO-9b9l+D~ysmV602t#(-|>OH&|BZx_);m3x@Du+AUQJ|M|s8# zDdd*_=;gD+^xrGc@Y{rmmLfuCDS21 zkktzinI*-{M#|2wrkrgRB5UIRk*WZfU*8M0TAJwns7cajvyq_3#AK827H2Y)>^HWw z+%6`zPfKiW-HP^6lzI7q@3}c9P9NlkJe!Zi#sfgN9JY$+622-GUS`ITYcm#mnV)LE zS!{dbEUq)b-9>tZ8^r2x**wV?L@&{uME+^6+MM`&_~IDNb?5&l6DU;@^@nvS0(L;> z_g}0#C<{zb_*^0zZ&kXw%aQMkLLY52A9-~_g3SZMu5Lk}>=MBgW!l#K@jVvM%t z5-gtt=mzfZ7Xj13SSo$f{QUMS&3D|60;uL^PSCu2>BBc>JD?5@|Ax;`E(jKw0tc6_^4{;n=CF@{8@a8b>sF&h0;dXB1B205~2QTm>LX6-=nYEtX)BLMyUIQ#w#GK>ldSMyr95vEf*sdMi|){6wbYYX4{>m%wv=c$frePq43r zI?Wt@^$e(=o*f3eUYoa1pL*b*SXECAu9$FBFj?nJdswg@wP5d7aLU@xo=G7-n&BMddLF|xXMxeLC0hGaIe zVWj=yb2fKeBG_AydTY!(h!m4<_QOOxSGzxd1BV zZMc5uVSo$`z1R!b9Sb7;RC+i~J z2V9LwA?7;FDy$_vMy4+Qw!s2|e@&05KcbZZHy*T8WL>+i*@u6zX?no8juVfWTFh(t z-gykfD-=TdY_8Omr!K+=p6ryIu6Tl<_NiRs<&&I|8; zjkrAZrk)T~w;3T$@GQYgkt;L9JPvs);$<;~P?v!PH-sl!+EIRFbRB!Jz~BC%M}|G! zg~Oo4%(AvI0x&0v6373+NV{hl&-{K*&MH_lnoWBAYp0W}g+-7Bt>75SwDRTev7)=} z`VyW$rC2`9)LWAK{L~!4GN^WnAWrUxysLKc!{<|OVI>vgWCKX@2oNLi)UT@)*})2) zDmfnQn_^Z5eEKCh_*ir5r-+o!Y5vVz?OT(hcvllQmebJxyg;`OpDP+AtZ@{&BRLM(sV|9uH=g z7DAW^|51EdkZHF)qxwA4&qz`zUftQE9~>~K!&fO_Hrcvlsn9{V`gBoXc2qi4(9iEY z?)D!q)*0iZBjb48k9~E9IknjEeCO+;YoQWODR~<1+Sqb-+|o(nm8si%s8c!5A$3Z6 zwgYP`Mi4swcPUeKCrR%xe#i*|5GJ}d#r#a+K*JB2GA@k4<2-rTozZzg${dlKWLswm zvt-os0ZZ~p9(-8jE)EKPyYN>!&yBQ|-h?UKh~U!O-@IGoi4pYj;vG$> zDzz;8dg9r}Pup=@dVJska7YbLJ#5^l8Eek$FwvFugp%|U6_6>MZTPOejx;K~6MWXC zIzQEyF!L8Q>Ah|VfYDZd-hc0Yd0S$bPA7TVDhDjEo|w^nI(&`uPjZACmMZ-)aq1LB z`2f_qc4M-2x;^&yV&P_50$_zf1FY?-WNYkwEf^0M=rog-ZHg_($=Nn27;;ad&@A3a z+3Nkn5gT#xwLj0Y+kZS>F;278lc$Y2guUzK3ahN4RB)v37bJuSpoF~O+T5;%;DKbV zZ&UKk5|vX0N3zOwXlSSj7ZxPJ>NLlyl;*^3_D^B&EQ5K6_CAIdR1`ujoi0dbYbYuJ5GUz{ z(8{@%PtBf4Mdr|t!iVp}Gufm8ETGd#ExX zQnsg0N^xkM?)jk6D(cQG& zX*2Rf)`d=`FYhGi)02HbP_z-Yr(TI=N#rL=hl)jyVn&QwP`o|}*im=ZQ6w+7xs=uF zD>7|4hdU{HYE~*WqM`=l0Qj0B+(09Fba~v82C%1_9qr7OPy3(Uj~W?4&8{W~wmk~W zMcp6IC0@&!{|8_-5Z(kvdF=v_SLVk}vk`W;sa2qJ!A~&q?A~Q$UGC8**(@n5^GX9B zI|L9nr!^jSj@pFHwFW@#SG9h_VA?E zk4{!}CX?p+R!E(^*HiWz79vZ(*xP_ufbxzWiFjWB_K5WV`NP6endp6s{BU zOU9!xFO_dmc!oM(#9^J>q0y716_19OfH2Jk{5bqEpD79}x2Z2ja@EO;!W2^XfDS z9Uk+-$cj7OSIKh(5RYT-&yo6QTj`B*DwY2VE(|FQ2(jAV7x4%36I_2nGUe>8FsPH{eVX2qN z`{#H~YMy*BZiYTHf!`x!IMfA&(eg8Xj|DVRR;QLlrcOVkqJDeMtYOxzPUQF@?LB4s z2`4lN#Oy3#0-_**orwN3(rvtU9#nreiyDM`74emnlc`lgsE)LJ7+wGdVQkk(Os@Zh z4&FT2$DaNC6td+cj+4r`pBZCv;Q)PG@nwkyWO1Y20i_&j0OxtPghRX zR-|>yx#GsYiYhxRefhZfzM)Yy^U1qj=AL})h`z^bR^8{!wJINl z_@*AOu+m57hvA1Dk{W5Vqy{PaC5>mVX9aAbFTSr8LivRpIOVSKGaKvs!7eJe9WLbW z==Eui2j}WhzQ9e1yvzU_K?vcVXD-w<^h<9J+p71nstXcFF*+Zw5e4no0wV9yaGy`a zb;r`4zdfgeCP7*Uz^O24sf}6Xihb190iSrSO#sAg+zKi`an*0JM#Q3^ds4455w z&)>$30U9#zO|MThrC#r=ARwm325MF1UU~A4OOBmSw{7K+xDNF8~ECd*}+e z2FyxGMTKO2o-6Nk>mU?^VL4;cc?PGGJaANGB`4tsV8V-p!=c~Ar2skR1c`{sb`TF& zq{I)Nzd3s81%7PP-!Z0nGlgB;L8d>;Zpvk$B%1318I5qDcWx`;dYRNLKpI&9Kq&w+ zk~6SweErvdd!#=^*OJ#K`8?+bS*WPRj*|yWe-k~A8u1vR_NLMmJVqbRSc6}%L%_k; z4buJp<=(Jd$dDf?zm}qfY;MNV?VvMTI-dC~?sPnhYTTKH_K!|2{(_W6?AZtVB#K0&L4G4c^|X$|O-<5*Fc za4>&g%4Xr>>a{>pEi8~)I;7XqN*-zvurI4y4m8fed(CpB*Xo6%58~6$di?~Ff)7ag z#4UO>1@|8c2}`_;Zx?WE1Fjsix-!Jw1h06Ra-wUn>_vJLT)~N1*Ae zA9S^MvD@JIG1>^+njQX!?Y(z%Ha(Y+RpdXY=yfZ^CfH^HxQJ>ZDyb#xR8&iA!5|SY zTC%jh&0A z)~(4R{0GO1in)T58{^9fi~{1B>-|+nVmrCJS&G`c<47YVC1j)Uhi8TYi|2=j& z6~X@kV!?0pfkMqMf{T;8`D(c|*>nJ(&owNpslk1`7V|*9Vuf$WExhC269z-&HwEn$ z`YL=gJmWfQJU3l&H<)9pLa4P^lC`2$7*XYdX7e;~w1*loTSym`mXcT4PTV5ioh9(| z@|xMWIdwnu+5FTefVa}70qPzZLQ9CXAGR3o{5CNJN0aKJ1Y!FudeSCWEsfZ^P3nhp z&g4OB*5isgKAu($aH!va>>VgE4LkA}LextVDqEVepN&?tGp z@l}|_QizbK`3eO;ekt$7_*h$ta@gxT+cG<>yQi&T^+a&qyLt{l$p7gWwTCMHbq3*J zbwXic!Mgr!YsE!nI6y_b z-iE6CQX&FG_FnNitn;8S%sUFxCFSqE$xxEhALZUD=xU4F_V`pfhx0As7ps#(8s5&G4@8r){YSCPk zWN~)=^q-tb11jD?TRKYQSQkm#=o_GM5E5p_$?{Ms^1@`YFt5}w)%;u3g(~qs8U`ls zm5b+PMZao0HpZ4soV7)t(+q7~Cs?cINc0G5Nh7=kEhSFKTiKaUe;%N}1WJAmjueSB znR~Wh$QGi%`mI{m$<`P!p)*o0^RnC+U1f>cjwPt5DS4hC;$IfHPZ$v4Az>7i@`?7wOZ$&;q4jmU#sY!3C;);-IJ+lWc)kFs72d zw+^!8IgC*!n^U^@@BU|>8_pHN<%ndKGctb1R^qFW{&A}?O^ri+lzl?Ug%(U19DFx$ zIF!w|y|*iMZiu0RcyVm$Rff;r4X3kSCyt7eLhcwqOdW^XZpt3q;> z%MrL1E^|p15CDdZ>3hl=5kR4R;?k}ITzWMp{>r6Y z*XYoLc!iPJT<+$JgmxU-UZXz@BNN}(vJoO|i%lJ7R1fB7Fwg15o$u*a+uv@_Qr|Uq z4?8+&Y&e_8n_SIPJbnqY=`&8l8BC^X^Y|(`iTNYbABI2(|tizrbH zFRK8&%986xsHDV8;f1U=CD@v(AQURp>9DK&yp{*19pf95=~ktVPHU6n+FzElurjN2 zvJU3^^Q0X}a4B%=8;idiF=6DIq=e6{%ord&4&~fbrT(*YXY+oLDYC@b`=3H@%L>kC z4^@idJ7B3%s!WS213-7^kt0Kd!%#6z?rZq7trTF9VNX|v%O(Xv;+yFz&`S zYz%(y63O^Iid}B(+xNB{fyti=Q2^=!6r$p#K~5~Nc&o2d7Jzj6oKA-FSu_ezzKC=d4;vlLPAGirg^$$MB%Hy;cJ`k2*T9rqjv>sqd=dE? z)#3DW|99@ym}BvVe+CWy(?YqF$uXt`T6zW9hAG{CcRYD#aq5iNH`pPQa zK*vvoks;)IABGOe4aqli?et7Mu4B4;aO^Nri7F=k8nfsWzUEz&hL;k9YylhwpTBcs zMPO1^a09`WLpn5e-7hL=kR+GSq^JOOd5VzHZ*JKf!}p#cNd?>!C1!*(fSIoyxCX~- z;(pM<3WGpyw}C+A?U$v--rkJddiD1T0ew3Uz2yBxa&lg($(pqCbwG3s5!mTE7T<&=Jx#Pp*WqV5N_o#t|rTvF9x-PYQgzvo8LV#NJC9bE~d~!X6z09^< zSSbDOMMwI)l={nTH2tefjuZ%Vd_!$OE;oo;&xc%(0Vxg|-=mHO&vqzp2PwiL>QbB{ z%OpM&VA)J*EHVLD4ETES5{PcxZJ zUj?Uhjo3$9G{jnrb`2sT_&z7gJT1dU=)ki9xz!gVL&iDv3@g?7{WcLR<4<&?@Grc) zPzZTOU#`vZQn}xE*CiFB_Q~azEd=EV9alW80dVIZOMKaB#Mf9v;KXQbI~yeAW`ys_ zHaaw6zI^Rvnb5UmW!XbVf*#aXFXKvB404u{dMXDqMSKE`wH- z%<(9gtQk{IwsqNHdc0$)qd@IwN@8B$P!&jLLo@nRSN)iA7 z(g#B(<3a&)ITr^Nl`W9#MZ7Rx}u53_m*Nz=pCX;LDWDsk)?haYt|WZn>MwYr z1fZm32Hb`Xx1I9$5(Sr^UH(PN(*6pGe`dkhHI{Sl48-Xy*K^_{)Z&iif>xYWLqnk%1V7mZP|K@-c)=UO?<|+x>`b=le?7oa|pSo~2 zn5*)7e~qHY`V!?g9SWJ`f#CswGIf6{a;VhlYCoJ;RH%!~*OXEsxU=MDI5WXztm?BG zoddgPr4+P*Qo6D36?(p#r!KWIURc8ybk8WRfrE6ifLbB#AON(+XXsFvoB;I7ttpXc zu51uS>!y^{QH84=9bPWev&Lz^Ac#8(ZEb^MUFMKU7eA}YV4s|{(=v>sjoTz(wcdM}SMwOZn!A{R;DJK=QWNcF5zq`ti=ovuO) z2nS9^Nqxb7V0<&e>1M0zZ!!CYisfO7x9sk*JP8(~Y%m`IUF5EHizX;97Z4iNylP~ z1f-TUQ38$ou~vO73w!3ZLEd#Bl9_jl6!f2!$GuEZ5XoRcJ5DS3aBgeIyzF5f8cbQf zY_@=9UVs()YUE}?x#m_I8_P}^zEMWU8)TOSZ<@by2PpFj-&~!G7WNPiIyPNZM`zZ@ ze4+K5tM|BTEar*@S6EaE3;W!relLAkzKna+Vd|LQV&482#>z;53%5>t0O?2vLTx4O#6NXX)`uq>f4U7+ij}^ zvK5oMT4cnFQSW`CO;=E=Pa&624Vyg)!f>!nE7~&Sa-R zQKV9eOgV>l^(Zn$e)8HR&DD%r33!@aDHs-}adv~}Pnn1yxzG0F)DAVgzM=X2C!ow5 z%Pi{DH2G&L2#-q-A%CF2eVz4fU((8;e8kXl+OoAVW$LpWt|~P%u=bsVUwYmasQiQiLeVIi@F8`~mDp?&7tpIf!9i24^$l9Ti(@k_6 zLIO)9xx)nU@)Hg~Y%xy^_Il#Ccf7;v;zD|2{{v@}YstiKpGfzzP^?eig8?2Ph-Lug z7M?@a4m5Hi!)#lGuY5MQ&qiOzToiVM2aqoMvcK`oKY0~#cvdK*gM6rit4i?q+8&4H zMA^!^1<01CodY?MguSwt16csOY8a;Y*}`9`P7=rMkr#SC`m06>#!@i+y+IU%H_QR_ z)^`Nbmvb#wn|y=i{T5z$u?($nY`XV?2T^TR?bthG;|0sp->WOkhG;f-wjwaV@g2xN5@%^wk%b1AX+dpOf zM)u|_+-8OAeOIwns23)YBDyTXzxUBM4!FLYwsYLx$T?e{d|UgTiwm?u5It?lM>v<%t3N_4_)_is( zazy>hjuQN_1^m>I568tA%H%bCJN$TYKEM zNtOkYd73zwjTjr;JycyoM*)|aM7`P&WBzo859UU>y*`0_qi)-89ui?E|K>!nF#u?? zg^bR=l8e|$GkU((KJvNA>!s9b-83yPni*?jLsTo9_soHOt@%->lqQ-^}v+ z%@9!gU|}NXEybi&fZ01EKnwkc9w5A9s@i_ic~w!A*tN+Z659~;*}yNf(4-4n{=&ky zA&w@H>zX&)%PSBk@~56~K*XtIDkk`b5iIjlP+;$>lHNn`?WhbBhSU%prAXE6J=_FKLj`y z5gnbdK02&cpE0rRDH<4J)j(Yd8)x~;?+df}8HDrIw?`?NnHxFl!POBzN$J`i-B0zs zt1(~9`oMS>8BNfEAEa|5qt*E)psEY=9~^G1tjxzYC4*gzi8uBD8FqW6ab`7z9S9-` z5`2P{hO;pC3(!;j&&|p9pMk6QF9&o?txd%5EzJiiIQ{98mOaRM{?R9a=xYrBl;9f% zDhFhv%Id(8(3l$&PaL>R7+o=$05EOv9K60LamSa#RTVKW&}(7neeVyv=@he=zp2=} z3|=`ke|~Xh`e;D`i!ijC?56?2z~M$grg1_C^$!Qcol7p_z%~S&y;fQ=F>$lYa}QlZ z!?m2r8ztR=0o`E?#(4LE<4The)4qGLpU(ga6yOrp9WiOWhe^KcHc+~^D>(H1G=UAe zvA2PN2LWb@Mg-f~`vZQZ)N?x(9L}#rT``bo8wE@R3?aP?AJ3vnP8w3I9vAY7NH0~XV_A6ebAzN2L@Y7-@k>ph zty!H}{KY0lxw*g>#_TTCz$Nf;pF&=!f`0sqkTH%eVf1lN-Dk)7dXl~G)i{(nWNibh ztB_pZPA9x>aV!fKVrdrQ^0W>PES#KOv3D_3fSy2T-)x=P&_3@4T{kx6#n@d;EVrC2 zi4H$6CGYPCKlhvEPt%)ghhMVjlVSS2SzX6{iK&72xH#Jzhl0mUB5qx&QvA{FCdw8g zuQZX}YT9g^u=sn7Viz5Mon1kNEvOxIdwK#E9%smGVwC!x7$)}k2x!DNK*P;q_2x(ZSSu-H7_ftB#BhY>3*-IrsAbT^<$=yVsEjH(Q#z>>nQ>Z z$%QT1S1gBn;dR&DMkbb!kvVTp^8pI&kh`9mw|R-?ErgZ}s5KtX2yG1`aTHMB9+|3& zB!`*75Ww8s^(feVmz8gV%_rL+Z)Z*Ec2lA{-&=oD9x=)veJQ=L`Eg;pxfGn#j9HNbHxh}N!ZAHJ{DALcIjOm(^(>=(|`9&bS{N`Lefx*GH z^n7Yy?icBhJ{*le*}aiso5sPBhf@#3o#vVS--LYA={lV%p;4H!`>4ED0!|>26^Jp5 zol~jOzX$&0Bl4rsBbv8VAtWZqMI~DeRQekE27YU?*#_w75fEL--4V_>`7wP{yw1Tw zCr?#X9ZkagP=t+bc|Ve@lqBN9)Z6rJoEBZrs)o4OzRRL~e1kil)_7T-Ont*#50JG7 zW9A-o-L7WtH9OnT?J;bJWC)Eh;-^c?Z<0wzOtE&h5*FD;DHAkS<|Ck+=g*Xs))T0H zzAjKUCK2|PvVeS7h=3TaTBe+CzXnE@9%sl=u+XCYCo@Mm3R+qH*yA|6^Avebdq;jF zvfs?K{-Alod`I7B8vyK`O#$y*5g=*U>PcW_WqpQALTZ15LDHXw+qk8l<}IL10`>2z z^@xkRCW_`;S&0(%crs*tDsz_{6qtKi#(P(iw^gtcI6}@Y(Vu%6g zU_h*P@L2+&icN+=y}u&XY{~`9v+4g|g_lQcL<`5i2D_s7=QDrb%iqFpm!+nr2@;v} z{8!3i5Lo_43V>iywPge@r9^yMx?cp#-?H8z@_o@wg=|;8 zJ9|bsW_PVz=)K&1bgS8WlihtWBa~oPQQPtAT}*|T$nTYNG()A+<8NnLam8fW+L^w! z`n6}OE3&7kv+u(kG0F^oPk-7e7+Bb-atNJRdR_iPmDv!C#Dv=M-XRIBfSGGhtkGBW zxJWV2BHqf1lTF>36OGB`6?PtcUd|cqV_9A1U98=_baV$;Hq&j4!2FaR*{jK?k1pSd z{CVoLM@J{>f#2BJIBoFuAa?NHygzKeS{$D}Xr0eK^g%MnlSl|yq9@`0B{Yu@2t72{ zd_LOfvB<@ufx6|Ru^s0TU6C4s66sI%?)yX)+Zsyp=m|A(yb7o>oT#-uCA$}e~m~dK!E*LB8>FSP;d(1ny9F$c2FW(FRK%LfHUV!MI z-F|3jZJZqq4eV@ocGe6~)NuYY?3yb2q88KO@qZK2JVbzrC|&iMt}bqvH7q3hVbK*Dxq2GAzHCoF5y@!)Ty2BkUtLwH;K-t%=Ei5)8RL z8ixB2o|sIw(%n&pYtURb2T0Pr^l&=#<)4>k}CDVi-;|Z%0xt-1MFW2q;Te zzke{kSmUw}LBYq*V?)>0QatLRh;*ej=aG=5@< zd_eYO^qWTVnR{(rN9<7Q@}5zXsktku<3eK>@P~nA9@ad@A#y#jmA}RKpB6gafYB+z zTCt0);!Li%r_pB$tka~NJ4>FM8gbKXF+L{41`;b2+1*isvdx!mT?XePYyial)u z((0Fw7(>KCx|${p0~3`)CA8Mr<*U(BJ^_tl+=VMf2+Lk~|Bbt#Kot#i1j!a`&9!Ji zC=R?^+v}RU+ZZ6KJM>&6+^G#ql_c9c^q<)NDi%5@vjiq~h(Xd-ofS{Li#t9(YQJjQ z3&JQSbe0)7gIzy zvkJ4U<(&CHB!oTk_{Mn0D3)`$=?!D zUDKU5Ih7^~HVRK_JXYNN-<;$5b+ar%aL&(lZ#$!bN%yU7P)d6GI*UAsVM74Dv8d-g zVC!6k|2q+LTUigoPoe7vg~9(p73_GM=5WZyf^8%*W-o8Hq$IUEBt1;mM)=DyLhTuL zqYk##5@XL=@Z)6uU*U0(RzVeamERWb7X62A3W+@kY+8rZG==p8nZjm_$u|N!=~Fs) zKBvlwlt*Vemfm~OoK&nViPx47)Z&2}4%Xf|e@t zBlAe?E)kEfOBQEVYszIPuz&aKvF&s#obj`W%L~z~gO^efcA7`6hw&piE%`LUq8=7e zC*xx-$CdPMGWoRZCN$%hQrrAvC-Lf3$%ahkTtXlXC|?ZSU`GWd)T;wt!gg3~n2i0m zs(zxPBg~f2fPW41lnELB(j>jhWiLoTc&}Zu$xALUP&%#69S_ixJAJHyCE_?A)WN3K z-UUX|=-i+Yxls~@l8_ymdWr0q2~QchbQLkwqd^(Sb6{E`4_>aW9P6ESWx062(PXS4 zIG$p?|O%yYZ+HnVgkCcwi@%)V>qXGi$0=%O1ef8BSn}{p1qNf z-_k-a4A;9&^kb38VGMmYnFmdh{g}0&V0jamqBvhsxKGazcD1muNjz%P4yH4R7;+ht zXOF8kW`LpXzOu3kX4vV!RjdDtCSK1ted_-61y9<~j6L8N|KYLeOxc7(vVO9LIq8%#9gDs%wPuJpP-7 zmlTVL`%`H$s`JA7 z3SFBk#moAsF9PD73eP=8%X%J!4{GL_l8uL04av5oHP|9QJp5MkHxtibMc0kKDk=Z4 zlc1SHoSK5$R49G~G}-nXW|uI2)DGAJu~`2_DIz;1JvA5@E@}q@jO)fRjm@|dJY(pS z&cmmCtfmZEt{Q0km~fw(I#e;#b#&iK$7_X=20 zQ#kZ+fi)R3VWMM*4k^~m2PddgtO~^8GM+Qp9mSjABhcxlMcKeu#Z6Au%fC@w0W{0@?;GU7TpRqVm%OUaJPT}+jhwFn7%4Gl2qNF z6;s87E`mA`!U>nmi^M*+a*7`cbngtu@TFw}vDPUbF{J>ibQNZGqc}w=>=YV+Pv_Lx zxdtR?r%KcH4vHbMsX`JQF`wzqesg;} z_aeziqTj?q=e^~@V1{|`d0nd1+o5x7!9D%t7+9<7{?WjD+liE)%WqLMd1C-_mx^*9 z?ZfEI7LOhDeb8hKxLS1jbI8+yvv{UvcO$9YD!k;kIe-rJk|u=R2q-pMyL<0>VSM-Vpw#mYH8eA z{eyv@Iovui%&SJOp%I7)Ed|T&Z=w>D=BS~{v~u#hE%~?_%>d~3;#4>T?_aRn+I^cI zjJQ3dbCbO@!29+G%(j@t^U?677+^7*q@*My1b1k({YId+oP8`PPS^6+tLhQGU$tX9 zWq)hNi6DX1y5-ft15&!nD?Hx|Cyrn?>4OfDS{OGXWw=W)S=qtX9bxEpUMd!XJii4@Nsu3Qwn-1Bb=Wuq+zlO{cM$$(E-Ub)@aU<6%ghdwi!+| zTNwc4GLI{J!734SPln*bYTiG9DgDfVHJ1Hu9EP{UHmSG+x(lEr*v0(WCa(9ryNM2m zcS^@-dSHsP`KJQeiGK%@&CG161Kc;+FNYSJ@tv<&>qxnjBNCoW)JRsk74SwJRoXzJ((tg&Fwi8}Qm zUUr%~I(S)zi`j<)B4=~*59Op z3Y$}a3y2RXe^C5xR~K8xLGd8zxm(0kKPATdij?%115*_XC%r8Dl|+H}_hwqt4e(6f z-y@V&^7CWrDWH^!ni?7Emxd4 zL#v(01Bx8_-ISihs;Z0LSyiuNhZooGwCqdWpMaH}p5-PJe*#nUFoVdXJL+qY%)#%Q z4HKrMV3<c@gb5rPyx_$3M8r%(C-NW(8XYmZ4iA*hTYuA>|%R+iouV<{QPUHCvFRvISJC@iOiU&XhZBJU5xRT#)u zb#qu}xgee6ouFcplAzPgS_^8~Xjmd{qFmp<&!+Raccp3xX`0|E@nHO9X*8Mp}9BfT85HVz8a zL^dSa*zU^@Ap4Tak{kl&pUyTyJ|~Eb^dLMz&Aa!_HIo5iZtQet%95OtPKOWx#wv3s zpVbdz-Vgq;T>i`(_4Op}*YflEy@wkUc_75<9L-pg5Hn_uvo$IsPT0o-eUPhT@MgBv zN8mC6u*rbmfVM4w70LK1;V?dOzyBoLw%A3bcrD)2Ap;}lJQkZx0^U4m z^Ir?YxxP#cGE4jd@XUJQJF|6hih(Vgi%<0#h}d2d*w@&jFG|G$@8sy1sn3770$Sy6 zu&jim@j~Moqb7*Yple=zn&e3hKxSF_Y{wMT@Kc|2s{D!Ubw=a9N`c!#AQaLo2(f80 zO^^l_66K9nr`rP5xO*qZUoc4l0S;k%t!k<%P^x#An}m0DDeNia>WPct|61_S8JDV@ z$60uCUjNHE4GD#=1N(8VA)>b!h^U5>s(j=tk8v~Bg;Had-)?#xH!&A}hVz-&7W-j{ zy2!AWNsqMFo*?|7wtg+K>%59Zm=7@B6qJglCxiysurninI<;EZY-u6ur^W@y?Mr7F zKd=WVX$4WBZP*W`XrW1sx+&fwNh0{V#+b5yA<#4;N=23h zLnPm9r}5_73^gg)8qQX=!&g081HyHVh7rE@V6}t-V9lxLy*}Ds??&hVbtFMUbmYpw zg!|AgiP@v7^KU2YL&+N-EbXcaw&S(gikl)D3VDYztGQKsftDs2X0GLoJ;`Q*O`d}7 zeh0_R3t$Ci@J<$h(*H5?0@i2y`&nTF7*_*w`%d{uN6FRw+KYz|Z-PmIu8>Y7s3xU{ z*}Wq%4;byOxrHe}%t6jq+xIZ#pKf(Cu7!aj`(bhlE_TM+shEu5B&Kks%FbZ+*Q)Qn zP}GhqG71HTjl|%s`9eEO+0=G>w(b@LuDucd$WUK!Z1h)V_x8U4&fgPEz7@HU^bNr0PHV=#Q1t!mkiu=| zb=hz-=H9yK1rQ_Py|3mE|hV_3aAg6|Ou zn|TllDYLMs(t<8t((yvu_F$|ScAU$!Qoi@lC!M;1X41-eBclq>H&`dTGk1vQ-H7aU znfk)ie1#DlPSKvhDwcb6`Yza5Xl!-jJR@7epTcqf6h1 zo}K)h!;8b9PxiMNc)BSQ-k7J4j zc`gdkZh8P3G!0$H6r@+H@hhqjR0U1jI}Q!GPCft!9_F*n`E80(X|dtQ7h3Ug3f zWP?P+=JPO8Fs&LP6ncS^9OhYOuUZ&4wgSbf1hb)_k?Tb)mnnmq(=Un#M=O>5cM_}I zIlpOj^;JZUDb^~&zyn`+lS-okkHO2WkauJ5y zFxEv17GIJy6_U4-FjyQAkvrpkZ@aK4X3J=~vW(KoB2 zxVk;FQZN|+jva{U$mF@M`_5R_w$x5ne0V*v?>l}Qgx$cd@)Bmz$0cIZRHTtQ?AtFZ z%h?#{p-l9QXy#WPHu+;N100_=~eNAV(V}87B_xGB=bNlyi&RT9S+<+hLH*L!I zY5Zga;zmnK5eG3}B=VDRmb+aU7n!&J>bAe0%Pyb3g@92E0tHe! zP#~>0+WrH5+1n9`E(#_!tUOKF40BRb8;{!WZ-@aSt48j0_QY}-Ok3Yt8Imc+UFcoI zOqeJ?Gxt|PGT*Q5c@eKwsF?cFk>}T!$vl}>g$a7hgOqvxG|&N?u+8-j!^|`h??gwL z{J`>R=b(HCYo?>hrkMR>c6AuBk0q@2hw$kr|JdlUN7=vz4HFaU@mGFribQJVjSuVY zfJuC$1B3=DDTPE7LQ<-UhCTpL*R@3mpJ`f%oe>+g-6I5l8I4RnJpxkv{iO##W-D)F z{C4dU0k9GgEaK=Kci13a>`*cgBl*z=?$JHq!e|S0>s`9on|{KCj*~v$=H`w$sD87OUq7a(cIImO4_3PpWD@je8gS50 zh!YLb4KKC@_ACu&fi*uLCdG($of-#czbf^>Xy_K}idymOyepTf`6NNGRsUkwlDJB0 zWQH2?ysacBc%JDLUS{x}K1c*^fCH%P@VcN`IeB>8dRBaF{sd_2?BaS$-~u&NsD#yR zfx0;YD$L-!hS&AW%ewx%$=n@Nd&2i;o<0<1d0-c}g9m#7F;aWr;TmQvV(^=I=h66) zW!)F~0qcc_%trHuOxr|4BM(9S%PR$7vVVCBj8XizYJP3w2Td572~BgN88CE7!?+%s`4_vBKD|6XMtm&Ja`>8oRa<4lsejsWHc0$z(mdu5;ZU zjCoump#|OU9FxcB*kOIt+4JrM)%J^%AMYe*>#9?ORpuJ+pzq$D*W=Mv!ALNgU7bs~HHrlff$ZvY4yrFU3N zOS|_Ct8;`A0yGR;6Xhu4Vu~5dNlS}Qcy%;G(aJlfH?=i1>qD=7TF9dUZuJhHjJ6w1 z=i*s`zi$9_b?^>e5L7>ReQ^?Pab6A}&9VZZ3TU`EN zAN9ass@z)jS+i&_fUN-nT|z=#l5^8_;N-~|Nmrz-+aJqj`(NTp;xl~e%AYH-k;#h7 zC;TlPMHJUyK*(+7fg9m!3%U(wJ**IZbb$x+UxPuZbb_3pdvlIN@Pjk#QZ0+I^z%H%v5EaqJ1ns z*rv=#j=ieuv(QUDes74BWs@3FvM@xAO@7A%D=JPKc>|0eu%c2Hy%VA94&^W2jc2(6 z!V-gZ5rSX+&D`j|cVd0b{RrUW^@zESx(Kmr-4R@(r6Fl!2?@fEyLT(Bb|Su;fU;-m zGfG1xzK@Ry_hoMEh-W(4yUq@O7t|ryG5^S-C5d@_mQz2o0i!^6Ygi`76q&=GuQ^Dd zB^U8O6urd6jqfRgprur*vK+^mVk;qr`KXD;CFZyRW$&)%f_WZ2_fhgR+KHZm`S)AoWaT!? z`$^)(+fqJds&O}S@ah9ZQSEyqgB{G)7y&I{EIsKVt4G)LRQS`W2;Ili>lCA};y1Zu z#36wvH-U?QdS0bJzzxFfjdx4fKWOtN2{@Q1o0FZONe*O!543S_Ocn_cQ4uhnDNJ`K zM%oUsURhqihE;JEVa$@rXSSWi68sk2h{wEW)P0oASIyGkCr28^Eto zFnFT7hA7e!X9dOaa<2;r@{Ud%3FMa|L_VKWz)e8~KaZ0)5gnvQ?aV+fcMk(DKEO9O zbD<76-v*sFjWFJ-=kF?$cW@Ei`W7QJl(p)V4c!3io5W8~Scx-^KQ-B*EKPqMTA{1IRg?_*;1);Sa^FZBK2D%Bsj0Dl zoly2lecJw3Hw>^=oEo$a)@lt>G#4Vk8c|za!J|W)*A+nEAUpRi4G8hG;pLI&X^%w9ddXlMbanTAVU9j=L;(+Ta{14Kq*%o zJJKp@%TRV`US0<&B45@jq~Obq9*+P4n5&`P6Wp>v?&HL#B4NyJ&LJ1#Ag%#hJ05({ zRTY?QxnV&iBrMF6GTj$a?jk!YpC9y2<&t^EB67xW9ahq>u^i2SZHy97@gXs7Y&$Hph@ZiBA}X3vOH(>Ikf^>qh^C z;vU=E-K*?Q5(F_9mW0Dk!Fbmxlg7>1jLabEb9Aut7Dg;?f^atB>HJeo|4g<-AAKpe zv;2Mp2rt2d-Az@Y!Dzea{9xIy^;Yq|r-^~RR)^Nd;J7mmLH7wja@6j6^PXKLbLXYe zuV^($MqGEjL*mz`NKxcF*zj2K_AQjywA4`Mf1(D3nXM1?6)N0%>278K-qQKKI#R^f z6D$>x;a_;h@oYUl4O*)TQ<2kC+XLJzHG#P9U|fw7CsiuR0bP)m6=yn2CrpgFq_8ie z*g)*=*kx2&b=>9LVv(Jmnr4%e{b4~jesRsKpy9ra*2&42r#)HAj!4)8q_>yto)pW;shA zC&i}FQp!0%VVfktMf4;mGo42~ROn$vm(q$qB?Qg!#QU0?rs=d}qFO8?^N!Iar z%I5a@`N`dP%8r;@foBiE#0Ek4`;OlftDa~%>b?Kj*kTHkwl;!&u)U}Pyj!51O5L~5 z0*B4Uh$4^<2S%5@2zqeXOk_x+4ZsNHK4-?6pB7%tasrQr;yT_ytE6%!DB46_i=WEhFoUTSM><$XFZxDhHpoH zugknu)|`8(MU$GKHh+JSt)}&;Ut+9wh$|XhwsBX}pX|@8R)tr2z7QnyH_f$IK>GDv zYc9eC>IA(Qdg%34hayj_W{@fY$i4pF#I-UzTX442P8#clLWmlqpN%XlE6d2zn7h+s z*cP&*aH>5qHJ%pzhdNn(JdJc`6e&CcJQvJx6R=Fz;u1wCaVAp!b$n3_pSNwWOy06m zl$y}_7QOs3W#C8rxggd>Q29Hae7Wmd4jGNyZa?L`;4RuL?Fszi2r_^HUSC)Z8|kp$h(h)I>( zaCIJ`D?6g=P-EBcR<7R8CUtfh?PDZ9uhb2wlKJvKTDXA9+3E3hIX^=;^{ZJ-BF0lf&8qpG4Y7h?j*R;P1| zW8+Hs$#t2ZeaHLvCHdtRupPxZLwbZue*H08p7#Z=DhU(1-iAsjQ1gukbjo1zjHwZE zb~2vV4`%B?>m$ylRu@x=yS7n@Bwe$QpIUJdb@@^cy~xk0l;&p$N(obtyCGI6%> z<1-I1X0`|u-L~Agii;@{3i^6f`FGB-Q&f|P!TF!3?cY5wgPx?}8}G2kXZWc9>#^3{ z-+&LdCuI8F3Aoy|a7V{YhV>ZSSeO&TANcf3cwy*;93e{clH> z#+sE}QVN}Q3tb?YvaW^)-7jgtnJ(C^z>;)upEOCoE@S1R4}l@bedq`sR=r>sm;9rB zcmDfn{I2H@+asNv--xQXQMLS(U2_}bIV}{rHW1xA&Auo~$jUg*@?|ttLuetrW!gRMW2_0B)(zHi z0YvKSb^lz0R3~X_h0??ULqT&81C%dWA3wWhkO^g9_~b}<^s)JFR4<7|;(&`ZeopkM zPwzmMMzCk182UhRBg7b9<>0s|q;tOnimjXJ=v(4ccKz8q+0B4=+j_KGy+1)^a4`5m5j zYf%JsV==dP{^rNp)NI^p9SMOh#Ict@?zP_nAOWq-hZE_<;A#AeE_Xt>n{XuZ_;|M< z+DURTVjFE(U8~Ow7~p5hw#g?EuCr_S`k5R4;~^@u$@3z@XAavT4}H=lU9qA zgW-u;h-ZxzLsqhCOf1@NNjZFcT#bt0pUkiHb?fiTN2f;xM7Ipn)+n24g`EpvMTYXN|T%)qt{TobkUP6%cur!Y+G`93N zrx5Q&@4CH2WLAVn;`JS{eR85PaVxp?2rdWc=a)wu zm$yqf7C+O9`!~g;u$%wB-43wnMn8d^I_?>wThR8&HX14*=uwDJ+bN1ZNfQhpBz6Ml zLd&1~&RPLI+t_PQ8T2+;ZSHN{pDncWVFM+%GPRZC5M^@d=hA1{cx3FDv^@v>aI1CbXV+5$?P` zK7aFuyu{l+_dqKrF{#c$fPUG{>@V~!o%E<{BTW0HxYbVLNmGnFGbvPN485cpw zZnP+B2~4^1^A>d>9OqBj%i`Yi+mWvy8HGC%4mHd2EDEv2TipPOaBge~62*ZzK=_q^ zARl!Y8L50-rTwfIao{ zz|Odwh<5(5Vcih=7;BtNgFcXKNWz4XtshQP?9F(*^;({LoK9hQk$ipza?w}#O=nXXWN74EFVlG$5u!fKYY??d6gZBo6>Y`q+u{-#4qD=J5uqUz7wWrI zb(cM(eCH@>c^0|f64rWbSlB)XW0pU_$Gkp{2)oZT0qR@ltEPIqjHx7n;GEHKAdDhk^=l2o(6d3NqHG1Q zH-MOy0ts_q2J7E@&@R%;#NYK5tf_$jNIn*da7Ut9>*65~wvlIGEC1E zer!gdr+6KLGg3p19Yd~FEbvSc;dX;Igcv2mca3^n+`5AuFig}jE;)LY=_x6Ntoz~N z*0rI(`5M?-K-6$C>x8zJpTEXn0mw()xB)-&q9oWE-TSnTc!5_^bo_wl{7v;>!Yn(B zK{9)pPCQ~0b+}ugoOmQ(8RjeeJ>48Kg8Yz;Dv4Q7OSAj9UQ?(*vutMK^QDr|tY&8p z0(T27l#9u?8F}HB>1)6u(^@sv7VNmEVQ?L8Wpsme{`_9AQ~yfT8h~Z$`_3S8Cc*3V z&bjR4hfH<}wv?5G`Jzz^BCKaoa0kSSU%G;Cqwq~x9tI9go2r7tUUNgcqaZWxth#rK zOaj$<(gQ-sn8RIbg<-m2RBWZ8b?)Z5E82GW(dv2yb_hedL-gpP@Cbn4{-yZ_$1#=0_V-uP2?%31Y9h)5D3o%0ox=;NHzKx?i@%m#v3m+6lU{V zY|&MvB_V3)vOSgn1PcZ8G0=eXN?7F=Wo zj}5KQmU9|zlXdx~l~=#cPU@~M{z-IGhJ)Ke@{K5$UA!VzN^)X$HsK!%)?R4f=o}y6 z#k2ccewnOCu8Bf!*3A&)T+~yzh8L~TfsWGOJH}ex>Od})|Ji=fo@ndH#4a<&)>Y<8 z%AKl2f)*H(xtQi*MVIvJd&A`lQ6D-K)vv*$-IAlXalcJJFI&JHXlRcAib)UA2B4C3R#D~@y2oZueZNtr!hQVE zJXK2`26$5WYSzT$j9W?qCLc%7jq&28)f3|Gwmhty0qGC+NPNAazC&B1s9OB0!!DaX z>0kd~_-+sD^pG?+U2`v&Jg>VNN-}-lk4cLQ1kEWW)byh-nlZe-*r3PZGR8aj1F%%w z#eh4nwi^coRdy-96pMAG6-ycroi1eFJa$cVaGUKs%dnnwNpbR;r2`TLyuY3!^sUIB z(^Kd||fp z8BAvI&li6DDH>_L+^61#g+U89=Wo5M5QTnGR%St?P9UD^q#7QIbYm~B)KNGe;s((wx1u272_p1m^ zW)$Fwj@&>|3mIwYMNB-XgGEh7+kn=J_E;&5RQHk(rrzuLLI(%T3Enev9<{7|T&z5K zAVAd8wk>N;@48{##OSa>7e~#}Upg*Rki7T9K>V*sf9gIR5&FHE7{*n|!PUk%7AF;TE+?-Ue=@{*ZYwK-mt&bN zxCu(-u6LK;kC5$CibK6*Ybnbb7fL)9cZO#z{r^eBE%^xipy-$t|KYy7`EG|5Bg#F$ z=wAC;!apGAe0FjxV`q-NXV{E4oL=N(FZLL4m^Uc;V>HS%sQw;ND=2T>@^nH(y?rs@ z#PTg(<5@WIB3mT@sYo@{4wXl7_^C0gfZ2Ts;b-ov{uf)Xk9=XTpZU*F3xY{}MF&|f z@pa9gvLSuex^^}<&TR(!9>=MSEPH?W2o$HG#_(g^PI0SJR@R-RMieVKl%t!q;6Z+G zf782#JWY{*{z24Tq}5Vin=c;W?@jVLVy@urHQ|XqPinRUX4xr?L{ZpM6G{v&L$V2v zjyCy>NG=loZsZEwrI5fvq<7*+{^%D3`Dos>%*wzArNyr%jAQifbegr{vD{|B0WDC? zAZ+34i5qrbpI?vzZ|Tn8x0JYcfm04ZC07#dp$2oZyeev~{uzZh5?jdZAD3B=EsU$u zSP|sF-e7e7re1e35DFB^z|^{fTq3f$dMh;8eLvjh^&K2k`k-yvF+SkJ%I+EPvTYz= zBI&P|55<%Jr>N_2g!=#gBqfraB7~0-Au_TCTh7Z-^j$6)E>!mDnSj8Zbs{-}gAr(R0f*rX*8!ni})jMz{|Iu%W^&KmQId@Z@nI zH|f%QoUG7h^;C93q&sqco^y4)@vcak@@T*}iNn+^`V6g3TICg+w5P!=3E*+d6j}Dw ztG%{|VLdx@Ta24qvM0oS>D2>#L86%&oW^(2$9nE-Jw*;EW(CE*364-afCG z=4fDWyUpQ9Ve}s?9d+M@!!dMqL3%VCk|F@?DOjng!;iil;7f}k+^>%j*vIf_>|>aT z2z%~WZHc-@Mp%`R^CoJ8*W=|K{avshSSSO+plZXpic&C;iwo1 zbcH?C>OjE+XyXo>= zoeOJ7t_%v@)on4RBcAs?bo&ca&Y`z~)A&XW(I( zF@~je7vtH%Fo{3?*PQ+OH*cuVPY2BdjFIMxcU$jG(a5lH4e~@uK_*W5#ltbi>@z!$ zK#&(cQRfpher5Ei-`{-T{8j-8xKjFQ4^CYv;&TQR5~{bKqk8~ z?)`;~8xxvIP+T~-l{!)XoPpkiv(6t3g6JJiHhliRWjuqiBhokuCLgJ!fq_2$!%_ zX4+SX6&ZGa6_g|233TB|L}7jva_Pb>2FN3wq*W#hTl?rFI^yF2cHOwRZnLB_8--h* zxPrQ4%oY7S{@m^u3e#ZC_uKRY^e*nIWOFAkkaQXmIEkBvQ=B`38Feuz9>)gnoodXy zpA5fmVkxw!&mh|y1(nj`YvksF{ex^_I4qLX&&`8a1?6;PqnF|8+PsAsog(`ixOhv@{Zk22U+5Wxjm zZ2|C#rLEDSN)ppuOzQYw3t9A8-@A4A4PRw0mW$QcB!%Knh5};z=o~b;YkMjk`bknT5Fh_`uwa5>y0$LIF zH7VBji7ITmANwbrunJF0NO7;pM9x=H$N`BfFoE-Ov`eB+#>C!5&3+P82O zG}?3WwXf!AV9iiqr6(EVIGeRxoEwkRU2@Njywul;L7G&Q1l*rFTzp}m;M-AUv&wRA zO(Vq1ZsxD_tN`0<<|ZM7kfn#s?XO%|@6~dRe9^2m-Bkt4%?8EPLi!uJMBupd;dYz% zc>CbSIo<}jCwt2#NyDS48P5nre(~HeBj;fCGB;lP4q#=#iyGnu(s3efI2DpJQYkLgZ-1m=R28J0J7=>tp*SNtus>o|?9y^l{==!A#_GM9i0(<<${A~4q2KMhV0h*2Obp-X z>H%u#$iwuHhspi(3_s~DvsI%%yE0-&kXYl6lH=OmYxX+BRW^ZHLK~OOtM-IjY}8?u z{F%5%pf^fB#GV6@Z_xro4s^xG^#E9E6)u$)RlxDPZtkgV_ozkro#K1W#Qi3B>4Lcm z_xWpKuFijdDlZvdFSyFQz16s>FMwWNV&O>d(@%vWI~{+Rd#M2vRW|NV&3CkI0ahjS zel05iBNd~DNfZz|K;`ld9mk!pH)l3Xxtu<|(9%`f3VuB5KK0!70P#28lH5vi?VH9= zVVzP}RT2Osw1Ma{2F(>n-x#(Q;C>bf?S7j{jTz?7b%BKJbnWq5nlglWghIK#zh(ultHu=G@vZM@iqL`h^ICRJUg^~iU;@C(MHA_EV&-~8jWl;Zd) zs`EJkiq_py*rdGuP@q5`$*mIHlHU~WD~mUX_J}w&O9d@%ont#MCBEa#pne2pUtH*T zv>)lLK_AiMfZ_)zXj_v*U0b1Mp^$$*jiPyvessOAD(-x9hhuOmJbhdrS2#ENw}uY7 zaS85ZLFTW6PkKWMOEf%rwRse?UiNOMj60KvW1(X(p?x&T@uxFd7$>*JHR+Y(i9 zofTrGn)uRl#ihJ?m{gC6-_#_tqlve2vQt0l^o+z9gA6g&XULvOK-zoh5t=$*COp>gb|b7f7D&wyT@_b_aDGCk>j z_53-Gk3|DoC+&J-dQ8+2{fRLy($lLzL=~=)+s)k3j_2^u@pG544skwfx~t&{YiP;k zp6UOTL7~?FxV5m+=i2#CSSX>($OQ8zt&Be4?u)AJL!&C5k|r1R5){4Q+oT5{R&|Se zIx2XF;D#&h)+kyjL=;PcW2_iguJ{DpGkYEw=FH#!441O7;%E4U4O?UT8rWfrxW-~^ zww0ipGeQ{ooklv$@z0E4;AYp)r*WGLT;>5u{_m(Q-iI6O3)-YFBz|zp6x~7MQfn{` zM$rkM^&H4ym9o-UUCi%K+dWH8wRi7=>`*d+Qsj}ynhMflANmH;R&eb}4)KBElRxRAkCz5xEYnjzi&mpmflhqSzX&(CR;M8>2s3tK& zXZ0~L2oC33kl89md|CA51R6y!KrnY{ ziX#!<(*98G2DzJgn%1`2R~17z$-UARqu&;eq_EHE$uGgp@N`CP_yifyRNK%$KhlpN1zWTn2H?j^LXonfEsv%sT?KY(3zp=X$rE{l_ZxzZHa3eak%lVel-vQP>aRs zq9L)mNkFuZCz;gSFSRZM&^3CBJv}F-ev+l-@@cu+a@WN_V4ECm^IDDK;qF_FmL!=I zq2@h&MM)`=?#(FTj7q9hff>!j#(sh8P2HQw|jT7AM^vu{#B@{JBcL_>kTtO60#grav3JkwK7L zSfxbC`~*U!wHtW@3$|(MG7K&Pnur!OBnclBs*~5elS!;EzK>btKt8r>k#$YM_r(Mt zjY6bw*h=arK!g!lnp7oUmmv2Q!=*b)VWr6;toWo+x*z5^$F7EhlsXus`JI9tP zr(O{~$Ex`XW~)2^j!ac`#ym!ujeCd6yoanz4D{1f;<4NE0TL|FZfS}#kH6i-H7^1@ zDB1G|q^~pWBQ7?DBTx4;|!arleBY#P6fnQC&_iiVK zg{hX*e=kXZ(4@8<5%gA%tA9x5v(8fYn=!8ApyY@R`Wdu1&Y>jfdt|2(cGkz-NB1cL z#@1=#LPv~!)Sn1dsDpmrJNaIfVcdNWo04p|xMW|)A~Me-W=`1PylaU5FVSdga~3T9 z>~8WQH#?oa6>#)CD`Re6@#dUrAy36PILvmn)}IVM!Kpjt?+67psY;`M(1&z-PQ@Tq ztjh$9es5UYiZUZMaxNsHqi}3G}BL3BIn}mIZFe5J!Q_sjbX$Oho zYRqif^Syh;J&xQUgq{zq?BW_$JJwKCHvu)6%kRDP9G|a5|HfJd_v^0bTUcZ(4qyL& zeaU5by>M0F!Rl#iK{P7T5rYaZ+BsLXl#DBQmW0|3(^fKs6ue<1HDjD|8V5LFT%~&- zK=`KOLjZil8(iO>S?4Hap=~RiFgZbc{+f;F8NA&RpV}?9mC9uwc15q3gMT)@tzjIkfFP0#{03ii``7)T&P%TuHunn&kR2>8V6HTPY9tJ;bcGR{x zcc6JqZKz+?v2$C9W&u4;w_yil~Z@ELdQ@=l39VN%KONNJ6A(aV}i$b-1ZC83QIEVioM|>i51<} zYRap3+qwcRrCC|T$;Jzq2R-k49ZX|0Do&(QkiBI0X5z&+$~=%5$W+ejecSsdSmDCH zrh!h|c1LVUhyK)uWmlNXgQR5TANLJE8_l<_@}B&N-p2E70Q>=e_JZ(Ve$?TaG`o}K zPrRgQitkT<&Hhb)KS)f-Pse&jV&ja5Syo$J1D{hCtTRc;1z-l>8oAk&EmmcT*kbx* zN><#B(blCgE}Pm{1PS;e3l`5Swmcit^!`Eg9NK>;0>9!n+n!1%x?|?7uZ*#2v|(HdY8bVKehCjE?8sGqVa>9UpjE>gcMa*^59SjPB== z$_Qrs@Y@x?0X`G}#b?0hLZ{t@cNS?fuLx}k`4=G{;wo*F%b{qy&(bIo=+X?EUR@{T6tmq{F)x1_Agc3G=S_r`Vqy2{dk3f93A@Tlu`C=K2QI#S(q7q z7iKD?7G9GxP-d5%APZ;$^mCp-iT6R2t7ZJmzjE+CN&P2WU1z<_6ASb)&8o68_g2rBr$z|J0UL(t=fIm~f_feGurzx+s{uM#ZhWB^uR5nXWnrxo^R zxfw5CI^FWf&4r}s)`KixCW{b_?Jx%FIpQK$DXhWjC(~ssD)W!hlP@gydN5_bvu@2C z^`^#vrppdKayi)zF_a%XSC9GlbKe1f872MeEv?}Sgij$+>d)P#mEI`B)a2D?IP-04KjFLa zC$k5-*?QPg$?UC-Y-S*vn4RH)6y61l4wRg5JXNM*|Cos8trJ`H1M&zI+K?y<_YWGn zHCO;#lv%RN`ROMOtM-=~rL_W@i4c1V53$(SU7`$sx>MKX-p!*tbvGBx8?GUxA$Kh- z!;Np{SU?Z0V!!8*W&_psCAt7l;$wn#(!p%`|=22#H78X^yB z(sZ^r$Qj4SS5A?Re|?!`1H9bsez=`vMpWG+yPO}b-!%k)^auHFX}8gf1WAheWtasl z&wi2NB~GjNKB{?JJmMYgZJ7JO$oj5GLC}0roinpUK&n6BiY1K_zi0Jx5@q;q+cmnG zqS2k!!uE2O#A=v?d`~`C(Tv3J3mj#@chdYilbCrUzvR8*0mKEsfi>+1sA>(JF{I}# zXY|ms)#-9sQq=S_#{pB)-3<^CzBHInJiunN#*eV4wHhu`J}f-T<#(b zfYpVJIuhHQOpLk�By+ttAtV^JH%4Kay;sj5BG(@pxoX3(@Ii7eJr>s3euys8RYa zoXuXAW9HyxVW~Ydme6Iu4MsRh`#6iI{(JFwBtqgrQY~nywvvB~bstt>8_*{w42qm@ z+^b1MRGU{U#gE^t#LYs^grmvl({HJ7>?MZJ4{G1{8lv-0_)jLZ9pQKa(U_)9i3fh` zQq63Tb23X3{A9)}d%C`Ec*?6bdG|@QgSlP-Ff_}$P<&;c%!l4^E8vuGtuW}F z(;~LdCfd8Bt;o=xQzj;cbN!W+4TGDTwwIWEP z+W$4rqhceewYIVt<>?WN@oFRtB0+6WH7gXIb<#4 z??!)t-iIpIKKL8)n|Ie_j#Bh3=T<~q_SEPYfCx7cvwkzLc%>A9XzB0UeksxOiFxRK z=)sZRUNyF;x7y?QKE3W{trt-Py6S%^XShbX>bn14L-}S#A^s*@Ior+FK>3Kpm)C2H z)_XFvhvt{Wqmv@lY|H#rP?S8vn*qQ)QA#qkN|u^W@8pQe$m*>fbx{BKuY{5UJQ8nS zgYnVt@l!c*x&Qlxp3U)^NAI@FLw_!i(Gw;b6>00f7;z90t~6On9-*{v{e~68Bkj0S zZC(#WhML>UrZ1zyQ&YDSeEO;%!Y10l45ZOU0CU5_V=qiWNhi?M&2T2?06)|k99^ln zas!Wi0R95UlnL{=L9>vi(>@phO;t_srUnyZFl>+bi7DbofYC#QqRqbD^u}F^CO;nk z-^b@?MpSqi>%C`L`stUo@O?xnstNj^hdSPgjJW@1}36wFnU8Y3cH2 zL4WfORxH7RUsH{(WS2X5qr-t_Se1VdcJ3c9S9Q%$uzEPyGuZKYs5^0I%OLMx-&qH?zQGzQ+4b&+~j zB3KOU6%+BFK)V?#CjH2c2RrG#F@4|=%5naQoAGZN*7n(4ZIV5~I2>|LpIS$((<)Qu zQ_m@S^rs4GY}VXJb2kJTCM<;)O7vuqYXgKj6^A+@-LiI^pHP}xb}z>M^VM;;-B#h# zszIg?yvQh`$J_p|%$!~6_C(BmBl$`rvN;Esyf8V`xO_*qXTq|1v8=-0-@A?V$~)ik zHR38qdF}UBKdi}6eOr8ls=|Bd|Dx+?L_7YHdK$yuynMby((E0Y=0A$sT09BXBPlAb zmUy)KqoQlX>jmbg9d|g=*G8h?_{Mo7a6BoRhX3S4=Z&;bRcDcwcOd}tI6Vj}l+TaY zcYKA3;s1~ScguJOucw$+&8D{u7A|yC;w5*~QGiXh_#*lm<<~{lqMuz@Wi4eu`u21} zNvU%XXQ?$^dOAy|qAXb&sFp(H(9F_iOnH$0e?E&iYTruRw9AscRy8=eka+h)N$SW) zGx!mEvk`D#v5_#v@mDhYu-CaTtY z&(n^0vtNK1lj9H!)&6ue(TNKNv@>xur1z>gavc1OYPPh7 zkBfgTT@?I(J`&NA%LsoT1}YfMM~bMLJRf^_H1tp z=)Vt@7QP@m27d^L)OS)?$Zy#c6ukTE_!JS)?WKgLrQYHFkf|0RG$!` zk$%@dTnZ&#>B0aY^Bcmi8)F{fBypT~Kg+(YGE6kf-8oJ&%xPri-!*sId`)AC=zm5_ zZHsyH#+^t1{h5rB%zj$(d5J5sey>0ZiH>~(5dGJ9GyLkrp8a;&X!ErA3-DRI<^^G< zVx{^3SLKuDn~c({ZfcC>bC3ciq)0}&&qe~FU{s4f0z4;h`?@~?L3X_DU69AhH_sYR zh?69_+B#oy4rKhZrlv0-D9SgbIvF|YnS3W}oM}+OqKHGha`nC*bJnj?b!$BY7YWZ! z#eAt;G>T+rNM4blEJpLpM|txm0fEX|$GnfYyW{h_p-XXo1hhkpX$}eBEh3Ro?_#e7 zv}&11zQjNH=5^xeW@g0XkmP1#K69nxUr~c4y>>G(-HVZJY`tJU;!SxQB1iTYPu$-# zdZTjP8y^oko!krdOk0g8o;R5HJ%^MFPT%gVn6}AqPp~29{bwc1tYWa9sUkc2} z+b#3+vJ|hib@t5HuuIWegev8-HdGZXzT0>jJ63R^>MY@;C2eNkVq(9SEoBnSGVwSo zvcLM~-R02}a4p!`Dct?6L(B6M#Ch>KR>gy?krHnz1bNA+$nG!Ggf;JdUcAmvoWzlQ zV8|YDpS6tG(XVsNVq3!rNQ|!AyndxVT{k|;xUskJrA*Y?tTSUJnr~xAH#6x?9+9bP zH{YTBrF_44Onu-u!WPRJGMm0!e??`irTjJQTBNOCmQ$+ts)z(8JTo>`l~CZvq%>fx)+-e)5GvhV_J}ifPjFh-e~-@HN<+mEkQe$BRJBZ z^6Qr(BWEX~>CHFP;6-yNe?*=GhNYnB^!WkbyqHTD4Nd>#C0<`<=Uub{0#``iNEws6gof*6@tej8Qx$ z_Mp#f8E6BSWX7SE*t&mQoY8gq^WJxTc!>W$U6#9xsShN?|B*Fip*KWw9k7wq%5lgQ zYKw~g7;+w04-!XIh`;fIcyuqNzg}p##v=XXodyogJ9>;^Co(Wc92UU~%VH-3DJ~s^ zfWbxD6z>nWySDG?BgjJ5LwIM&jV&olZpiuNFZ&=We4-QdGefbSollqGuNDp0m7NU? z{VC{QPxj~j=;S)DXS@8Q?-domBy0N41*@5`mm;6VsswNHHiD|$&#AAQs!RY9`Vs3& z&pM)Zqf^=8G9-p4m~HnAjQS@W{)L74PZjindI-JTb*Tl&`;g@|1h>gO@{Y?0L~w6d z)+~Vn|7~$6BW!6*fz0TKMb}$m1zGobhNDVJM@(ZXty%j~#YQGkk2ZYd{UE`>7$OOF z%zC0qlj(qZGp^a&#_te{)W76=?zaFZ##`*180~T2r;17>h_8m2<3eS0^G@BjyK~nib`FtQt>`J0{5Ux%=Rk8zJ#hz_! zp5pxHAD&UgSH7_OEkq?*A+3~Y(WZUR>g;i&LdL(LDy%}H?CPF4uY#Qt*quS#&z{EV ztJv}G$+HH|P|c{GHRCw8y>$8udpgdP+bQyQ;3Ok+7QE>Xcb^3E8|Bo=sKYnlo%X)z zaU!whmBJdUQQ_ezYr>rciYrlaP0(LdAHCL&elPNnH?DV8&Mez?9r1917d{C~7TJ=d zZz3}}Yjk*VwGwJm^ZdQpO19o=(WT$IlbNSZJ%hrPVDNc|_bS zNh}Fl^6cI=SwhXq;g5gddKu=qBCiuT4A8gDQ6PYLlOsaHdgQmNo=(T{gmhgH^7!~F zhEaxf((rmL)Ul>8n{-|5nP)$?nov7|Vn=<5a8;#_R>sO7a?|!P1;g&%SX3~!{ha^2 zq*UNas)U9!_FYNX!i6R~Scw%YK|ux%G!$dmrIqM*1Xdp3OAGLV#lBtG47%sIoyO)xZL zAQlx;&?#Od_10ONr}dt&x4vjzB(B;|?lS&Tew8Fr-ERLyWtd^~YJKQAx6R$N>+v?> zvT92IAYXE0;CerA2l_~*vy(etYVs>c?|#mB|?=>?)9 zAEW>ElV;lto0`b7lY;sLdO%M>RJC_*|VoymgA0aE_U9$+%T^8VR{0fzI*DG3jeTA!&yi(H!7hX$>-r&TB5t~_g8!e?P!PTJ03&8H)DjEk-;g5OEPTThN4b&fG zA;;wPEz)M}i80C}=n5&bLiehZ8ka#UlR81n?RjLOuxYCpE%+)saEE>PB_i%&t9B!+ zFWgDEp(dJjR@V{-4*FzIV|2Yi;u?vu4ll{0>ujHg>EBVL{0zv0Cp6IY`lsD1zS05d zYe+6@;d)=s*3P_6)*`uRHeCJCI92P51)<^NTy2Ez;#P?tC62Vr?CmrbII@$zkKNq( z$c&E(o>$gOER52MI~a`WZCH!+7xq8ayqOiIVA%d3gc#_T=qSHYIdpu@%vgh`SK_@f zL`S-$bv|Pk8P|-#x-_oi9$dQe^ADze72KVTjIl@EjvksyGS_61mT$8*k0L%$VxpTx zyMaf{u&AAz&-~@|XvbyIX0YZ?5FQL_wJN3`%EoqJ>jG`rD^x@<6T`;B(3 zN@vI~tJ{H$BKWz%ad*n9c<)V@VBVA3E5U0~AB4iTTe`LH`%1X(my4iz5uoEo-> zRelMgaW4~eWkm*mcO*OYra6U4bj(QJ08O{g9C}7y8cg;JQG90cu><-xWl`AisraO-gtx*&<|afGa>Ke4#85ZvPIqkU zd)mLRhp&0vg0Bt@QnCYi9QFmBZnVG~kQX78^4bbK6O5Ad+tDq!9{{ZUb<1IpAbLC` zLEGTBx?pW%`@ylcw7(ptjz+z~w@`J<^^OALl`vk3Fp$^my2;6Gq*|+7h>2($=~r2dQt~?GMDgD6A0_e`^@M=#E&)95Y{J65i7*)(AfO zDt*o$C|h{-DVyW_SpTL5X zRdkCB-WXM{jPYIXd^g}|_B3y@@kHpi=0(bjLy|$EP$Ka_3jG!Z%#HyBHAWf&782&f zW@t;8Q!OLxZ}$%|mwIXAsqZffo9C)xS^T|CgX&%Ifp*z9Y$ilsIl96+Z8u+U{rG0% zOv!oU7eUsc=T+PwaBeUe|140_-?fJJpK9|OX1cv@s>}>AfuL+@-F($oF*l$O=$PiQy)0gH*!F33>4DiPJ4NgB z!%^DwXJfln#yiA!Jk(@J~HJq+SdN7XA7DJx@)J%>xQzh07>J zU~@(T^H`W8tMT*)=Gp$bn4}l4-jNoS*FH)0ovfTMX#FVF$y)8!(QRm*H7HclNj+}b zeQ2j(asATV2t39HL|mjy6Ez3bd&bi{W^mfKzC=(PBsh4290sh6Zz-m1RnC)uO(%-@ zJA9%Lwrjo$W=TFEkZMfBBuJTUjPhc1c>Yc&`-yWNvIU6%4jiQM!@zuU{(d;)pL# zgv65200-q0`dEukrMWKs1)V0zE12gbXQBKhzi}$qxj8Qn8vOTvFNbW!xNh`y>q7+c zN*@drURaTpJ;pGqa)mMXvestjnLebR;M!M18}GVziZ(vq)pP9T`%&@r37_fPjeX7w zQv4aoTUM3|=kDjhQtzI*Zp^VX<^{Weuid#rDZt&u*b&eAO;B7Z>x|9K@1u?wt*>2u zZ+FdsyNx-Q%ha(>?ZFzXzN-4!qni}x0AJsIF4=4N6L-n`;i?yIfb;WZ=`xV(TMwi% zxD5H^$F|=ntMJP+2W@$ViTRh&Nbf`fKZIUv)Ukq@zYmDaNA3RT>nb(OjQTlw0pDY_ z?gZY>yrzo~FMXbbgUXHM`X4hX~_;EZY*F1Y{|IHr~aY{wJz2 zP7Orr?}tNFW)+{=Duv1n+r+<`j8)ls!a_KFhnZKXY^!iVRILcO{Mu@+^!Dx3f0Fo_ i5|m@syB8MnPFZ4Mo#o6YXCVB!1g{h|UX;jN1pXi3?2hsP literal 0 HcmV?d00001 diff --git a/71_RayTracingPipeline/docs/Images/shader_binding_table.png b/71_RayTracingPipeline/docs/Images/shader_binding_table.png new file mode 100644 index 0000000000000000000000000000000000000000..b146adeec959656a937e5f8b05b10980f5a08700 GIT binary patch literal 8569 zcmbVy2UJs8*LIX)7^%*nB1lnCIz);fU6f*hAWClmlqS8`1eKvl6C=Gzks>9u&`Ctl z&=HUl0)!5sh0sYz_`~>q@Bjbb%vxVL>#pqFyUxA$+57Cf&p!J((T_peER0-?004mH z!F`P<0KhNpwEu5@`<3=vM2w52i8J0$wC@5c`nmD6lV2Rvb=3iYssyHE8+zLLIgk6M z-T(kA?Jm~#GidkX=K#Pp)(0Bu1_4$ZQ>|{;?n^7gMJAQI)rt1*nP0SKzOc>67S&ZT z8A}dZ+;J-B|5*B7g3mqZ;)N^xSfe%;7Ymwe0B9%|&ME9=3K#xX_2;1rML545>;=P zi>4i|ZSf}AoXW~yLn(X4yNrNJ-C|}s+J%?54#@;mzA@mX>s`Ccr(FO5v#{=)r_v-0 z7kZ)+L=PAjG>AAuyYQLon26>Z6a##g%avw6?E(OlS9ynqs>N9e@$&I4&d#a}!lGl* z&jP8(fwBxgdyIo{NJxxTqf?ggMt3D%w*InmRZY#k5EZQu%0C`PLP)c*ug~87yxoyS z^mH(3?K{^`7y!VBTR+hQ0C$`pbDoYeO_R|N|Hh>LpH%?>0&Jwy3_yQ>z)KFrEYPCH z-*2e>^{2@Ifd2&ot5hAqB=)~QU?vyPJ})otbLq6s0OjQ6ft((nbaUqHaSY}fl2-r;{l{bY5K7*cAob|3 zwK#G{j5L$4!e8^v^k=NC#C_k(m}$+Dx1GM#qX9R@lj4quLQe%*GCtsxbiQSU1nfOj zolL8D7)1bI8j6y#CI}p@e%Ih`DRStskQ7t1`l}_SV@rpHLuT7q>1BOH-4$~75*YA# z(yApuUy-tpbvKBxv1APk1&8H5ST~c5Bw0N_&_U{`b?fggKXjbkFj;t^cSh7K4=4#*^+TxH-E<51^MnPeqOm<=>+39UE9)(Im(-`@9w2y>vrs zn3}88j7e+CYKPvGhw>DNnYh(umpY9hHeE^4F6__{)26=O?^ z(&4(Hy}4K+_D@u{I1;1$dks#{ieo`6tVNMc7Y*y&M5j`q+Y|YRt4B9rCUO3f6z4pA zz#kaO<Z?vPXt3PDWAB-@x| zz+iNZJmGMHDz2N_1wGX6N3@8vc$>=&cpaEZx;1&dYh5hyzl+Lz;6+EYzv@-7#z=^_ zVi4oo*S4P;7E7Er<|~+P9eA}X3A!O1MW3VecG_y+Gt5$foVW9O&gMY$A=&1!A5-1M zkIO6hh4V3)X-Ziy_2L)ii-B#LdT;$@B!2HK<=qlSYV+BV20`lx3)ZfjEJNHHsGgnc*`0X_msMr_{_RoKXe zbXjD+@2lb7e6+V0W7#6z32_+P-rBzLCn z^KoPShpY3697V__Q^03Mea?tI4CN_E7iA;Rs3P3VUg|HRw`dEL{EVR>-uLx1G_urP zh-O57%xNTxwy)SZ>g~foeV@i982lC&+jNH*B`^3bG>tVUW=v;4KohSDf8u4pmq5@L z#^om7-7%*#fjQaS6xv=*e&HPgU+n9*^3Yd#2b{JttiO$%sO}0`_HLXK6VaA6W17EG zK-l|Q5|Ss#45$Q%wpVJYw`oZ(>_k4|?0=}__CWnsYHi7}v+=c2DPZA6_eeS^mcd>e z(V>DiV6Uc9yw0~WI~A1;_B4+3G=HmgI=N4Zj2x2Zk#Sbcd;MvCkmbXz&QhqeZAc$I zDv%d9|EBVyQwUY%9n7un3w?CNS7Sbc1amlhG)fD6JxJLW&p_2C)t>k=ELal zdl6nFhc5bvcngCcXY*aG9zM2KB>Wowp*O2%?GdL2aHsaM@>_o*gAb$noqebJk6Z!#2pz=ywm^ zPJZk!vSDY84Az^{cW5}^-2Q~)x&-bk3RY2QNPjrUepq=+9N8%Sr|M8jxksB$&75;n z|IH*{MW>Hx_x&@(5H)tGW{cRDmWAp0A-BYGo{K-#a(<#9Ue|k&#Xz)-DHm~I$03Ac z-9^ippg68#=JueCjUJoF7eqIpG+(#i>G0ow{lNV@+TW*V$<9f;vqW1oe-Sai_IiU4 z((*2>7!L`W(%!BE4#kY_DGEn@B)oYsRzt`eVEK5`l~_1fKFQr8OK?8(7|RkH>=+h3bldPCjBVQkg^?Lbm^Rtfo}0m*v(3ffZB=W( zXj!?c>djpNXJ())=4yW(zVx0=z9;4QN`bFWkd)ZH^%fXK-&4^%Y<>sUOx_ZmNbuye z=Sp^8vwG|vrj^vrpT49?>d#;&TBd9W)AaQfiGmv)nmS2v-+_O5E?ttT5&6CsG*1j5 zt&PF)1379%YS3fr75#zVae;V`nTl(l>xaANudu{TZ6dB^6AKT7k7w=VqY`Sbbpv0r-NR9bl%y6B z8Vqj851)|48sC)GDECxVa1W+KUI)j%49;-+ij%k!J5c_wY2I7uUBZn7rrK}QNM>DN zv;Rb7d5gHpnLZM;6eOeoWNo)2jv#qXWY11J>~aREq}|*l4-3FgWA|zJwj;*z?ujR3 z_zF=7(UF`7-O_EEgupQ_?rXmr@pnMTo7q{9{5|k3D{gKI>*vnnSiT*c>&A5W%Mjy8 z;n=n4UEC!LKFMfW?ie$us;%=gWR;e)vF?{pwbva;G>?__ynQmGv2E|5CkfJyeZl~^ zbH(YT@ty?GaJavqy}Hk{>jS;h*PIgUlO;j?n1n?n`3oX#u~O(xh(LC7O|<)SI^syO z^F)(NkYs)Ejj^alw3P4j>(^l!v>?ouB3-6vCd5!v+_>&M_NreBc*%Dk`+j!14?mvi z*emnF;LkIF??z_4o^{2)0e@>RO5e?cYD?DkYXWcKG~(M*nte74+KjQ z56hw#D~s;_$E#sS#@Z77M#bf)dEq5LAFr#e(>I;6*9!7eE=+wC%SicyTQ);B6P5kp zAjs={=7*SU(k}pj>-mIFX`AxPq&}<1$d#Bt61w(F=RW@F_b&oQ^D<$FtM$bLA1q8u ziY-Pfy~bhu$cXwympNx;yAW@jKnQp3r;V@t>0r*}7fTL$o1mr2yQF)FgTV5KaS~hC zQJk&jGcWo%r|CW1YgS~;buw5PBHM=~x#CqL+S$-JQ}?+J1l4Y*noD*vIONIAKFECS zudq_Ro<2sx3yhPFpfOY~&>;>zYB!pr!t09bp9aif1L&7a`&h!=aOc2@pK##=RYJ(} zE^MKm!N$u8QU$sl89V3IwV4(&)DNFOg1axyROa2#h#T-(l+JWeTxC`I+W#;miLqPx zlRC|KS|Ry%DiXAB+!0gMjZ!z$F5;+JYtQnpRucs+Ke1@niAT@l;b2suS|~M>dPr<% z)@?*EMoe(yJ+dou6%uRihxAeV*8O?T*8NUC7Ja@`G7Di=pGF`Kt|#T2SXj)NeQ+|@^wBI@GmA#|?saPJ(~?aPKYmC4fGP!5&^ z6(UQm0@CEW-~09`LeqIL>K1L$L0H^)_O3ZJyW7S_+p{M9*+#8oiRP%?yXHkBrfg&P z1>H|yb;@gr%5l0<=4!n*NR!wxK{HM1n`ga=8O_G$;zp}*S#d&Y`)^S z`P$5%Vc1`@XTSd2w;+^9#z^u>pGn!o=AZcrWt@BY3kp82t;Ca67dDef7hah_ei#E! z;B{)&B|fH}czE6GmYhEplKkoLRv2_XE&ut!(Rk>xm{je1u`grCB2BD8tPZV8R;q}p3da0SoxR9yjVU1%w1 zmgy6MIp~GKJ~~-l_7#_>rLVV@F+5p5c9+$*6dB(D^x0e2+y;}Wssr0UBpP-iua15^cz3^!%$}zw$g?82hJb2 zQ_J~LN!))KVW@>qY*pKamaAXyYwQ_IdYB*=rnflQg#Kh5n!2>qMvixNk(k4_9Hv8T z_p%bBZHNaWjAwUlQ##3l0SVlHX$GNWw zEqW{1m6YPxv2w1%pFE$T08Rsim5&$LnruUAlGr9fqZF3=YEItF;j34UFhg&~*>Se2 z9UOqla++>65f@sAHnFeSv-9)iAT1K&F5KnoCRQ#d%83}I!G;uSG6{~hwZC@>*kzI5z-yUq(S+xfv! zvu?QMWZu@~C2?8H%ltqpx9mB#?YT7&|! z>I8!qK_H-&kiSq*UA>4upOvP+e!GbIfINq{c0*ohqG+i}q=awEM7B{woS^w_R1y!^ z)pv(lk(uRh=aJc%0f$`C0@A*rEaM+Sw)D7u`VRSWDWb#?64M>Ikg701t(kKY6|hG| z2NVYvg+ui^R1*$^*m19=wLXa>lD%|mk%n-kQb&p%wmhTIus%UntCGlnqk#V2NGb^TuXN5s%To0na%^k@QHzjjaz+?MOt#+*5>wLnGXP3s# zcD!n-1%1%wgD>1s!v0tU9z2Bz7cB9xWhIOq_3xH^$V&P~mQ9V1q?qQI8AJ{g+G!5;%9=HKv=)ZQq zeMH@Tly4?)4ReWRCUv3&71q?%IO*Z;7jn3+i z`8dmGwF(udb(M8l-h?6)l#)(8KJq5bqZ6DeP0sTr#F2*E-8>pZ@^(HdNtoshwj>Y~ z4Jh_TuF)Iisz~GE3Nf{X?Py6Mdoq9IFjY7RyN9G;)`O};!M^zes|M8;TQBOedc7hk zPN*vbuoCaFrBM#gLLjQ5)SA*2lV~0o0{?AAysk=~ zzHCP@@jGG9VvP4}t54@%Mz+N|dZ$S`z*voCNk4Ic%z&3dN3M`#?c2Nf{+iwdq?<10 zqUwJ`8>4;)158p?vsc$swFrG`YKuu)Ndt2& zV~K254D*u#JnQq+M=;ii$HzQZ24XKO1(OP7?d2SfjcVXggMZ|$75Yr>itv-8b6@Sw z33Y8Ta(1fI8MsZOij8MNQT^^%nL_Fs6ioIl?Ji@j?| zQ)u=(VJCI(-V^0tf=1r1jVkl-AS#Dvd*2?kJ?CZ99m-U2!tjuO#l~vWw<0GisMB&W z^QE{bAV9pr#XzSSTvs`OX>h3u_LH1yvkS;o_BVANlz#ev)6oed+uEn>1+M2Fd!8n} zVx`AgF-NhyD1&X_xp<2qvgFuH)I$M z2hS72@1e$W{dw1^n%hDw?LJG+=02Uit?j>C{s8-iq1c{Mk~${ayBM$+RHW#Il4j3l zktV)xzg;pRz2S!VMuwzXVl zrab6zi}~>}IX|@@6DwC&V?48n?xq$c%yc*TTFK{ekXk!}#-%9{lYQ3L_ys9Nrwc0f zY0XNF9dFmFW}e^bj?3eUKReG0f*MnmFE&C#Megw?(lT%D{`2EjU(PJ9S7uUpnb-uu38oyQM%hII(#Z5X#ZYM{!D}3u*J%L`2>J558UF zY<9>gfDas?g@E1;-=Hq)rU{FeH z#ut^U3bOAgZ^Pv~QAoz_C+INDGyRX%i`=Cl21R8_)z)bxdCaw$F6q;c2qj{{brV6| zjXT%09!bS9#mk`GAsT~jEPk{q$Kctf&>QhY#d%ys!xPSkZ_?rFr82%*y<6kDA~W)- z_|l$O058Ym_INYY_4?~E5dpv;oRM@US)X$4yRq~_5a=zH7RQ-ER`A{HRpwHeEh2Au zJA0BcGv!K;jwDVR+P+0pPT!3Ch$l;8IBc?IQFvX`p33PCWi9F+!_in#sss53OV5-3 zipM-di>?FOuUC_R^AGmbl2Bq1)K5 ze42l;SabJrR;qy7CAVSQWX}cMI+=d0Y1JP2OnIHc{@tf6c@xV;{vysnU{czTKiC*h zJSf?#zqs=?q3cHbG1_Y_=Js$3cTIOwYOgX*wx@ZDcTe{xNhNthAL`tuTe zZKXi0{jf1%u>3xZ{8FaM$0wF+YLJPq)RUzLj_$}iSR2xQxjR3*>|KR;&j*W%ID(H( zr@}&6wt+_+EV(G-(!Dp)w-&cH?#3Sf_{(#a6a!548%mVcT}Hex)blT2Ea^XKmcbU5 z_PQJ1E8}~pR&iOD!)+hur{-xBdai$VjzvkWU#2V=>C_PTLpZSP!*k0vso9yinnvqp z=%BETU2DmTEbOAQ z%Q@RxtF&j6FehVLG`J_@bmqn3$ORw9w0IR)`u6@&OIF8H#UqL8^lPJ!B4YfD>vc;{ z(n{U7{5_=H_gc?;!^Lh3Mp^aJa|E^<`aW)Q^`X7}qZdL|Uz0HQ$A$IY%2}dnT_jt5 z*?Ou@@2MT+wW#!ft29z6{Q&`>XdW1@tEAgQnY3JbZR4;e$xv&Ubu2h0#%-&D_4FFF zETOLt+|_@r8OG#CPAseT-RRCK;;$zF?Z$V}uC+;8pFV=FV_y^$BfeF)yJp1KocUY~QEV8B`Sg*9S06raf0N_fJjidN5IGi(4&W#@> zKwJ2aK-Fz^~KJr$H}9SOMxV8%%FmasV6U>pWCX& zC>+(7bfPw56adyLem?iR{()!85bWH7UE+^|qoUxrC<6PS(FFpyo5LY;aP3}*{K`37 zP)N>iNA-|Sl*u2&y(`2$^Q^-cmK+i^8d2EuVAgQ_4L`36+qA$E^=k6Qz1}kb94Zu; zMLk;PGmChY?uq*yOv?)X;TVYqo<8HWeHZ;ZllZ@2)wcescmAEr1OR;$v${`9X@9kU zrs4j_2wHBY>4*ybolX7Blm#j}o!WlBt^6O@)BiaF08mE|_D?Eo-7Oj}$^5&{{{?^h zUm@_@BoK-Q*E;@xCOt;CdU%vCnkaK?>Q{xDdvNZBl!(mCvO!q^AtCQ0BN@_k1f_Ws7BMvrOTK9V(0^|m z$O)w;;-F+2-TEK!<8~Vpk+ucE1e&C6BKWzPpkB9-fI6YTkJrE_Z>Bx5&q+Vz2x^+N heFXn91FTDfs#Z0vo4agJEIw83fhI_!;;!}U{{!}yoZJ8a literal 0 HcmV?d00001 diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index ac3befb5e..b5cde7410 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1188,9 +1188,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, // intersection geometries setup { - auto spheresInfoBuffer = ICPUBuffer::create({ NumberOfProceduralGeometries * sizeof(SProceduralGeomInfo) }); - SProceduralGeomInfo* sphereInfos = reinterpret_cast(spheresInfoBuffer->getPointer()); + core::vector proceduralGeoms; + proceduralGeoms.reserve(NumberOfProceduralGeometries); core::vector aabbs; + aabbs.reserve(NumberOfProceduralGeometries); for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) { const auto middle_i = NumberOfProceduralGeometries / 2.0; @@ -1206,7 +1207,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, }, }; - sphereInfos[i] = sphere; + proceduralGeoms.push_back(sphere); aabbs.push_back({ .minimum = sphere.center - sphere.radius, .maximum = sphere.center + sphere.radius, @@ -1216,8 +1217,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { IGPUBuffer::SCreationParams params; params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = spheresInfoBuffer->getSize(); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), sphereInfos).move_into(m_proceduralGeomInfoBuffer); + params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); m_logger->log("Device address : %d", ILogger::ELL_INFO, m_proceduralGeomInfoBuffer->getDeviceAddress()); } From 7b5059cd913f89424c48180fcaaa82d6f0301cd6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Jan 2025 19:20:28 +0700 Subject: [PATCH 008/296] Fix alignment issue of scratch buffer Signed-off-by: kevyuu --- 71_RayTracingPipeline/main.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index b5cde7410..2905e0427 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1344,7 +1344,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, m_currentImageAcquire = m_surface->acquireNextImage(); #endif size_t totalScratchSize = 0; - + const auto scratchOffsetAlignment = getRequiredDeviceLimits().minAccelerationStructureScratchOffsetAlignment; // build bottom level ASes { @@ -1416,7 +1416,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } scratchSizes[i] = buildSizes.buildScratchSize; - totalScratchSize += buildSizes.buildScratchSize; + totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment); + totalScratchSize += buildSizes.buildScratchSize, scratchOffsetAlignment; { IGPUBuffer::SCreationParams params; @@ -1456,7 +1457,14 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { blasBuildInfos[i].dstAS = m_gpuBlasList[i].get(); blasBuildInfos[i].scratch.buffer = scratchBuffer; - blasBuildInfos[i].scratch.offset = (i == 0) ? 0u : blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1]; + if (i == 0) + { + blasBuildInfos[i].scratch.offset = 0u; + } else + { + const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1]; + blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment); + } buildRangeInfos[i].primitiveCount = primitiveCounts[i]; buildRangeInfos[i].primitiveByteOffset = 0u; From 8cbb0dc19f3cf5c5f562ab2e4be3d0df08c954ec Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Jan 2025 20:19:45 +0700 Subject: [PATCH 009/296] Fix alignment issue of scratch buffer Signed-off-by: kevyuu --- 71_RayTracingPipeline/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 2905e0427..7a35facc4 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1344,7 +1344,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, m_currentImageAcquire = m_surface->acquireNextImage(); #endif size_t totalScratchSize = 0; - const auto scratchOffsetAlignment = getRequiredDeviceLimits().minAccelerationStructureScratchOffsetAlignment; + const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; // build bottom level ASes { From a9d5f8bcec54e5c5ea6d0eff4e2bb8a2469a981b Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Jan 2025 20:49:06 +0700 Subject: [PATCH 010/296] Fix alignment issue of scratch buffer Signed-off-by: kevyuu --- 71_RayTracingPipeline/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 7a35facc4..ce0388ce9 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1417,7 +1417,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, scratchSizes[i] = buildSizes.buildScratchSize; totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment); - totalScratchSize += buildSizes.buildScratchSize, scratchOffsetAlignment; + totalScratchSize += buildSizes.buildScratchSize; { IGPUBuffer::SCreationParams params; From feedf653f12e3e04ada616b48fc7012219cabb0f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Jan 2025 22:43:07 +0700 Subject: [PATCH 011/296] Reduce Push Constant Size and skip compacting procedural geometries Signed-off-by: kevyuu --- 71_RayTracingPipeline/app_resources/common.hlsl | 16 +++++++++------- .../app_resources/light_directional.rcall.hlsl | 2 +- .../app_resources/light_point.rcall.hlsl | 2 +- .../app_resources/light_spot.rcall.hlsl | 4 ++-- 71_RayTracingPipeline/main.cpp | 17 ++++------------- 5 files changed, 17 insertions(+), 24 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index a35bd3fcd..0b8bb277d 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -75,10 +75,9 @@ struct Light { float32_t3 direction; float32_t3 position; - float32_t intensity; - float32_t innerCutoff; float32_t outerCutoff; - int type; + uint16_t type; + #ifndef __HLSL_VERSION bool operator==(const Light&) const = default; @@ -86,16 +85,19 @@ struct Light }; +static const float LightIntensity = 100.0f; + struct SPushConstants { - Light light; + uint64_t proceduralGeomInfoBuffer; + uint64_t triangleGeomInfoBuffer; float32_t3 camPos; + uint32_t frameCounter; float32_t4x4 invMVP; - uint64_t proceduralGeomInfoBuffer; - uint64_t triangleGeomInfoBuffer; - uint32_t frameCounter; + + Light light; }; diff --git a/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl index d4aeca85e..1eb18be34 100644 --- a/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl +++ b/71_RayTracingPipeline/app_resources/light_directional.rcall.hlsl @@ -6,6 +6,6 @@ void main(inout RayLight cLight) { cLight.outLightDir = normalize(-pc.light.direction); - cLight.outIntensity = 1.0; + cLight.outIntensity = 1; cLight.outLightDistance = 10000000; } diff --git a/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl index e82d17ec8..2265a98e7 100644 --- a/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl +++ b/71_RayTracingPipeline/app_resources/light_point.rcall.hlsl @@ -7,7 +7,7 @@ void main(inout RayLight cLight) { float32_t3 lDir = pc.light.position - cLight.inHitPosition; float lightDistance = length(lDir); - cLight.outIntensity = pc.light.intensity / (lightDistance * lightDistance); + cLight.outIntensity = LightIntensity / (lightDistance * lightDistance); cLight.outLightDir = normalize(lDir); cLight.outLightDistance = lightDistance; } \ No newline at end of file diff --git a/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl index 5dbc5a830..f1357d30b 100644 --- a/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl +++ b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl @@ -7,10 +7,10 @@ void main(inout RayLight cLight) { float32_t3 lDir = pc.light.position - cLight.inHitPosition; cLight.outLightDistance = length(lDir); - cLight.outIntensity = pc.light.intensity / (cLight.outLightDistance * cLight.outLightDistance); + cLight.outIntensity = LightIntensity / (cLight.outLightDistance * cLight.outLightDistance); cLight.outLightDir = normalize(lDir); float theta = dot(cLight.outLightDir, normalize(-pc.light.direction)); - float epsilon = pc.light.innerCutoff - pc.light.outerCutoff; + float epsilon = - pc.light.outerCutoff; float spotIntensity = clamp((theta - pc.light.outerCutoff) / epsilon, 0.0, 1.0); cLight.outIntensity *= spotIntensity; } diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index ce0388ce9..d9cad5947 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -507,27 +507,21 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); Light m_oldLight = m_light; - ImGui::ListBox("LightType", &m_light.type, s_lightTypeNames, ELT_COUNT); + int light_type = m_light.type; + ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT); + m_light.type = static_cast(light_type); if (m_light.type == ELT_DIRECTIONAL) { ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); } else if (m_light.type == ELT_POINT) { ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); - ImGui::SliderFloat("Light Intensity", &m_light.intensity, 0.0f, 500.f); } else if (m_light.type == ELT_SPOT) { ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); - ImGui::SliderFloat("Light Intensity", &m_light.intensity, 0.0f, 500.f); - float32_t dInnerCutoff = degrees(acos(m_light.innerCutoff)); float32_t dOuterCutoff = degrees(acos(m_light.outerCutoff)); - if (ImGui::SliderFloat("Light Inner Cutoff", &dInnerCutoff, 0.0f, 45.0f)) - { - dInnerCutoff = dInnerCutoff > dOuterCutoff ? dOuterCutoff : dInnerCutoff; - m_light.innerCutoff = cos(radians(dInnerCutoff)); - } if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f)) { m_light.outerCutoff = cos(radians(dOuterCutoff)); @@ -1219,7 +1213,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); - m_logger->log("Device address : %d", ILogger::ELL_INFO, m_proceduralGeomInfoBuffer->getDeviceAddress()); } { @@ -1307,7 +1300,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT; params.size = bufferSize; m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRegion.buffer); - m_logger->log("Device address : %d", ILogger::ELL_INFO, raygenRegion.buffer->getDeviceAddress()); missRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer); hitRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer); callableRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer); @@ -1510,6 +1502,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, core::vector> cleanupBlas(blasCount); for (uint32_t i = 0; i < blasCount; i++) { + if (asSizes[i] == 0) continue; cleanupBlas[i] = m_gpuBlasList[i]; { IGPUBuffer::SCreationParams params; @@ -1671,8 +1664,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, Light m_light = { .direction = {-1.0f, -1.0f, -0.4f}, .position = {10.0f, 15.0f, 8.0f}, - .intensity = 100.0f, - .innerCutoff = 0.939692621f, // {cos(radians(20.0f))}, .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, .type = ELT_DIRECTIONAL }; From e369368bfe9fcf1b037aa869e7a1741909ca1ead Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Jan 2025 22:55:22 +0700 Subject: [PATCH 012/296] Fix occlusion of procedural geometries Signed-off-by: kevyuu --- 71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl | 2 +- .../app_resources/raytrace_procedural.rchit.hlsl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl index 734491e7d..462287689 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl @@ -119,7 +119,7 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics); const float32_t3 worldPosition = mul(ObjectToWorld3x4(), float32_t4(vertexData.position, 1)); - const float32_t3 worldNormal = mul(vertexData.normal, WorldToObject3x4()).xyz; + const float32_t3 worldNormal = normalize(mul(vertexData.normal, WorldToObject3x4()).xyz); RayLight cLight; cLight.inHitPosition = worldPosition; diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl index ef3503346..dd5598105 100644 --- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl @@ -44,7 +44,7 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs ShadowPayload shadowPayload; shadowPayload.isShadowed = true; shadowPayload.seed = p.seed; - TraceRay(topLevelAS, flags, 0xFF, ERT_OCCLUSION, 0, EMT_PRIMARY, rayDesc, shadowPayload); + TraceRay(topLevelAS, flags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload); bool isShadowed = shadowPayload.isShadowed; if (isShadowed) From c2c82d49f918de9f90577156bbeb33edf2e050c1 Mon Sep 17 00:00:00 2001 From: Ali Cheraghi Date: Fri, 24 Jan 2025 18:43:16 +0330 Subject: [PATCH 013/296] 71: cache shaders Signed-off-by: Ali Cheraghi Signed-off-by: kevyuu --- 71_RayTracingPipeline/main.cpp | 129 +++++++++++++++++++++++---------- 1 file changed, 90 insertions(+), 39 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index d9cad5947..95540c0b9 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -101,45 +101,101 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) return false; + smart_refctd_ptr shaderReadCache = nullptr; + smart_refctd_ptr shaderWriteCache = core::make_smart_refctd_ptr(); + auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin"; - const auto compileShader = [&](const std::string & filePath, const std::string & header = "") -> smart_refctd_ptr { - IAssetLoader::SAssetLoadParams lparams = {}; - lparams.logger = m_logger.get(); - lparams.workingDirectory = ""; - auto bundle = m_assetMgr->getAsset(filePath, lparams); - if (bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER) - { - m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath); - exit(-1); - } + core::smart_refctd_ptr shaderReadCacheFile; + { + system::ISystem::future_t> future; + m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ); + if (future.wait()) + { + future.acquire().move_into(shaderReadCacheFile); + if (shaderReadCacheFile) + { + const size_t size = shaderReadCacheFile->getSize(); + if (size > 0ull) + { + std::vector contents(size); + system::IFile::success_t succ; + shaderReadCacheFile->read(succ, contents.data(), 0, size); + if (succ) + shaderReadCache = IShaderCompiler::CCache::deserialize(contents); + } + } + } + else + m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR); + } - const auto assets = bundle.getContents(); - assert(assets.size() == 1); - smart_refctd_ptr sourceRaw = IAsset::castDown(assets[0]); - if (!sourceRaw) - m_logger->log("Fail to load shader source", ILogger::ELL_ERROR, filePath); - smart_refctd_ptr source = CHLSLCompiler::createOverridenCopy( - sourceRaw.get(), - "%s\n", - header.c_str() - ); + } - return m_device->createShader(source.get()); - }; + // Load Custom Shader + auto loadCompileAndCreateShader = [&](const std::string& relPath, const std::string& header = "") -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + auto assetBundle = m_assetMgr->getAsset(relPath, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + return nullptr; + + // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader + auto sourceRaw = IAsset::castDown(assets[0]); + if (!sourceRaw) + return nullptr; + + smart_refctd_ptr source = CHLSLCompiler::createOverridenCopy( + sourceRaw.get(), + "%s\n", + header.c_str() + ); + + return m_device->createShader({ source.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); + }; - // shader - const auto raygenShader = compileShader("app_resources/raytrace.rgen.hlsl"); - const auto closestHitShader = compileShader("app_resources/raytrace.rchit.hlsl"); - const auto proceduralClosestHitShader = compileShader("app_resources/raytrace_procedural.rchit.hlsl"); - const auto intersectionHitShader = compileShader("app_resources/raytrace.rint.hlsl"); - const auto anyHitShaderColorPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_COLOR_PAYLOAD\n"); - const auto anyHitShaderShadowPayload = compileShader("app_resources/raytrace.rahit.hlsl", "#define USE_SHADOW_PAYLOAD\n"); - const auto missShader = compileShader("app_resources/raytrace.rmiss.hlsl"); - const auto shadowMissShader = compileShader("app_resources/raytraceShadow.rmiss.hlsl"); - const auto directionalLightCallShader = compileShader("app_resources/light_directional.rcall.hlsl"); - const auto pointLightCallShader = compileShader("app_resources/light_point.rcall.hlsl"); - const auto spotLightCallShader = compileShader("app_resources/light_spot.rcall.hlsl"); + // load shaders + const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl"); + const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl"); + const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl"); + const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl"); + const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl", "#define USE_COLOR_PAYLOAD\n"); + const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl", "#define USE_SHADOW_PAYLOAD\n"); + const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); + const auto shadowMissShader = loadCompileAndCreateShader("app_resources/raytraceShadow.rmiss.hlsl"); + const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); + const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); + const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl"); + const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl"); + + core::smart_refctd_ptr shaderWriteCacheFile; + { + system::ISystem::future_t> future; + m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json + m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE); + if (future.wait()) + { + future.acquire().move_into(shaderWriteCacheFile); + if (shaderWriteCacheFile) + { + auto serializedCache = shaderWriteCache->serialize(); + if (shaderWriteCacheFile) + { + system::IFile::success_t succ; + shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize()); + if (!succ) + m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR); + } + } + else + m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); + } + else + m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); + } m_semaphore = m_device->createSemaphore(m_realFrameIx); if (!m_semaphore) @@ -382,11 +438,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, if (!fsTriProtoPPln) return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); - // Load Fragment Shader - auto fragmentShader = compileShader("app_resources/present.frag.hlsl"); - if (!fragmentShader) - return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!"); - const IGPUShader::SSpecInfo fragSpec = { .entryPoint = "main", .shader = fragmentShader.get() From 34a3fa9c925af2c7d8abeaaf500a7e79603fa9eb Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Jan 2025 23:43:13 +0700 Subject: [PATCH 014/296] Remove unnecesary log Signed-off-by: kevyuu --- 71_RayTracingPipeline/main.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 95540c0b9..09a20340f 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -413,7 +413,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, if (!createShaderBindingTable(gQueue, m_rayTracingPipeline)) return logFail("Could not create shader binding table"); - m_logger->log("Shader binding table created", system::ILogger::ELL_INFO); } { @@ -1023,7 +1022,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, return logFail("Couldn't create Command Pool for geometry creation!"); const auto defaultMaterial = Material{ - .ambient = {0.1, 0.1, 0.1}, + .ambient = {}, .diffuse = {0.8, 0.3, 0.3}, .specular = {0.8, 0.8, 0.8}, .shininess = 1.0f, @@ -1057,7 +1056,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), .material = { - .ambient = {0.1, 0.1, 0.1}, + .ambient = {}, .diffuse = {0.2, 0.2, 0.8}, .specular = {0.8, 0.8, 0.8}, .shininess = 1.0f, @@ -1069,7 +1068,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), .material = { - .ambient = {0.1, 0.1, 0.1}, + .ambient = {}, .diffuse = {0.2, 0.8, 0.2}, .specular = {0.8, 0.8, 0.8}, .shininess = 1.0f, @@ -1244,7 +1243,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), .radius = 1, .material = { - .ambient = {0.1, 0.1, 0.1}, + .ambient = {}, .diffuse = {0.3, 0.2 * i, 0.3}, .specular = {0.8, 0.8, 0.8}, .shininess = 1.0f, From ec8826816de92eb0ccb78e63157af2a721f5f236 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 25 Jan 2025 16:22:04 +0700 Subject: [PATCH 015/296] Change recursion depth to 1 so the demo can be run on more devices. Signed-off-by: kevyuu --- 71_RayTracingPipeline/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 09a20340f..4245a186d 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -404,7 +404,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, shaderGroups.callableGroups[ELT_POINT] = { .shaderIndex = RTDS_POINT_CALL }; shaderGroups.callableGroups[ELT_SPOT] = { .shaderIndex = RTDS_SPOT_CALL }; - params.cached.maxRecursionDepth = 2; + params.cached.maxRecursionDepth = 1; if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) return logFail("Failed to create ray tracing pipeline"); From d21c22c4b9a5785eae115b0e8e6f83df2bbfaf3b Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 25 Jan 2025 17:05:17 +0700 Subject: [PATCH 016/296] Temporarily remove procedural geometries to debug crash on amd cards. Signed-off-by: kevyuu --- 71_RayTracingPipeline/main.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 4245a186d..5961ed225 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1362,8 +1362,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}} // spheres. Each sphere is a primitive instead one instance or geometry - const auto blasCount = m_gpuTriangleGeometries.size() + 1; - const auto proceduralBlasIdx = blasCount - 1; + const auto blasCount = m_gpuTriangleGeometries.size(); + const auto proceduralBlasIdx = m_gpuTriangleGeometries.size(); IQueryPool::SCreationParams qParams{ .queryCount = static_cast(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE }; smart_refctd_ptr queryPool = m_device->createQueryPool(std::move(qParams)); @@ -1587,7 +1587,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, // build top level AS { - const uint32_t instancesCount = m_gpuBlasList.size(); + const uint32_t instancesCount = blasCount; core::vector instances(instancesCount); for (uint32_t i = 0; i < instancesCount; i++) { From bc093c73bca79308880e586bb604692025e03f51 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 25 Jan 2025 23:15:05 +0700 Subject: [PATCH 017/296] Use Nabla AABB type instead creating another aabb type Signed-off-by: kevyuu --- .../app_resources/common.hlsl | 6 ------ 71_RayTracingPipeline/main.cpp | 19 +++++++++++-------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index 0b8bb277d..8c73fada3 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -22,12 +22,6 @@ struct SProceduralGeomInfo Material material; }; -struct Aabb -{ - float32_t3 minimum; - float32_t3 maximum; -}; - struct STriangleGeomInfo { uint64_t vertexBufferAddress; diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 5961ed225..0e76e4c72 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -365,7 +365,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get()}; params.layout = pipelineLayout.get(); - params.shaders = std::span(shaders, std::size(shaders)); + params.shaders = std::span(shaders); auto& shaderGroups = params.cached.shaderGroups; @@ -1234,6 +1234,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { core::vector proceduralGeoms; proceduralGeoms.reserve(NumberOfProceduralGeometries); + using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; core::vector aabbs; aabbs.reserve(NumberOfProceduralGeometries); for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) @@ -1252,10 +1253,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, }; proceduralGeoms.push_back(sphere); - aabbs.push_back({ - .minimum = sphere.center - sphere.radius, - .maximum = sphere.center + sphere.radius, - }); + const auto sphereMin = sphere.center - sphere.radius; + const auto sphereMax = sphere.center + sphere.radius; + aabbs.emplace_back( + vector3d(sphereMin.x, sphereMin.y, sphereMin.z), + vector3d(sphereMax.x, sphereMax.y, sphereMax.z)); } { @@ -1362,7 +1364,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}} // spheres. Each sphere is a primitive instead one instance or geometry - const auto blasCount = m_gpuTriangleGeometries.size(); + const auto blasCount = m_gpuTriangleGeometries.size() + 1; const auto proceduralBlasIdx = m_gpuTriangleGeometries.size(); IQueryPool::SCreationParams qParams{ .queryCount = static_cast(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE }; @@ -1415,8 +1417,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, bool isProcedural = i == proceduralBlasIdx; if (isProcedural) { - aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer); - aabbs.stride = sizeof(Aabb); + aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer); + aabbs.data.offset = 0; + aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t); aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries; From 1dc682adea052395fc0be30044f5b8aafd2ceb1e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sun, 26 Jan 2025 11:45:10 +0700 Subject: [PATCH 018/296] Fix query compact blas size issue Signed-off-by: kevyuu --- 71_RayTracingPipeline/main.cpp | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 0e76e4c72..22c745635 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -287,20 +287,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, }); - auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); - auto* geometryCreator = assetManager->getGeometryCreator(); - - // create geometry objects - if (!createGeometries(gQueue, geometryCreator)) - return logFail("Could not create geometries from geometry creator"); - - if (!createAccelerationStructures(getComputeQueue())) - return logFail("Could not create acceleration structures"); - - ISampler::SParams samplerParams = { - .AnisotropicFilter = 0 - }; - auto defaultSampler = m_device->createSampler(samplerParams); // ray trace pipeline and descriptor set layout setup { @@ -415,6 +401,21 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } + auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); + auto* geometryCreator = assetManager->getGeometryCreator(); + + // create geometry objects + if (!createGeometries(gQueue, geometryCreator)) + return logFail("Could not create geometries from geometry creator"); + + if (!createAccelerationStructures(getComputeQueue())) + return logFail("Could not create acceleration structures"); + + ISampler::SParams samplerParams = { + .AnisotropicFilter = 0 + }; + auto defaultSampler = m_device->createSampler(samplerParams); + { const IGPUDescriptorSetLayout::SBinding bindings[] = { { @@ -1495,7 +1496,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, scratchBuffer = createBuffer(params); } - uint32_t queryCount = 0; core::vector buildRangeInfos(blasCount); core::vector pRangeInfos(blasCount); for (uint32_t i = 0; i < blasCount; i++) @@ -1536,7 +1536,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, for (uint32_t i = 0; i < blasCount; i++) ases[i] = m_gpuBlasList[i].get(); if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE, - queryPool.get(), queryCount++)) + queryPool.get(), 0)) return logFail("Failed to write acceleration structure properties!"); cmdbufBlas->endDebugMarker(); @@ -1549,7 +1549,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, // compact blas { core::vector asSizes(blasCount); - if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), IQueryPool::WAIT_BIT)) + if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT)) return logFail("Could not get query pool results for AS sizes"); core::vector> cleanupBlas(blasCount); From eab0f70c674f93b86ac4649805ac7baaed8d0af0 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 3 Feb 2025 14:52:11 +0700 Subject: [PATCH 019/296] copied ex 30 to new ex 31 --- 31_HLSLPathTracer/CMakeLists.txt | 37 + .../app_resources/glsl/common.glsl | 811 +++++++++++ .../app_resources/glsl/litByRectangle.comp | 182 +++ .../app_resources/glsl/litBySphere.comp | 60 + .../app_resources/glsl/litByTriangle.comp | 105 ++ .../app_resources/hlsl/present.frag.hlsl | 19 + 31_HLSLPathTracer/config.json.template | 28 + .../include/nbl/this_example/common.hpp | 17 + 31_HLSLPathTracer/main.cpp | 1276 +++++++++++++++++ 31_HLSLPathTracer/pipeline.groovy | 50 + CMakeLists.txt | 2 + 11 files changed, 2587 insertions(+) create mode 100644 31_HLSLPathTracer/CMakeLists.txt create mode 100644 31_HLSLPathTracer/app_resources/glsl/common.glsl create mode 100644 31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp create mode 100644 31_HLSLPathTracer/app_resources/glsl/litBySphere.comp create mode 100644 31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp create mode 100644 31_HLSLPathTracer/app_resources/hlsl/present.frag.hlsl create mode 100644 31_HLSLPathTracer/config.json.template create mode 100644 31_HLSLPathTracer/include/nbl/this_example/common.hpp create mode 100644 31_HLSLPathTracer/main.cpp create mode 100644 31_HLSLPathTracer/pipeline.groovy diff --git a/31_HLSLPathTracer/CMakeLists.txt b/31_HLSLPathTracer/CMakeLists.txt new file mode 100644 index 000000000..07b0fd396 --- /dev/null +++ b/31_HLSLPathTracer/CMakeLists.txt @@ -0,0 +1,37 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +if(NBL_BUILD_IMGUI) + set(NBL_INCLUDE_SERACH_DIRECTORIES + "${CMAKE_CURRENT_SOURCE_DIR}/include" + ) + + list(APPEND NBL_LIBRARIES + imtestengine + "${NBL_EXT_IMGUI_UI_LIB}" + ) + + nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + + if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) + endif() +endif() + + diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl new file mode 100644 index 000000000..2463f82cf --- /dev/null +++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl @@ -0,0 +1,811 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +// firefly and variance reduction techniques +//#define KILL_DIFFUSE_SPECULAR_PATHS +//#define VISUALIZE_HIGH_VARIANCE + +// debug +//#define NEE_ONLY + +layout(set = 2, binding = 0) uniform sampler2D envMap; +layout(set = 2, binding = 1) uniform usamplerBuffer sampleSequence; +layout(set = 2, binding = 2) uniform usampler2D scramblebuf; + +layout(set=0, binding=0, rgba16f) uniform image2D outImage; + +#ifndef _NBL_GLSL_WORKGROUP_SIZE_ +#define _NBL_GLSL_WORKGROUP_SIZE_ 32 +layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=_NBL_GLSL_WORKGROUP_SIZE_, local_size_z=1) in; +#endif + +ivec2 getCoordinates() { + return ivec2(gl_GlobalInvocationID.xy); +} + +vec2 getTexCoords() { + ivec2 imageSize = imageSize(outImage); + ivec2 iCoords = getCoordinates(); + return vec2(float(iCoords.x) / imageSize.x, 1.0 - float(iCoords.y) / imageSize.y); +} + + +#include +#include +#include + +#include + +layout(push_constant, row_major) uniform constants +{ + mat4 invMVP; + int sampleCount; + int depth; +} PTPushConstant; + +#define INVALID_ID_16BIT 0xffffu +struct Sphere +{ + vec3 position; + float radius2; + uint bsdfLightIDs; +}; + +Sphere Sphere_Sphere(in vec3 position, in float radius, in uint bsdfID, in uint lightID) +{ + Sphere sphere; + sphere.position = position; + sphere.radius2 = radius*radius; + sphere.bsdfLightIDs = bitfieldInsert(bsdfID,lightID,16,16); + return sphere; +} + +// return intersection distance if found, nbl_glsl_FLT_NAN otherwise +float Sphere_intersect(in Sphere sphere, in vec3 origin, in vec3 direction) +{ + vec3 relOrigin = origin-sphere.position; + float relOriginLen2 = dot(relOrigin,relOrigin); + const float radius2 = sphere.radius2; + + float dirDotRelOrigin = dot(direction,relOrigin); + float det = radius2-relOriginLen2+dirDotRelOrigin*dirDotRelOrigin; + + // do some speculative math here + float detsqrt = sqrt(det); + return -dirDotRelOrigin+(relOriginLen2>radius2 ? (-detsqrt):detsqrt); +} + +vec3 Sphere_getNormal(in Sphere sphere, in vec3 position) +{ + const float radiusRcp = inversesqrt(sphere.radius2); + return (position-sphere.position)*radiusRcp; +} + +float Sphere_getSolidAngle_impl(in float cosThetaMax) +{ + return 2.0*nbl_glsl_PI*(1.0-cosThetaMax); +} +float Sphere_getSolidAngle(in Sphere sphere, in vec3 origin) +{ + float cosThetaMax = sqrt(1.0-sphere.radius2/nbl_glsl_lengthSq(sphere.position-origin)); + return Sphere_getSolidAngle_impl(cosThetaMax); +} + + +Sphere spheres[SPHERE_COUNT] = { + Sphere_Sphere(vec3(0.0,-100.5,-1.0),100.0,0u,INVALID_ID_16BIT), + Sphere_Sphere(vec3(2.0,0.0,-1.0),0.5,1u,INVALID_ID_16BIT), + Sphere_Sphere(vec3(0.0,0.0,-1.0),0.5,2u,INVALID_ID_16BIT), + Sphere_Sphere(vec3(-2.0,0.0,-1.0),0.5,3u,INVALID_ID_16BIT), + Sphere_Sphere(vec3(2.0,0.0,1.0),0.5,4u,INVALID_ID_16BIT), + Sphere_Sphere(vec3(0.0,0.0,1.0),0.5,4u,INVALID_ID_16BIT), + Sphere_Sphere(vec3(-2.0,0.0,1.0),0.5,5u,INVALID_ID_16BIT), + Sphere_Sphere(vec3(0.5,1.0,0.5),0.5,6u,INVALID_ID_16BIT) +#if SPHERE_COUNT>8 + ,Sphere_Sphere(vec3(-1.5,1.5,0.0),0.3,INVALID_ID_16BIT,0u) +#endif +}; + + +struct Triangle +{ + vec3 vertex0; + uint bsdfLightIDs; + vec3 vertex1; + uint padding0; + vec3 vertex2; + uint padding1; +}; + +Triangle Triangle_Triangle(in mat3 vertices, in uint bsdfID, in uint lightID) +{ + Triangle tri; + tri.vertex0 = vertices[0]; + tri.vertex1 = vertices[1]; + tri.vertex2 = vertices[2]; + // + tri.bsdfLightIDs = bitfieldInsert(bsdfID, lightID, 16, 16); + return tri; +} + +// return intersection distance if found, nbl_glsl_FLT_NAN otherwise +float Triangle_intersect(in Triangle tri, in vec3 origin, in vec3 direction) +{ + const vec3 edges[2] = vec3[2](tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0); + + const vec3 h = cross(direction,edges[1]); + const float a = dot(edges[0],h); + + const vec3 relOrigin = origin-tri.vertex0; + + const float u = dot(relOrigin,h)/a; + + const vec3 q = cross(relOrigin,edges[0]); + const float v = dot(direction,q)/a; + + const float t = dot(edges[1],q)/a; + + return t>0.f&&u>=0.f&&v>=0.f&&(u+v)<=1.f ? t:nbl_glsl_FLT_NAN; +} + +vec3 Triangle_getNormalTimesArea_impl(in mat2x3 edges) +{ + return cross(edges[0],edges[1])*0.5; +} +vec3 Triangle_getNormalTimesArea(in Triangle tri) +{ + return Triangle_getNormalTimesArea_impl(mat2x3(tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0)); +} + + + +struct Rectangle +{ + vec3 offset; + uint bsdfLightIDs; + vec3 edge0; + uint padding0; + vec3 edge1; + uint padding1; +}; + +Rectangle Rectangle_Rectangle(in vec3 offset, in vec3 edge0, in vec3 edge1, in uint bsdfID, in uint lightID) +{ + Rectangle rect; + rect.offset = offset; + rect.edge0 = edge0; + rect.edge1 = edge1; + // + rect.bsdfLightIDs = bitfieldInsert(bsdfID, lightID, 16, 16); + return rect; +} + +void Rectangle_getNormalBasis(in Rectangle rect, out mat3 basis, out vec2 extents) +{ + extents = vec2(length(rect.edge0), length(rect.edge1)); + basis[0] = rect.edge0/extents[0]; + basis[1] = rect.edge1/extents[1]; + basis[2] = normalize(cross(basis[0],basis[1])); +} + +// return intersection distance if found, nbl_glsl_FLT_NAN otherwise +float Rectangle_intersect(in Rectangle rect, in vec3 origin, in vec3 direction) +{ + const vec3 h = cross(direction,rect.edge1); + const float a = dot(rect.edge0,h); + + const vec3 relOrigin = origin-rect.offset; + + const float u = dot(relOrigin,h)/a; + + const vec3 q = cross(relOrigin,rect.edge0); + const float v = dot(direction,q)/a; + + const float t = dot(rect.edge1,q)/a; + + const bool intersection = t>0.f&&u>=0.f&&v>=0.f&&u<=1.f&&v<=1.f; + return intersection ? t:nbl_glsl_FLT_NAN; +} + +vec3 Rectangle_getNormalTimesArea(in Rectangle rect) +{ + return cross(rect.edge0,rect.edge1); +} + + + +#define DIFFUSE_OP 0u +#define CONDUCTOR_OP 1u +#define DIELECTRIC_OP 2u +#define OP_BITS_OFFSET 0 +#define OP_BITS_SIZE 2 +struct BSDFNode +{ + uvec4 data[2]; +}; + +uint BSDFNode_getType(in BSDFNode node) +{ + return bitfieldExtract(node.data[0].w,OP_BITS_OFFSET,OP_BITS_SIZE); +} +bool BSDFNode_isBSDF(in BSDFNode node) +{ + return BSDFNode_getType(node)==DIELECTRIC_OP; +} +bool BSDFNode_isNotDiffuse(in BSDFNode node) +{ + return BSDFNode_getType(node)!=DIFFUSE_OP; +} +float BSDFNode_getRoughness(in BSDFNode node) +{ + return uintBitsToFloat(node.data[1].w); +} +vec3 BSDFNode_getRealEta(in BSDFNode node) +{ + return uintBitsToFloat(node.data[0].rgb); +} +vec3 BSDFNode_getImaginaryEta(in BSDFNode node) +{ + return uintBitsToFloat(node.data[1].rgb); +} +mat2x3 BSDFNode_getEta(in BSDFNode node) +{ + return mat2x3(BSDFNode_getRealEta(node),BSDFNode_getImaginaryEta(node)); +} +#include +vec3 BSDFNode_getReflectance(in BSDFNode node, in float VdotH) +{ + const vec3 albedoOrRealIoR = uintBitsToFloat(node.data[0].rgb); + if (BSDFNode_isNotDiffuse(node)) + return nbl_glsl_fresnel_conductor(albedoOrRealIoR, BSDFNode_getImaginaryEta(node), VdotH); + else + return albedoOrRealIoR; +} + +float BSDFNode_getNEEProb(in BSDFNode bsdf) +{ + const float alpha = BSDFNode_isNotDiffuse(bsdf) ? BSDFNode_getRoughness(bsdf):1.0; + return min(8.0*alpha,1.0); +} + +#include +#include +float getLuma(in vec3 col) +{ + return dot(transpose(nbl_glsl_scRGBtoXYZ)[1],col); +} + +#define BSDF_COUNT 7 +BSDFNode bsdfs[BSDF_COUNT] = { + {{uvec4(floatBitsToUint(vec3(0.8,0.8,0.8)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}}, + {{uvec4(floatBitsToUint(vec3(0.8,0.4,0.4)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}}, + {{uvec4(floatBitsToUint(vec3(0.4,0.8,0.4)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}}, + {{uvec4(floatBitsToUint(vec3(1.02,1.02,1.3)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,1.0,2.0,0.0))}}, + {{uvec4(floatBitsToUint(vec3(1.02,1.3,1.02)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,2.0,1.0,0.0))}}, + {{uvec4(floatBitsToUint(vec3(1.02,1.3,1.02)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,2.0,1.0,0.15))}}, + {{uvec4(floatBitsToUint(vec3(1.4,1.45,1.5)),DIELECTRIC_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0625))}} +}; + + +struct Light +{ + vec3 radiance; + uint objectID; +}; + +vec3 Light_getRadiance(in Light light) +{ + return light.radiance; +} +uint Light_getObjectID(in Light light) +{ + return light.objectID; +} + + +#define LIGHT_COUNT 1 +float scene_getLightChoicePdf(in Light light) +{ + return 1.0/float(LIGHT_COUNT); +} + + +#define LIGHT_COUNT 1 +Light lights[LIGHT_COUNT] = +{ + { + vec3(30.0,25.0,15.0), +#ifdef POLYGON_METHOD + 0u +#else + 8u +#endif + } +}; + + + +#define ANY_HIT_FLAG (-2147483648) +#define DEPTH_BITS_COUNT 8 +#define DEPTH_BITS_OFFSET (31-DEPTH_BITS_COUNT) +struct ImmutableRay_t +{ + vec3 origin; + vec3 direction; +#if POLYGON_METHOD==2 + vec3 normalAtOrigin; + bool wasBSDFAtOrigin; +#endif +}; +struct MutableRay_t +{ + float intersectionT; + uint objectID; + /* irrelevant here + uint triangleID; + vec2 barycentrics; + */ +}; +struct Payload_t +{ + vec3 accumulation; + float otherTechniqueHeuristic; + vec3 throughput; + #ifdef KILL_DIFFUSE_SPECULAR_PATHS + bool hasDiffuse; + #endif +}; + +struct Ray_t +{ + ImmutableRay_t _immutable; + MutableRay_t _mutable; + Payload_t _payload; +}; + + +#define INTERSECTION_ERROR_BOUND_LOG2 (-8.0) +float getTolerance_common(in uint depth) +{ + float depthRcp = 1.0/float(depth); + return INTERSECTION_ERROR_BOUND_LOG2;// *depthRcp*depthRcp; +} +float getStartTolerance(in uint depth) +{ + return exp2(getTolerance_common(depth)); +} +float getEndTolerance(in uint depth) +{ + return 1.0-exp2(getTolerance_common(depth)+1.0); +} + + +vec2 SampleSphericalMap(vec3 v) +{ + vec2 uv = vec2(atan(v.z, v.x), asin(v.y)); + uv *= nbl_glsl_RECIPROCAL_PI*0.5; + uv += 0.5; + return uv; +} + +void missProgram(in ImmutableRay_t _immutable, inout Payload_t _payload) +{ + vec3 finalContribution = _payload.throughput; + // #define USE_ENVMAP +#ifdef USE_ENVMAP + vec2 uv = SampleSphericalMap(_immutable.direction); + finalContribution *= textureLod(envMap, uv, 0.0).rgb; +#else + const vec3 kConstantEnvLightRadiance = vec3(0.15, 0.21, 0.3); + finalContribution *= kConstantEnvLightRadiance; + _payload.accumulation += finalContribution; +#endif +} + +#include +#include +#include +#include +#include +#include +#include +nbl_glsl_LightSample nbl_glsl_bsdf_cos_generate(in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in vec3 u, in BSDFNode bsdf, in float monochromeEta, out nbl_glsl_AnisotropicMicrofacetCache _cache) +{ + const float a = BSDFNode_getRoughness(bsdf); + const mat2x3 ior = BSDFNode_getEta(bsdf); + + // fresnel stuff for dielectrics + float orientedEta, rcpOrientedEta; + const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta); + + nbl_glsl_LightSample smpl; + nbl_glsl_AnisotropicMicrofacetCache dummy; + switch (BSDFNode_getType(bsdf)) + { + case DIFFUSE_OP: + smpl = nbl_glsl_oren_nayar_cos_generate(interaction,u.xy,a*a); + break; + case CONDUCTOR_OP: + smpl = nbl_glsl_ggx_cos_generate(interaction,u.xy,a,a,_cache); + break; + default: + smpl = nbl_glsl_ggx_dielectric_cos_generate(interaction,u,a,a,monochromeEta,_cache); + break; + } + return smpl; +} + +vec3 nbl_glsl_bsdf_cos_remainder_and_pdf(out float pdf, in nbl_glsl_LightSample _sample, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in BSDFNode bsdf, in float monochromeEta, in nbl_glsl_AnisotropicMicrofacetCache _cache) +{ + // are V and L on opposite sides of the surface? + const bool transmitted = nbl_glsl_isTransmissionPath(interaction.isotropic.NdotV,_sample.NdotL); + + // is the BSDF or BRDF, if it is then we make the dot products `abs` before `max(,0.0)` + const bool transmissive = BSDFNode_isBSDF(bsdf); + const float clampedNdotL = nbl_glsl_conditionalAbsOrMax(transmissive,_sample.NdotL,0.0); + const float clampedNdotV = nbl_glsl_conditionalAbsOrMax(transmissive,interaction.isotropic.NdotV,0.0); + + vec3 remainder; + + const float minimumProjVectorLen = 0.00000001; + if (clampedNdotV>minimumProjVectorLen && clampedNdotL>minimumProjVectorLen) + { + // fresnel stuff for conductors (but reflectance also doubles as albedo) + const mat2x3 ior = BSDFNode_getEta(bsdf); + const vec3 reflectance = BSDFNode_getReflectance(bsdf,_cache.isotropic.VdotH); + + // fresnel stuff for dielectrics + float orientedEta, rcpOrientedEta; + const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta); + + // + const float VdotL = dot(interaction.isotropic.V.dir,_sample.L); + + // + const float a = max(BSDFNode_getRoughness(bsdf),0.0001); // TODO: @Crisspl 0-roughness still doesn't work! Also Beckmann has a weird dark rim instead as fresnel!? + const float a2 = a*a; + + // TODO: refactor into Material Compiler-esque thing + switch (BSDFNode_getType(bsdf)) + { + case DIFFUSE_OP: + remainder = reflectance*nbl_glsl_oren_nayar_cos_remainder_and_pdf_wo_clamps(pdf,a*a,VdotL,clampedNdotL,clampedNdotV); + break; + case CONDUCTOR_OP: + remainder = nbl_glsl_ggx_cos_remainder_and_pdf_wo_clamps(pdf,nbl_glsl_ggx_trowbridge_reitz(a2,_cache.isotropic.NdotH2),clampedNdotL,_sample.NdotL2,clampedNdotV,interaction.isotropic.NdotV_squared,reflectance,a2); + break; + default: + remainder = vec3(nbl_glsl_ggx_dielectric_cos_remainder_and_pdf(pdf, _sample, interaction.isotropic, _cache.isotropic, monochromeEta, a*a)); + break; + } + } + else + remainder = vec3(0.0); + return remainder; +} + +layout (constant_id = 0) const int MAX_DEPTH_LOG2 = 4; +layout (constant_id = 1) const int MAX_SAMPLES_LOG2 = 10; + + +#include + +mat2x3 rand3d(in uint protoDimension, in uint _sample, inout nbl_glsl_xoroshiro64star_state_t scramble_state) +{ + mat2x3 retval; + uint address = bitfieldInsert(protoDimension,_sample,MAX_DEPTH_LOG2,MAX_SAMPLES_LOG2); + for (int i=0; i<2u; i++) + { + uvec3 seqVal = texelFetch(sampleSequence,int(address)+i).xyz; + seqVal ^= uvec3(nbl_glsl_xoroshiro64star(scramble_state),nbl_glsl_xoroshiro64star(scramble_state),nbl_glsl_xoroshiro64star(scramble_state)); + retval[i] = vec3(seqVal)*uintBitsToFloat(0x2f800004u); + } + return retval; +} + + +void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction); +int traceRay(inout float intersectionT, in vec3 origin, in vec3 direction) +{ + const bool anyHit = intersectionT!=nbl_glsl_FLT_MAX; + + int objectID = -1; + for (int i=0; i0.0 && tnbl_glsl_FLT_MIN; + // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself + nbl_glsl_AnisotropicMicrofacetCache _cache; + validPath = validPath && nbl_glsl_calcAnisotropicMicrofacetCache(_cache, interaction, nee_sample, monochromeEta); + if (lightPdflumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1) + ray._payload.accumulation += neeContrib; + }} + } +#if NEE_ONLY + return false; +#endif + // sample BSDF + float bsdfPdf; vec3 bsdfSampleL; + { + nbl_glsl_AnisotropicMicrofacetCache _cache; + nbl_glsl_LightSample bsdf_sample = nbl_glsl_bsdf_cos_generate(interaction,epsilon[1],bsdf,monochromeEta,_cache); + // the value of the bsdf divided by the probability of the sample being generated + throughput *= nbl_glsl_bsdf_cos_remainder_and_pdf(bsdfPdf,bsdf_sample,interaction,bsdf,monochromeEta,_cache); + // + bsdfSampleL = bsdf_sample.L; + } + + // additional threshold + const float lumaThroughputThreshold = lumaContributionThreshold; + if (bsdfPdf>bsdfPdfThreshold && getLuma(throughput)>lumaThroughputThreshold) + { + ray._payload.throughput = throughput; + ray._payload.otherTechniqueHeuristic = neeProbability/bsdfPdf; // numerically stable, don't touch + ray._payload.otherTechniqueHeuristic *= ray._payload.otherTechniqueHeuristic; + + // trace new ray + ray._immutable.origin = intersection+bsdfSampleL*(1.0/*kSceneSize*/)*getStartTolerance(depth); + ray._immutable.direction = bsdfSampleL; + #if POLYGON_METHOD==2 + ray._immutable.normalAtOrigin = interaction.isotropic.N; + ray._immutable.wasBSDFAtOrigin = isBSDF; + #endif + return true; + } + } + return false; +} + +void main() +{ + const ivec2 imageExtents = imageSize(outImage); + const ivec2 coords = getCoordinates(); + vec2 texCoord = vec2(coords) / vec2(imageExtents); + texCoord.y = 1.0 - texCoord.y; + + if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) { + return; + } + + if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0) + { + vec4 pixelCol = vec4(1.0,0.0,0.0,1.0); + imageStore(outImage, coords, pixelCol); + return; + } + + nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg; + const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0)); + + + const mat4 invMVP = PTPushConstant.invMVP; + + vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0); + vec3 camPos; + { + vec4 tmp = invMVP*NDC; + camPos = tmp.xyz/tmp.w; + NDC.z = 1.0; + } + + vec3 color = vec3(0.0); + float meanLumaSquared = 0.0; + // TODO: if we collapse the nested for loop, then all GPUs will get `PTPushConstant.depth` factor speedup, not just NV with separate PC + for (int i=0; i5.0) + color = vec3(1.0,0.0,0.0); + #endif + + vec4 pixelCol = vec4(color, 1.0); + imageStore(outImage, coords, pixelCol); +} +/** TODO: Improving Rendering + +Now: +- Always MIS (path correlated reuse) +- Test MIS alpha (roughness) scheme + +Many Lights: +- Path Guiding +- Light Importance Lists/Classification +- Spatio-Temporal Reservoir Sampling + +Indirect Light: +- Bidirectional Path Tracing +- Uniform Path Sampling / Vertex Connection and Merging / Path Space Regularization + +Animations: +- A-SVGF / BMFR +**/ \ No newline at end of file diff --git a/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp b/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp new file mode 100644 index 000000000..300cef559 --- /dev/null +++ b/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp @@ -0,0 +1,182 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#version 430 core +#extension GL_GOOGLE_include_directive : require + +#define SPHERE_COUNT 8 +#define POLYGON_METHOD 1 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling +#include "common.glsl" + +#define RECTANGLE_COUNT 1 +const vec3 edge0 = normalize(vec3(2,0,-1)); +const vec3 edge1 = normalize(vec3(2,-5,4)); +Rectangle rectangles[RECTANGLE_COUNT] = { + Rectangle_Rectangle(vec3(-3.8,0.35,1.3),edge0*7.0,edge1*0.1,INVALID_ID_16BIT,0u) +}; + + +void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction) +{ + for (int i=0; i0.0 && t +#include +#include + +float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray) +{ + const Rectangle rect = rectangles[Light_getObjectID(light)]; + + const ImmutableRay_t _immutable = ray._immutable; + const vec3 L = _immutable.direction; +#if POLYGON_METHOD==0 + const float dist = ray._mutable.intersectionT; + return dist*dist/abs(dot(Rectangle_getNormalTimesArea(rect),L)); +#else + #ifdef TRIANGLE_REFERENCE + const mat3 sphericalVertices[2] = + { + nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset,rect.offset+rect.edge0,rect.offset+rect.edge1),_immutable.origin), + nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset+rect.edge1,rect.offset+rect.edge0,rect.offset+rect.edge0+rect.edge1),_immutable.origin) + }; + float solidAngle[2]; + vec3 cos_vertices[2],sin_vertices[2]; + float cos_a[2],cos_c[2],csc_b[2],csc_c[2]; + for (uint i=0u; i<2u; i++) + solidAngle[i] = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i]); + const float rectSolidAngle = solidAngle[0]+solidAngle[1]; + #if POLYGON_METHOD==1 + return 1.f/rectSolidAngle; + #elif POLYGON_METHOD==2 + // TODO: figure out what breaks for a directly visible light under MIS + if (rectSolidAngle > nbl_glsl_FLT_MIN) + { + const vec2 bary = nbl_glsl_barycentric_reconstructBarycentrics(L*ray._mutable.intersectionT+_immutable.origin-rect.offset,mat2x3(rect.edge0,rect.edge1)); + const uint i = bary.x>=0.f&&bary.y>=0.f&&(bary.x+bary.y)<=1.f ? 0u:1u; + + float pdf = nbl_glsl_sampling_probProjectedSphericalTriangleSample(solidAngle[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i],sphericalVertices[i],_immutable.normalAtOrigin,_immutable.wasBSDFAtOrigin,L); + pdf *= solidAngle[i]/rectSolidAngle; + return pdf; + } + else + return nbl_glsl_FLT_INF; + #endif + #else + float pdf; + mat3 rectNormalBasis; + vec2 rectExtents; + Rectangle_getNormalBasis(rect, rectNormalBasis, rectExtents); + vec3 sphR0 = nbl_glsl_shapes_getSphericalRectangle(_immutable.origin, rect.offset, rectNormalBasis); + float solidAngle = nbl_glsl_shapes_SolidAngleOfRectangle(sphR0, rectExtents); + if (solidAngle > nbl_glsl_FLT_MIN) + { + #if POLYGON_METHOD==1 + pdf = 1.f/solidAngle; + #else + #error + #endif + } + else + pdf = nbl_glsl_FLT_INF; + return pdf; + #endif +#endif +} + +vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID) +{ + const Rectangle rect = rectangles[objectID]; + const vec3 N = Rectangle_getNormalTimesArea(rect); + + const vec3 origin2origin = rect.offset-origin; +#if POLYGON_METHOD==0 + vec3 L = origin2origin+rect.edge0*xi.x+rect.edge1*xi.y; // TODO: refactor + + const float distanceSq = dot(L,L); + const float rcpDistance = inversesqrt(distanceSq); + L *= rcpDistance; + + pdf = distanceSq/abs(dot(N,L)); + newRayMaxT = 1.0/rcpDistance; + return L; +#else + #ifdef TRIANGLE_REFERENCE + const mat3 sphericalVertices[2] = + { + nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset,rect.offset+rect.edge0,rect.offset+rect.edge1),origin), + nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset+rect.edge1,rect.offset+rect.edge0,rect.offset+rect.edge0+rect.edge1),origin) + }; + float solidAngle[2]; + vec3 cos_vertices[2],sin_vertices[2]; + float cos_a[2],cos_c[2],csc_b[2],csc_c[2]; + for (uint i=0u; i<2u; i++) + solidAngle[i] = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i]); + vec3 L = vec3(0.f,0.f,0.f); + const float rectangleSolidAngle = solidAngle[0]+solidAngle[1]; + if (rectangleSolidAngle > nbl_glsl_FLT_MIN) + { + float rcpTriangleChoiceProb; + const uint i = nbl_glsl_partitionRandVariable(solidAngle[0]/rectangleSolidAngle,xi.z,rcpTriangleChoiceProb) ? 1u:0u; + #if POLYGON_METHOD==1 + L = nbl_glsl_sampling_generateSphericalTriangleSample(solidAngle[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i],sphericalVertices[i],xi.xy); + pdf = 1.f/rectangleSolidAngle; + #elif POLYGON_METHOD==2 + float rcpPdf; + L = nbl_glsl_sampling_generateProjectedSphericalTriangleSample(rcpPdf,solidAngle[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i],sphericalVertices[i],interaction.isotropic.N,isBSDF,xi.xy); + pdf = 1.f/(rcpPdf*rcpTriangleChoiceProb); + #endif + } + else + pdf = nbl_glsl_FLT_INF; + #else + mat3 rectNormalBasis; + vec2 rectExtents; + Rectangle_getNormalBasis(rect, rectNormalBasis, rectExtents); + vec3 sphR0 = nbl_glsl_shapes_getSphericalRectangle(origin, rect.offset, rectNormalBasis); + vec3 L = vec3(0.f,0.f,0.f); + float solidAngle; + vec2 sphUv = nbl_glsl_sampling_generateSphericalRectangleSample(sphR0, rectExtents, xi.xy, solidAngle); + if (solidAngle > nbl_glsl_FLT_MIN) + { + #if POLYGON_METHOD==1 + vec3 sph_sample = sphUv[0] * rect.edge0 + sphUv[1] * rect.edge1 + rect.offset; + L = normalize(sph_sample - origin); + pdf = 1.f/solidAngle; + #else + #error + #endif + } + else + pdf = nbl_glsl_FLT_INF; + #endif + newRayMaxT = dot(N,origin2origin)/dot(N,L); + return L; +#endif +} + + +uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection) +{ + if (objectID0.0) + { + const float rcpDistance = inversesqrt(distanceSQ); + Z *= rcpDistance; + + const float cosThetaMax = sqrt(cosThetaMax2); + const float cosTheta = mix(1.0,cosThetaMax,xi.x); + + vec3 L = Z*cosTheta; + + const float cosTheta2 = cosTheta*cosTheta; + const float sinTheta = sqrt(1.0-cosTheta2); + float sinPhi,cosPhi; + nbl_glsl_sincos(2.0*nbl_glsl_PI*xi.y-nbl_glsl_PI,sinPhi,cosPhi); + mat2x3 XY = nbl_glsl_frisvad(Z); + + L += (XY[0]*cosPhi+XY[1]*sinPhi)*sinTheta; + + newRayMaxT = (cosTheta-sqrt(cosTheta2-cosThetaMax2))/rcpDistance; + pdf = 1.0/Sphere_getSolidAngle_impl(cosThetaMax); + return L; + } + pdf = 0.0; + return vec3(0.0,0.0,0.0); +} + +uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection) +{ + Sphere sphere = spheres[objectID]; + normal = Sphere_getNormal(sphere,intersection); + return sphere.bsdfLightIDs; +} \ No newline at end of file diff --git a/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp b/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp new file mode 100644 index 000000000..ba23c82e5 --- /dev/null +++ b/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp @@ -0,0 +1,105 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#version 430 core +#extension GL_GOOGLE_include_directive : require + +#define SPHERE_COUNT 8 +#define POLYGON_METHOD 1 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling +#include "common.glsl" + +#define TRIANGLE_COUNT 1 +Triangle triangles[TRIANGLE_COUNT] = { + Triangle_Triangle(mat3(vec3(-1.8,0.35,0.3),vec3(-1.2,0.35,0.0),vec3(-1.5,0.8,-0.3))*10.0,INVALID_ID_16BIT,0u) +}; + +void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction) +{ + for (int i=0; i0.0 && t +float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray) +{ + const Triangle tri = triangles[Light_getObjectID(light)]; + + const vec3 L = ray._immutable.direction; +#if POLYGON_METHOD==0 + const float dist = ray._mutable.intersectionT; + return dist*dist/abs(dot(Triangle_getNormalTimesArea(tri),L)); +#else + const ImmutableRay_t _immutable = ray._immutable; + const mat3 sphericalVertices = nbl_glsl_shapes_getSphericalTriangle(mat3(tri.vertex0,tri.vertex1,tri.vertex2),_immutable.origin); + #if POLYGON_METHOD==1 + const float rcpProb = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices); + // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 + return rcpProb>nbl_glsl_FLT_MIN ? (1.0/rcpProb):nbl_glsl_FLT_MAX; + #elif POLYGON_METHOD==2 + const float pdf = nbl_glsl_sampling_probProjectedSphericalTriangleSample(sphericalVertices,_immutable.normalAtOrigin,_immutable.wasBSDFAtOrigin,L); + // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small + return pdfnbl_glsl_FLT_MIN ? (1.0/rcpPdf):0.0; + + const vec3 N = Triangle_getNormalTimesArea(tri); + newRayMaxT = dot(N,tri.vertex0-origin)/dot(N,L); + return L; +#endif +} + + +uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection) +{ + if (objectID +using namespace nbl::hlsl; +using namespace ext::FullScreenTriangle; + +// binding 0 set 0 +[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D texture; +[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] SamplerState samplerState; + +[[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0 +{ + return float32_t4(texture.Sample(samplerState, vxAttr.uv).rgb, 1.0f); +} \ No newline at end of file diff --git a/31_HLSLPathTracer/config.json.template b/31_HLSLPathTracer/config.json.template new file mode 100644 index 000000000..24adf54fb --- /dev/null +++ b/31_HLSLPathTracer/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} diff --git a/31_HLSLPathTracer/include/nbl/this_example/common.hpp b/31_HLSLPathTracer/include/nbl/this_example/common.hpp new file mode 100644 index 000000000..ff3dd8095 --- /dev/null +++ b/31_HLSLPathTracer/include/nbl/this_example/common.hpp @@ -0,0 +1,17 @@ +#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ +#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ + +#include + +// common api +#include "CCamera.hpp" +#include "SimpleWindowedApplication.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "CEventCallback.hpp" + +// example's own headers +#include "nbl/ui/ICursorControl.h" +#include "nbl/ext/ImGui/ImGui.h" +#include "imgui/imgui_internal.h" + +#endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ \ No newline at end of file diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp new file mode 100644 index 000000000..73434a852 --- /dev/null +++ b/31_HLSLPathTracer/main.cpp @@ -0,0 +1,1276 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/this_example/common.hpp" +#include "nbl/asset/interchange/IImageAssetHandlerBase.h" +#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" +#include "nbl/builtin/hlsl/surface_transform.h" + +using namespace nbl; +using namespace core; +using namespace hlsl; +using namespace system; +using namespace asset; +using namespace ui; +using namespace video; + +struct PTPushConstant { + matrix4SIMD invMVP; + int sampleCount; + int depth; +}; + +// TODO: Add a QueryPool for timestamping once its ready +// TODO: Do buffer creation using assConv +class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = examples::SimpleWindowedApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using clock_t = std::chrono::steady_clock; + + enum E_LIGHT_GEOMETRY : uint8_t + { + ELG_SPHERE, + ELG_TRIANGLE, + ELG_RECTANGLE, + ELG_COUNT + }; + + constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 }; + constexpr static inline uint32_t MaxFramesInFlight = 5; + constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); + constexpr static inline uint32_t DefaultWorkGroupSize = 16u; + constexpr static inline uint32_t MaxDescriptorCount = 256u; + constexpr static inline uint32_t MaxDepthLog2 = 4u; // 5 + constexpr static inline uint32_t MaxSamplesLog2 = 10u; // 18 + constexpr static inline uint32_t MaxBufferDimensions = 3u << MaxDepthLog2; + constexpr static inline uint32_t MaxBufferSamples = 1u << MaxSamplesLog2; + constexpr static inline uint8_t MaxUITextureCount = 1u; + static inline std::string DefaultImagePathsFile = "envmap/envmap_0.exr"; + static inline std::string OwenSamplerFilePath = "owen_sampler_buffer.bin"; + static inline std::array PTShaderPaths = { "app_resources/glsl/litBySphere.comp", "app_resources/glsl/litByTriangle.comp", "app_resources/glsl/litByRectangle.comp" }; + static inline std::string PresentShaderPath = "app_resources/hlsl/present.frag.hlsl"; + + const char* shaderNames[E_LIGHT_GEOMETRY::ELG_COUNT] = { + "ELG_SPHERE", + "ELG_TRIANGLE", + "ELG_RECTANGLE" + }; + + public: + inline ComputeShaderPathtracer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + inline bool isComputeOnly() const override { return false; } + + inline core::vector getSurfaces() const override + { + if (!m_surface) + { + { + auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); + IWindow::SCreationParams params = {}; + params.callback = core::make_smart_refctd_ptr(); + params.width = WindowDimensions.x; + params.height = WindowDimensions.y; + params.x = 32; + params.y = 32; + params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; + params.windowCaption = "ComputeShaderPathtracer"; + params.callback = windowCallback; + const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); + } + + auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); + const_cast&>(m_surface) = nbl::video::CSimpleResizeSurface::create(std::move(surface)); + } + + if (m_surface) + return { {m_surface->getSurface()/*,EQF_NONE*/} }; + + return {}; + } + + inline bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Init systems + { + m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + + // Remember to call the base class initialization! + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + m_semaphore = m_device->createSemaphore(m_realFrameIx); + + if (!m_semaphore) + return logFail("Failed to create semaphore!"); + } + + // Create renderpass and init surface + nbl::video::IGPURenderpass* renderpass; + { + ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr(m_surface->getSurface()) }; + if (!swapchainParams.deduceFormat(m_physicalDevice)) + return logFail("Could not choose a Surface Format for the Swapchain!"); + + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = + { + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; + + auto scResources = std::make_unique(m_device.get(), swapchainParams.surfaceFormat.format, dependencies); + renderpass = scResources->getRenderpass(); + + if (!renderpass) + return logFail("Failed to create Renderpass!"); + + auto gQueue = getGraphicsQueue(); + if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) + return logFail("Could not create Window & Surface or initialize the Surface!"); + } + + // image upload utils + { + m_scratchSemaphore = m_device->createSemaphore(0); + if (!m_scratchSemaphore) + return logFail("Could not create Scratch Semaphore"); + m_scratchSemaphore->setObjectDebugName("Scratch Semaphore"); + // we don't want to overcomplicate the example with multi-queue + m_intendedSubmit.queue = getGraphicsQueue(); + // wait for nothing before upload + m_intendedSubmit.waitSemaphores = {}; + m_intendedSubmit.waitSemaphores = {}; + // fill later + m_intendedSubmit.scratchCommandBuffers = {}; + m_intendedSubmit.scratchSemaphore = { + .semaphore = m_scratchSemaphore.get(), + .value = 0, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + } + + // Create command pool and buffers + { + auto gQueue = getGraphicsQueue(); + m_cmdPool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!m_cmdPool) + return logFail("Couldn't create Command Pool!"); + + if (!m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data(), MaxFramesInFlight })) + return logFail("Couldn't create Command Buffer!"); + } + + ISampler::SParams samplerParams = { + .AnisotropicFilter = 0 + }; + auto defaultSampler = m_device->createSampler(samplerParams); + + // Create descriptors and pipeline for the pathtracer + { + auto convertDSLayoutCPU2GPU = [&](smart_refctd_ptr cpuLayout) { + auto converter = CAssetConverter::create({ .device = m_device.get() }); + CAssetConverter::SInputs inputs = {}; + inputs.readCache = converter.get(); + inputs.logger = m_logger.get(); + CAssetConverter::SConvertParams params = {}; + params.utilities = m_utils.get(); + + std::get>(inputs.assets) = { &cpuLayout.get(),1 }; + // don't need to assert that we don't need to provide patches since layouts are not patchable + //assert(true); + auto reservation = converter->reserve(inputs); + // the `.value` is just a funny way to make the `smart_refctd_ptr` copyable + auto gpuLayout = reservation.getGPUObjects().front().value; + if (!gpuLayout) { + m_logger->log("Failed to convert %s into an IGPUDescriptorSetLayout handle", ILogger::ELL_ERROR); + std::exit(-1); + } + + return gpuLayout; + }; + auto convertDSCPU2GPU = [&](smart_refctd_ptr cpuDS) { + auto converter = CAssetConverter::create({ .device = m_device.get() }); + CAssetConverter::SInputs inputs = {}; + inputs.readCache = converter.get(); + inputs.logger = m_logger.get(); + CAssetConverter::SConvertParams params = {}; + params.utilities = m_utils.get(); + + std::get>(inputs.assets) = { &cpuDS.get(), 1 }; + // don't need to assert that we don't need to provide patches since layouts are not patchable + //assert(true); + auto reservation = converter->reserve(inputs); + // the `.value` is just a funny way to make the `smart_refctd_ptr` copyable + auto gpuDS = reservation.getGPUObjects().front().value; + if (!gpuDS) { + m_logger->log("Failed to convert %s into an IGPUDescriptorSet handle", ILogger::ELL_ERROR); + std::exit(-1); + } + + return gpuDS; + }; + + std::array descriptorSet0Bindings = {}; + std::array descriptorSet3Bindings = {}; + std::array presentDescriptorSetBindings; + + descriptorSet0Bindings[0] = { + .binding = 0u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, + .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1u, + .immutableSamplers = nullptr + }; + descriptorSet3Bindings[0] = { + .binding = 0u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, + .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1u, + .immutableSamplers = nullptr + }; + descriptorSet3Bindings[1] = { + .binding = 1u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER, + .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1u, + .immutableSamplers = nullptr + }; + descriptorSet3Bindings[2] = { + .binding = 2u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, + .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1u, + .immutableSamplers = nullptr + }; + presentDescriptorSetBindings[0] = { + .binding = 0u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, + .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT, + .count = 1u, + .immutableSamplers = &defaultSampler + }; + + auto cpuDescriptorSetLayout0 = make_smart_refctd_ptr(descriptorSet0Bindings); + auto cpuDescriptorSetLayout2 = make_smart_refctd_ptr(descriptorSet3Bindings); + + auto gpuDescriptorSetLayout0 = convertDSLayoutCPU2GPU(cpuDescriptorSetLayout0); + auto gpuDescriptorSetLayout2 = convertDSLayoutCPU2GPU(cpuDescriptorSetLayout2); + auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(presentDescriptorSetBindings); + + auto cpuDescriptorSet0 = make_smart_refctd_ptr(std::move(cpuDescriptorSetLayout0)); + auto cpuDescriptorSet2 = make_smart_refctd_ptr(std::move(cpuDescriptorSetLayout2)); + + m_descriptorSet0 = convertDSCPU2GPU(cpuDescriptorSet0); + m_descriptorSet2 = convertDSCPU2GPU(cpuDescriptorSet2); + + smart_refctd_ptr presentDSPool; + { + const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() }; + const uint32_t setCounts[] = { 1u }; + presentDSPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts); + } + m_presentDescriptorSet = presentDSPool->createDescriptorSet(gpuPresentDescriptorSetLayout); + + // Create Shaders + auto loadAndCompileShader = [&](std::string pathToShader) + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.workingDirectory = localInputCWD; + auto assetBundle = m_assetMgr->getAsset(pathToShader, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + m_logger->log("Could not load shader: ", ILogger::ELL_ERROR, pathToShader); + std::exit(-1); + } + + auto source = IAsset::castDown(assets[0]); + // The down-cast should not fail! + assert(source); + + // this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple + auto shader = m_device->createShader(source.get()); + if (!shader) + { + m_logger->log("Shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader); + std::exit(-1); + } + + return shader; + }; + + // Create compute pipelines + { + for (int index = 0; index < E_LIGHT_GEOMETRY::ELG_COUNT; index++) { + auto ptShader = loadAndCompileShader(PTShaderPaths[index]); + const nbl::asset::SPushConstantRange pcRange = { + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .offset = 0, + .size = sizeof(PTPushConstant) + }; + auto ptPipelineLayout = m_device->createPipelineLayout( + { &pcRange, 1 }, + core::smart_refctd_ptr(gpuDescriptorSetLayout0), + nullptr, + core::smart_refctd_ptr(gpuDescriptorSetLayout2), + nullptr + ); + if (!ptPipelineLayout) { + return logFail("Failed to create Pathtracing pipeline layout"); + } + + IGPUComputePipeline::SCreationParams params = {}; + params.layout = ptPipelineLayout.get(); + params.shader.shader = ptShader.get(); + params.shader.entryPoint = "main"; + params.shader.entries = nullptr; + params.shader.requireFullSubgroups = true; + params.shader.requiredSubgroupSize = static_cast(5); + if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTPipelines.data() + index)) { + return logFail("Failed to create compute pipeline!\n"); + } + } + } + + // Create graphics pipeline + { + auto scRes = static_cast(m_surface->getSwapchainResources()); + ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); + if (!fsTriProtoPPln) + return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); + + // Load Fragment Shader + auto fragmentShader = loadAndCompileShader(PresentShaderPath); + if (!fragmentShader) + return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!"); + + const IGPUShader::SSpecInfo fragSpec = { + .entryPoint = "main", + .shader = fragmentShader.get() + }; + + auto presentLayout = m_device->createPipelineLayout( + {}, + core::smart_refctd_ptr(gpuPresentDescriptorSetLayout), + nullptr, + nullptr, + nullptr + ); + m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass()); + if (!m_presentPipeline) + return logFail("Could not create Graphics Pipeline!"); + + } + } + + // load CPUImages and convert to GPUImages + smart_refctd_ptr envMap, scrambleMap; + { + auto convertImgCPU2GPU = [&](std::span cpuImgs) { + auto queue = getGraphicsQueue(); + auto cmdbuf = m_cmdBufs[0].get(); + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); + std::array commandBufferInfo = { cmdbuf }; + core::smart_refctd_ptr imgFillSemaphore = m_device->createSemaphore(0); + imgFillSemaphore->setObjectDebugName("Image Fill Semaphore"); + + auto converter = CAssetConverter::create({ .device = m_device.get() }); + // We don't want to generate mip-maps for these images, to ensure that we must override the default callbacks. + struct SInputs final : CAssetConverter::SInputs + { + // we also need to override this to have concurrent sharing + inline std::span getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUImage* buffer, const CAssetConverter::patch_t& patch) const override + { + if (familyIndices.size() > 1) + return familyIndices; + return {}; + } + + inline uint8_t getMipLevelCount(const size_t groupCopyID, const ICPUImage* image, const CAssetConverter::patch_t& patch) const override + { + return image->getCreationParameters().mipLevels; + } + inline uint16_t needToRecomputeMips(const size_t groupCopyID, const ICPUImage* image, const CAssetConverter::patch_t& patch) const override + { + return 0b0u; + } + + std::vector familyIndices; + } inputs = {}; + inputs.readCache = converter.get(); + inputs.logger = m_logger.get(); + { + const core::set uniqueFamilyIndices = { queue->getFamilyIndex(), queue->getFamilyIndex() }; + inputs.familyIndices = { uniqueFamilyIndices.begin(),uniqueFamilyIndices.end() }; + } + // scratch command buffers for asset converter transfer commands + SIntendedSubmitInfo transfer = { + .queue = queue, + .waitSemaphores = {}, + .prevCommandBuffers = {}, + .scratchCommandBuffers = commandBufferInfo, + .scratchSemaphore = { + .semaphore = imgFillSemaphore.get(), + .value = 0, + // because of layout transitions + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS + } + }; + // as per the `SIntendedSubmitInfo` one commandbuffer must be begun + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // Normally we'd have to inherit and override the `getFinalOwnerQueueFamily` callback to ensure that the + // compute queue becomes the owner of the buffers and images post-transfer, but in this example we use concurrent sharing + CAssetConverter::SConvertParams params = {}; + params.transfer = &transfer; + params.utilities = m_utils.get(); + + std::get>(inputs.assets) = cpuImgs; + // assert that we don't need to provide patches + assert(cpuImgs[0]->getImageUsageFlags().hasFlags(ICPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT)); + auto reservation = converter->reserve(inputs); + // the `.value` is just a funny way to make the `smart_refctd_ptr` copyable + auto gpuImgs = reservation.getGPUObjects(); + for (auto& gpuImg : gpuImgs) { + if (!gpuImg) { + m_logger->log("Failed to convert %s into an IGPUImage handle", ILogger::ELL_ERROR, DefaultImagePathsFile); + std::exit(-1); + } + } + + // and launch the conversions + m_api->startCapture(); + auto result = reservation.convert(params); + m_api->endCapture(); + if (!result.blocking() && result.copy() != IQueue::RESULT::SUCCESS) { + m_logger->log("Failed to record or submit conversions", ILogger::ELL_ERROR); + std::exit(-1); + } + + envMap = gpuImgs[0].value; + scrambleMap = gpuImgs[1].value; + }; + + smart_refctd_ptr envMapCPU, scrambleMapCPU; + { + IAssetLoader::SAssetLoadParams lp; + lp.workingDirectory = this->sharedInputCWD; + SAssetBundle bundle = m_assetMgr->getAsset(DefaultImagePathsFile, lp); + if (bundle.getContents().empty()) { + m_logger->log("Couldn't load an asset.", ILogger::ELL_ERROR); + std::exit(-1); + } + + envMapCPU = IAsset::castDown(bundle.getContents()[0]); + if (!envMapCPU) { + m_logger->log("Couldn't load an asset.", ILogger::ELL_ERROR); + std::exit(-1); + } + }; + { + asset::ICPUImage::SCreationParams info; + info.format = asset::E_FORMAT::EF_R32G32_UINT; + info.type = asset::ICPUImage::ET_2D; + auto extent = envMapCPU->getCreationParameters().extent; + info.extent.width = extent.width; + info.extent.height = extent.height; + info.extent.depth = 1u; + info.mipLevels = 1u; + info.arrayLayers = 1u; + info.samples = asset::ICPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT; + info.flags = static_cast(0u); + info.usage = asset::IImage::EUF_TRANSFER_SRC_BIT | asset::IImage::EUF_SAMPLED_BIT; + + scrambleMapCPU = ICPUImage::create(std::move(info)); + const uint32_t texelFormatByteSize = getTexelOrBlockBytesize(scrambleMapCPU->getCreationParameters().format); + const uint32_t texelBufferSize = scrambleMapCPU->getImageDataSizeInBytes(); + auto texelBuffer = ICPUBuffer::create({ texelBufferSize }); + + core::RandomSampler rng(0xbadc0ffeu); + auto out = reinterpret_cast(texelBuffer->getPointer()); + for (auto index = 0u; index < texelBufferSize / 4; index++) { + out[index] = rng.nextSample(); + } + + auto regions = core::make_refctd_dynamic_array>(1u); + ICPUImage::SBufferCopy& region = regions->front(); + region.imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + region.imageSubresource.mipLevel = 0u; + region.imageSubresource.baseArrayLayer = 0u; + region.imageSubresource.layerCount = 1u; + region.bufferOffset = 0u; + region.bufferRowLength = IImageAssetHandlerBase::calcPitchInBlocks(extent.width, texelFormatByteSize); + region.bufferImageHeight = 0u; + region.imageOffset = { 0u, 0u, 0u }; + region.imageExtent = scrambleMapCPU->getCreationParameters().extent; + + scrambleMapCPU->setBufferAndRegions(std::move(texelBuffer), regions); + } + + std::array cpuImgs = { envMapCPU.get(), scrambleMapCPU.get()}; + convertImgCPU2GPU(cpuImgs); + } + + // create views for textures + { + auto createHDRIImage = [this](const asset::E_FORMAT colorFormat, const uint32_t width, const uint32_t height) -> smart_refctd_ptr { + IGPUImage::SCreationParams imgInfo; + imgInfo.format = colorFormat; + imgInfo.type = IGPUImage::ET_2D; + imgInfo.extent.width = width; + imgInfo.extent.height = height; + imgInfo.extent.depth = 1u; + imgInfo.mipLevels = 1u; + imgInfo.arrayLayers = 1u; + imgInfo.samples = IGPUImage::ESCF_1_BIT; + imgInfo.flags = static_cast(0u); + imgInfo.usage = asset::IImage::EUF_STORAGE_BIT | asset::IImage::EUF_TRANSFER_DST_BIT | asset::IImage::EUF_SAMPLED_BIT; + + auto image = m_device->createImage(std::move(imgInfo)); + auto imageMemReqs = image->getMemoryReqs(); + imageMemReqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); + m_device->allocate(imageMemReqs, image.get()); + + return image; + }; + auto createHDRIImageView = [this](smart_refctd_ptr img) -> smart_refctd_ptr + { + auto format = img->getCreationParameters().format; + IGPUImageView::SCreationParams imgViewInfo; + imgViewInfo.image = std::move(img); + imgViewInfo.format = format; + imgViewInfo.viewType = IGPUImageView::ET_2D; + imgViewInfo.flags = static_cast(0u); + imgViewInfo.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + imgViewInfo.subresourceRange.baseArrayLayer = 0u; + imgViewInfo.subresourceRange.baseMipLevel = 0u; + imgViewInfo.subresourceRange.layerCount = 1u; + imgViewInfo.subresourceRange.levelCount = 1u; + + return m_device->createImageView(std::move(imgViewInfo)); + }; + + auto params = envMap->getCreationParameters(); + auto extent = params.extent; + envMap->setObjectDebugName("Env Map"); + m_envMapView = createHDRIImageView(envMap); + m_envMapView->setObjectDebugName("Env Map View"); + scrambleMap->setObjectDebugName("Scramble Map"); + m_scrambleView = createHDRIImageView(scrambleMap); + m_scrambleView->setObjectDebugName("Scramble Map View"); + auto outImg = createHDRIImage(asset::E_FORMAT::EF_R16G16B16A16_SFLOAT, WindowDimensions.x, WindowDimensions.y); + outImg->setObjectDebugName("Output Image"); + m_outImgView = createHDRIImageView(outImg); + m_outImgView->setObjectDebugName("Output Image View"); + } + + // create sequence buffer view + { + // TODO: do this better use asset manager to get the ICPUBuffer from `.bin` + auto createBufferFromCacheFile = [this]( + system::path filename, + size_t bufferSize, + void *data, + smart_refctd_ptr& buffer + ) -> std::pair, bool> + { + ISystem::future_t> owenSamplerFileFuture; + ISystem::future_t owenSamplerFileReadFuture; + size_t owenSamplerFileBytesRead; + + m_system->createFile(owenSamplerFileFuture, localOutputCWD / filename, IFile::ECF_READ); + smart_refctd_ptr owenSamplerFile; + + if (owenSamplerFileFuture.wait()) + { + owenSamplerFileFuture.acquire().move_into(owenSamplerFile); + if (!owenSamplerFile) + return { nullptr, false }; + + owenSamplerFile->read(owenSamplerFileReadFuture, data, 0, bufferSize); + if (owenSamplerFileReadFuture.wait()) + { + owenSamplerFileReadFuture.acquire().move_into(owenSamplerFileBytesRead); + + if (owenSamplerFileBytesRead < bufferSize) + { + buffer = asset::ICPUBuffer::create({ sizeof(uint32_t) * bufferSize }); + return { owenSamplerFile, false }; + } + + buffer = asset::ICPUBuffer::create({ { sizeof(uint32_t) * bufferSize }, data }); + } + } + + return { owenSamplerFile, true }; + }; + auto writeBufferIntoCacheFile = [this](smart_refctd_ptr file, size_t bufferSize, void* data) + { + ISystem::future_t owenSamplerFileWriteFuture; + size_t owenSamplerFileBytesWritten; + + file->write(owenSamplerFileWriteFuture, data, 0, bufferSize); + if (owenSamplerFileWriteFuture.wait()) + owenSamplerFileWriteFuture.acquire().move_into(owenSamplerFileBytesWritten); + }; + + constexpr size_t bufferSize = MaxBufferDimensions * MaxBufferSamples; + std::array data = {}; + smart_refctd_ptr sampleSeq; + + auto cacheBufferResult = createBufferFromCacheFile(sharedOutputCWD/OwenSamplerFilePath, bufferSize, data.data(), sampleSeq); + if (!cacheBufferResult.second) + { + core::OwenSampler sampler(MaxBufferDimensions, 0xdeadbeefu); + + ICPUBuffer::SCreationParams params = {}; + params.size = MaxBufferDimensions*MaxBufferSamples*sizeof(uint32_t); + sampleSeq = ICPUBuffer::create(std::move(params)); + + auto out = reinterpret_cast(sampleSeq->getPointer()); + for (auto dim = 0u; dim < MaxBufferDimensions; dim++) + for (uint32_t i = 0; i < MaxBufferSamples; i++) + { + out[i * MaxBufferDimensions + dim] = sampler.sample(dim, i); + } + if (cacheBufferResult.first) + writeBufferIntoCacheFile(cacheBufferResult.first, bufferSize, out); + } + + IGPUBuffer::SCreationParams params = {}; + params.usage = asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_UNIFORM_TEXEL_BUFFER_BIT; + params.size = sampleSeq->getSize(); + + // we don't want to overcomplicate the example with multi-queue + auto queue = getGraphicsQueue(); + auto cmdbuf = m_cmdBufs[0].get(); + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); + IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = { cmdbuf }; + m_intendedSubmit.scratchCommandBuffers = { &cmdbufInfo, 1 }; + + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_api->startCapture(); + auto bufferFuture = m_utils->createFilledDeviceLocalBufferOnDedMem( + m_intendedSubmit, + std::move(params), + sampleSeq->getPointer() + ); + m_api->endCapture(); + bufferFuture.wait(); + auto buffer = bufferFuture.get(); + + m_sequenceBufferView = m_device->createBufferView({ 0u, buffer->get()->getSize(), *buffer }, asset::E_FORMAT::EF_R32G32B32_UINT); + m_sequenceBufferView->setObjectDebugName("Sequence Buffer"); + } + + // Update Descriptors + { + ISampler::SParams samplerParams0 = { + ISampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_EDGE, + ISampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_EDGE, + ISampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_EDGE, + ISampler::ETBC_FLOAT_OPAQUE_BLACK, + ISampler::ETF_LINEAR, + ISampler::ETF_LINEAR, + ISampler::ESMM_LINEAR, + 0u, + false, + ECO_ALWAYS + }; + auto sampler0 = m_device->createSampler(samplerParams0); + ISampler::SParams samplerParams1 = { + ISampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_EDGE, + ISampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_EDGE, + ISampler::E_TEXTURE_CLAMP::ETC_CLAMP_TO_EDGE, + ISampler::ETBC_INT_OPAQUE_BLACK, + ISampler::ETF_NEAREST, + ISampler::ETF_NEAREST, + ISampler::ESMM_NEAREST, + 0u, + false, + ECO_ALWAYS + }; + auto sampler1 = m_device->createSampler(samplerParams1); + + std::array writeDSInfos = {}; + writeDSInfos[0].desc = m_outImgView; + writeDSInfos[0].info.image.imageLayout = IImage::LAYOUT::GENERAL; + writeDSInfos[1].desc = m_envMapView; + // ISampler::SParams samplerParams = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_FLOAT_OPAQUE_BLACK, ISampler::ETF_LINEAR, ISampler::ETF_LINEAR, ISampler::ESMM_LINEAR, 0u, false, ECO_ALWAYS }; + writeDSInfos[1].info.combinedImageSampler.sampler = sampler0; + writeDSInfos[1].info.combinedImageSampler.imageLayout = asset::IImage::LAYOUT::READ_ONLY_OPTIMAL; + writeDSInfos[2].desc = m_sequenceBufferView; + writeDSInfos[3].desc = m_scrambleView; + // ISampler::SParams samplerParams = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_INT_OPAQUE_BLACK, ISampler::ETF_NEAREST, ISampler::ETF_NEAREST, ISampler::ESMM_NEAREST, 0u, false, ECO_ALWAYS }; + writeDSInfos[3].info.combinedImageSampler.sampler = sampler1; + writeDSInfos[3].info.combinedImageSampler.imageLayout = asset::IImage::LAYOUT::READ_ONLY_OPTIMAL; + writeDSInfos[4].desc = m_outImgView; + writeDSInfos[4].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + + std::array writeDescriptorSets = {}; + writeDescriptorSets[0] = { + .dstSet = m_descriptorSet0.get(), + .binding = 0, + .arrayElement = 0u, + .count = 1u, + .info = &writeDSInfos[0] + }; + writeDescriptorSets[1] = { + .dstSet = m_descriptorSet2.get(), + .binding = 0, + .arrayElement = 0u, + .count = 1u, + .info = &writeDSInfos[1] + }; + writeDescriptorSets[2] = { + .dstSet = m_descriptorSet2.get(), + .binding = 1, + .arrayElement = 0u, + .count = 1u, + .info = &writeDSInfos[2] + }; + writeDescriptorSets[3] = { + .dstSet = m_descriptorSet2.get(), + .binding = 2, + .arrayElement = 0u, + .count = 1u, + .info = &writeDSInfos[3] + }; + writeDescriptorSets[4] = { + .dstSet = m_presentDescriptorSet.get(), + .binding = 0, + .arrayElement = 0u, + .count = 1u, + .info = &writeDSInfos[4] + }; + + m_device->updateDescriptorSets(writeDescriptorSets, {}); + } + + // Create ui descriptors + { + using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS; + { + IGPUSampler::SParams params; + params.AnisotropicFilter = 1u; + params.TextureWrapU = ISampler::E_TEXTURE_CLAMP::ETC_REPEAT; + params.TextureWrapV = ISampler::E_TEXTURE_CLAMP::ETC_REPEAT; + params.TextureWrapW = ISampler::E_TEXTURE_CLAMP::ETC_REPEAT; + + m_ui.samplers.gui = m_device->createSampler(params); + m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler"); + } + + std::array, 69u> immutableSamplers; + for (auto& it : immutableSamplers) + it = smart_refctd_ptr(m_ui.samplers.scene); + + immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui); + + nbl::ext::imgui::UI::SCreationParameters params; + + params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u }; + params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u }; + params.assetManager = m_assetMgr; + params.pipelineCache = nullptr; + params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount); + params.renderpass = smart_refctd_ptr(renderpass); + params.streamingBuffer = nullptr; + params.subpassIx = 0u; + params.transfer = getTransferUpQueue(); + params.utilities = m_utils; + { + m_ui.manager = ext::imgui::UI::create(std::move(params)); + + // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources + const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u); + const auto& params = m_ui.manager->getCreationParameters(); + + IDescriptorPool::SCreateInfo descriptorPoolInfo = {}; + descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT; + descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount; + descriptorPoolInfo.maxSets = 1u; + descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT; + + m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo)); + assert(m_guiDescriptorSetPool); + + m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet); + assert(m_ui.descriptorSet); + } + } + m_ui.manager->registerListener( + [this]() -> void { + ImGuiIO& io = ImGui::GetIO(); + + m_camera.setProjectionMatrix([&]() + { + static matrix4SIMD projection; + + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar); + + return projection; + }()); + + ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); + + // create a window and insert the inspector + ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); + ImGui::Begin("Controls"); + + ImGui::SameLine(); + + ImGui::Text("Camera"); + + ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Fov", &fov, 20.f, 150.f); + ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); + ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); + ImGui::ListBox("Shader", &PTPipline, shaderNames, E_LIGHT_GEOMETRY::ELG_COUNT); + ImGui::SliderInt("SPP", &spp, 1, MaxBufferSamples); + ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 3); + + ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); + + ImGui::End(); + } + ); + + // Set Camera + { + core::vectorSIMDf cameraPosition(0, 5, -10); + matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( + core::radians(60.0f), + WindowDimensions.x / WindowDimensions.y, + 0.01f, + 500.0f + ); + m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); + } + + m_winMgr->setWindowSize(m_window.get(), WindowDimensions.x, WindowDimensions.y); + m_surface->recreateSwapchain(); + m_winMgr->show(m_window.get()); + m_oracle.reportBeginFrameRecord(); + m_camera.mapKeysToWASD(); + + return true; + } + + bool updateGUIDescriptorSet() + { + // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout + static std::array descriptorInfo; + static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount]; + + descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr(m_ui.manager->getFontAtlasView()); + + for (uint32_t i = 0; i < descriptorInfo.size(); ++i) + { + writes[i].dstSet = m_ui.descriptorSet.get(); + writes[i].binding = 0u; + writes[i].arrayElement = i; + writes[i].count = 1u; + } + writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId; + + return m_device->updateDescriptorSets(writes, {}); + } + + inline void workLoopBody() override + { + // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. + const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); + // We block for semaphores for 2 reasons here: + // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] + // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] + if (m_realFrameIx >= framesInFlight) + { + const ISemaphore::SWaitInfo cbDonePending[] = + { + { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + 1 - framesInFlight + } + }; + if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) + return; + } + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + + m_api->startCapture(); + + // CPU events + update(); + + auto queue = getGraphicsQueue(); + auto cmdbuf = m_cmdBufs[resourceIx].get(); + + if (!keepRunning()) + return; + + // render whole scene to offline frame buffer & submit + { + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); + // disregard surface/swapchain transformation for now + const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix(); + PTPushConstant pc; + viewProjectionMatrix.getInverseTransform(pc.invMVP); + pc.sampleCount = spp; + pc.depth = depth; + + // safe to proceed + // upload buffer data + cmdbuf->beginDebugMarker("ComputeShaderPathtracer IMGUI Frame"); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + // TRANSITION m_outImgView to GENERAL (because of descriptorSets0 -> ComputeShader Writes into the image) + { + const IGPUCommandBuffer::SImageMemoryBarrier imgBarriers[] = { + { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS, + .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }, + .image = m_outImgView->getCreationParameters().image.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }, + .oldLayout = IImage::LAYOUT::UNDEFINED, + .newLayout = IImage::LAYOUT::GENERAL + } + }; + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers }); + } + + // cube envmap handle + { + auto pipeline = m_PTPipelines[PTPipline].get(); + cmdbuf->bindComputePipeline(pipeline); + cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get()); + cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get()); + cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc); + cmdbuf->dispatch(1 + (WindowDimensions.x - 1) / DefaultWorkGroupSize, 1 + (WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u); + } + + // TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image) + { + const IGPUCommandBuffer::SImageMemoryBarrier imgBarriers[] = { + { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS + } + }, + .image = m_outImgView->getCreationParameters().image.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }, + .oldLayout = IImage::LAYOUT::GENERAL, + .newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL + } + }; + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imgBarriers }); + } + + // TODO: tone mapping and stuff + } + + asset::SViewport viewport; + { + viewport.minDepth = 1.f; + viewport.maxDepth = 0.f; + viewport.x = 0u; + viewport.y = 0u; + viewport.width = WindowDimensions.x; + viewport.height = WindowDimensions.y; + } + cmdbuf->setViewport(0u, 1u, &viewport); + + + VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} }; + cmdbuf->setScissor(defaultScisors); + + const VkRect2D currentRenderArea = + { + .offset = {0,0}, + .extent = {m_window->getWidth(),m_window->getHeight()} + }; + auto scRes = static_cast(m_surface->getSwapchainResources()); + + // Upload m_outImg to swapchain + UI + { + const IGPUCommandBuffer::SRenderpassBeginInfo info = + { + .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex), + .colorClearValues = &clearColor, + .depthStencilClearValues = nullptr, + .renderArea = currentRenderArea + }; + nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u }; + + cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); + + cmdbuf->bindGraphicsPipeline(m_presentPipeline.get()); + cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDescriptorSet.get()); + ext::FullScreenTriangle::recordDrawCall(cmdbuf); + + const auto uiParams = m_ui.manager->getCreationParameters(); + auto* uiPipeline = m_ui.manager->getPipeline(); + cmdbuf->bindGraphicsPipeline(uiPipeline); + cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get()); + m_ui.manager->render(cmdbuf, waitInfo); + + cmdbuf->endRenderPass(); + } + + cmdbuf->end(); + { + const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = + { + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT + } + }; + { + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cmdbuf } + }; + + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = + { + { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = rendered + } + }; + + updateGUIDescriptorSet(); + + if (queue->submit(infos) != IQueue::RESULT::SUCCESS) + m_realFrameIx--; + } + } + + m_window->setCaption("[Nabla Engine] Computer Path Tracer"); + m_surface->present(m_currentImageAcquire.imageIndex, rendered); + } + m_api->endCapture(); + } + + inline bool keepRunning() override + { + if (m_surface->irrecoverable()) + return false; + + return true; + } + + inline bool onAppTerminated() override + { + return device_base_t::onAppTerminated(); + } + + inline void update() + { + m_camera.setMoveSpeed(moveSpeed); + m_camera.setRotateSpeed(rotateSpeed); + + static std::chrono::microseconds previousEventTimestamp{}; + + m_inputSystem->getDefaultMouse(&mouse); + m_inputSystem->getDefaultKeyboard(&keyboard); + + auto updatePresentationTimestamp = [&]() + { + m_currentImageAcquire = m_surface->acquireNextImage(); + + m_oracle.reportEndFrameRecord(); + const auto timestamp = m_oracle.getNextPresentationTimeStamp(); + m_oracle.reportBeginFrameRecord(); + + return timestamp; + }; + + const auto nextPresentationTimestamp = updatePresentationTimestamp(); + + struct + { + std::vector mouse{}; + std::vector keyboard{}; + } capturedEvents; + + m_camera.beginInputProcessing(nextPresentationTimestamp); + { + mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + { + m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.mouse.emplace_back(e); + + if (e.type == nbl::ui::SMouseEvent::EET_SCROLL) + gcIndex = std::clamp(int16_t(gcIndex) + int16_t(core::sign(e.scrollEvent.verticalScroll)), int64_t(0), int64_t(ELG_COUNT - (uint8_t)1u)); + } + }, m_logger.get()); + + keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + { + m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.keyboard.emplace_back(e); + } + }, m_logger.get()); + } + m_camera.endInputProcessing(nextPresentationTimestamp); + + const core::SRange mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size()); + const core::SRange keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size()); + const auto cursorPosition = m_window->getCursorControl()->getPosition(); + const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()); + + const ext::imgui::UI::SUpdateParameters params = + { + .mousePosition = mousePosition, + .displaySize = { m_window->getWidth(), m_window->getHeight() }, + .mouseEvents = mouseEvents, + .keyboardEvents = keyboardEvents + }; + + m_ui.manager->update(params); + } + + private: + smart_refctd_ptr m_window; + smart_refctd_ptr> m_surface; + + // gpu resources + smart_refctd_ptr m_cmdPool; + std::array, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTPipelines; + smart_refctd_ptr m_presentPipeline; + uint64_t m_realFrameIx = 0; + std::array, MaxFramesInFlight> m_cmdBufs; + ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + smart_refctd_ptr m_descriptorSet0, m_descriptorSet2, m_presentDescriptorSet; + + core::smart_refctd_ptr m_guiDescriptorSetPool; + + // system resources + core::smart_refctd_ptr m_inputSystem; + InputSystem::ChannelReader mouse; + InputSystem::ChannelReader keyboard; + + // pathtracer resources + smart_refctd_ptr m_envMapView, m_scrambleView; + smart_refctd_ptr m_sequenceBufferView; + smart_refctd_ptr m_outImgView; + + // sync + smart_refctd_ptr m_semaphore; + + // image upload resources + smart_refctd_ptr m_scratchSemaphore; + SIntendedSubmitInfo m_intendedSubmit; + + struct C_UI + { + nbl::core::smart_refctd_ptr manager; + + struct + { + core::smart_refctd_ptr gui, scene; + } samplers; + + core::smart_refctd_ptr descriptorSet; + } m_ui; + + Camera m_camera; + + video::CDumbPresentationOracle m_oracle; + + uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed + + float fov = 60.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; + float viewWidth = 10.f; + float camYAngle = 165.f / 180.f * 3.14159f; + float camXAngle = 32.f / 180.f * 3.14159f; + int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE; + int spp = 32; + int depth = 3; + + bool m_firstFrame = true; + IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; +}; + +NBL_MAIN_FUNC(ComputeShaderPathtracer) diff --git a/31_HLSLPathTracer/pipeline.groovy b/31_HLSLPathTracer/pipeline.groovy new file mode 100644 index 000000000..955e77cec --- /dev/null +++ b/31_HLSLPathTracer/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CHLSLPathTracerBuilder extends IBuilder +{ + public CHLSLPathTracerBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CHLSLPathTracerBuilder(_agent, _info) +} + +return this diff --git a/CMakeLists.txt b/CMakeLists.txt index 935354ed7..aa84caa6b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,8 @@ if(NBL_BUILD_EXAMPLES) # Showcase compute pathtracing add_subdirectory(30_ComputeShaderPathTracer EXCLUDE_FROM_ALL) + add_subdirectory(31_HLSLPathTracer EXCLUDE_FROM_ALL) + add_subdirectory(38_EXRSplit EXCLUDE_FROM_ALL) # if (NBL_BUILD_MITSUBA_LOADER AND NBL_BUILD_OPTIX) # add_subdirectory(39_DenoiserTonemapper EXCLUDE_FROM_ALL) From 85211238891bff05b749cda5f36c8b2610210666 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 3 Feb 2025 15:51:28 +0700 Subject: [PATCH 020/296] ignore events on imgui focus --- 31_HLSLPathTracer/main.cpp | 49 ++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 73434a852..018468e46 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -1112,7 +1112,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication } } - m_window->setCaption("[Nabla Engine] Computer Path Tracer"); + m_window->setCaption("[Nabla Engine] HLSL Compute Path Tracer"); m_surface->present(m_currentImageAcquire.imageIndex, rendered); } m_api->endCapture(); @@ -1162,36 +1162,39 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication m_camera.beginInputProcessing(nextPresentationTimestamp); { + const auto& io = ImGui::GetIO(); mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void - { - m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl - - for (const auto& e : events) // here capture { - if (e.timeStamp < previousEventTimestamp) - continue; + if (!io.WantCaptureMouse) + m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl - previousEventTimestamp = e.timeStamp; - capturedEvents.mouse.emplace_back(e); + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; - if (e.type == nbl::ui::SMouseEvent::EET_SCROLL) - gcIndex = std::clamp(int16_t(gcIndex) + int16_t(core::sign(e.scrollEvent.verticalScroll)), int64_t(0), int64_t(ELG_COUNT - (uint8_t)1u)); - } - }, m_logger.get()); + previousEventTimestamp = e.timeStamp; + capturedEvents.mouse.emplace_back(e); + if (e.type == nbl::ui::SMouseEvent::EET_SCROLL) + gcIndex = std::clamp(int16_t(gcIndex) + int16_t(core::sign(e.scrollEvent.verticalScroll)), int64_t(0), int64_t(ELG_COUNT - (uint8_t)1u)); + } + }, m_logger.get()); + keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void - { - m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl - - for (const auto& e : events) // here capture { - if (e.timeStamp < previousEventTimestamp) - continue; + if (!io.WantCaptureKeyboard) + m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl - previousEventTimestamp = e.timeStamp; - capturedEvents.keyboard.emplace_back(e); - } - }, m_logger.get()); + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.keyboard.emplace_back(e); + } + }, m_logger.get()); } m_camera.endInputProcessing(nextPresentationTimestamp); From b171724bb0db3bf6f144d6eb077e95ddea806cbd Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 4 Feb 2025 14:16:06 +0700 Subject: [PATCH 021/296] initial files for pathtracer --- .../app_resources/hlsl/common.hlsl | 49 ++++++++++++ .../app_resources/hlsl/intersector.hlsl | 27 +++++++ .../app_resources/hlsl/material_system.hlsl | 20 +++++ .../hlsl/next_event_estimator.hlsl | 20 +++++ .../app_resources/hlsl/pathtracer.hlsl | 32 ++++++++ .../app_resources/hlsl/rand_gen.hlsl | 38 +++++++++ .../app_resources/hlsl/ray_gen.hlsl | 80 +++++++++++++++++++ 7 files changed, 266 insertions(+) create mode 100644 31_HLSLPathTracer/app_resources/hlsl/common.hlsl create mode 100644 31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl create mode 100644 31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl create mode 100644 31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl create mode 100644 31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl create mode 100644 31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl create mode 100644 31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl new file mode 100644 index 000000000..694defc08 --- /dev/null +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -0,0 +1,49 @@ +#ifndef _NBL_HLSL_EXT_PATHTRACING_COMMON_INCLUDED_ +#define _NBL_HLSL_EXT_PATHTRACING_COMMON_INCLUDED_ + +namespace nbl +{ +namespace hlsl +{ +namespace ext +{ + +template +struct Payload +{ + using this_t = Payload; + using scalar_type = T; + using vector3_type = vector; + + vector3_type accumulation; + scalar_type otherTechniqueHeuristic; + vector3_type throughput; + // #ifdef KILL_DIFFUSE_SPECULAR_PATHS + // bool hasDiffuse; + // #endif +}; + +template +struct Ray +{ + using this_t = Ray; + using scalar_type = T; + using vector3_type = vector; + + // immutable + vector3_type origin; + vector3_type direction; + // TODO: polygon method == 2 stuff + + // mutable + scalar_type intersectionT; + uint32_t objectID; + + Payload payload; +}; + +} +} +} + +#endif \ No newline at end of file diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl new file mode 100644 index 000000000..5d12d6d18 --- /dev/null +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -0,0 +1,27 @@ +#ifndef _NBL_HLSL_EXT_INTERSECTOR_INCLUDED_ +#define _NBL_HLSL_EXT_INTERSECTOR_INCLUDED_ + +namespace nbl +{ +namespace hlsl +{ +namespace ext +{ +namespace Intersector +{ + +// ray query method + +// ray tracing pipeline method + +struct Procedural +{ + +}; + +} +} +} +} + +#endif \ No newline at end of file diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl new file mode 100644 index 000000000..6f635ab68 --- /dev/null +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -0,0 +1,20 @@ +#ifndef _NBL_HLSL_EXT_MATERIAL_SYSTEM_INCLUDED_ +#define _NBL_HLSL_EXT_MATERIAL_SYSTEM_INCLUDED_ + +namespace nbl +{ +namespace hlsl +{ +namespace ext +{ +namespace MaterialSystem +{ + + + +} +} +} +} + +#endif \ No newline at end of file diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl new file mode 100644 index 000000000..1afa8d12e --- /dev/null +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -0,0 +1,20 @@ +#ifndef _NBL_HLSL_EXT_NEXT_EVENT_ESTIMATOR_INCLUDED_ +#define _NBL_HLSL_EXT_NEXT_EVENT_ESTIMATOR_INCLUDED_ + +namespace nbl +{ +namespace hlsl +{ +namespace ext +{ +namespace NextEventEstimator +{ + + + +} +} +} +} + +#endif \ No newline at end of file diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl new file mode 100644 index 000000000..9d2e8c260 --- /dev/null +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -0,0 +1,32 @@ +#ifndef _NBL_HLSL_EXT_PATHTRACER_INCLUDED_ +#define _NBL_HLSL_EXT_PATHTRACER_INCLUDED_ + +namespace nbl +{ +namespace hlsl +{ +namespace ext +{ +namespace PathTracer +{ + +template +struct Unidirectional +{ + using this_t = Unidirectional; + + static this_t create(RandGen randGen, + RayGen rayGen, + Intersector intersector, + MaterialSystem materialSystem, + /* PathGuider pathGuider, */ + NextEventEstimator nee) + {} +}; + +} +} +} +} + +#endif \ No newline at end of file diff --git a/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl b/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl new file mode 100644 index 000000000..949c2064b --- /dev/null +++ b/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl @@ -0,0 +1,38 @@ +#ifndef _NBL_HLSL_EXT_RANDGEN_INCLUDED_ +#define _NBL_HLSL_EXT_RANDGEN_INCLUDED_ + +namespace nbl +{ +namespace hlsl +{ +namespace ext +{ +namespace RandGen +{ + +template +struct Uniform3D +{ + using rng_type = RNG; + + static Uniform3D create(uint32_t2 seed) + { + Uniform3D retval; + retval.rng = rng_type::construct(seed); + return retval; + } + + float32_t3 operator()() + { + return float32_t3(uint32_t3(rng(), rng(), rng())); + } + + rng_type rng; +}; + +} +} +} +} + +#endif \ No newline at end of file diff --git a/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl b/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl new file mode 100644 index 000000000..467ef2bd4 --- /dev/null +++ b/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl @@ -0,0 +1,80 @@ +#ifndef _NBL_HLSL_EXT_RAYGEN_INCLUDED_ +#define _NBL_HLSL_EXT_RAYGEN_INCLUDED_ + +#include "common.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace ext +{ +namespace RayGen +{ + +template +struct Basic +{ + using this_t = Basic; + using ray_type = Ray; + using scalar_type = typename Ray::scalar_type; + using vector3_type = typename Ray::vector3_type; + + using vector2_type = vector; + using vector4_type = vector; + using matrix4x4_type = matrix; + + static this_t create(NBL_CONST_REF_ARG(vector2_type) pixOffsetParam, NBL_CONST_REF_ARG(vector3_type) camPos, NBL_CONST_REF_ARG(vector4_type) NDC, NBL_CONST_REF_ARG(matrix4x4_type) invMVP) + { + this_t retval; + retval.pixOffsetParam = pixOffsetParam; + retval.camPos = camPos; + retval.NDC = NDC; + retval.invMVP = invMVP; + return retval; + } + + ray_type generate(NBL_CONST_REF_ARG(vector3_type) randVec) + { + ray_type ray; + ray.origin = camPos; + + vector4_type tmp = NDC; + // apply stochastic reconstruction filter + const float gaussianFilterCutoff = 2.5; + const float truncation = nbl::hlsl::exp(-0.5 * gaussianFilterCutoff * gaussianFilterCutoff); + vec2 remappedRand = randVec.xy; + remappedRand.x *= 1.0 - truncation; + remappedRand.x += truncation; + tmp.xy += pixOffsetParam * nbl::hlsl::boxMullerTransform(remappedRand, 1.5); + // for depth of field we could do another stochastic point-pick + tmp = invMVP * tmp; + ray.direction = nbl::hlsl::normalize(tmp.xyz / tmp.w - camPos); + + // #if POLYGON_METHOD==2 + // ray._immutable.normalAtOrigin = vec3(0.0,0.0,0.0); + // ray._immutable.wasBSDFAtOrigin = false; + // #endif + + ray.payload.accumulation = (vector3_type)0.0; + ray.payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths + ray.payload.throughput = (vector3_type)1.0; + // #ifdef KILL_DIFFUSE_SPECULAR_PATHS + // ray._payload.hasDiffuse = false; + // #endif + + return ray; + } + + vector2_type pixOffsetParam; + vector3_type camPos; + vector4_type NDC; + matrix4x4_type invMVP; +}; + +} +} +} +} + +#endif \ No newline at end of file From af35393db518deca935259cab7c414dbad44be20 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 5 Feb 2025 14:16:08 +0700 Subject: [PATCH 022/296] intersection logic --- .../app_resources/hlsl/common.hlsl | 95 +++++++++++++++++++ .../app_resources/hlsl/intersector.hlsl | 39 +++++++- .../app_resources/hlsl/pathtracer.hlsl | 9 ++ 3 files changed, 142 insertions(+), 1 deletion(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 694defc08..56a4cace7 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -1,6 +1,10 @@ #ifndef _NBL_HLSL_EXT_PATHTRACING_COMMON_INCLUDED_ #define _NBL_HLSL_EXT_PATHTRACING_COMMON_INCLUDED_ +#include +#include +#include + namespace nbl { namespace hlsl @@ -42,6 +46,97 @@ struct Ray Payload payload; }; +enum PTIntersectionType : uint16_t +{ + PIT_NONE = 0, + PIT_SPHERE, + PIT_TRIANGLE, + PIT_RECTANGLE +}; + +// TODO: check if this works for ambiguous arrays of Intersection +// unsure if calling correct method +struct IIntersection +{ + PTIntersectionType type = PIT_NONE; +}; + +template +struct Intersection : IIntersection +{ + PTIntersectionType type = PIT_NONE; +}; + +template<> +struct Intersection : IIntersection +{ + static Intersection create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID) + { + Intersection retval; + retval.type = PIT_SPHERE; + retval.position = position; + retval.radius2 = radius * radius; + retval.bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + return retval; + } + + // return intersection distance if found, nan otherwise + float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction) + { + float32_t3 relOrigin = origin - position; + float relOriginLen2 = nbl::hlsl::dot(relOrigin, relOrigin); + + float dirDotRelOrigin = nbl::hlsl::dot(direction, relOrigin); + float det = radius2 - relOriginLen2 + dirDotRelOrigin * dirDotRelOrigin; + + // do some speculative math here + float detsqrt = nbl::hlsl::sqrt(det); + return -dirDotRelOrigin + (relOriginLen2 > radius2 ? (-detsqrt) : detsqrt); + } + + float32_t3 getNormal(NBL_CONST_REF_ARG(float32_t3) hitPosition) + { + const float radiusRcp = spirv::inverseSqrt(radius2); + return (hitPosition - position) * radiusRcp; + } + + float getSolidAngle(NBL_CONST_REF_ARG(float32_t3) origin) + { + float32_t3 dist = position - origin; + float cosThetaMax = nbl::hlsl::sqrt(1.0 - radius2 / nbl::hlsl::dot(dist, dist)); + return 2.0 * numbers::pi * (1.0 - cosThetaMax); + } + + // should this be in material system? + float deferredPdf(Light light, Ray ray) + { + return 1.0 / getSolidAngle(ray.origin); + } + + float generate_and_pdf() + { + // TODO + } + + float32_t3 generate_and + + float32_t3 position; + float32_t radius2; + uint32_t bsdfLightIDs; +}; + +template<> +struct Intersection : IIntersection +{ + +}; + +template<> +struct Intersection : IIntersection +{ + +}; + } } } diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index 5d12d6d18..b2b3d0d2d 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -1,6 +1,9 @@ #ifndef _NBL_HLSL_EXT_INTERSECTOR_INCLUDED_ #define _NBL_HLSL_EXT_INTERSECTOR_INCLUDED_ +#include "common.hlsl" +#include + namespace nbl { namespace hlsl @@ -11,12 +14,46 @@ namespace Intersector { // ray query method +// ray query struct holds AS info +// pass in address to vertex/index buffers? // ray tracing pipeline method +// does everything in traceray in ex 30 +template struct Procedural { - + using scalar_type = typename Ray::scalar_type; + using ray_type = Ray; + + static int traceRay(NBL_REF_ARG(ray_type) ray, IIntersection objects[32], int objCount) + { + const bool anyHit = ray.intersectionT != numeric_limits::max; + + int objectID = -1; + for (int i = 0; i < objCount; i++) + { + float t; + if (objects[i].type == PIT_SPHERE) // we don't know what type of intersection it is so cast, there has to be a better way to do this + { + Intersection sphere = (Intersection)objects[i]; + t = sphere.intersect(ray.origin, ray.direction); + } + // TODO: other types + + bool closerIntersection = t > 0.0 && t < ray.intersectionT; + + ray.intersectionT = closerIntersection ? t : ray.intersectionT; + objectID = closerIntersection ? i : objectID; + + // allowing early out results in a performance regression, WTF!? + //if (anyHit && closerIntersection) + //break; + } + return objectID; + } + + // TODO? traceray with vertex/index buffer }; } diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 9d2e8c260..f28dc621b 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -22,6 +22,15 @@ struct Unidirectional /* PathGuider pathGuider, */ NextEventEstimator nee) {} + + // closest hit + + // Li + MaterialSystem::measure_t getMeasure() + { + // loop through bounces, do closest hit + // return ray.payload.accumulation --> color + } }; } From ab582180d9a1e735706d913deb0e078c97280d48 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 5 Feb 2025 23:53:52 +0700 Subject: [PATCH 023/296] Reorder hlsl datastructure to reduce padding and make it more compact --- 71_RayTracingPipeline/app_resources/common.hlsl | 9 ++++----- 71_RayTracingPipeline/main.cpp | 6 +++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index 8c73fada3..af35cb731 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -17,23 +17,22 @@ struct Material struct SProceduralGeomInfo { + Material material; float32_t3 center; float32_t radius; - Material material; }; struct STriangleGeomInfo { + Material material; uint64_t vertexBufferAddress; uint64_t indexBufferAddress; - uint32_t vertexStride : 29; + uint32_t vertexStride : 26; + uint32_t objType: 3; uint32_t indexType : 2; // 16 bit, 32 bit or none uint32_t smoothNormals : 1; // flat for cube, rectangle, disk - uint32_t objType; - - Material material; }; enum E_GEOM_TYPE : uint16_t diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 22c745635..d9186a9bc 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1242,8 +1242,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { const auto middle_i = NumberOfProceduralGeometries / 2.0; SProceduralGeomInfo sphere = { - .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), - .radius = 1, .material = { .ambient = {}, .diffuse = {0.3, 0.2 * i, 0.3}, @@ -1251,6 +1249,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .shininess = 1.0f, .illum = 2 }, + .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), + .radius = 1, }; proceduralGeoms.push_back(sphere); @@ -1415,7 +1415,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, // setup blas info for triangle geometries for (uint32_t i = 0; i < blasCount; i++) { - bool isProcedural = i == proceduralBlasIdx; + const auto isProcedural = i == proceduralBlasIdx; if (isProcedural) { aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer); From 6966942f6de8cc51be77b281687cbd8d922bcbac Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 5 Feb 2025 23:54:36 +0700 Subject: [PATCH 024/296] Adjust changes to SPhysicalDeviceLimit regarding shaderGroupHandleSize --- 71_RayTracingPipeline/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index d9186a9bc..c3aadeff7 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1282,7 +1282,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr& pipeline) { const auto& limits = m_device->getPhysicalDevice()->getLimits(); - const auto handleSize = limits.shaderGroupHandleSize; + const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize; const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment); auto& raygenRegion = m_shaderBindingTable.raygenGroupRegion; From ef02db2fda6fbd4e9652e67bcf946310e76a8103 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 5 Feb 2025 23:55:11 +0700 Subject: [PATCH 025/296] Adjust changes to ray tracing SShaderGroupParams --- 71_RayTracingPipeline/main.cpp | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index c3aadeff7..218c6157d 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -353,42 +353,45 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, params.layout = pipelineLayout.get(); params.shaders = std::span(shaders); - auto& shaderGroups = params.cached.shaderGroups; + auto& shaderGroups = params.shaderGroups; shaderGroups.raygenGroup = { .shaderIndex = RTDS_RAYGEN }; - shaderGroups.missGroups.resize(E_MISS_TYPE::EMT_COUNT, {}); - shaderGroups.missGroups[EMT_PRIMARY] = { .shaderIndex = RTDS_MISS }; - shaderGroups.missGroups[EMT_OCCLUSION] = { .shaderIndex = RTDS_SHADOW_MISS }; + SGeneralShaderGroup missGroups[EMT_COUNT]; + missGroups[EMT_PRIMARY] = { .shaderIndex = RTDS_MISS }; + missGroups[EMT_OCCLUSION] = { .shaderIndex = RTDS_SHADOW_MISS }; + shaderGroups.missGroups = missGroups; auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) { return geomType * ERT_COUNT + rayType; }; - shaderGroups.hitGroups.resize(E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT); - shaderGroups.hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { + SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; + hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { .closestHitShaderIndex = RTDS_CLOSEST_HIT, .anyHitShaderIndex = RTDS_ANYHIT_COLOR, }; - shaderGroups.hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { + hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { .closestHitShaderIndex = RTDS_CLOSEST_HIT, .anyHitShaderIndex = RTDS_ANYHIT_SHADOW, }; - shaderGroups.hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { + hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { .closestHitShaderIndex = RTDS_SPHERE_CLOSEST_HIT, .anyHitShaderIndex = RTDS_ANYHIT_COLOR, .intersectionShaderIndex = RTDS_INTERSECTION, }; - shaderGroups.hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { + hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { .closestHitShaderIndex = RTDS_CLOSEST_HIT, .anyHitShaderIndex = RTDS_ANYHIT_SHADOW, .intersectionShaderIndex = RTDS_INTERSECTION, }; + shaderGroups.hitGroups = hitGroups; - shaderGroups.callableGroups.resize(ELT_COUNT); - shaderGroups.callableGroups[ELT_DIRECTIONAL] = { .shaderIndex = RTDS_DIRECTIONAL_CALL }; - shaderGroups.callableGroups[ELT_POINT] = { .shaderIndex = RTDS_POINT_CALL }; - shaderGroups.callableGroups[ELT_SPOT] = { .shaderIndex = RTDS_SPOT_CALL }; + SGeneralShaderGroup callableGroups[ELT_COUNT]; + callableGroups[ELT_DIRECTIONAL] = { .shaderIndex = RTDS_DIRECTIONAL_CALL }; + callableGroups[ELT_POINT] = { .shaderIndex = RTDS_POINT_CALL }; + callableGroups[ELT_SPOT] = { .shaderIndex = RTDS_SPOT_CALL }; + shaderGroups.callableGroups = callableGroups; params.cached.maxRecursionDepth = 1; @@ -1213,13 +1216,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto& gpuObject = m_gpuTriangleGeometries[i]; const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); geomInfos[i] = { + .material = gpuObject.material, .vertexBufferAddress = vertexBufferAddress, .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, .vertexStride = gpuObject.vertexStride, + .objType = gpuObject.meta.type, .indexType = gpuObject.indexType, .smoothNormals = s_smoothNormals[gpuObject.meta.type], - .objType = gpuObject.meta.type, - .material = gpuObject.material, }; } } From 5a5fbfe55aa4cf062c562f19507ba30de085b7a6 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 6 Feb 2025 11:24:28 +0700 Subject: [PATCH 026/296] changes to intersection logic --- .../app_resources/hlsl/common.hlsl | 115 ++++++++++++++--- .../app_resources/hlsl/intersector.hlsl | 120 ++++++++++++++++-- 2 files changed, 208 insertions(+), 27 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 56a4cace7..84933edfb 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -46,34 +46,22 @@ struct Ray Payload payload; }; -enum PTIntersectionType : uint16_t +enum ProceduralIntersectionType : uint16_t { - PIT_NONE = 0, PIT_SPHERE, PIT_TRIANGLE, PIT_RECTANGLE }; -// TODO: check if this works for ambiguous arrays of Intersection -// unsure if calling correct method -struct IIntersection -{ - PTIntersectionType type = PIT_NONE; -}; - -template -struct Intersection : IIntersection -{ - PTIntersectionType type = PIT_NONE; -}; +template +struct Intersection; template<> -struct Intersection : IIntersection +struct Intersection { static Intersection create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID) { Intersection retval; - retval.type = PIT_SPHERE; retval.position = position; retval.radius2 = radius * radius; retval.bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); @@ -118,7 +106,7 @@ struct Intersection : IIntersection // TODO } - float32_t3 generate_and + NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 5; float32_t3 position; float32_t radius2; @@ -126,15 +114,104 @@ struct Intersection : IIntersection }; template<> -struct Intersection : IIntersection +struct Intersection { + static Intersection create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfID, uint32_t lightID) + { + Intersection retval; + retval.vertex0 = vertex0; + retval.vertex1 = vertex1; + retval.vertex2 = vertex2; + retval.bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + return retval; + } + + float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction) + { + const float32_t3 edges[2] = { vertex1 - vertex0, vertex2 - vertex0 }; + + const float32_t3 h = nbl::hlsl::cross(direction, edges[1]); + const float a = nbl::hlsl::dot(edges[0], h); + + const float32_t3 relOrigin = origin - vertex0; + + const float u = nbl::hlsl::dot(relOrigin, h) / a; + + const float32_t3 q = nbl::hlsl::cross(relOrigin, edges[0]); + const float v = nbl::hlsl::dot(direction, q) / a; + + const float t = nbl::hlsl::dot(edges[1], q) / a; + + const bool intersection = t > 0.f && u >= 0.f && v >= 0.f && (u + v) <= 1.f; + return intersection ? t : numeric_limits::infinity; + } + + float32_t3 getNormalTimesArea() + { + const float32_t3 edges[2] = { vertex1 - vertex0, vertex2 - vertex0 }; + return nbl::hlsl::cross(edges[0], edges[1]) * 0.5f; + } + + NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 10; + float32_t3 vertex0; + float32_t3 vertex1; + float32_t3 vertex2; + uint32_t bsdfLightIDs; }; template<> -struct Intersection : IIntersection +struct Intersection { + static Intersection create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID) + { + Intersection retval; + retval.offset = offset; + retval.edge0 = edge0; + retval.edge1 = edge1; + retval.bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + return retval; + } + + float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction) + { + const float32_t3 h = nbl::hlsl::cross(direction, edge1); + const float a = nbl::hlsl::dot(edge0, h); + + const float32_t3 relOrigin = origin - offset; + + const float u = nbl::hlsl::dot(relOrigin,h)/a; + + const float32_t3 q = nbl::hlsl::cross(relOrigin, edge0); + const float v = nbl::hlsl::dot(direction, q) / a; + + const float t = nbl::hlsl::dot(edge1, q) / a; + const bool intersection = t > 0.f && u >= 0.f && v >= 0.f && u <= 1.f && v <= 1.f; + return intersection ? t : numeric_limits::infinity; + } + + float32_t3 getNormalTimesArea() + { + return nbl::hlsl::cross(edge0, edge1); + } + + void getNormalBasis(NBL_REF_ARG(float32_t3x3) basis, NBL_REF_ARG(float32_t2) extents) + { + extents = float32_t2(nbl::hlsl::length(edge0), nbl::hlsl::length(edge1)); + basis[0] = edge0 / extents[0]; + basis[1] = edge1 / extents[1]; + basis[2] = normalize(cross(basis[0],basis[1])); + + basis = nbl::hlsl::transpose(basis); // TODO: double check transpose + } + + NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 10; + + float32_t3 offset; + float32_t3 edge0; + float32_t3 edge1; + uint32_t bsdfLightIDs; }; } diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index b2b3d0d2d..d4b87196d 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -19,27 +19,71 @@ namespace Intersector // ray tracing pipeline method -// does everything in traceray in ex 30 +// procedural data store: [obj count] [intersect type] [obj1] [obj2] [...] + +struct IntersectData +{ + enum class Mode : uint32_t + { + RAY_QUERY, + RAY_TRACING, + PROCEDURAL + }; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128; + + uint32_t mode : 1; + unit32_t unused : 31; // possible space for flags + uint32_t data[DataSize]; +}; + template -struct Procedural +struct Comprehensive { using scalar_type = typename Ray::scalar_type; using ray_type = Ray; - static int traceRay(NBL_REF_ARG(ray_type) ray, IIntersection objects[32], int objCount) + static int traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect) { const bool anyHit = ray.intersectionT != numeric_limits::max; + const uint32_t objCount = intersect.data[0]; + const ProceduralIntersectionType type = intersect.data[1]; int objectID = -1; for (int i = 0; i < objCount; i++) { float t; - if (objects[i].type == PIT_SPHERE) // we don't know what type of intersection it is so cast, there has to be a better way to do this + switch (type) { - Intersection sphere = (Intersection)objects[i]; - t = sphere.intersect(ray.origin, ray.direction); + case PIT_SPHERE: + { + float32_t3 position = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 1]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 2])); + Intersection sphere = Intersection::create(position, asfloat(intersect.data[2 + i * Intersection::ObjSize + 3]), intersect.data[2 + i * Intersection::ObjSize + 4]); + t = sphere.intersect(ray.origin, ray.direction); + } + break; + case PIT_TRIANGLE: + { + float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 1]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 2])); + float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize + 3]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 4]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 5])); + float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize + 6]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 7]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 8])); + Intersection tri = Intersection::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Intersection::ObjSize + 9]); + t = tri.intersect(ray.origin, ray.direction); + } + break; + case PIT_RECTANGLE: + { + float32_t3 offset = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 1]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 2])); + float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize + 3]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 4]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 5])); + float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize + 6]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 7]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 8])); + Intersection rect = Intersection::create(offset, edge0, edge1, intersect.data[2 + i * Intersection::ObjSize + 9]); + t = rect.intersect(ray.origin, ray.direction); + } + break; + default: + t = numeric_limits::infinity; + break; } - // TODO: other types bool closerIntersection = t > 0.0 && t < ray.intersectionT; @@ -53,9 +97,69 @@ struct Procedural return objectID; } - // TODO? traceray with vertex/index buffer + static int traceRay(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect) + { + const IntersectData::Mode mode = intersect.mode; + switch (mode) + { + case IntersectData::Mode::RAY_QUERY: + { + // TODO: do ray query stuff + } + break; + case IntersectData::Mode::RAY_TRACING: + { + // TODO: do ray tracing stuff + } + break; + case IntersectData::Mode::PROCEDURAL: + { + return traceProcedural(ray, intersect); + } + break; + default: + return -1; + } + } }; +// does everything in traceray in ex 30 +// template +// struct Procedural +// { +// using scalar_type = typename Ray::scalar_type; +// using ray_type = Ray; + +// static int traceRay(NBL_REF_ARG(ray_type) ray, IIntersection objects[32], int objCount) +// { +// const bool anyHit = ray.intersectionT != numeric_limits::max; + +// int objectID = -1; +// for (int i = 0; i < objCount; i++) +// { +// float t; +// if (objects[i].type == PIT_SPHERE) // we don't know what type of intersection it is so cast, there has to be a better way to do this +// { +// Intersection sphere = (Intersection)objects[i]; +// t = sphere.intersect(ray.origin, ray.direction); +// } +// // TODO: other types + +// bool closerIntersection = t > 0.0 && t < ray.intersectionT; + +// ray.intersectionT = closerIntersection ? t : ray.intersectionT; +// objectID = closerIntersection ? i : objectID; + +// // allowing early out results in a performance regression, WTF!? +// //if (anyHit && closerIntersection) +// //break; +// } +// return objectID; +// } + +// // TODO? traceray with vertex/index buffer +// }; + } } } From c810949a4a66eb6f0c614537404333c6394463de Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 6 Feb 2025 13:05:41 +0700 Subject: [PATCH 027/296] Adjust ray tracing pipeline demo to remove SStridedBufferRegion --- 71_RayTracingPipeline/main.cpp | 73 ++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 218c6157d..e95032181 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -26,10 +26,14 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, struct ShaderBindingTable { - SStridedBufferRegion raygenGroupRegion; - SStridedBufferRegion hitGroupsRegion; - SStridedBufferRegion missGroupsRegion; - SStridedBufferRegion callableGroupsRegion; + SBufferRange raygenGroupRange; + uint32_t raygenGroupStride; + SBufferRange hitGroupsRange; + uint32_t hitGroupsStride; + SBufferRange missGroupsRange; + uint32_t missGroupsStride; + SBufferRange callableGroupsRange; + uint32_t callableGroupsStride; }; @@ -718,10 +722,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get()); - cmdbuf->traceRays(m_shaderBindingTable.raygenGroupRegion, - m_shaderBindingTable.missGroupsRegion, - m_shaderBindingTable.hitGroupsRegion, - m_shaderBindingTable.callableGroupsRegion, + cmdbuf->traceRays( + m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.raygenGroupStride, + m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, + m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, + m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, WIN_W, WIN_H, 1); } @@ -1288,36 +1293,36 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize; const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment); - auto& raygenRegion = m_shaderBindingTable.raygenGroupRegion; - auto& hitRegion = m_shaderBindingTable.hitGroupsRegion; - auto& missRegion = m_shaderBindingTable.missGroupsRegion; - auto& callableRegion = m_shaderBindingTable.callableGroupsRegion; + auto& raygenRange = m_shaderBindingTable.raygenGroupRange; + auto& hitRange = m_shaderBindingTable.hitGroupsRange; + auto& missRange = m_shaderBindingTable.missGroupsRange; + auto& callableRange = m_shaderBindingTable.callableGroupsRange; - raygenRegion = { + raygenRange = { .offset = 0, - .stride = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment), .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment) }; + m_shaderBindingTable.raygenGroupStride = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment); - missRegion = { - .offset = raygenRegion.size, - .stride = handleSizeAligned, + missRange = { + .offset = raygenRange.size, .size = core::alignUp(pipeline->getMissGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment), }; + m_shaderBindingTable.missGroupsStride = handleSizeAligned; - hitRegion = { - .offset = missRegion.offset + missRegion.size, - .stride = handleSizeAligned, + hitRange = { + .offset = missRange.offset + missRange.size, .size = core::alignUp(pipeline->getHitGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment), }; + m_shaderBindingTable.hitGroupsStride = handleSizeAligned; - callableRegion = { - .offset = hitRegion.offset + hitRegion.size, - .stride = handleSizeAligned, + callableRange = { + .offset = hitRange.offset + hitRange.size, .size = core::alignUp(pipeline->getCallableGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment), }; + m_shaderBindingTable.callableGroupsStride = handleSizeAligned; - const auto bufferSize = raygenRegion.size + missRegion.size + hitRegion.size + callableRegion.size; + const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size; ICPUBuffer::SCreationParams cpuBufferParams; cpuBufferParams.size = bufferSize; @@ -1328,37 +1333,37 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, memcpy(pData, pipeline->getRaygenGroupShaderHandle().data(), handleSize); // copy miss region - uint8_t* pMissData = pData + missRegion.offset; + uint8_t* pMissData = pData + missRange.offset; for (int32_t missIx = 0; missIx < pipeline->getMissGroupCount(); missIx++) { memcpy(pMissData, pipeline->getMissGroupShaderHandle(missIx).data(), handleSize); - pMissData += missRegion.stride; + pMissData += m_shaderBindingTable.missGroupsStride; } // copy hit region - uint8_t* pHitData = pData + hitRegion.offset; + uint8_t* pHitData = pData + hitRange.offset; for (int32_t hitIx = 0; hitIx < pipeline->getHitGroupCount(); hitIx++) { memcpy(pHitData, pipeline->getHitGroupShaderHandle(hitIx).data(), handleSize); - pHitData += hitRegion.stride; + pHitData += m_shaderBindingTable.hitGroupsStride; } // copy callable region - uint8_t* pCallableData = pData + callableRegion.offset; + uint8_t* pCallableData = pData + callableRange.offset; for (int32_t callableIx = 0; callableIx < pipeline->getCallableGroupCount(); callableIx++) { memcpy(pCallableData, pipeline->getCallableGroupShaderHandle(callableIx).data(), handleSize); - pCallableData += callableRegion.stride; + pCallableData += m_shaderBindingTable.callableGroupsStride; } { IGPUBuffer::SCreationParams params; params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT; params.size = bufferSize; - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRegion.buffer); - missRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer); - hitRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer); - callableRegion.buffer = core::smart_refctd_ptr(raygenRegion.buffer); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer); + missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); } return true; From 13251cac673b1c118c43d5f8fc24e8b9cee4492b Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 6 Feb 2025 16:50:06 +0700 Subject: [PATCH 028/296] Pack material into more compact representation before send to gpu. --- .../app_resources/common.hlsl | 72 ++++++++++++++++++- .../app_resources/raytrace.rahit.hlsl | 7 +- .../app_resources/raytrace.rchit.hlsl | 5 +- .../raytrace_procedural.rchit.hlsl | 6 +- 71_RayTracingPipeline/include/common.hpp | 2 +- 71_RayTracingPipeline/main.cpp | 10 +-- 6 files changed, 85 insertions(+), 17 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index af35cb731..eb66aa374 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -5,6 +5,38 @@ NBL_CONSTEXPR uint32_t WorkgroupSize = 16; +inline uint32_t packUnorm10(float32_t v) +{ + return trunc(v * 1023.0f + 0.5f); +} + +inline float32_t unpackUnorm10(uint32_t packed) +{ + return float32_t(packed & 0x3ff) * (1.0f / 1023.0f); +} + +inline uint32_t packUnorm18(float32_t v) +{ + const float maxValue = 262143; + return trunc(v * maxValue + 0.5f); +} + +inline float32_t unpackUnorm18(uint32_t packed) +{ + const float maxValue = 262143; + return float32_t(packed & 0x3ffff) * (1.0f / maxValue); +} + +inline uint32_t packUnorm3x10(float32_t3 v) +{ + return (packUnorm10(v.z) << 20 | (packUnorm10(v.y) << 10 | packUnorm10(v.x))); +} + +inline float32_t3 unpackUnorm3x10(uint32_t packed) +{ + return float32_t3(unpackUnorm10(packed), unpackUnorm10(packed >> 10), unpackUnorm10(packed >> 20)); +} + struct Material { float32_t3 ambient; @@ -15,16 +47,50 @@ struct Material uint32_t illum; // illumination model (see http://www.fileformat.info/format/material/) }; -struct SProceduralGeomInfo +struct MaterialPacked +{ + uint32_t ambient; + uint32_t diffuse; + uint32_t specular; + uint32_t shininess: 18; + uint32_t dissolve : 10; // 1 == opaque; 0 == fully transparent + uint32_t illum : 4; // illumination model (see http://www.fileformat.info/format/material/) +}; + +inline MaterialPacked packMaterial(Material material) +{ + MaterialPacked packed; + packed.ambient = packUnorm3x10(material.ambient); + packed.diffuse = packUnorm3x10(material.diffuse); + packed.specular = packUnorm3x10(material.specular); + packed.shininess = packUnorm18(material.shininess); + packed.dissolve = packUnorm10(material.dissolve); + packed.illum = material.illum; + return packed; +} + +inline Material unpackMaterial(MaterialPacked packed) { Material material; + material.ambient = unpackUnorm3x10(packed.ambient); + material.diffuse = unpackUnorm3x10(packed.diffuse); + material.specular = unpackUnorm3x10(packed.specular); + material.shininess = unpackUnorm18(packed.shininess); + material.dissolve = unpackUnorm10(packed.dissolve); + material.illum = packed.illum; + return material; +} + +struct SProceduralGeomInfo +{ + MaterialPacked material; float32_t3 center; float32_t radius; }; struct STriangleGeomInfo { - Material material; + MaterialPacked material; uint64_t vertexBufferAddress; uint64_t indexBufferAddress; @@ -89,7 +155,6 @@ struct SPushConstants uint32_t frameCounter; float32_t4x4 invMVP; - Light light; }; @@ -102,6 +167,7 @@ struct RayLight float32_t outIntensity; }; + #ifdef __HLSL_VERSION struct [raypayload] ColorPayload diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl index 5db6d70fa..7eb4efbf4 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -16,13 +16,14 @@ void main(inout AnyHitPayload p, in BuiltInTriangleIntersectionAttributes attrib { const int instID = InstanceID(); const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); + const Material material = unpackMaterial(geom.material); - if (geom.material.illum != 4) + if (material.illum != 4) return; uint32_t seed = p.seed; - if (geom.material.dissolve == 0.0) + if (material.dissolve == 0.0) IgnoreHit(); - else if (rnd(seed) > geom.material.dissolve) + else if (rnd(seed) > material.dissolve) IgnoreHit(); } diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl index 462287689..bee5429a8 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl @@ -120,12 +120,13 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics); const float32_t3 worldPosition = mul(ObjectToWorld3x4(), float32_t4(vertexData.position, 1)); const float32_t3 worldNormal = normalize(mul(vertexData.normal, WorldToObject3x4()).xyz); + const Material material = unpackMaterial(geom.material); RayLight cLight; cLight.inHitPosition = worldPosition; CallShader(pc.light.type, cLight); - float32_t3 diffuse = computeDiffuse(geom.material, cLight.outLightDir, worldNormal); + float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal); float32_t3 specular = float32_t3(0, 0, 0); float32_t attenuation = 1; @@ -150,7 +151,7 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs } else { - specular = computeSpecular(geom.material, WorldRayDirection(), cLight.outLightDir, worldNormal); + specular = computeSpecular(material, WorldRayDirection(), cLight.outLightDir, worldNormal); } } p.hitValue = (cLight.outIntensity * attenuation * (diffuse + specular)); diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl index dd5598105..c056f3925 100644 --- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl @@ -21,10 +21,10 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs CallShader(pc.light.type, cLight); // Material of the object - Material mat = sphere.material; + Material material = unpackMaterial(sphere.material); // Diffuse - float3 diffuse = computeDiffuse(sphere.material, cLight.outLightDir, worldNormal); + float3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal); float3 specular = float3(0, 0, 0); float attenuation = 1; @@ -53,7 +53,7 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs } else { - specular = computeSpecular(sphere.material, WorldRayDirection(), cLight.outLightDir, worldNormal); + specular = computeSpecular(material, WorldRayDirection(), cLight.outLightDir, worldNormal); } } diff --git a/71_RayTracingPipeline/include/common.hpp b/71_RayTracingPipeline/include/common.hpp index 3a8411fd2..3b66fd3e9 100644 --- a/71_RayTracingPipeline/include/common.hpp +++ b/71_RayTracingPipeline/include/common.hpp @@ -84,7 +84,7 @@ struct ReferenceObjectGpu uint32_t vertexStride; nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN; uint32_t indexCount = {}; - Material material; + MaterialPacked material; core::matrix3x4SIMD transform; const bool useIndex() const diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index e95032181..015f08a42 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1064,7 +1064,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, ReferenceObjectCpu { .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = { + .material = Material{ .ambient = {}, .diffuse = {0.2, 0.2, 0.8}, .specular = {0.8, 0.8, 0.8}, @@ -1076,7 +1076,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, ReferenceObjectCpu { .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = { + .material = Material{ .ambient = {}, .diffuse = {0.2, 0.8, 0.2}, .specular = {0.8, 0.8, 0.8}, @@ -1211,7 +1211,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .vertexStride = cpuObject.data.inputParams.bindings[0].stride, .indexType = cpuObject.data.indexType, .indexCount = cpuObject.data.indexCount, - .material = cpuObject.material, + .material = packMaterial(cpuObject.material), .transform = cpuObject.transform, }); } @@ -1250,13 +1250,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { const auto middle_i = NumberOfProceduralGeometries / 2.0; SProceduralGeomInfo sphere = { - .material = { + .material = packMaterial({ .ambient = {}, .diffuse = {0.3, 0.2 * i, 0.3}, .specular = {0.8, 0.8, 0.8}, .shininess = 1.0f, .illum = 2 - }, + }), .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), .radius = 1, }; From 85e67ad0c4012d7d8d2014489327036d89b0bf57 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 6 Feb 2025 16:51:56 +0700 Subject: [PATCH 029/296] completed material system? --- .../app_resources/hlsl/material_system.hlsl | 119 ++++++++++++++++++ .../app_resources/hlsl/pathtracer.hlsl | 2 +- 2 files changed, 120 insertions(+), 1 deletion(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index 6f635ab68..1f13198fa 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -1,6 +1,9 @@ #ifndef _NBL_HLSL_EXT_MATERIAL_SYSTEM_INCLUDED_ #define _NBL_HLSL_EXT_MATERIAL_SYSTEM_INCLUDED_ +#include +#include + namespace nbl { namespace hlsl @@ -10,7 +13,123 @@ namespace ext namespace MaterialSystem { +struct Material +{ + enum class Type : uint32_t + { + DIFFUSE, + CONDUCTOR, + DIELECTRIC + }; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 32; + + uint32_t type : 1; + unit32_t unused : 31; // possible space for flags + uint32_t data[DataSize]; +}; + +template +struct System +{ + using this_t = System; + using scalar_type = typename DiffuseBxDF::scalar_type; // types should be same across all 3 bxdfs + using vector2_type = vector; + using vector3_type = vector; + using measure_type = typename DiffuseBxDF::spectral_type; + using quotient_pdf_type = typename DiffuseBxDF::quotient_pdf_type; + using anisotropic_type = typename DiffuseBxDF::anisotropic_type; + using anisocache_type = typename ConductorBxDF::anisocache_type; + using params_t = SBxDFParams; + + static this_t create(NBL_CONST_REF_ARG(SBxDFCreationParams) diffuseParams, NBL_CONST_REF_ARG(SBxDFCreationParams) conductorParams, NBL_CONST_REF_ARG(SBxDFCreationParams) dielectricParams) + { + diffuseBxDF = DiffuseBxDF::create(diffuseParams); + conductorBxDF = DiffuseBxDF::create(conductorParams); + dielectricBxDF = DiffuseBxDF::create(dielectricParams); + } + + static measure_type eval(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(params_t) params) + { + switch(material.type) + { + case DIFFUSE: + { + return (measure_type)diffuseBxDF.eval(params); + } + break; + case CONDUCTOR: + { + return conductorBxDF.eval(params); + } + break; + case DIELECTRIC: + { + return dielectricBxDF.eval(params); + } + break; + default: + return (measure_type)0.0; + } + } + + static vector3_type generate(NBL_CONST_REF_ARG(Material) material, anisotropic_type interaction, vector2_type u, NBL_REF_ARG(anisocache_type) cache) + { + switch(material.type) + { + case DIFFUSE: + { + return diffuseBxDF.generate(interaction, u); + } + break; + case CONDUCTOR: + { + return conductorBxDF.generate(interaction, u, cache); + } + break; + case DIELECTRIC: + { + return dielectricBxDF.generate(interaction, u, cache); + } + break; + default: + return (vector3_type)numeric_limits::infinity; + } + } + + static quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(params_t) params) + { + const float minimumProjVectorLen = 0.00000001; + if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen) + { + switch(material.type) + { + case DIFFUSE: + { + return diffuseBxDF.quotient_and_pdf(params); + } + break; + case CONDUCTOR: + { + return conductorBxDF.quotient_and_pdf(params); + } + break; + case DIELECTRIC: + { + return dielectricBxDF.quotient_and_pdf(params); + } + break; + default: + return quotient_pdf_type::create((measure_type)0.0, numeric_limits::infinity); + } + } + return quotient_pdf_type::create((measure_type)0.0, numeric_limits::infinity); + } + DiffuseBxDF diffuseBxDF; + ConductorBxDF conductorBxDF; + DielectricBxDF dielectricBxDF; +}; } } diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index f28dc621b..9ca0f77e4 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -26,7 +26,7 @@ struct Unidirectional // closest hit // Li - MaterialSystem::measure_t getMeasure() + MaterialSystem::measure_type getMeasure() { // loop through bounces, do closest hit // return ray.payload.accumulation --> color From 2c500b1e06e3e83b2a427bf0aa1ef27878467e0b Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 7 Feb 2025 14:37:56 +0700 Subject: [PATCH 030/296] sphere nee stuff --- .../app_resources/hlsl/common.hlsl | 62 +++++++++++++------ .../app_resources/hlsl/intersector.hlsl | 30 ++++----- 2 files changed, 59 insertions(+), 33 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 84933edfb..2b627523f 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -46,22 +46,22 @@ struct Ray Payload payload; }; -enum ProceduralIntersectionType : uint16_t +enum ProceduralShapeType : uint16_t { - PIT_SPHERE, - PIT_TRIANGLE, - PIT_RECTANGLE + PST_SPHERE, + PST_TRIANGLE, + PST_RECTANGLE }; -template -struct Intersection; +template +struct Shape; template<> -struct Intersection +struct Shape { - static Intersection create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID) + static Shape create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID) { - Intersection retval; + Shape retval; retval.position = position; retval.radius2 = radius * radius; retval.bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); @@ -95,15 +95,41 @@ struct Intersection return 2.0 * numbers::pi * (1.0 - cosThetaMax); } - // should this be in material system? float deferredPdf(Light light, Ray ray) { return 1.0 / getSolidAngle(ray.origin); } - float generate_and_pdf() + template + float generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi, uint32_t objectID) { - // TODO + float32_t3 Z = position - origin; + const float distanceSQ = nbl::hlsl::dot(Z,Z); + const float cosThetaMax2 = 1.0 - radius2 / distanceSQ; + if (cosThetaMax2 > 0.0) + { + const float rcpDistance = 1.0 / nbl::hlsl::sqrt(distanceSQ); + Z *= rcpDistance; + + const float cosThetaMax = nbl::hlsl::sqrt(cosThetaMax2); + const float cosTheta = nbl::hlsl::mix(1.0, cosThetaMax, xi.x); + + vec3 L = Z * cosTheta; + + const float cosTheta2 = cosTheta * cosTheta; + const float sinTheta = nbl::hlsl::sqrt(1.0 - cosTheta2); + float sinPhi, cosPhi; + math::sincos(2.0 * numbers::pi * xi.y - numbers::pi, sinPhi, cosPhi); + float32_t2x3 XY = math::frisvad(Z); + + L += (XY[0] * cosPhi + XY[1] * sinPhi) * sinTheta; + + newRayMaxT = (cosTheta - nbl::hlsl::sqrt(cosTheta2 - cosThetaMax2)) / rcpDistance; + pdf = 1.0 / (2.0 * numbers::pi * (1.0 - cosThetaMax)); + return L; + } + pdf = 0.0; + return float32_t3(0.0,0.0,0.0); } NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 5; @@ -114,11 +140,11 @@ struct Intersection }; template<> -struct Intersection +struct Shape { - static Intersection create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfID, uint32_t lightID) + static Shape create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfID, uint32_t lightID) { - Intersection retval; + Shape retval; retval.vertex0 = vertex0; retval.vertex1 = vertex1; retval.vertex2 = vertex2; @@ -161,11 +187,11 @@ struct Intersection }; template<> -struct Intersection +struct Shape { - static Intersection create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID) + static Shape create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID) { - Intersection retval; + Shape retval; retval.offset = offset; retval.edge0 = edge0; retval.edge1 = edge1; diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index d4b87196d..a694082fe 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -55,28 +55,28 @@ struct Comprehensive float t; switch (type) { - case PIT_SPHERE: + case PST_SPHERE: { - float32_t3 position = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 1]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 2])); - Intersection sphere = Intersection::create(position, asfloat(intersect.data[2 + i * Intersection::ObjSize + 3]), intersect.data[2 + i * Intersection::ObjSize + 4]); + float32_t3 position = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); + Shape sphere = Shape::create(position, asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), intersect.data[2 + i * Shape::ObjSize + 4]); t = sphere.intersect(ray.origin, ray.direction); } break; - case PIT_TRIANGLE: + case PST_TRIANGLE: { - float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 1]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 2])); - float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize + 3]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 4]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 5])); - float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize + 6]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 7]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 8])); - Intersection tri = Intersection::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Intersection::ObjSize + 9]); + float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); + float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape::ObjSize + 5])); + float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape::ObjSize + 8])); + Shape tri = Shape::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Shape::ObjSize + 9]); t = tri.intersect(ray.origin, ray.direction); } break; - case PIT_RECTANGLE: + case PST_RECTANGLE: { - float32_t3 offset = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 1]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 2])); - float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize + 3]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 4]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 5])); - float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + i * Intersection::ObjSize + 6]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 7]), asfloat(intersect.data[2 + i * Intersection::ObjSize + 8])); - Intersection rect = Intersection::create(offset, edge0, edge1, intersect.data[2 + i * Intersection::ObjSize + 9]); + float32_t3 offset = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); + float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape::ObjSize + 5])); + float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape::ObjSize + 8])); + Shape rect = Shape::create(offset, edge0, edge1, intersect.data[2 + i * Shape::ObjSize + 9]); t = rect.intersect(ray.origin, ray.direction); } break; @@ -138,9 +138,9 @@ struct Comprehensive // for (int i = 0; i < objCount; i++) // { // float t; -// if (objects[i].type == PIT_SPHERE) // we don't know what type of intersection it is so cast, there has to be a better way to do this +// if (objects[i].type == PST_SPHERE) // we don't know what type of intersection it is so cast, there has to be a better way to do this // { -// Intersection sphere = (Intersection)objects[i]; +// Shape sphere = (Shape)objects[i]; // t = sphere.intersect(ray.origin, ray.direction); // } // // TODO: other types From e6a99165c1b153977192f9722381fc24f566c9ca Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 10 Feb 2025 16:58:50 +0700 Subject: [PATCH 031/296] triangle sampling --- .../app_resources/hlsl/common.hlsl | 109 +++++++++++++++++- 1 file changed, 108 insertions(+), 1 deletion(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 2b627523f..dfc500beb 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -4,6 +4,11 @@ #include #include #include +#include +#include +#include +#include +//#include namespace nbl { @@ -53,6 +58,13 @@ enum ProceduralShapeType : uint16_t PST_RECTANGLE }; +enum PTPolygonMethod : uint16_t +{ + PPM_AREA, + PPM_SOLID_ANGLE, + PPM_APPROX_PROJECTED_SOLID_ANGLE +}; + template struct Shape; @@ -95,7 +107,7 @@ struct Shape return 2.0 * numbers::pi * (1.0 - cosThetaMax); } - float deferredPdf(Light light, Ray ray) + float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray) { return 1.0 / getSolidAngle(ray.origin); } @@ -149,6 +161,7 @@ struct Shape retval.vertex1 = vertex1; retval.vertex2 = vertex2; retval.bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + retval.polygonMethod = PPM_SOLID_ANGLE; return retval; } @@ -178,12 +191,104 @@ struct Shape return nbl::hlsl::cross(edges[0], edges[1]) * 0.5f; } + float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray) + { + const float32_t3 L = ray.direction; + switch (polygonMethod) + { + case PPM_AREA: + { + const float dist = ray.intersectionT; + return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea()), L); + } + break; + case PPM_SOLID_ANGLE: + { + shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, ray.origin); + const float rcpProb = st.solidAngleOfTriangle(); + // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 + return rcpProb > numeric_limits::min ? (1.0 / rcpProb) : numeric_limits::max; + } + break; + case PPM_APPROX_PROJECTED_SOLID_ANGLE: + { + shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, ray.origin); + const float pdf = st.projectedSolidAngleOfTriangle(ray.normalAtOrigin, ray.wasBSDFAtOrigin, L); + // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small + return pdf < numeric_limits::max ? pdf : 0.0; + } + break; + default: + return 0.0; + } + } + + template + float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi, uint32_t objectID) + { + switch(polygonMethod) + { + case PPM_AREA: + { + const float32_t3 edge0 = vertex1 - vertex0; + const float32_t3 edge1 = vertex2 - vertex0; + const float sqrtU = nbl::hlsl::sqrt(xi.x); + float32_t3 pnt = vertex0 + edge0 * (1.0 - sqrtU) + edge1 * sqrtU * xi.y; + float32_t3 L = pnt - origin; + + const float distanceSq = nbl::hlsl::dot(L,L); + const float rcpDistance = 1.0 / nbl::hlsl::sqrt(distanceSq); + L *= rcpDistance; + + pdf = distanceSq / nbl::hlsl::abs(nbl::hlsl::dot(nbl::hlsl::cross(edge0, edge1) * 0.5f, L)); + newRayMaxT = 1.0 / rcpDistance; + return L; + } + break; + case PPM_SOLID_ANGLE: + { + float rcpPdf; + + shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, ray.origin); + sampling::SphericalTriangle sst = sampling::SphericalTriangle::create(st); + + const float32_t3 L = sst.generate(rcpPdf, xi.xy); + + pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : 0.0; + + const float32_t3 N = getNormalTimesArea(); + newRayMaxT = nbl::hlsl::dot(N, vertex0 - origin) / nbl::hlsl::dot(N, L); + return L; + } + break; + case PPM_APPROX_PROJECTED_SOLID_ANGLE: + { + float rcpPdf; + + shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, ray.origin); + sampling::ProjectedSphericalTriangle sst = sampling::ProjectedSphericalTriangle::create(st); + + const float32_t3 L = sst.generate(rcpPdf, interaction.N, isBSDF, xi.xy); + + pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : 0.0; + + const float32_t3 N = getNormalTimesArea(); + newRayMaxT = nbl::hlsl::dot(N, vertex0 - origin) / nbl::hlsl::dot(N, L); + return L; + } + break; + default: + return (float32_t3)0.0; + } + } + NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 10; float32_t3 vertex0; float32_t3 vertex1; float32_t3 vertex2; uint32_t bsdfLightIDs; + PTPolygonMethod polygonMethod; }; template<> @@ -196,6 +301,7 @@ struct Shape retval.edge0 = edge0; retval.edge1 = edge1; retval.bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + retval.polygonMethod = PPM_SOLID_ANGLE; return retval; } @@ -238,6 +344,7 @@ struct Shape float32_t3 edge0; float32_t3 edge1; uint32_t bsdfLightIDs; + PTPolygonMethod polygonMethod; }; } From c48b5b9015bd81230952d37667108e953c8a97f2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 11 Feb 2025 11:56:09 +0700 Subject: [PATCH 032/296] Change ray tracing implementation from recursion to loop based --- .../app_resources/common.hlsl | 16 ++++--- .../app_resources/raytrace.rahit.hlsl | 2 +- .../app_resources/raytrace.rchit.hlsl | 39 ++------------- .../app_resources/raytrace.rgen.hlsl | 48 +++++++++++++++++-- .../app_resources/raytrace.rmiss.hlsl | 5 +- .../app_resources/raytraceShadow.rmiss.hlsl | 4 +- .../raytrace_procedural.rchit.hlsl | 46 ++---------------- 7 files changed, 66 insertions(+), 94 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index eb66aa374..b56155855 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -170,18 +170,20 @@ struct RayLight #ifdef __HLSL_VERSION -struct [raypayload] ColorPayload -{ - float32_t3 hitValue : read(caller) : write(closesthit,miss); - uint32_t seed : read(closesthit,anyhit) : write(caller); -}; - struct [raypayload] ShadowPayload { bool isShadowed : read(caller) : write(caller,miss); uint32_t seed : read(anyhit) : write(caller); }; +struct [raypayload] HitPayload +{ + MaterialPacked material : read(caller) : write(closesthit); + float32_t3 worldNormal : read(caller) : write(closesthit); + float32_t rayDistance : read(caller) : write(closesthit, miss); + uint32_t seed : read(closesthit, anyhit) : write(caller); +}; + enum ObjectType : uint32_t // matches c++ { OT_CUBE = 0, @@ -197,6 +199,7 @@ enum ObjectType : uint32_t // matches c++ }; static uint32_t s_offsetsToNormalBytes[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 }; // based on normals data position + float32_t3 computeDiffuse(Material mat, float32_t3 light_dir, float32_t3 normal) { // Lambertian @@ -213,7 +216,6 @@ float32_t3 computeSpecular(Material mat, float32_t3 view_dir, if (mat.illum < 2) return float32_t3(0, 0, 0); - // Compute specular only if not in shadow const float32_t kPi = 3.14159265; const float32_t kShininess = max(mat.shininess, 4.0); diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl index 7eb4efbf4..7df0c16ca 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -6,7 +6,7 @@ [[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; #if defined(USE_COLOR_PAYLOAD) -using AnyHitPayload = ColorPayload; +using AnyHitPayload = HitPayload; #elif defined(USE_SHADOW_PAYLOAD) using AnyHitPayload = ShadowPayload; #endif diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl index bee5429a8..a0dd973e6 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl @@ -112,47 +112,16 @@ VertexData fetchVertexData(int instID, int primID, STriangleGeomInfo geom, float } [shader("closesthit")] -void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs) +void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs) { const int instID = InstanceID(); const int primID = PrimitiveIndex(); const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics); - const float32_t3 worldPosition = mul(ObjectToWorld3x4(), float32_t4(vertexData.position, 1)); const float32_t3 worldNormal = normalize(mul(vertexData.normal, WorldToObject3x4()).xyz); - const Material material = unpackMaterial(geom.material); - RayLight cLight; - cLight.inHitPosition = worldPosition; - CallShader(pc.light.type, cLight); + payload.material = geom.material; + payload.worldNormal = worldNormal; + payload.rayDistance = RayTCurrent(); - float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal); - float32_t3 specular = float32_t3(0, 0, 0); - float32_t attenuation = 1; - - if (dot(worldNormal, cLight.outLightDir) > 0) - { - RayDesc rayDesc; - rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent(); - rayDesc.Direction = cLight.outLightDir; - rayDesc.TMin = 0.01; - rayDesc.TMax = cLight.outLightDistance; - - uint flags = RAY_FLAG_SKIP_CLOSEST_HIT_SHADER; - ShadowPayload shadowPayload; - shadowPayload.isShadowed = true; - shadowPayload.seed = p.seed; - TraceRay(topLevelAS, flags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload); - - bool isShadowed = shadowPayload.isShadowed; - if (isShadowed) - { - attenuation = 0.3; - } - else - { - specular = computeSpecular(material, WorldRayDirection(), cLight.outLightDir, worldNormal); - } - } - p.hitValue = (cLight.outIntensity * attenuation * (diffuse + specular)); } \ No newline at end of file diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index 43b052630..facba537c 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -41,19 +41,59 @@ void main() const float32_t4 tmp = mul(pc.invMVP, float32_t4(d.x, d.y, 1, 1)); const float32_t3 targetPos = tmp.xyz / tmp.w; - float32_t3 direction = normalize(targetPos - pc.camPos); + const float32_t3 camDirection = normalize(targetPos - pc.camPos); RayDesc rayDesc; rayDesc.Origin = pc.camPos; - rayDesc.Direction = direction; + rayDesc.Direction = camDirection; rayDesc.TMin = 0.001; rayDesc.TMax = 10000.0; - ColorPayload payload; + HitPayload payload; payload.seed = seed; TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload); - hitValues += payload.hitValue; + if (payload.rayDistance < 0) + { + hitValues += float32_t3(0.3, 0.3, 0.3); + continue; + } + + const float32_t3 worldPosition = pc.camPos + (camDirection * payload.rayDistance); + const float32_t3 worldNormal = payload.worldNormal; + const Material material = unpackMaterial(payload.material); + RayLight cLight; + cLight.inHitPosition = worldPosition; + CallShader(pc.light.type, cLight); + + const float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal); + float32_t3 specular = float32_t3(0, 0, 0); + float32_t attenuation = 1; + + if (dot(worldNormal, cLight.outLightDir) > 0) + { + RayDesc rayDesc; + rayDesc.Origin = worldPosition; + rayDesc.Direction = cLight.outLightDir; + rayDesc.TMin = 0.01; + rayDesc.TMax = 100000; + + ShadowPayload shadowPayload; + shadowPayload.isShadowed = true; + shadowPayload.seed = seed; + TraceRay(topLevelAS, RAY_FLAG_SKIP_CLOSEST_HIT_SHADER, 0xFF, ERT_PRIMARY, 0, EMT_OCCLUSION, rayDesc, shadowPayload); + + bool isShadowed = shadowPayload.isShadowed; + if (isShadowed) + { + attenuation = 0.3; + } + else + { + specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal); + } + } + hitValues += (cLight.outIntensity * attenuation * (diffuse + specular)); } float32_t3 hitValue = hitValues / s_sampleCount; diff --git a/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl index 70db3b0e4..602104a19 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl @@ -1,8 +1,7 @@ #include "common.hlsl" [shader("miss")] -void main(inout ColorPayload p) +void main(inout HitPayload payload) { - p.hitValue = float32_t3(0.3, 0.3, 0.6); - + payload.rayDistance = -1; } diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl index 295e721f2..c1ea42173 100644 --- a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl +++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl @@ -1,7 +1,7 @@ #include "common.hlsl" [shader("miss")] -void main(inout ShadowPayload p) +void main(inout ShadowPayload payload) { - p.isShadowed = false; + payload.isShadowed = false; } diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl index c056f3925..227bfa092 100644 --- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl @@ -5,7 +5,7 @@ [[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; [shader("closesthit")] -void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs) +void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs) { const int instID = InstanceID(); const int primID = PrimitiveIndex(); @@ -16,46 +16,8 @@ void main(inout ColorPayload p, in BuiltInTriangleIntersectionAttributes attribs // Computing the normal at hit position float32_t3 worldNormal = normalize(worldPosition - sphere.center); - RayLight cLight; - cLight.inHitPosition = worldPosition; - CallShader(pc.light.type, cLight); + payload.material = sphere.material; + payload.worldNormal = worldNormal; + payload.rayDistance = RayTCurrent(); - // Material of the object - Material material = unpackMaterial(sphere.material); - - // Diffuse - float3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal); - float3 specular = float3(0, 0, 0); - float attenuation = 1; - - // Tracing shadow ray only if the light is visible from the surface - if (dot(worldNormal, cLight.outLightDir) > 0) - { - RayDesc rayDesc; - rayDesc.Origin = WorldRayOrigin() + WorldRayDirection() * RayTCurrent(); - rayDesc.Direction = cLight.outLightDir; - rayDesc.TMin = 0.01; - rayDesc.TMax = cLight.outLightDistance; - - uint flags = - RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_OPAQUE | - RAY_FLAG_SKIP_CLOSEST_HIT_SHADER; - - ShadowPayload shadowPayload; - shadowPayload.isShadowed = true; - shadowPayload.seed = p.seed; - TraceRay(topLevelAS, flags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload); - - bool isShadowed = shadowPayload.isShadowed; - if (isShadowed) - { - attenuation = 0.3; - } - else - { - specular = computeSpecular(material, WorldRayDirection(), cLight.outLightDir, worldNormal); - } - } - - p.hitValue = (cLight.outIntensity * attenuation * (diffuse + specular)); } \ No newline at end of file From 6dc25eb438bed7c9a729da6520b930d571410d20 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 11 Feb 2025 14:04:11 +0700 Subject: [PATCH 033/296] rectangle sampling --- .../app_resources/hlsl/common.hlsl | 102 +++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index dfc500beb..9295b459b 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -51,6 +51,13 @@ struct Ray Payload payload; }; +template +struct Light +{ + Spectrum radiance; + uint32_t objectID; +}; + enum ProceduralShapeType : uint16_t { PST_SPHERE, @@ -191,7 +198,7 @@ struct Shape return nbl::hlsl::cross(edges[0], edges[1]) * 0.5f; } - float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray) + float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray) { const float32_t3 L = ray.direction; switch (polygonMethod) @@ -338,6 +345,99 @@ struct Shape basis = nbl::hlsl::transpose(basis); // TODO: double check transpose } + float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray) + { + switch (polygonMethod) + { + case PPM_AREA: + { + const float dist = ray.intersectionT; + return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea(), L)); + } + break; + // #ifdef TRIANGLE_REFERENCE ? + case PPM_SOLID_ANGLE: + { + float pdf; + float32_t3x3 rectNormalBasis; + float32_t2 rectExtents; + getNormalBasis(rectNormalBasis, rectExtents); + shapes::SphericalRectangle sphR0 = shapes::SphericalRectangle::create(ray.origin, offset, rectNormalBasis); + float solidAngle = sphR0.solidAngleOfRectangle(rectExtents); + if (solidAngle > numeric_limits::min) + pdf = 1.f / solidAngle; + else + pdf = numeric_limits::infinity; + return pdf; + } + break; + case PPM_APPROX_PROJECTED_SOLID_ANGLE: + { + return numeric_limits::infinity; + } + break; + default: + return numeric_limits::infinity; + } + } + + template + float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi, uint32_t objectID) + { + const float32_t3 N = getNormalTimesArea(); + const float32_t3 origin2origin = offset - origin; + + switch (polygonMethod) + { + case PPM_AREA: + { + float32_t3 L = origin2origin + edge0 * xi.x + edge1 * xi.y; + const float distSq = nbl::hlsl::dot(L, L); + const float rcpDist = 1.0 / nbl::hlsl::sqrt(distSq); + L *= rcpDist; + pdf = distSq / nbl::hlsl::abs(nbl::hlsl::dot(N, L)); + newRayMaxT = 1.0 / rcpDist; + return L; + } + break; + // #ifdef TRIANGLE_REFERENCE ? + case PPM_SOLID_ANGLE: + { + float pdf; + float32_t3x3 rectNormalBasis; + float32_t2 rectExtents; + getNormalBasis(rectNormalBasis, rectExtents); + shapes::SphericalRectangle sphR0 = shapes::SphericalRectangle::create(origin, offset, rectNormalBasis); + float32_t3 L = (float32_t3)0.0; + float solidAngle = sphR0.solidAngleOfRectangle(rectExtents); + + sampling::SphericalRectangle ssph = sampling::SphericalRectangle::create(sphR0); + float32_t2 sphUv = ssph.generate(rectExtents, xi.xy, solidAngle); + if (solidAngle > numeric_limits::min) + { + float32_t3 sph_sample = sphUv[0] * edge0 + sphUv[1] * edge1 + offset; + L = nbl::hlsl::normalize(sph_sample - origin); + pdf = 1.f / solidAngle; + } + else + pdf = numeric_limits::infinity; + + newRayMaxT = nbl::hlsl::dot(N, origin2origin) / nbl::hlsl::dot(N, L); + return L; + } + break; + case PPM_APPROX_PROJECTED_SOLID_ANGLE: + { + pdf = numeric_limits::infinity; + return (float32_t3)0.0; + } + break; + default: + pdf = numeric_limits::infinity; + return (float32_t3)0.0; + } + } + NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 10; float32_t3 offset; From 73b3f9915da069a7efc8e1d6f5e617b85b06742b Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 11 Feb 2025 16:27:44 +0700 Subject: [PATCH 034/296] Assign Blas flag according the the primitives transparency --- 71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl | 2 -- 71_RayTracingPipeline/main.cpp | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl index 7df0c16ca..598e271a5 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -18,8 +18,6 @@ void main(inout AnyHitPayload p, in BuiltInTriangleIntersectionAttributes attrib const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); const Material material = unpackMaterial(geom.material); - if (material.illum != 4) - return; uint32_t seed = p.seed; if (material.dissolve == 0.0) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 015f08a42..fb0dca14f 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1451,7 +1451,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, triangles[i].vertexStride = vertexStride; triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; triangles[i].indexType = gpuObject.indexType; - triangles[i].geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT; + triangles[i].geometryFlags = gpuObject.material.illum == 4 ? + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; blasBuildInfos[i].triangles = &triangles[i]; } From adb7bb612a19e5e4b9c932f5c7f77d9c0b26a3c6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 11 Feb 2025 16:30:09 +0700 Subject: [PATCH 035/296] Store material information in intersection shader --- 71_RayTracingPipeline/app_resources/common.hlsl | 6 ++++++ .../app_resources/raytrace.rint.hlsl | 15 +++++++-------- .../app_resources/raytrace_procedural.rchit.hlsl | 14 ++++---------- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index b56155855..a57fa82dd 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -167,6 +167,12 @@ struct RayLight float32_t outIntensity; }; +struct ProceduralHitAttribute +{ + MaterialPacked material; + float32_t3 center; +}; + #ifdef __HLSL_VERSION diff --git a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl index f302543b6..b9941fc59 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl @@ -8,11 +8,6 @@ struct Ray float32_t3 direction; }; -struct Attrib -{ - float3 HitAttribute; -}; - // Ray-Sphere intersection // http://viclw17.github.io/2018/07/16/raytracing-ray-sphere-intersection/ float32_t hitSphere(SProceduralGeomInfo s, Ray r) @@ -45,10 +40,14 @@ void main() // Sphere data SProceduralGeomInfo sphere = vk::RawBufferLoad < SProceduralGeomInfo > (pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo)); - float32_t tHit = hitSphere(sphere, ray); + const float32_t tHit = hitSphere(sphere, ray); - Attrib attrib; + ProceduralHitAttribute hitAttrib; // Report hit point if (tHit > 0) - ReportHit(tHit, 0, attrib); + { + hitAttrib.center = sphere.center; + hitAttrib.material = sphere.material; + ReportHit(tHit, 0, hitAttrib); + } } \ No newline at end of file diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl index 227bfa092..48495f0fc 100644 --- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl @@ -5,18 +5,12 @@ [[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; [shader("closesthit")] -void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs) +void main(inout HitPayload payload, in ProceduralHitAttribute attrib) { - const int instID = InstanceID(); - const int primID = PrimitiveIndex(); - float32_t3 worldPosition = WorldRayOrigin() + WorldRayDirection() * RayTCurrent(); + const float32_t3 worldPosition = WorldRayOrigin() + WorldRayDirection() * RayTCurrent(); + const float32_t3 worldNormal = normalize(worldPosition - attrib.center); - SProceduralGeomInfo sphere = vk::RawBufferLoad < SProceduralGeomInfo > (pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo)); - - // Computing the normal at hit position - float32_t3 worldNormal = normalize(worldPosition - sphere.center); - - payload.material = sphere.material; + payload.material = attrib.material; payload.worldNormal = worldNormal; payload.rayDistance = RayTCurrent(); From bc16d1bb717c3812ce3bb8e0e5253a45ba97ee43 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 11 Feb 2025 16:31:00 +0700 Subject: [PATCH 036/296] Refactor shadow ray tracing implementation to use opacity instead of stochastic method. --- .../app_resources/common.hlsl | 3 +-- .../app_resources/raytrace.rahit.hlsl | 15 ++++++--------- .../app_resources/raytrace.rgen.hlsl | 17 ++++++----------- .../app_resources/raytraceShadow.rahit.hlsl | 18 ++++++++++++++++++ .../app_resources/raytraceShadow.rmiss.hlsl | 2 +- 71_RayTracingPipeline/main.cpp | 4 ++-- 6 files changed, 34 insertions(+), 25 deletions(-) create mode 100644 71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index a57fa82dd..9eb7744f5 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -178,8 +178,7 @@ struct ProceduralHitAttribute struct [raypayload] ShadowPayload { - bool isShadowed : read(caller) : write(caller,miss); - uint32_t seed : read(anyhit) : write(caller); + float32_t attenuation : read(caller) : write(caller, miss, anyhit); }; struct [raypayload] HitPayload diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl index 598e271a5..d0f24c209 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -5,23 +5,20 @@ [[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; -#if defined(USE_COLOR_PAYLOAD) -using AnyHitPayload = HitPayload; -#elif defined(USE_SHADOW_PAYLOAD) -using AnyHitPayload = ShadowPayload; -#endif - [shader("anyhit")] -void main(inout AnyHitPayload p, in BuiltInTriangleIntersectionAttributes attribs) +void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs) { const int instID = InstanceID(); const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); const Material material = unpackMaterial(geom.material); - - uint32_t seed = p.seed; + uint32_t seed = payload.seed; if (material.dissolve == 0.0) + { IgnoreHit(); + } else if (rnd(seed) > material.dissolve) + { IgnoreHit(); + } } diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index facba537c..fb4cb45b9 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -68,7 +68,7 @@ void main() const float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal); float32_t3 specular = float32_t3(0, 0, 0); - float32_t attenuation = 1; + float32_t attenuation = 0; if (dot(worldNormal, cLight.outLightDir) > 0) { @@ -76,21 +76,16 @@ void main() rayDesc.Origin = worldPosition; rayDesc.Direction = cLight.outLightDir; rayDesc.TMin = 0.01; - rayDesc.TMax = 100000; + rayDesc.TMax = cLight.outLightDistance; ShadowPayload shadowPayload; - shadowPayload.isShadowed = true; - shadowPayload.seed = seed; - TraceRay(topLevelAS, RAY_FLAG_SKIP_CLOSEST_HIT_SHADER, 0xFF, ERT_PRIMARY, 0, EMT_OCCLUSION, rayDesc, shadowPayload); + shadowPayload.attenuation = -1; // negative attenuation indicate occlusion happening. will be multiplied by -1 in miss shader. + TraceRay(topLevelAS, RAY_FLAG_SKIP_CLOSEST_HIT_SHADER, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload); - bool isShadowed = shadowPayload.isShadowed; - if (isShadowed) - { - attenuation = 0.3; - } - else + if (shadowPayload.attenuation > 0) { specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal); + attenuation = shadowPayload.attenuation; } } hitValues += (cLight.outIntensity * attenuation * (diffuse + specular)); diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl new file mode 100644 index 000000000..5ac656a7b --- /dev/null +++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl @@ -0,0 +1,18 @@ +#include "common.hlsl" +#include "random.hlsl" + +[[vk::push_constant]] SPushConstants pc; + +[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; + +[shader("anyhit")] +void main(inout ShadowPayload payload, in BuiltInTriangleIntersectionAttributes attribs) +{ + const int instID = InstanceID(); + const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); + const Material material = unpackMaterial(geom.material); + + payload.attenuation = (1 - material.dissolve) * payload.attenuation; + IgnoreHit(); + +} diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl index c1ea42173..287c38f55 100644 --- a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl +++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl @@ -3,5 +3,5 @@ [shader("miss")] void main(inout ShadowPayload payload) { - payload.isShadowed = false; + payload.attenuation = payload.attenuation * -1; } diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index fb0dca14f..1594ee6c4 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -166,8 +166,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl"); const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl"); const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl"); - const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl", "#define USE_COLOR_PAYLOAD\n"); - const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl", "#define USE_SHADOW_PAYLOAD\n"); + const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl"); + const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytraceShadow.rahit.hlsl"); const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); const auto shadowMissShader = loadCompileAndCreateShader("app_resources/raytraceShadow.rmiss.hlsl"); const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); From dc7db460b7d9fea4bfdb29f6d891e05c370eb9ec Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 11 Feb 2025 20:06:50 +0700 Subject: [PATCH 037/296] Remove unnecessary read of attenuation by miss shader --- 71_RayTracingPipeline/app_resources/common.hlsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index 9eb7744f5..bef1cb674 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -178,7 +178,7 @@ struct ProceduralHitAttribute struct [raypayload] ShadowPayload { - float32_t attenuation : read(caller) : write(caller, miss, anyhit); + float32_t attenuation : read(caller) : write(caller, anyhit); }; struct [raypayload] HitPayload From 86f4f4bc84cc8f9f19974664d0731161ffd80d9c Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 11 Feb 2025 20:08:21 +0700 Subject: [PATCH 038/296] Remove unnecsarry topLevelAs binding declaration --- 71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl | 2 -- 71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl | 2 -- 71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl | 2 -- 3 files changed, 6 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl index d0f24c209..e85d5b572 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -3,8 +3,6 @@ [[vk::push_constant]] SPushConstants pc; -[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; - [shader("anyhit")] void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs) { diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl index a0dd973e6..c5cf70185 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl @@ -2,8 +2,6 @@ [[vk::push_constant]] SPushConstants pc; -[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; - float3 unpackNormals3x10(uint32_t v) { // host side changes float32_t3 to EF_A2B10G10R10_SNORM_PACK32 diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl index 5ac656a7b..3f063daba 100644 --- a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl @@ -3,8 +3,6 @@ [[vk::push_constant]] SPushConstants pc; -[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; - [shader("anyhit")] void main(inout ShadowPayload payload, in BuiltInTriangleIntersectionAttributes attribs) { From 96f25cad0585b88bcd506bd93db47601a9aeee01 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 11 Feb 2025 20:09:11 +0700 Subject: [PATCH 039/296] Rework gpu random value generation to use pcg hash and xoroshiro --- .../app_resources/common.hlsl | 2 +- .../app_resources/random.hlsl | 34 ------------------ .../app_resources/raytrace.rahit.hlsl | 8 +---- .../app_resources/raytrace.rgen.hlsl | 36 ++++++++++++------- .../app_resources/raytraceShadow.rahit.hlsl | 13 +++++-- .../app_resources/raytraceShadow.rmiss.hlsl | 1 - 6 files changed, 36 insertions(+), 58 deletions(-) delete mode 100644 71_RayTracingPipeline/app_resources/random.hlsl diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index bef1cb674..a089b152a 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -186,7 +186,7 @@ struct [raypayload] HitPayload MaterialPacked material : read(caller) : write(closesthit); float32_t3 worldNormal : read(caller) : write(closesthit); float32_t rayDistance : read(caller) : write(closesthit, miss); - uint32_t seed : read(closesthit, anyhit) : write(caller); + float32_t dissolveThreshold : read(closesthit, anyhit) : write(caller); }; enum ObjectType : uint32_t // matches c++ diff --git a/71_RayTracingPipeline/app_resources/random.hlsl b/71_RayTracingPipeline/app_resources/random.hlsl deleted file mode 100644 index e01d7ff6c..000000000 --- a/71_RayTracingPipeline/app_resources/random.hlsl +++ /dev/null @@ -1,34 +0,0 @@ -// Generate a random unsigned int from two unsigned int values, using 16 pairs -// of rounds of the Tiny Encryption Algorithm. See Zafar, Olano, and Curtis, -// "GPU Random Numbers via the Tiny Encryption Algorithm" -uint32_t tea(uint32_t val0, uint32_t val1) -{ - uint32_t v0 = val0; - uint32_t v1 = val1; - uint32_t s0 = 0; - - for(uint32_t n = 0; n < 16; n++) - { - s0 += 0x9e3779b9; - v0 += ((v1 << 4) + 0xa341316c) ^ (v1 + s0) ^ ((v1 >> 5) + 0xc8013ea4); - v1 += ((v0 << 4) + 0xad90777d) ^ (v0 + s0) ^ ((v0 >> 5) + 0x7e95761e); - } - - return v0; -} - -// Generate a random unsigned int in [0, 2^24) given the previous RNG state -// using the Numerical Recipes linear congruential generator -uint32_t lcg(inout uint32_t prev) -{ - uint32_t LCG_A = 1664525u; - uint32_t LCG_C = 1013904223u; - prev = (LCG_A * prev + LCG_C); - return prev & 0x00FFFFFF; -} - -// Generate a random float32_t in [0, 1) given the previous RNG state -float32_t rnd(inout uint32_t prev) -{ - return (float32_t(lcg(prev)) / float32_t(0x01000000)); -} diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl index e85d5b572..9fece3a2d 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -1,5 +1,4 @@ #include "common.hlsl" -#include "random.hlsl" [[vk::push_constant]] SPushConstants pc; @@ -10,12 +9,7 @@ void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes att const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); const Material material = unpackMaterial(geom.material); - uint32_t seed = payload.seed; - if (material.dissolve == 0.0) - { - IgnoreHit(); - } - else if (rnd(seed) > material.dissolve) + if (material.dissolve == 0.0 || material.dissolve < payload.dissolveThreshold) { IgnoreHit(); } diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index fb4cb45b9..a493e13af 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -1,12 +1,13 @@ #include "common.hlsl" -#include "random.hlsl" #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" +#include "nbl/builtin/hlsl/random/xoroshiro.hlsl" #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl" static const int32_t s_sampleCount = 10; +static const float32_t3 s_clearColor = float32_t3(0.3, 0.3, 0.8); [[vk::push_constant]] SPushConstants pc; @@ -14,9 +15,16 @@ static const int32_t s_sampleCount = 10; [[vk::binding(1, 0)]] RWTexture2D colorImage; -float32_t3 reinhardTonemap(float32_t3 v) +uint32_t pcgHash(uint32_t v) { - return v / (1.0f + v); + uint32_t state = v * 747796405u + 2891336453u; + uint32_t word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u; + return (word >> 22u) ^ word; +} + +float32_t nextRandomUnorm(inout nbl::hlsl::Xoroshiro64StarStar rnd) +{ + return float32_t(rnd()) / float32_t(0xFFFFFFFF); } [shader("raygeneration")] @@ -25,13 +33,16 @@ void main() uint32_t3 launchID = DispatchRaysIndex(); uint32_t3 launchSize = DispatchRaysDimensions(); uint32_t2 coords = launchID.xy; - uint32_t seed = tea(launchID.y * launchSize.x + launchID.x, pc.frameCounter); + + uint32_t seed1 = pcgHash(pc.frameCounter); + uint32_t seed2 = pcgHash(launchID.y * launchSize.x + launchID.x); + nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(seed1, seed2)); float32_t3 hitValues = float32_t3(0, 0, 0); for (uint32_t sample_i = 0; sample_i < s_sampleCount; sample_i++) { - const float32_t r1 = rnd(seed); - const float32_t r2 = rnd(seed); + const float32_t r1 = nextRandomUnorm(rnd); + const float32_t r2 = nextRandomUnorm(rnd); const float32_t2 subpixelJitter = pc.frameCounter == 0 ? float32_t2(0.5f, 0.5f) : float32_t2(r1, r2); const float32_t2 pixelCenter = float32_t2(coords) + subpixelJitter; @@ -50,12 +61,12 @@ void main() rayDesc.TMax = 10000.0; HitPayload payload; - payload.seed = seed; + payload.dissolveThreshold = nextRandomUnorm(rnd); TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload); if (payload.rayDistance < 0) { - hitValues += float32_t3(0.3, 0.3, 0.3); + hitValues += s_clearColor; continue; } @@ -68,7 +79,7 @@ void main() const float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal); float32_t3 specular = float32_t3(0, 0, 0); - float32_t attenuation = 0; + float32_t attenuation = 1; if (dot(worldNormal, cLight.outLightDir) > 0) { @@ -78,14 +89,15 @@ void main() rayDesc.TMin = 0.01; rayDesc.TMax = cLight.outLightDistance; + uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_NON_OPAQUE | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER; ShadowPayload shadowPayload; - shadowPayload.attenuation = -1; // negative attenuation indicate occlusion happening. will be multiplied by -1 in miss shader. - TraceRay(topLevelAS, RAY_FLAG_SKIP_CLOSEST_HIT_SHADER, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload); + shadowPayload.attenuation = 1; // negative attenuation indicate occlusion happening. will be multiplied by -1 in miss shader. + TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload); + attenuation = shadowPayload.attenuation; if (shadowPayload.attenuation > 0) { specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal); - attenuation = shadowPayload.attenuation; } } hitValues += (cLight.outIntensity * attenuation * (diffuse + specular)); diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl index 3f063daba..15ac009e7 100644 --- a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl @@ -1,5 +1,4 @@ #include "common.hlsl" -#include "random.hlsl" [[vk::push_constant]] SPushConstants pc; @@ -10,7 +9,15 @@ void main(inout ShadowPayload payload, in BuiltInTriangleIntersectionAttributes const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); const Material material = unpackMaterial(geom.material); - payload.attenuation = (1 - material.dissolve) * payload.attenuation; - IgnoreHit(); + if (material.illum != 4) + { + payload.attenuation = 0; + AcceptHitAndEndSearch(); + } + else + { + payload.attenuation = (1 - material.dissolve) * payload.attenuation; + IgnoreHit(); + } } diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl index 287c38f55..aa8df4123 100644 --- a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl +++ b/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl @@ -3,5 +3,4 @@ [shader("miss")] void main(inout ShadowPayload payload) { - payload.attenuation = payload.attenuation * -1; } From 583c0f9a9f44bd064f9341b3ef89e9579df075be Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 11 Feb 2025 22:38:44 +0700 Subject: [PATCH 040/296] Add trace ray indirect option --- 71_RayTracingPipeline/main.cpp | 45 +++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 1594ee6c4..18600f604 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -4,6 +4,7 @@ #include "common.hpp" #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" +#include "nbl/builtin/hlsl/indirect_commands.hlsl" class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { @@ -411,6 +412,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); auto* geometryCreator = assetManager->getGeometryCreator(); + if (!createIndirectBuffer(gQueue)) + return logFail("Could not create indirect buffer"); + // create geometry objects if (!createGeometries(gQueue, geometryCreator)) return logFail("Could not create geometries from geometry creator"); @@ -585,6 +589,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, m_light.outerCutoff = cos(radians(dOuterCutoff)); } } + ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand); if (m_light != m_oldLight) { m_frameAccumulationCounter = 0; @@ -722,12 +727,26 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get()); - cmdbuf->traceRays( - m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.raygenGroupStride, - m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, - m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, - m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, - WIN_W, WIN_H, 1); + if (m_useIndirectCommand) + { + cmdbuf->traceRaysIndirect( + m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.raygenGroupStride, + m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, + m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, + m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, + SBufferBinding{ + .offset = 0, + .buffer = m_indirectBuffer, + }); + }else + { + cmdbuf->traceRays( + m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.raygenGroupStride, + m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, + m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, + m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, + WIN_W, WIN_H, 1); + } } // pipeline barrier @@ -1024,6 +1043,16 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } } + bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue) + { + const auto command = TraceRaysIndirectCommand_t{ WIN_W, WIN_H, 1 }; + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = sizeof(TraceRaysIndirectCommand_t); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer); + return true; + } + bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) { auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); @@ -1757,6 +1786,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, smart_refctd_ptr m_triangleGeomInfoBuffer; smart_refctd_ptr m_proceduralGeomInfoBuffer; smart_refctd_ptr m_proceduralAabbBuffer; + smart_refctd_ptr m_indirectBuffer; + smart_refctd_ptr m_hdrImage; smart_refctd_ptr m_hdrImageView; @@ -1771,5 +1802,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, smart_refctd_ptr m_converter; + bool m_useIndirectCommand = false; + }; NBL_MAIN_FUNC(RaytracingPipelineApp) From 0797b331ec2feacba4c03af7a26182994d443652 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 11 Feb 2025 22:58:10 +0700 Subject: [PATCH 041/296] Use matrix changes to reset frameAccumulationCounter --- 71_RayTracingPipeline/main.cpp | 14 +++++++++----- common/include/CCamera.hpp | 14 ++------------ 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 18600f604..e471065c7 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -687,6 +687,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, modelMatrix.setRotation(quaternion(0, 0, 0)); core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); + if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix) + { + m_frameAccumulationCounter = 0; + m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix; + } core::matrix4SIMD invModelViewProjectionMatrix; modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix); @@ -903,10 +908,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, m_camera.beginInputProcessing(nextPresentationTimestamp); { - bool camera_moved = false; m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { - camera_moved |= m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl for (const auto& e : events) // here capture { @@ -921,7 +925,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { - camera_moved |= m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl + m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl for (const auto& e : events) // here capture { @@ -933,8 +937,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } }, m_logger.get()); - if (camera_moved) - m_frameAccumulationCounter = 0; } m_camera.endInputProcessing(nextPresentationTimestamp); @@ -1802,6 +1804,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, smart_refctd_ptr m_converter; + + core::matrix4SIMD m_cachedModelViewProjectionMatrix; bool m_useIndirectCommand = false; }; diff --git a/common/include/CCamera.hpp b/common/include/CCamera.hpp index d9f31a260..797602a3b 100644 --- a/common/include/CCamera.hpp +++ b/common/include/CCamera.hpp @@ -133,9 +133,8 @@ class Camera public: // return whether camera is moved by mouse - bool mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events) + void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events) { - bool cameraMoved = false; for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++) { auto ev = *eventIt; @@ -181,15 +180,11 @@ class Camera mat.transformVect(localTarget); setTarget(localTarget + pos); - - cameraMoved = true; } } - return cameraMoved; } - // return whether camera is moved by keyboard - bool keyboardProcess(const nbl::ui::IKeyboardEventChannel::range_t& events) + void keyboardProcess(const nbl::ui::IKeyboardEventChannel::range_t& events) { for(uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) perActionDt[k] = 0.0; @@ -200,14 +195,12 @@ class Camera * And If an UP event was sent It will get subtracted it from this value. (Currently Disabled Because we Need better Oracle) */ - bool cameraMoved = false; for(uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) if(keysDown[k]) { auto timeDiff = std::chrono::duration_cast(nextPresentationTimeStamp - lastVirtualUpTimeStamp).count(); assert(timeDiff >= 0); perActionDt[k] += timeDiff; - cameraMoved = true; } for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++) @@ -245,11 +238,8 @@ class Camera position = initialPosition; target = initialTarget; recomputeViewMatrix(); - cameraMoved = true; } } - - return cameraMoved; } void beginInputProcessing(std::chrono::microseconds _nextPresentationTimeStamp) From 7409e4b863a06a3a5ed5a9d099babd1129bfe806 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 11 Feb 2025 23:03:14 +0700 Subject: [PATCH 042/296] Remove comment in CCamera.hpp --- common/include/CCamera.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/common/include/CCamera.hpp b/common/include/CCamera.hpp index 797602a3b..1b0fe9c0f 100644 --- a/common/include/CCamera.hpp +++ b/common/include/CCamera.hpp @@ -132,7 +132,6 @@ class Camera public: - // return whether camera is moved by mouse void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events) { for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++) From f9c3fad711cef70d39ffff683c28fde2dc2f1199 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 11 Feb 2025 23:34:51 +0700 Subject: [PATCH 043/296] Remove unnecesarry calculation of vertex position --- .../app_resources/raytrace.rchit.hlsl | 25 ++++--------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl index c5cf70185..aedea08d2 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl @@ -11,13 +11,7 @@ float3 unpackNormals3x10(uint32_t v) return clamp(float3(pn) / 511.0, -1.0, 1.0); } -struct VertexData -{ - float32_t3 position; - float32_t3 normal; -}; - -VertexData fetchVertexData(int instID, int primID, STriangleGeomInfo geom, float2 bary) +float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, float2 bary) { uint idxOffset = primID * 3; @@ -52,12 +46,7 @@ VertexData fetchVertexData(int instID, int primID, STriangleGeomInfo geom, float } } - const uint64_t vertexBufferAddress = geom.vertexBufferAddress; - float32_t3 p0 = vk::RawBufferLoad < float32_t3 > (vertexBufferAddress + i0 * vertexStride); - float32_t3 p1 = vk::RawBufferLoad < float32_t3 > (vertexBufferAddress + i1 * vertexStride); - float32_t3 p2 = vk::RawBufferLoad < float32_t3 > (vertexBufferAddress + i2 * vertexStride); - - const uint64_t normalVertexBufferAddress = vertexBufferAddress + s_offsetsToNormalBytes[objType]; + const uint64_t normalVertexBufferAddress = geom.vertexBufferAddress + s_offsetsToNormalBytes[objType]; float3 n0, n1, n2; switch (objType) { @@ -102,11 +91,7 @@ VertexData fetchVertexData(int instID, int primID, STriangleGeomInfo geom, float float3 barycentrics = float3(0.0, bary); barycentrics.x = 1.0 - barycentrics.y - barycentrics.z; - - VertexData data; - data.position = barycentrics.x * p0 + barycentrics.y * p1 + barycentrics.z * p2; - data.normal = normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2); - return data; + return normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2); } [shader("closesthit")] @@ -115,8 +100,8 @@ void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes att const int instID = InstanceID(); const int primID = PrimitiveIndex(); const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); - const VertexData vertexData = fetchVertexData(instID, primID, geom, attribs.barycentrics); - const float32_t3 worldNormal = normalize(mul(vertexData.normal, WorldToObject3x4()).xyz); + const float32_t3 vertexNormal = fetchVertexNormal(instID, primID, geom, attribs.barycentrics); + const float32_t3 worldNormal = normalize(mul(vertexNormal, WorldToObject3x4()).xyz); payload.material = geom.material; payload.worldNormal = worldNormal; From 22bd6f970c3954049a899181085a8df06b64fbe7 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 12 Feb 2025 11:30:50 +0700 Subject: [PATCH 044/296] nee stuff --- .../app_resources/hlsl/common.hlsl | 35 +++- .../app_resources/hlsl/intersector.hlsl | 2 +- .../hlsl/next_event_estimator.hlsl | 171 ++++++++++++++++++ 3 files changed, 203 insertions(+), 5 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 9295b459b..e5940aab0 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -54,10 +54,34 @@ struct Ray template struct Light { - Spectrum radiance; + using spectral_type = Spectrum; + + spectral_type radiance; uint32_t objectID; }; +template +struct Tolerance +{ + NBL_CONSTEXPR_STATIC_INLINE float INTERSECTION_ERROR_BOUND_LOG2 = -8.0; + + static T __common(uint32_t depth) + { + float depthRcp = 1.0 / float(depth); + return INTERSECTION_ERROR_BOUND_LOG2; + } + + static T getStart(uint32_t depth) + { + return nbl::hlsl::exp2(__common(depth)); + } + + static T getEnd(uint32_t depth) + { + return 1.0 - nbl::hlsl::exp2(__common(depth) + 1.0); + } +} + enum ProceduralShapeType : uint16_t { PST_SPHERE, @@ -114,7 +138,8 @@ struct Shape return 2.0 * numbers::pi * (1.0 - cosThetaMax); } - float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray) + template + float deferredPdf(NBL_CONST_REF_ARG(Light) light, NBL_CONST_REF_ARG(Ray) ray) { return 1.0 / getSolidAngle(ray.origin); } @@ -198,7 +223,8 @@ struct Shape return nbl::hlsl::cross(edges[0], edges[1]) * 0.5f; } - float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray) + template + float deferredPdf(NBL_CONST_REF_ARG(Light) light, NBL_CONST_REF_ARG(Ray) ray) { const float32_t3 L = ray.direction; switch (polygonMethod) @@ -345,7 +371,8 @@ struct Shape basis = nbl::hlsl::transpose(basis); // TODO: double check transpose } - float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray) + template + float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray) { switch (polygonMethod) { diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index a694082fe..919816019 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -47,7 +47,7 @@ struct Comprehensive { const bool anyHit = ray.intersectionT != numeric_limits::max; const uint32_t objCount = intersect.data[0]; - const ProceduralIntersectionType type = intersect.data[1]; + const ProceduralShapeType type = intersect.data[1]; int objectID = -1; for (int i = 0; i < objCount; i++) diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 1afa8d12e..5d96ae13e 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -1,6 +1,8 @@ #ifndef _NBL_HLSL_EXT_NEXT_EVENT_ESTIMATOR_INCLUDED_ #define _NBL_HLSL_EXT_NEXT_EVENT_ESTIMATOR_INCLUDED_ +#include "common.hlsl" + namespace nbl { namespace hlsl @@ -10,7 +12,176 @@ namespace ext namespace NextEventEstimator { +// procedural data store: [light count] [intersect type] [obj] + +struct Event +{ + enum class Mode : uint32_t + { + RAY_QUERY, + RAY_TRACING, + PROCEDURAL + }; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128; + + uint32_t mode : 1; + unit32_t unused : 31; // possible space for flags + uint32_t data[DataSize]; +}; + +template +struct Estimator +{ + using scalar_type = typename Ray::scalar_type; + using ray_type = Ray; + using light_type = Light; + using spectral_type = typename Light::spectral_type; + using interaction_type = Aniso; + using quotient_pdf_type = quotient_and_pdf; + using sample_type = LightSample; + + static spectral_type proceduralDeferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event) + { + const uint32_t lightCount = event.data[0]; + const ProceduralShapeType type = event.data[1]; + + pdf = 1.0 / lightCount; + switch (type) + { + case PST_SPHERE: + { + float32_t3 position = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); + Shape sphere = Shape::create(position, asfloat(intersect.data[2 + Shape::ObjSize + 3]), intersect.data[2 + Shape::ObjSize + 4]); + pdf *= sphere.template deferredPdf(light, ray); + } + break; + case PST_TRIANGLE: + { + float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); + float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 3]), asfloat(intersect.data[2 + Shape::ObjSize + 4]), asfloat(intersect.data[2 + Shape::ObjSize + 5])); + float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 6]), asfloat(intersect.data[2 + Shape::ObjSize + 7]), asfloat(intersect.data[2 + Shape::ObjSize + 8])); + Shape tri = Shape::create(vertex0, vertex1, vertex2, intersect.data[2 + Shape::ObjSize + 9]); + pdf *= tri.template deferredPdf(light, ray); + } + break; + case PST_RECTANGLE: + { + float32_t3 offset = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); + float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 3]), asfloat(intersect.data[2 + Shape::ObjSize + 4]), asfloat(intersect.data[2 + Shape::ObjSize + 5])); + float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 6]), asfloat(intersect.data[2 + Shape::ObjSize + 7]), asfloat(intersect.data[2 + Shape::ObjSize + 8])); + Shape rect = Shape::create(offset, edge0, edge1, intersect.data[2 + Shape::ObjSize + 9]); + pdf *= rect.template deferredPdf(light, ray); + } + break; + default: + pdf = numeric_limits::infinity; + break; + } + + return light.radiance; + } + + static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event) + { + const Event::Mode mode = event.mode; + switch (mode) + { + case Event::Mode::RAY_QUERY: + { + // TODO: do ray query stuff + } + break; + case Event::Mode::RAY_TRACING: + { + // TODO: do ray tracing stuff + } + break; + case Event::Mode::PROCEDURAL: + { + return proceduralDeferredEvalAndPdf(pdf, light, ray, event); + } + break; + default: + return (spectral_type)0.0; + } + } + + static sample_type procedural_generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, unit32_t depth, NBL_CONST_REF_ARG(Event) event) + { + const uint32_t lightCount = event.data[0]; + const ProceduralShapeType type = event.data[1]; + + sample_type L; + scalar_type pdf; + switch (type) + { + case PST_SPHERE: + { + float32_t3 position = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); + Shape sphere = Shape::create(position, asfloat(intersect.data[2 + Shape::ObjSize + 3]), intersect.data[2 + Shape::ObjSize + 4]); + L = sphere.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi, objectID); + } + break; + case PST_TRIANGLE: + { + float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); + float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 3]), asfloat(intersect.data[2 + Shape::ObjSize + 4]), asfloat(intersect.data[2 + Shape::ObjSize + 5])); + float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 6]), asfloat(intersect.data[2 + Shape::ObjSize + 7]), asfloat(intersect.data[2 + Shape::ObjSize + 8])); + Shape tri = Shape::create(vertex0, vertex1, vertex2, intersect.data[2 + Shape::ObjSize + 9]); + L = tri.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi, objectID); + } + break; + case PST_RECTANGLE: + { + float32_t3 offset = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); + float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 3]), asfloat(intersect.data[2 + Shape::ObjSize + 4]), asfloat(intersect.data[2 + Shape::ObjSize + 5])); + float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 6]), asfloat(intersect.data[2 + Shape::ObjSize + 7]), asfloat(intersect.data[2 + Shape::ObjSize + 8])); + Shape rect = Shape::create(offset, edge0, edge1, intersect.data[2 + Shape::ObjSize + 9]); + L = rect.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi, objectID); + } + break; + default: + pdf = numeric_limits::infinity; + break; + } + + newRayMaxT *= Tolerance::getEnd(depth); + pdf *= 1.0 / lightCount; + spectral_type quo = light.radiance / pdf; + quotient_pdf = quotient_pdf_type::create(quo, pdf); + + return L; + } + static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, unit32_t depth, NBL_CONST_REF_ARG(Event) event) + { + const Event::Mode mode = event.mode; + switch (mode) + { + case Event::Mode::RAY_QUERY: + { + // TODO: do ray query stuff + } + break; + case Event::Mode::RAY_TRACING: + { + // TODO: do ray tracing stuff + } + break; + case Event::Mode::PROCEDURAL: + { + return procedural_generate_and_quotient_and_pdf(newRayMaxT, origin, interaction, isBSDF, xi, depth, event); + } + break; + default: + { + sample_type L; + return L; + } + } + } +}; } } From 0da41df6fafe255caf47cf9dc1c6a363b12a0324 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 12 Feb 2025 20:04:18 +0700 Subject: [PATCH 045/296] Tidy up variable name and file name --- .../app_resources/common.hlsl | 56 ++++++------- .../app_resources/raytrace.rahit.hlsl | 4 +- .../app_resources/raytrace.rchit.hlsl | 2 +- .../app_resources/raytrace.rgen.hlsl | 34 ++++---- .../app_resources/raytrace.rmiss.hlsl | 2 +- .../raytrace_procedural.rchit.hlsl | 4 +- ....rahit.hlsl => raytrace_shadow.rahit.hlsl} | 12 +-- ....rmiss.hlsl => raytrace_shadow.rmiss.hlsl} | 2 +- 71_RayTracingPipeline/main.cpp | 81 ++++++++++--------- 9 files changed, 99 insertions(+), 98 deletions(-) rename 71_RayTracingPipeline/app_resources/{raytraceShadow.rahit.hlsl => raytrace_shadow.rahit.hlsl} (69%) rename 71_RayTracingPipeline/app_resources/{raytraceShadow.rmiss.hlsl => raytrace_shadow.rmiss.hlsl} (51%) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index a089b152a..5b69c4a76 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -4,27 +4,29 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" NBL_CONSTEXPR uint32_t WorkgroupSize = 16; +NBL_CONSTEXPR uint32_t MAX_UNORM_10 = 1023; +NBL_CONSTEXPR uint32_t MAX_UNORM_22 = 4194303; inline uint32_t packUnorm10(float32_t v) { - return trunc(v * 1023.0f + 0.5f); + return trunc(v * float32_t(MAX_UNORM_10) + 0.5f); } inline float32_t unpackUnorm10(uint32_t packed) { - return float32_t(packed & 0x3ff) * (1.0f / 1023.0f); + return float32_t(packed & 0x3ff) * (1.0f / float32_t(MAX_UNORM_10)); } -inline uint32_t packUnorm18(float32_t v) +inline uint32_t packUnorm22(float32_t v) { - const float maxValue = 262143; + const float maxValue = float32_t(MAX_UNORM_22); return trunc(v * maxValue + 0.5f); } -inline float32_t unpackUnorm18(uint32_t packed) +inline float32_t unpackUnorm22(uint32_t packed) { - const float maxValue = 262143; - return float32_t(packed & 0x3ffff) * (1.0f / maxValue); + const float maxValue = float32_t(MAX_UNORM_22); + return float32_t(packed & 0x3fffff) * (1.0f / maxValue); } inline uint32_t packUnorm3x10(float32_t3 v) @@ -43,8 +45,12 @@ struct Material float32_t3 diffuse; float32_t3 specular; float32_t shininess; - float32_t dissolve; // 1 == opaque; 0 == fully transparent - uint32_t illum; // illumination model (see http://www.fileformat.info/format/material/) + float32_t alpha; + + bool isTransparent() NBL_CONST_MEMBER_FUNC + { + return alpha < 1.0; + } }; struct MaterialPacked @@ -52,9 +58,13 @@ struct MaterialPacked uint32_t ambient; uint32_t diffuse; uint32_t specular; - uint32_t shininess: 18; - uint32_t dissolve : 10; // 1 == opaque; 0 == fully transparent - uint32_t illum : 4; // illumination model (see http://www.fileformat.info/format/material/) + uint32_t shininess: 22; + uint32_t alpha : 10; + + bool isTransparent() NBL_CONST_MEMBER_FUNC + { + return alpha != MAX_UNORM_10; +} }; inline MaterialPacked packMaterial(Material material) @@ -63,9 +73,8 @@ inline MaterialPacked packMaterial(Material material) packed.ambient = packUnorm3x10(material.ambient); packed.diffuse = packUnorm3x10(material.diffuse); packed.specular = packUnorm3x10(material.specular); - packed.shininess = packUnorm18(material.shininess); - packed.dissolve = packUnorm10(material.dissolve); - packed.illum = material.illum; + packed.shininess = packUnorm22(material.shininess); + packed.alpha = packUnorm10(material.alpha); return packed; } @@ -75,9 +84,8 @@ inline Material unpackMaterial(MaterialPacked packed) material.ambient = unpackUnorm3x10(packed.ambient); material.diffuse = unpackUnorm3x10(packed.diffuse); material.specular = unpackUnorm3x10(packed.specular); - material.shininess = unpackUnorm18(packed.shininess); - material.dissolve = unpackUnorm10(packed.dissolve); - material.illum = packed.illum; + material.shininess = unpackUnorm22(packed.shininess); + material.alpha = unpackUnorm10(packed.alpha); return material; } @@ -176,17 +184,17 @@ struct ProceduralHitAttribute #ifdef __HLSL_VERSION -struct [raypayload] ShadowPayload +struct [raypayload] OcclusionPayload { float32_t attenuation : read(caller) : write(caller, anyhit); }; -struct [raypayload] HitPayload +struct [raypayload] PrimaryPayload { MaterialPacked material : read(caller) : write(closesthit); float32_t3 worldNormal : read(caller) : write(closesthit); float32_t rayDistance : read(caller) : write(closesthit, miss); - float32_t dissolveThreshold : read(closesthit, anyhit) : write(caller); + float32_t alphaThreshold : read(closesthit, anyhit) : write(caller); }; enum ObjectType : uint32_t // matches c++ @@ -207,20 +215,14 @@ static uint32_t s_offsetsToNormalBytes[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, float32_t3 computeDiffuse(Material mat, float32_t3 light_dir, float32_t3 normal) { - // Lambertian float32_t dotNL = max(dot(normal, light_dir), 0.0); float32_t3 c = mat.diffuse * dotNL; - if (mat.illum >= 1) - c += mat.ambient; return c; } float32_t3 computeSpecular(Material mat, float32_t3 view_dir, float32_t3 light_dir, float32_t3 normal) { - if (mat.illum < 2) - return float32_t3(0, 0, 0); - const float32_t kPi = 3.14159265; const float32_t kShininess = max(mat.shininess, 4.0); diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl index 9fece3a2d..2923e95d9 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -3,13 +3,13 @@ [[vk::push_constant]] SPushConstants pc; [shader("anyhit")] -void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs) +void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs) { const int instID = InstanceID(); const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); const Material material = unpackMaterial(geom.material); - if (material.dissolve == 0.0 || material.dissolve < payload.dissolveThreshold) + if (material.alpha > payload.alphaThreshold) { IgnoreHit(); } diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl index aedea08d2..fdb252cda 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl @@ -95,7 +95,7 @@ float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, flo } [shader("closesthit")] -void main(inout HitPayload payload, in BuiltInTriangleIntersectionAttributes attribs) +void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs) { const int instID = InstanceID(); const int primID = PrimitiveIndex(); diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index a493e13af..df6a5215d 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -17,8 +17,8 @@ static const float32_t3 s_clearColor = float32_t3(0.3, 0.3, 0.8); uint32_t pcgHash(uint32_t v) { - uint32_t state = v * 747796405u + 2891336453u; - uint32_t word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u; + const uint32_t state = v * 747796405u + 2891336453u; + const uint32_t word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u; return (word >> 22u) ^ word; } @@ -30,12 +30,12 @@ float32_t nextRandomUnorm(inout nbl::hlsl::Xoroshiro64StarStar rnd) [shader("raygeneration")] void main() { - uint32_t3 launchID = DispatchRaysIndex(); - uint32_t3 launchSize = DispatchRaysDimensions(); - uint32_t2 coords = launchID.xy; + const uint32_t3 launchID = DispatchRaysIndex(); + const uint32_t3 launchSize = DispatchRaysDimensions(); + const uint32_t2 coords = launchID.xy; - uint32_t seed1 = pcgHash(pc.frameCounter); - uint32_t seed2 = pcgHash(launchID.y * launchSize.x + launchID.x); + const uint32_t seed1 = pcgHash(pc.frameCounter); + const uint32_t seed2 = pcgHash(launchID.y * launchSize.x + launchID.x); nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(seed1, seed2)); float32_t3 hitValues = float32_t3(0, 0, 0); @@ -57,11 +57,11 @@ void main() RayDesc rayDesc; rayDesc.Origin = pc.camPos; rayDesc.Direction = camDirection; - rayDesc.TMin = 0.001; + rayDesc.TMin = 0.01; rayDesc.TMax = 10000.0; - HitPayload payload; - payload.dissolveThreshold = nextRandomUnorm(rnd); + PrimaryPayload payload; + payload.alphaThreshold = nextRandomUnorm(rnd); TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload); if (payload.rayDistance < 0) @@ -90,20 +90,20 @@ void main() rayDesc.TMax = cLight.outLightDistance; uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_NON_OPAQUE | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER; - ShadowPayload shadowPayload; - shadowPayload.attenuation = 1; // negative attenuation indicate occlusion happening. will be multiplied by -1 in miss shader. - TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, shadowPayload); + OcclusionPayload occlusionPayload; + occlusionPayload.attenuation = 1; // negative attenuation indicate occlusion happening. will be multiplied by -1 in miss shader. + TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, occlusionPayload); - attenuation = shadowPayload.attenuation; - if (shadowPayload.attenuation > 0) + attenuation = occlusionPayload.attenuation; + if (occlusionPayload.attenuation > 0) { specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal); } } - hitValues += (cLight.outIntensity * attenuation * (diffuse + specular)); + hitValues += ((cLight.outIntensity * attenuation * (diffuse + specular)) + material.ambient); } - float32_t3 hitValue = hitValues / s_sampleCount; + const float32_t3 hitValue = hitValues / s_sampleCount; if (pc.frameCounter > 0) { diff --git a/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl index 602104a19..5ccfed470 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rmiss.hlsl @@ -1,7 +1,7 @@ #include "common.hlsl" [shader("miss")] -void main(inout HitPayload payload) +void main(inout PrimaryPayload payload) { payload.rayDistance = -1; } diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl index 48495f0fc..0a58ccba8 100644 --- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl @@ -2,10 +2,8 @@ [[vk::push_constant]] SPushConstants pc; -[[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; - [shader("closesthit")] -void main(inout HitPayload payload, in ProceduralHitAttribute attrib) +void main(inout PrimaryPayload payload, in ProceduralHitAttribute attrib) { const float32_t3 worldPosition = WorldRayOrigin() + WorldRayDirection() * RayTCurrent(); const float32_t3 worldNormal = normalize(worldPosition - attrib.center); diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl similarity index 69% rename from 71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl rename to 71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl index 15ac009e7..c59f7367e 100644 --- a/71_RayTracingPipeline/app_resources/raytraceShadow.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl @@ -3,21 +3,21 @@ [[vk::push_constant]] SPushConstants pc; [shader("anyhit")] -void main(inout ShadowPayload payload, in BuiltInTriangleIntersectionAttributes attribs) +void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttributes attribs) { const int instID = InstanceID(); const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); const Material material = unpackMaterial(geom.material); - if (material.illum != 4) + if (material.isTransparent()) { - payload.attenuation = 0; - AcceptHitAndEndSearch(); + payload.attenuation = material.alpha * payload.attenuation; + IgnoreHit(); } else { - payload.attenuation = (1 - material.dissolve) * payload.attenuation; - IgnoreHit(); + payload.attenuation = 0; + AcceptHitAndEndSearch(); } } diff --git a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl similarity index 51% rename from 71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl rename to 71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl index aa8df4123..baad9a3e9 100644 --- a/71_RayTracingPipeline/app_resources/raytraceShadow.rmiss.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl @@ -1,6 +1,6 @@ #include "common.hlsl" [shader("miss")] -void main(inout ShadowPayload payload) +void main(inout OcclusionPayload payload) { } diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index e471065c7..d457e37dc 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -138,7 +138,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } // Load Custom Shader - auto loadCompileAndCreateShader = [&](const std::string& relPath, const std::string& header = "") -> smart_refctd_ptr + auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = m_logger.get(); @@ -153,13 +153,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, if (!sourceRaw) return nullptr; - smart_refctd_ptr source = CHLSLCompiler::createOverridenCopy( - sourceRaw.get(), - "%s\n", - header.c_str() - ); - - return m_device->createShader({ source.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); + return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); }; // load shaders @@ -168,9 +162,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl"); const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl"); const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl"); - const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytraceShadow.rahit.hlsl"); + const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl"); const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); - const auto shadowMissShader = loadCompileAndCreateShader("app_resources/raytraceShadow.rmiss.hlsl"); + const auto shadowMissShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl"); const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl"); @@ -300,14 +294,14 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .binding = 0, .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, .count = 1, }, { .binding = 1, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, .count = 1, } }; @@ -333,7 +327,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, RTDS_SHADOW_MISS, RTDS_CLOSEST_HIT, RTDS_SPHERE_CLOSEST_HIT, - RTDS_ANYHIT_COLOR, + RTDS_ANYHIT_PRIMARY, RTDS_ANYHIT_SHADOW, RTDS_INTERSECTION, RTDS_DIRECTIONAL_CALL, @@ -348,7 +342,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, shaders[RTDS_SHADOW_MISS] = {.shader = shadowMissShader.get()}; shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()}; shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()}; - shaders[RTDS_ANYHIT_COLOR] = {.shader = anyHitShaderColorPayload.get()}; + shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()}; shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get()}; shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get() }; shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get()}; @@ -374,19 +368,17 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { .closestHitShaderIndex = RTDS_CLOSEST_HIT, - .anyHitShaderIndex = RTDS_ANYHIT_COLOR, + .anyHitShaderIndex = RTDS_ANYHIT_PRIMARY, }; hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { - .closestHitShaderIndex = RTDS_CLOSEST_HIT, .anyHitShaderIndex = RTDS_ANYHIT_SHADOW, }; hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { .closestHitShaderIndex = RTDS_SPHERE_CLOSEST_HIT, - .anyHitShaderIndex = RTDS_ANYHIT_COLOR, + .anyHitShaderIndex = RTDS_ANYHIT_PRIMARY, .intersectionShaderIndex = RTDS_INTERSECTION, }; hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { - .closestHitShaderIndex = RTDS_CLOSEST_HIT, .anyHitShaderIndex = RTDS_ANYHIT_SHADOW, .intersectionShaderIndex = RTDS_INTERSECTION, }; @@ -546,7 +538,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { static matrix4SIMD projection; - projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar); + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( + core::radians(m_cameraSetting.fov), + io.DisplaySize.x / io.DisplaySize.y, + m_cameraSetting.zNear, + m_cameraSetting.zFar); return projection; }()); @@ -563,11 +559,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, ImGui::Text("Camera"); - ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f); - ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f); - ImGui::SliderFloat("Fov", &fov, 20.f, 150.f); - ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); - ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); + ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f); + ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f); + ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f); Light m_oldLight = m_light; int light_type = m_light.type; ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT); @@ -879,8 +875,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, inline void update() { - m_camera.setMoveSpeed(moveSpeed); - m_camera.setRotateSpeed(rotateSpeed); + m_camera.setMoveSpeed(m_cameraSetting.moveSpeed); + m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed); static std::chrono::microseconds previousEventTimestamp{}; @@ -1062,11 +1058,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, return logFail("Couldn't create Command Pool for geometry creation!"); const auto defaultMaterial = Material{ - .ambient = {}, + .ambient = {0.2, 0.1, 0.1}, .diffuse = {0.8, 0.3, 0.3}, .specular = {0.8, 0.8, 0.8}, .shininess = 1.0f, - .illum = 2 + .alpha = 1.0f, }; auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) @@ -1096,11 +1092,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), .material = Material{ - .ambient = {}, + .ambient = {0.1, 0.1, 0.2}, .diffuse = {0.2, 0.2, 0.8}, .specular = {0.8, 0.8, 0.8}, .shininess = 1.0f, - .illum = 2 }, .transform = getTranslationMatrix(-5.0f, 1.0f, 0), }, @@ -1108,12 +1103,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), .material = Material{ - .ambient = {}, + .ambient = {0.1, 0.2, 0.1}, .diffuse = {0.2, 0.8, 0.2}, .specular = {0.8, 0.8, 0.8}, .shininess = 1.0f, - .dissolve = 0.2, - .illum = 4 + .alpha = 0.8, }, .transform = getTranslationMatrix(5.0f, 1.0f, 0), }, @@ -1282,11 +1276,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto middle_i = NumberOfProceduralGeometries / 2.0; SProceduralGeomInfo sphere = { .material = packMaterial({ - .ambient = {}, + .ambient = {0.1, 0.05 * i, 0.1}, .diffuse = {0.3, 0.2 * i, 0.3}, .specular = {0.8, 0.8, 0.8}, .shininess = 1.0f, - .illum = 2 }), .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), .radius = 1, @@ -1482,7 +1475,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, triangles[i].vertexStride = vertexStride; triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; triangles[i].indexType = gpuObject.indexType; - triangles[i].geometryFlags = gpuObject.material.illum == 4 ? + triangles[i].geometryFlags = gpuObject.material.isTransparent() ? IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; @@ -1749,10 +1742,18 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, InputSystem::ChannelReader m_mouse; InputSystem::ChannelReader m_keyboard; - float fov = 60.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; - float viewWidth = 10.f; - float camYAngle = 165.f / 180.f * 3.14159f; - float camXAngle = 32.f / 180.f * 3.14159f; + struct CameraSetting + { + float fov = 60.f; + float zNear = 0.1f; + float zFar = 10000.f; + float moveSpeed = 1.f; + float rotateSpeed = 1.f; + float viewWidth = 10.f; + float camYAngle = 165.f / 180.f * 3.14159f; + float camXAngle = 32.f / 180.f * 3.14159f; + + } m_cameraSetting; Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); Light m_light = { From 3bb858b45166be7b8c4b48a3d465697c7a6aadc8 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 13 Feb 2025 17:01:20 +0700 Subject: [PATCH 046/296] scene representation, getmeasure for pt --- .../app_resources/hlsl/common.hlsl | 21 +++ .../app_resources/hlsl/intersector.hlsl | 72 ++++++++- .../app_resources/hlsl/pathtracer.hlsl | 144 ++++++++++++++++-- 3 files changed, 227 insertions(+), 10 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index e5940aab0..7289d508d 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -474,6 +474,27 @@ struct Shape PTPolygonMethod polygonMethod; }; +struct Scene +{ + NBL_CONSTEXPR_STATIC_INLINE uint32_t maxSphereCount = 25; + NBL_CONSTEXPR_STATIC_INLINE uint32_t maxTriangleCount = 12; + NBL_CONSTEXPR_STATIC_INLINE uint32_t maxRectangleCount = 12; + + Shape spheres[maxSphereCount]; + Shape triangles[maxTriangleCount]; + Shape rectangles[maxRectangleCount]; + + uint32_t sphereCount; + uint32_t triangleCount; + uint32_t rectangleCount; + + Light lights[]; + // Material materials[]; + // + obj count for each + + // AS ases; +}; + } } } diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index 919816019..23706402a 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -23,6 +23,76 @@ namespace Intersector struct IntersectData { + static IntersectData encode(uint32_t mode, ProceduralShapeType type, NBL_CONST_REF_ARG(Scene) scene) + { + IntersectData retval; + retval.mode = mode; + + uint32_t objCount = (type == PST_SPHERE) ? scene.sphereCount : + (type == PST_TRIANGLE) ? scene.triangleCount : + (type == PST_RECTANGLE) ? scene.rectangleCount : + -1; + retval.data[0] = objCount; + retval.data[1] = type; + + switch (type) + { + case PST_SPHERE: + { + for (int i = 0; i < objCount; i++) + { + Shape sphere = scene.spheres[i]; + retval.data[2 + i * Shape::ObjSize] = asuint(sphere.position.x); + retval.data[2 + i * Shape::ObjSize + 1] = asuint(sphere.position.y); + retval.data[2 + i * Shape::ObjSize + 2] = asuint(sphere.position.z); + retval.data[2 + i * Shape::ObjSize + 3] = asuint(sphere.radius); + retval.data[2 + i * Shape::ObjSize + 4] = sphere.bsdfLightIDs; + } + } + break; + case PST_TRIANGLE: + { + for (int i = 0; i < objCount; i++) + { + Shape tri = scene.triangles[i]; + retval.data[2 + i * Shape::ObjSize] = asuint(tri.vertex0.x); + retval.data[2 + i * Shape::ObjSize + 1] = asuint(tri.vertex0.y); + retval.data[2 + i * Shape::ObjSize + 2] = asuint(tri.vertex0.z); + retval.data[2 + i * Shape::ObjSize + 3] = asuint(tri.vertex1.x); + retval.data[2 + i * Shape::ObjSize + 4] = asuint(tri.vertex1.y); + retval.data[2 + i * Shape::ObjSize + 5] = asuint(tri.vertex1.z); + retval.data[2 + i * Shape::ObjSize + 6] = asuint(tri.vertex2.x); + retval.data[2 + i * Shape::ObjSize + 7] = asuint(tri.vertex2.y); + retval.data[2 + i * Shape::ObjSize + 8] = asuint(tri.vertex2.z); + retval.data[2 + i * Shape::ObjSize + 9] = tri.bsdfLightIDs; + } + } + break; + case PST_RECTANGLE: + { + for (int i = 0; i < objCount; i++) + { + Shape rect = scene.rectangles[i]; + retval.data[2 + i * Shape::ObjSize] = asuint(rect.offset.x); + retval.data[2 + i * Shape::ObjSize + 1] = asuint(rect.offset.y); + retval.data[2 + i * Shape::ObjSize + 2] = asuint(rect.offset.z); + retval.data[2 + i * Shape::ObjSize + 3] = asuint(rect.edge0.x); + retval.data[2 + i * Shape::ObjSize + 4] = asuint(rect.edge0.y); + retval.data[2 + i * Shape::ObjSize + 5] = asuint(rect.edge0.z); + retval.data[2 + i * Shape::ObjSize + 6] = asuint(rect.edge1.x); + retval.data[2 + i * Shape::ObjSize + 7] = asuint(rect.edge1.y); + retval.data[2 + i * Shape::ObjSize + 8] = asuint(rect.edge1.z); + retval.data[2 + i * Shape::ObjSize + 9] = rect.bsdfLightIDs; + } + } + break; + default: + // for ASes + break; + } + return retval; + } + enum class Mode : uint32_t { RAY_QUERY, @@ -49,7 +119,7 @@ struct Comprehensive const uint32_t objCount = intersect.data[0]; const ProceduralShapeType type = intersect.data[1]; - int objectID = -1; + int objectID = ray.objectID; for (int i = 0; i < objCount; i++) { float t; diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 9ca0f77e4..06950b825 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -10,27 +10,153 @@ namespace ext namespace PathTracer { +template +struct PathTracerCreationParams +{ + // rng gen + uint32_t2 rngState; + + // ray gen + vector pixOffsetParam; + vector camPos; + vector NDC; + matrix invMVP; + + // mat + BxDFCreation diffuseParams; + BxDFCreation conductorParams; + BxDFCreation dielectricParams; +}; + template struct Unidirectional { using this_t = Unidirectional; + using randgen_type = RandGen; + using raygen_type = RayGen; + using intersector_type = Intersector; + using material_system_type = MaterialSystem; + using nee_type = NextEventEstimator; + + using scalar_type = typename MaterialSystem::scalar_type; + using vector3_type = vector; + using measure_type = typename MaterialSystem::measure_type; + using ray_type = typename RayGen::ray_type; - static this_t create(RandGen randGen, - RayGen rayGen, - Intersector intersector, - MaterialSystem materialSystem, - /* PathGuider pathGuider, */ - NextEventEstimator nee) - {} + // static this_t create(RandGen randGen, + // RayGen rayGen, + // Intersector intersector, + // MaterialSystem materialSystem, + // /* PathGuider pathGuider, */ + // NextEventEstimator nee) + // {} - // closest hit + static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams) params) + { + this_t retval; + retval.randGen = randgen_type::create(params.rngState); + retval.rayGen = raygen_type::create(params.pixOffsetParam, params.camPos, params.NDC, params.invMVP); + retval.materialSystem = material_system_type::create(diffuseParams, conductorParams, dielectricParams); + return retval; + } + + // TODO: get working, what is sampleSequence stuff + vector3_type rand3d(uint32_t protoDimension, uint32_t _sample) + { + uint32_t address = spirv::bitfieldInsert(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2); + unit32_t3 seqVal = texelFetch(sampleSequence, int(address) + i).xyz; + seqVal ^= unit32_t3(randGen(), randGen(), randGen()); + return vector3_type(seqVal) * asfloat(0x2f800004u); + } + + bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray) + { + const uint32_t objectID = ray.objectID; + const vector3_type intersection = ray.origin + ray.direction * ray.intersectionT; + + uint32_t bsdfLightIDs; + } + + void missProgram(NBL_REF_ARG(ray_type) ray) + { + vector3_type finalContribution = ray.payload.throughput; + // #ifdef USE_ENVMAP + // vec2 uv = SampleSphericalMap(_immutable.direction); + // finalContribution *= textureLod(envMap, uv, 0.0).rgb; + // #else + const vector3_type kConstantEnvLightRadiance = vector3_type(0.15, 0.21, 0.3); // TODO: match spectral_type + finalContribution *= kConstantEnvLightRadiance; + ray.payload.accumulation += finalContribution; + // #endif + } // Li - MaterialSystem::measure_type getMeasure() + measure_type getMeasure(uint32_t numSamples, uint32_t depth, NBL_CONST_REF_ARG(Scene) scene) { // loop through bounces, do closest hit // return ray.payload.accumulation --> color + + // TODO: not hardcode this, pass value from somewhere?, where to get objects? + Intersector::IntersectData data; + + measure_type Li = (measure_type)0.0; + scalar_type meanLumaSq = 0.0; + for (uint32_t i = 0; i < numSamples; i++) + { + vector3_type uvw = rand3d(0u, i); + ray_type ray = rayGen.generate(uvw); + + // bounces + bool hit = true; + bool rayAlive = true; + for (int d = 1; d <= depth && hit && rayAlive; d += 2) + { + ray.intersectionT = numeric_limits::max; + ray.objectID = -1; // start with no intersect + + // prodedural shapes + if (scene.sphereCount > 0) + { + data = Intersector::IntersectData::encode(Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE, scene); + ray.objectID = intersector.traceRay(ray, data); + } + + if (scene.triangleCount > 0) + { + data = Intersector::IntersectData::encode(Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE, scene); + ray.objectID = intersector.traceRay(ray, data); + } + + if (scene.rectangleCount > 0) + { + data = Intersector::IntersectData::encode(Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE, scene); + ray.objectID = intersector.traceRay(ray, data); + } + + // TODO: trace AS + + hit = ray.objectID != -1; + if (hit) + rayAlive = closestHitProgram(d, i, ray); + } + if (!hit) + missProgram(ray); + + spectral_type accumulation = ray.payload.accumulation; + scalar_type rcpSampleSize = 1.0 / (i + 1); + Li += (accumulation - Li) * rcpSampleSize; + + // TODO: visualize high variance + } + + return Li; } + + randgen_type randGen; + raygen_type rayGen; + intersector_type intersector; + material_system_type materialSystem; + nee_type nee; }; } From f5adbf6494a28624db0b6204f34b0235a8687c3c Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 14 Feb 2025 16:35:04 +0700 Subject: [PATCH 047/296] moved scene rep out, some closest hit stuff --- .../app_resources/hlsl/common.hlsl | 68 +++---- .../app_resources/hlsl/intersector.hlsl | 84 +------- .../hlsl/next_event_estimator.hlsl | 8 +- .../app_resources/hlsl/pathtracer.hlsl | 60 +++++- .../app_resources/hlsl/scene.hlsl | 190 ++++++++++++++++++ 5 files changed, 289 insertions(+), 121 deletions(-) create mode 100644 31_HLSLPathTracer/app_resources/hlsl/scene.hlsl diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 7289d508d..00d35a2a9 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -8,7 +8,7 @@ #include #include #include -//#include +#include namespace nbl { @@ -32,6 +32,20 @@ struct Payload // #endif }; +enum ProceduralShapeType : uint16_t +{ + PST_SPHERE, + PST_TRIANGLE, + PST_RECTANGLE +}; + +struct ObjectID +{ + uint32_t id; + uint32_t mode; + ProceduralShapeType shapeType; +}; + template struct Ray { @@ -46,7 +60,7 @@ struct Ray // mutable scalar_type intersectionT; - uint32_t objectID; + ObjectID objectID; Payload payload; }; @@ -56,10 +70,24 @@ struct Light { using spectral_type = Spectrum; + NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu; + spectral_type radiance; - uint32_t objectID; + ObjectID objectID; }; +template +struct BxDFNode +{ + using spectral_type = Spectrum; + using params_type = bxdf::SBxDFCreationParams; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu; + + params_type params; + ObjectID objectID; +} + template struct Tolerance { @@ -82,13 +110,6 @@ struct Tolerance } } -enum ProceduralShapeType : uint16_t -{ - PST_SPHERE, - PST_TRIANGLE, - PST_RECTANGLE -}; - enum PTPolygonMethod : uint16_t { PPM_AREA, @@ -145,7 +166,7 @@ struct Shape } template - float generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi, uint32_t objectID) + float generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi) { float32_t3 Z = position - origin; const float distanceSQ = nbl::hlsl::dot(Z,Z); @@ -257,7 +278,7 @@ struct Shape } template - float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi, uint32_t objectID) + float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi) { switch(polygonMethod) { @@ -409,7 +430,7 @@ struct Shape } template - float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi, uint32_t objectID) + float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi) { const float32_t3 N = getNormalTimesArea(); const float32_t3 origin2origin = offset - origin; @@ -474,27 +495,6 @@ struct Shape PTPolygonMethod polygonMethod; }; -struct Scene -{ - NBL_CONSTEXPR_STATIC_INLINE uint32_t maxSphereCount = 25; - NBL_CONSTEXPR_STATIC_INLINE uint32_t maxTriangleCount = 12; - NBL_CONSTEXPR_STATIC_INLINE uint32_t maxRectangleCount = 12; - - Shape spheres[maxSphereCount]; - Shape triangles[maxTriangleCount]; - Shape rectangles[maxRectangleCount]; - - uint32_t sphereCount; - uint32_t triangleCount; - uint32_t rectangleCount; - - Light lights[]; - // Material materials[]; - // + obj count for each - - // AS ases; -}; - } } } diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index 23706402a..b2d858ef6 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -23,76 +23,6 @@ namespace Intersector struct IntersectData { - static IntersectData encode(uint32_t mode, ProceduralShapeType type, NBL_CONST_REF_ARG(Scene) scene) - { - IntersectData retval; - retval.mode = mode; - - uint32_t objCount = (type == PST_SPHERE) ? scene.sphereCount : - (type == PST_TRIANGLE) ? scene.triangleCount : - (type == PST_RECTANGLE) ? scene.rectangleCount : - -1; - retval.data[0] = objCount; - retval.data[1] = type; - - switch (type) - { - case PST_SPHERE: - { - for (int i = 0; i < objCount; i++) - { - Shape sphere = scene.spheres[i]; - retval.data[2 + i * Shape::ObjSize] = asuint(sphere.position.x); - retval.data[2 + i * Shape::ObjSize + 1] = asuint(sphere.position.y); - retval.data[2 + i * Shape::ObjSize + 2] = asuint(sphere.position.z); - retval.data[2 + i * Shape::ObjSize + 3] = asuint(sphere.radius); - retval.data[2 + i * Shape::ObjSize + 4] = sphere.bsdfLightIDs; - } - } - break; - case PST_TRIANGLE: - { - for (int i = 0; i < objCount; i++) - { - Shape tri = scene.triangles[i]; - retval.data[2 + i * Shape::ObjSize] = asuint(tri.vertex0.x); - retval.data[2 + i * Shape::ObjSize + 1] = asuint(tri.vertex0.y); - retval.data[2 + i * Shape::ObjSize + 2] = asuint(tri.vertex0.z); - retval.data[2 + i * Shape::ObjSize + 3] = asuint(tri.vertex1.x); - retval.data[2 + i * Shape::ObjSize + 4] = asuint(tri.vertex1.y); - retval.data[2 + i * Shape::ObjSize + 5] = asuint(tri.vertex1.z); - retval.data[2 + i * Shape::ObjSize + 6] = asuint(tri.vertex2.x); - retval.data[2 + i * Shape::ObjSize + 7] = asuint(tri.vertex2.y); - retval.data[2 + i * Shape::ObjSize + 8] = asuint(tri.vertex2.z); - retval.data[2 + i * Shape::ObjSize + 9] = tri.bsdfLightIDs; - } - } - break; - case PST_RECTANGLE: - { - for (int i = 0; i < objCount; i++) - { - Shape rect = scene.rectangles[i]; - retval.data[2 + i * Shape::ObjSize] = asuint(rect.offset.x); - retval.data[2 + i * Shape::ObjSize + 1] = asuint(rect.offset.y); - retval.data[2 + i * Shape::ObjSize + 2] = asuint(rect.offset.z); - retval.data[2 + i * Shape::ObjSize + 3] = asuint(rect.edge0.x); - retval.data[2 + i * Shape::ObjSize + 4] = asuint(rect.edge0.y); - retval.data[2 + i * Shape::ObjSize + 5] = asuint(rect.edge0.z); - retval.data[2 + i * Shape::ObjSize + 6] = asuint(rect.edge1.x); - retval.data[2 + i * Shape::ObjSize + 7] = asuint(rect.edge1.y); - retval.data[2 + i * Shape::ObjSize + 8] = asuint(rect.edge1.z); - retval.data[2 + i * Shape::ObjSize + 9] = rect.bsdfLightIDs; - } - } - break; - default: - // for ASes - break; - } - return retval; - } - enum class Mode : uint32_t { RAY_QUERY, @@ -113,13 +43,15 @@ struct Comprehensive using scalar_type = typename Ray::scalar_type; using ray_type = Ray; - static int traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect) + static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect) { const bool anyHit = ray.intersectionT != numeric_limits::max; const uint32_t objCount = intersect.data[0]; const ProceduralShapeType type = intersect.data[1]; - int objectID = ray.objectID; + ObjectID objectID = ray.objectID; + objectID.mode = IntersectData::Mode::PROCEDURAL; + objectID.type = type; for (int i = 0; i < objCount; i++) { float t; @@ -152,13 +84,13 @@ struct Comprehensive break; default: t = numeric_limits::infinity; - break; + break; } bool closerIntersection = t > 0.0 && t < ray.intersectionT; ray.intersectionT = closerIntersection ? t : ray.intersectionT; - objectID = closerIntersection ? i : objectID; + objectID.id = closerIntersection ? i : objectID.id; // allowing early out results in a performance regression, WTF!? //if (anyHit && closerIntersection) @@ -167,7 +99,7 @@ struct Comprehensive return objectID; } - static int traceRay(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect) + static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect) { const IntersectData::Mode mode = intersect.mode; switch (mode) @@ -188,7 +120,7 @@ struct Comprehensive } break; default: - return -1; + return ObjectID(-1, IntersectData::Mode::PROCEDURAL, PST_SPHERE); } } }; diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 5d96ae13e..74cf00926 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -23,7 +23,7 @@ struct Event PROCEDURAL }; - NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128; + NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 16; uint32_t mode : 1; unit32_t unused : 31; // possible space for flags @@ -120,7 +120,7 @@ struct Estimator { float32_t3 position = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); Shape sphere = Shape::create(position, asfloat(intersect.data[2 + Shape::ObjSize + 3]), intersect.data[2 + Shape::ObjSize + 4]); - L = sphere.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi, objectID); + L = sphere.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); } break; case PST_TRIANGLE: @@ -129,7 +129,7 @@ struct Estimator float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 3]), asfloat(intersect.data[2 + Shape::ObjSize + 4]), asfloat(intersect.data[2 + Shape::ObjSize + 5])); float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 6]), asfloat(intersect.data[2 + Shape::ObjSize + 7]), asfloat(intersect.data[2 + Shape::ObjSize + 8])); Shape tri = Shape::create(vertex0, vertex1, vertex2, intersect.data[2 + Shape::ObjSize + 9]); - L = tri.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi, objectID); + L = tri.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); } break; case PST_RECTANGLE: @@ -138,7 +138,7 @@ struct Estimator float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 3]), asfloat(intersect.data[2 + Shape::ObjSize + 4]), asfloat(intersect.data[2 + Shape::ObjSize + 5])); float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 6]), asfloat(intersect.data[2 + Shape::ObjSize + 7]), asfloat(intersect.data[2 + Shape::ObjSize + 8])); Shape rect = Shape::create(offset, edge0, edge1, intersect.data[2 + Shape::ObjSize + 9]); - L = rect.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi, objectID); + L = rect.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); } break; default: diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 06950b825..80a342a86 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -42,6 +42,10 @@ struct Unidirectional using vector3_type = vector; using measure_type = typename MaterialSystem::measure_type; using ray_type = typename RayGen::ray_type; + using light_type = Light; + using bxdfnode_type = BxDFNode; + using anisotropic_type = typename MaterialSystem::anisotropic_type; + using isotropic_type = typename anisotropic_type::isotropic_type; // static this_t create(RandGen randGen, // RayGen rayGen, @@ -69,12 +73,54 @@ struct Unidirectional return vector3_type(seqVal) * asfloat(0x2f800004u); } - bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray) + // TODO: probably will only work with procedural shapes, do the other ones + bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Scene) scene) { const uint32_t objectID = ray.objectID; const vector3_type intersection = ray.origin + ray.direction * ray.intersectionT; uint32_t bsdfLightIDs; + anisotropic_type interaction; + switch (objectID.mode) + { + // TODO + case Intersector::IntersectData::Mode::RAY_QUERY: + case Intersector::IntersectData::Mode::RAY_TRACING: + break; + case Intersector::IntersectData::Mode::PROCEDURAL: + { + bsdfLightIDs = scene.getBsdfLightIDs(objectID.id); + vector3_type N = scene.getNormal(objectID.id) + N = nbl::hlsl::normalize(N); + typename isotropic_type::ray_dir_info_type V; + V.direction = nbl::hlsl::normalize(-ray.direction); + isotropic_type iso = isotropic_type::create(V, N); + interaction = anisotropic_type::create(iso); + } + break; + default: + break; + } + + vector3_type throughput = ray.payload.throughput; + + // emissive + const uint32_t lightID = spirv::bitfieldExtract(bsdfLightIDs, 16, 16); + if (lightID != light_type::INVALID_ID) + { + float pdf; + ray.payload.accumulation += nee.deferredEvalAndPdf(pdf, lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + pdf * pdf * ray.payload.otherTechniqueHeuristic); + } + + const uint32_t bsdfID = spirv::bitfieldExtract(bsdfLightIDs, 0, 16); + if (bsdfID == bxdfnode_type::INVALID_ID) + return false; + + // TODO: ifdef kill diffuse specular paths + + // sample lights + + // sample BSDF } void missProgram(NBL_REF_ARG(ray_type) ray) @@ -112,32 +158,32 @@ struct Unidirectional for (int d = 1; d <= depth && hit && rayAlive; d += 2) { ray.intersectionT = numeric_limits::max; - ray.objectID = -1; // start with no intersect + ray.objectID.id = -1; // start with no intersect // prodedural shapes if (scene.sphereCount > 0) { - data = Intersector::IntersectData::encode(Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE, scene); + data = scene.toIntersectData(Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE); ray.objectID = intersector.traceRay(ray, data); } if (scene.triangleCount > 0) { - data = Intersector::IntersectData::encode(Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE, scene); + data = scene.toIntersectData(Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE); ray.objectID = intersector.traceRay(ray, data); } if (scene.rectangleCount > 0) { - data = Intersector::IntersectData::encode(Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE, scene); + data = scene.toIntersectData(Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE); ray.objectID = intersector.traceRay(ray, data); } // TODO: trace AS - hit = ray.objectID != -1; + hit = ray.objectID.id != -1; if (hit) - rayAlive = closestHitProgram(d, i, ray); + rayAlive = closestHitProgram(d, i, ray, scene); } if (!hit) missProgram(ray); diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl new file mode 100644 index 000000000..ea173e1a7 --- /dev/null +++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl @@ -0,0 +1,190 @@ +#ifndef _NBL_HLSL_EXT_PATHTRACING_SCENE_INCLUDED_ +#define _NBL_HLSL_EXT_PATHTRACING_SCENE_INCLUDED_ + +#include "common.hlsl" +#include "material_system.hlsl" +#include "next_event_estimator.hlsl" +#include "intersector.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace ext +{ + +struct Scene +{ + NBL_CONSTEXPR_STATIC_INLINE uint32_t maxSphereCount = 25; + NBL_CONSTEXPR_STATIC_INLINE uint32_t maxTriangleCount = 12; + NBL_CONSTEXPR_STATIC_INLINE uint32_t maxRectangleCount = 12; + + Shape spheres[maxSphereCount]; + Shape triangles[maxTriangleCount]; + Shape rectangles[maxRectangleCount]; + + uint32_t sphereCount; + uint32_t triangleCount; + uint32_t rectangleCount; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t maxLightCount = 4; + + Light lights[maxLightCount]; + uint32_t lightCount; + // Material materials[]; + // + obj count for each + + // AS ases; + + Intersector::IntersectData toIntersectData(uint32_t mode, ProceduralShapeType type) + { + Intersector::IntersectData retval; + retval.mode = mode; + + uint32_t objCount = (type == PST_SPHERE) ? sphereCount : + (type == PST_TRIANGLE) ? triangleCount : + (type == PST_RECTANGLE) ? rectangleCount : + -1; + retval.data[0] = objCount; + retval.data[1] = type; + + switch (type) + { + case PST_SPHERE: + { + for (int i = 0; i < objCount; i++) + { + Shape sphere = spheres[i]; + retval.data[2 + i * Shape::ObjSize] = asuint(sphere.position.x); + retval.data[2 + i * Shape::ObjSize + 1] = asuint(sphere.position.y); + retval.data[2 + i * Shape::ObjSize + 2] = asuint(sphere.position.z); + retval.data[2 + i * Shape::ObjSize + 3] = asuint(sphere.radius); + retval.data[2 + i * Shape::ObjSize + 4] = sphere.bsdfLightIDs; + } + } + break; + case PST_TRIANGLE: + { + for (int i = 0; i < objCount; i++) + { + Shape tri = triangles[i]; + retval.data[2 + i * Shape::ObjSize] = asuint(tri.vertex0.x); + retval.data[2 + i * Shape::ObjSize + 1] = asuint(tri.vertex0.y); + retval.data[2 + i * Shape::ObjSize + 2] = asuint(tri.vertex0.z); + retval.data[2 + i * Shape::ObjSize + 3] = asuint(tri.vertex1.x); + retval.data[2 + i * Shape::ObjSize + 4] = asuint(tri.vertex1.y); + retval.data[2 + i * Shape::ObjSize + 5] = asuint(tri.vertex1.z); + retval.data[2 + i * Shape::ObjSize + 6] = asuint(tri.vertex2.x); + retval.data[2 + i * Shape::ObjSize + 7] = asuint(tri.vertex2.y); + retval.data[2 + i * Shape::ObjSize + 8] = asuint(tri.vertex2.z); + retval.data[2 + i * Shape::ObjSize + 9] = tri.bsdfLightIDs; + } + } + break; + case PST_RECTANGLE: + { + for (int i = 0; i < objCount; i++) + { + Shape rect = rectangles[i]; + retval.data[2 + i * Shape::ObjSize] = asuint(rect.offset.x); + retval.data[2 + i * Shape::ObjSize + 1] = asuint(rect.offset.y); + retval.data[2 + i * Shape::ObjSize + 2] = asuint(rect.offset.z); + retval.data[2 + i * Shape::ObjSize + 3] = asuint(rect.edge0.x); + retval.data[2 + i * Shape::ObjSize + 4] = asuint(rect.edge0.y); + retval.data[2 + i * Shape::ObjSize + 5] = asuint(rect.edge0.z); + retval.data[2 + i * Shape::ObjSize + 6] = asuint(rect.edge1.x); + retval.data[2 + i * Shape::ObjSize + 7] = asuint(rect.edge1.y); + retval.data[2 + i * Shape::ObjSize + 8] = asuint(rect.edge1.z); + retval.data[2 + i * Shape::ObjSize + 9] = rect.bsdfLightIDs; + } + } + break; + default: + // for ASes + break; + } + return retval; + } + + NextEventEstimator::Event toNextEvent(uint32_t lightID) + { + NextEventEstimator::Event retval; + + ObjectID objectID = lights[lightID].objectID; + retval.mode = objectID.mode; + + retval.data[0] = lightCount; + retval.data[1] = objectID.type; + + uint32_t id = objectID.id; + switch (type) + { + case PST_SPHERE: + { + Shape sphere = spheres[id]; + retval.data[2 + Shape::ObjSize] = asuint(sphere.position.x); + retval.data[2 + Shape::ObjSize + 1] = asuint(sphere.position.y); + retval.data[2 + Shape::ObjSize + 2] = asuint(sphere.position.z); + retval.data[2 + Shape::ObjSize + 3] = asuint(sphere.radius); + retval.data[2 + Shape::ObjSize + 4] = sphere.bsdfLightIDs; + } + break; + case PST_TRIANGLE: + { + Shape tri = triangles[id]; + retval.data[2 + Shape::ObjSize] = asuint(tri.vertex0.x); + retval.data[2 + Shape::ObjSize + 1] = asuint(tri.vertex0.y); + retval.data[2 + Shape::ObjSize + 2] = asuint(tri.vertex0.z); + retval.data[2 + Shape::ObjSize + 3] = asuint(tri.vertex1.x); + retval.data[2 + Shape::ObjSize + 4] = asuint(tri.vertex1.y); + retval.data[2 + Shape::ObjSize + 5] = asuint(tri.vertex1.z); + retval.data[2 + Shape::ObjSize + 6] = asuint(tri.vertex2.x); + retval.data[2 + Shape::ObjSize + 7] = asuint(tri.vertex2.y); + retval.data[2 + Shape::ObjSize + 8] = asuint(tri.vertex2.z); + retval.data[2 + Shape::ObjSize + 9] = tri.bsdfLightIDs; + } + break; + case PST_RECTANGLE: + { + Shape rect = rectangles[id]; + retval.data[2 + Shape::ObjSize] = asuint(rect.offset.x); + retval.data[2 + Shape::ObjSize + 1] = asuint(rect.offset.y); + retval.data[2 + Shape::ObjSize + 2] = asuint(rect.offset.z); + retval.data[2 + Shape::ObjSize + 3] = asuint(rect.edge0.x); + retval.data[2 + Shape::ObjSize + 4] = asuint(rect.edge0.y); + retval.data[2 + Shape::ObjSize + 5] = asuint(rect.edge0.z); + retval.data[2 + Shape::ObjSize + 6] = asuint(rect.edge1.x); + retval.data[2 + Shape::ObjSize + 7] = asuint(rect.edge1.y); + retval.data[2 + Shape::ObjSize + 8] = asuint(rect.edge1.z); + retval.data[2 + Shape::ObjSize + 9] = rect.bsdfLightIDs; + } + break; + default: + // for ASes + break; + } + return retval; + } + + // TODO: get these to work with AS types as well + uint32_t getBsdfLightIDs(uint32_t id) + { + return (objectID.type == PST_SPHERE) ? spheres[id].bsdfLightIDs : + (objectID.type == PST_TRIANGLE) ? triangles[id].bsdfLightIDs : + (objectID.type == PST_RECTANGLE) ? rectangles[id].bsdfLightIDs : -1; + } + + float32_t3 getNormal(uint32_t id, NBL_CONST_REF_ARG(float32_t3) intersection) + { + return (objectID.type == PST_SPHERE) ? scene.spheres[id].getNormal(intersection) : + (objectID.type == PST_TRIANGLE) ? scene.triangles[id].getNormalTimesArea() : + (objectID.type == PST_RECTANGLE) ? scene.rectangles[id].getNormalTimesArea() : + (float32_t3)0.0; + } +}; + +} +} +} + +#endif From 159d1533e8d82e3c5e82165e8b79ea67c0f23111 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 17 Feb 2025 16:58:03 +0700 Subject: [PATCH 048/296] sample light part of closest hit --- .../app_resources/hlsl/common.hlsl | 1 + .../app_resources/hlsl/intersector.hlsl | 2 +- .../app_resources/hlsl/material_system.hlsl | 8 +- .../hlsl/next_event_estimator.hlsl | 2 +- .../app_resources/hlsl/pathtracer.hlsl | 132 ++++++++++++++++-- .../app_resources/hlsl/scene.hlsl | 7 +- 6 files changed, 137 insertions(+), 15 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 00d35a2a9..7d29dabd4 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -84,6 +84,7 @@ struct BxDFNode NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu; + uint32_t materialType; params_type params; ObjectID objectID; } diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index b2d858ef6..60aa7143b 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -23,7 +23,7 @@ namespace Intersector struct IntersectData { - enum class Mode : uint32_t + enum Mode : uint32_t // enum class? { RAY_QUERY, RAY_TRACING, diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index 1f13198fa..b89bfbd40 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -15,7 +15,7 @@ namespace MaterialSystem struct Material { - enum class Type : uint32_t + enum Type : uint32_t // enum class? { DIFFUSE, CONDUCTOR, @@ -29,7 +29,7 @@ struct Material uint32_t data[DataSize]; }; -template +template // NOTE: these bxdfs should match the ones in Scene BxDFNode struct System { using this_t = System; @@ -42,6 +42,10 @@ struct System using anisocache_type = typename ConductorBxDF::anisocache_type; using params_t = SBxDFParams; + using diffuse_op_type = DiffuseBxDF; + using conductor_op_type = ConductorBxDF; + using dielectric_op_type = DielectricBxDF; + static this_t create(NBL_CONST_REF_ARG(SBxDFCreationParams) diffuseParams, NBL_CONST_REF_ARG(SBxDFCreationParams) conductorParams, NBL_CONST_REF_ARG(SBxDFCreationParams) dielectricParams) { diffuseBxDF = DiffuseBxDF::create(diffuseParams); diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 74cf00926..c6380094d 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -16,7 +16,7 @@ namespace NextEventEstimator struct Event { - enum class Mode : uint32_t + enum Mode : uint32_t // enum class? { RAY_QUERY, RAY_TRACING, diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 80a342a86..e4638703a 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -1,6 +1,16 @@ #ifndef _NBL_HLSL_EXT_PATHTRACER_INCLUDED_ #define _NBL_HLSL_EXT_PATHTRACER_INCLUDED_ +#include +#include +#include + +#include "rand_gen.hlsl" +#include "ray_gen.hlsl" +#include "intersector.hlsl" +#include "material_system.hlsl" +#include "next_event_estimator.hlsl" + namespace nbl { namespace hlsl @@ -41,11 +51,20 @@ struct Unidirectional using scalar_type = typename MaterialSystem::scalar_type; using vector3_type = vector; using measure_type = typename MaterialSystem::measure_type; + using sample_type = typename NextEventEstimator::sample_type; using ray_type = typename RayGen::ray_type; using light_type = Light; using bxdfnode_type = BxDFNode; using anisotropic_type = typename MaterialSystem::anisotropic_type; using isotropic_type = typename anisotropic_type::isotropic_type; + using anisocache_type = typename MaterialSystem::anisocache_type; + using isocache_type = typename anisocache_type::isocache_type; + using quotient_pdf_type = typename NextEventEstimator::quotient_pdf_type; + using params_type = typename MaterialSystem::params_t; + + using diffuse_op_type = typename MaterialSystem::diffuse_op_type; + using conductor_op_type = typename MaterialSystem::conductor_op_type; + using dielectric_op_type = typename MaterialSystem::dielectric_op_type; // static this_t create(RandGen randGen, // RayGen rayGen, @@ -73,6 +92,11 @@ struct Unidirectional return vector3_type(seqVal) * asfloat(0x2f800004u); } + scalar_type getLuma(NBL_CONST_REF_ARG(vector3_type) col) + { + return nbl::hlsl::dot(nbl::hlsl::transpose(colorspace::scRGBtoXYZ)[1], col); + } + // TODO: probably will only work with procedural shapes, do the other ones bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Scene) scene) { @@ -81,21 +105,22 @@ struct Unidirectional uint32_t bsdfLightIDs; anisotropic_type interaction; + isotropic_type iso_interaction; switch (objectID.mode) { // TODO - case Intersector::IntersectData::Mode::RAY_QUERY: - case Intersector::IntersectData::Mode::RAY_TRACING: + case ext::Intersector::IntersectData::Mode::RAY_QUERY: + case ext::Intersector::IntersectData::Mode::RAY_TRACING: break; - case Intersector::IntersectData::Mode::PROCEDURAL: + case ext::Intersector::IntersectData::Mode::PROCEDURAL: { bsdfLightIDs = scene.getBsdfLightIDs(objectID.id); vector3_type N = scene.getNormal(objectID.id) N = nbl::hlsl::normalize(N); typename isotropic_type::ray_dir_info_type V; V.direction = nbl::hlsl::normalize(-ray.direction); - isotropic_type iso = isotropic_type::create(V, N); - interaction = anisotropic_type::create(iso); + isotropic_type iso_interaction = isotropic_type::create(V, N); + interaction = anisotropic_type::create(iso_interaction); } break; default: @@ -116,9 +141,98 @@ struct Unidirectional if (bsdfID == bxdfnode_type::INVALID_ID) return false; + BxDFNode bxdf = scene.bxdfs[bsdfID]; + // TODO: ifdef kill diffuse specular paths + const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) ? bxdf_traits::type == BT_BSDF : + (bxdf.materialType == ext::MaterialSystem::Material::CONDUCTOR) ? bxdf_traits::type == BT_BSDF : + bxdf_traits::type == BT_BSDF; + + vector3_type eps0 = rand3d(depth, _sample); + vector3_type eps1 = rand3d(depth, _sample); + vector3_type eps2 = rand3d(depth, _sample); + + // thresholds + const scalar_type bsdfPdfThreshold = 0.0001; + const scalar_type lumaContributionThreshold = getLuma(colorspace::eotf::sRGB((vector3_type)1.0 / 255.0)); // OETF smallest perceptible value + const vector3_type throughputCIE_Y = nbl::hlsl::transpose(colorspace::sRGBtoXYZ)[1] * throughput; // TODO: this only works if spectral_type is dim 3 + const scalar_type monochromeEta = nbl::hlsl::dot(throughputCIE_Y, BSDFNode_getEta(bsdf)[0]) / (throughputCIE_Y.r + throughputCIE_Y.g + throughputCIE_Y.b); // TODO: fix getEta, what is real eta + // sample lights + const scalar_type neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf); + scalar_type rcpChoiceProb; + if (!math::partitionRandVariable(neeProbability, eps0.z, rcpChoiceProb) && depth < 2u) + { + quotient_pdf_type neeContrib_pdf; + scalar_type t; + sample_type nee_sample = nee.generate_and_quotient_and_pdf( + neeContrib_pdf, t, + intersection, interaction, + isBSDF, eps0, depth + ); + + // We don't allow non watertight transmitters in this renderer + bool validPath = nee_sample.NdotL > numeric_limits::min; + // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself + anisocache_type _cache; + validPath = validPath && anisocache_type::compute(_cache, interaction, nee_sample, monochromeEta); + + if (neeContrib_pdf.pdf < numeric_limits::max) + { + if (nbl::hlsl::any(isnan(nee_sample.L))) + ray.payload.accumulation += vector3_type(1000.f, 0.f, 0.f); + else if (nbl::hlsl::all((vector3_type)69.f == nee_sample.L)) + ray.payload.accumulation += vector3_type(0.f, 1000.f, 0.f); + else if (validPath) + { + ext::MaterialSystem::Material material; + material.type = bxdf.materialType; + params_type params; + + // TODO: does not yet account for smooth dielectric + if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) + { + params = params_type::template create(nee_sample, iso_interaction, bxdf::BCM_MAX); + } + else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) + { + if (bxdf.params.is_aniso) + params = params_type::template create(nee_sample, interaction, _cache, bxdf::BCM_MAX); + else + { + isocache = (iso_cache)_cache; + params = params_type::template create(nee_sample, iso_interaction, isocache, bxdf::BCM_MAX); + } + } + else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) + { + params = params_type::template create(nee_sample, iso_interaction, bxdf::BCM_ABS); + } + else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) + { + if (bxdf.params.is_aniso) + params = params_type::template create(nee_sample, interaction, _cache, bxdf::BCM_ABS); + else + { + isocache = (iso_cache)_cache; + params = params_type::template create(nee_sample, iso_interaction, isocache, bxdf::BCM_ABS); + } + } + + quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, params) * throughput; + neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient; + const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb; + const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf; + neeContrib_pdf.quotient *= otherGenOverChoice/(1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice); // balance heuristic + + // TODO: ifdef NEE only + + if (bsdf_quotient_pdf.pdf < numeric_limits::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1) + ray._payload.accumulation += neeContrib_pdf.quotient; + } + } + } // sample BSDF } @@ -143,7 +257,7 @@ struct Unidirectional // return ray.payload.accumulation --> color // TODO: not hardcode this, pass value from somewhere?, where to get objects? - Intersector::IntersectData data; + ext::Intersector::IntersectData data; measure_type Li = (measure_type)0.0; scalar_type meanLumaSq = 0.0; @@ -163,19 +277,19 @@ struct Unidirectional // prodedural shapes if (scene.sphereCount > 0) { - data = scene.toIntersectData(Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE); + data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE); ray.objectID = intersector.traceRay(ray, data); } if (scene.triangleCount > 0) { - data = scene.toIntersectData(Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE); + data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE); ray.objectID = intersector.traceRay(ray, data); } if (scene.rectangleCount > 0) { - data = scene.toIntersectData(Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE); + data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE); ray.objectID = intersector.traceRay(ray, data); } diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl index ea173e1a7..fe4dea8b3 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl @@ -31,8 +31,11 @@ struct Scene Light lights[maxLightCount]; uint32_t lightCount; - // Material materials[]; - // + obj count for each + + NBL_CONSTEXPR_STATIC_INLINE uint32_t maxBxdfCount = 16; // TODO: limit change? + + BxDFNode bxdfs[maxBxdfCount]; + uint32_t bxdfCount; // AS ases; From a7350db7d7e422fa5086982b3327103c06cfbe44 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 18 Feb 2025 15:23:52 +0700 Subject: [PATCH 049/296] fix bugs, reorganize traceRay --- .../app_resources/hlsl/intersector.hlsl | 32 +++++++++++++ .../app_resources/hlsl/material_system.hlsl | 16 +++++-- .../app_resources/hlsl/pathtracer.hlsl | 47 ++++++------------- .../app_resources/hlsl/scene.hlsl | 8 +++- 4 files changed, 65 insertions(+), 38 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index 60aa7143b..cf2d3ae7c 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -123,6 +123,38 @@ struct Comprehensive return ObjectID(-1, IntersectData::Mode::PROCEDURAL, PST_SPHERE); } } + + template + static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Scene) scene) + { + IntersectData data; + + ObjectID objectID; + objectID.id = -1; // start with no intersect + + // prodedural shapes + if (scene.sphereCount > 0) + { + data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE); + objectID = intersector.traceRay(ray, data); + } + + if (scene.triangleCount > 0) + { + data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE); + objectID = intersector.traceRay(ray, data); + } + + if (scene.rectangleCount > 0) + { + data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE); + objectID = intersector.traceRay(ray, data); + } + + // TODO: trace AS + + return objectID; + } }; // does everything in traceray in ex 30 diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index b89bfbd40..1d5587443 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -41,6 +41,7 @@ struct System using anisotropic_type = typename DiffuseBxDF::anisotropic_type; using anisocache_type = typename ConductorBxDF::anisocache_type; using params_t = SBxDFParams; + using create_params_t = SBxDFCreationParams; using diffuse_op_type = DiffuseBxDF; using conductor_op_type = ConductorBxDF; @@ -53,22 +54,25 @@ struct System dielectricBxDF = DiffuseBxDF::create(dielectricParams); } - static measure_type eval(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(params_t) params) + static measure_type eval(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) { switch(material.type) { case DIFFUSE: { + diffuseBxDF.init(cparams); return (measure_type)diffuseBxDF.eval(params); } break; case CONDUCTOR: { + conductorBxDF.init(cparams); return conductorBxDF.eval(params); } break; case DIELECTRIC: { + dielectricBxDF.init(cparams); return dielectricBxDF.eval(params); } break; @@ -77,22 +81,25 @@ struct System } } - static vector3_type generate(NBL_CONST_REF_ARG(Material) material, anisotropic_type interaction, vector2_type u, NBL_REF_ARG(anisocache_type) cache) + static vector3_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, vector2_type u, NBL_REF_ARG(anisocache_type) cache) { switch(material.type) { case DIFFUSE: { + diffuseBxDF.init(cparams); return diffuseBxDF.generate(interaction, u); } break; case CONDUCTOR: { + conductorBxDF.init(cparams); return conductorBxDF.generate(interaction, u, cache); } break; case DIELECTRIC: { + dielectricBxDF.init(cparams); return dielectricBxDF.generate(interaction, u, cache); } break; @@ -101,7 +108,7 @@ struct System } } - static quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(params_t) params) + static quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) { const float minimumProjVectorLen = 0.00000001; if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen) @@ -110,16 +117,19 @@ struct System { case DIFFUSE: { + diffuseBxDF.init(cparams); return diffuseBxDF.quotient_and_pdf(params); } break; case CONDUCTOR: { + conductorBxDF.init(cparams); return conductorBxDF.quotient_and_pdf(params); } break; case DIELECTRIC: { + dielectricBxDF.init(cparams); return dielectricBxDF.quotient_and_pdf(params); } break; diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index e4638703a..8d8d9a201 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -61,6 +61,7 @@ struct Unidirectional using isocache_type = typename anisocache_type::isocache_type; using quotient_pdf_type = typename NextEventEstimator::quotient_pdf_type; using params_type = typename MaterialSystem::params_t; + using scene_type = Scene; using diffuse_op_type = typename MaterialSystem::diffuse_op_type; using conductor_op_type = typename MaterialSystem::conductor_op_type; @@ -98,7 +99,7 @@ struct Unidirectional } // TODO: probably will only work with procedural shapes, do the other ones - bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Scene) scene) + bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene) { const uint32_t objectID = ray.objectID; const vector3_type intersection = ray.origin + ray.direction * ray.intersectionT; @@ -157,7 +158,8 @@ struct Unidirectional const scalar_type bsdfPdfThreshold = 0.0001; const scalar_type lumaContributionThreshold = getLuma(colorspace::eotf::sRGB((vector3_type)1.0 / 255.0)); // OETF smallest perceptible value const vector3_type throughputCIE_Y = nbl::hlsl::transpose(colorspace::sRGBtoXYZ)[1] * throughput; // TODO: this only works if spectral_type is dim 3 - const scalar_type monochromeEta = nbl::hlsl::dot(throughputCIE_Y, BSDFNode_getEta(bsdf)[0]) / (throughputCIE_Y.r + throughputCIE_Y.g + throughputCIE_Y.b); // TODO: fix getEta, what is real eta + const measure_type eta = bxdf.params.ior0 / bxdf.params.ior1; // assume it's real, not imaginary? + const scalar_type monochromeEta = nbl::hlsl::dot(throughputCIE_Y, eta) / (throughputCIE_Y.r + throughputCIE_Y.g + throughputCIE_Y.b); // TODO: imaginary eta? // sample lights const scalar_type neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf); @@ -177,6 +179,8 @@ struct Unidirectional // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself anisocache_type _cache; validPath = validPath && anisocache_type::compute(_cache, interaction, nee_sample, monochromeEta); + bxdf.params.A = nbl::hlsl::max(bxdf.params.A, vector(0,0)); + bxdf.params.eta = monochromeEta; if (neeContrib_pdf.pdf < numeric_limits::max) { @@ -220,7 +224,7 @@ struct Unidirectional } } - quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, params) * throughput; + quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params) * throughput; neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient; const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb; const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf; @@ -228,7 +232,11 @@ struct Unidirectional // TODO: ifdef NEE only - if (bsdf_quotient_pdf.pdf < numeric_limits::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1) + ray_type nee_ray; + nee_ray.origin = intersection + nee_sample.L * t * Tolerance::getStart(depth); + nee_ray.direction = nee_sample.L; + nee_ray.intersectionT = t; + if (bsdf_quotient_pdf.pdf < numeric_limits::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector.traceRay(nee_ray, scene).id == -1) ray._payload.accumulation += neeContrib_pdf.quotient; } } @@ -251,14 +259,8 @@ struct Unidirectional } // Li - measure_type getMeasure(uint32_t numSamples, uint32_t depth, NBL_CONST_REF_ARG(Scene) scene) + measure_type getMeasure(uint32_t numSamples, uint32_t depth, NBL_CONST_REF_ARG(scene_type) scene) { - // loop through bounces, do closest hit - // return ray.payload.accumulation --> color - - // TODO: not hardcode this, pass value from somewhere?, where to get objects? - ext::Intersector::IntersectData data; - measure_type Li = (measure_type)0.0; scalar_type meanLumaSq = 0.0; for (uint32_t i = 0; i < numSamples; i++) @@ -272,28 +274,7 @@ struct Unidirectional for (int d = 1; d <= depth && hit && rayAlive; d += 2) { ray.intersectionT = numeric_limits::max; - ray.objectID.id = -1; // start with no intersect - - // prodedural shapes - if (scene.sphereCount > 0) - { - data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE); - ray.objectID = intersector.traceRay(ray, data); - } - - if (scene.triangleCount > 0) - { - data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE); - ray.objectID = intersector.traceRay(ray, data); - } - - if (scene.rectangleCount > 0) - { - data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE); - ray.objectID = intersector.traceRay(ray, data); - } - - // TODO: trace AS + ray.objectID = intersector.traceRay(ray, scene); hit = ray.objectID.id != -1; if (hit) diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl index fe4dea8b3..cbc9d153c 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl @@ -13,8 +13,12 @@ namespace hlsl namespace ext { +template struct Scene { + using light_type = Light; + using bxdfnode_type = BxdfNode; + NBL_CONSTEXPR_STATIC_INLINE uint32_t maxSphereCount = 25; NBL_CONSTEXPR_STATIC_INLINE uint32_t maxTriangleCount = 12; NBL_CONSTEXPR_STATIC_INLINE uint32_t maxRectangleCount = 12; @@ -29,12 +33,12 @@ struct Scene NBL_CONSTEXPR_STATIC_INLINE uint32_t maxLightCount = 4; - Light lights[maxLightCount]; + light_type lights[maxLightCount]; uint32_t lightCount; NBL_CONSTEXPR_STATIC_INLINE uint32_t maxBxdfCount = 16; // TODO: limit change? - BxDFNode bxdfs[maxBxdfCount]; + bxdfnode_type bxdfs[maxBxdfCount]; uint32_t bxdfCount; // AS ases; From 8a4e0a94aab11c6eb0072ca0044db26ffe433a91 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 18 Feb 2025 16:45:20 +0700 Subject: [PATCH 050/296] sample bsdf in closest hit --- .../app_resources/hlsl/material_system.hlsl | 6 +- .../app_resources/hlsl/pathtracer.hlsl | 95 ++++++++++++++++--- 2 files changed, 86 insertions(+), 15 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index 1d5587443..038bd578a 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -81,20 +81,20 @@ struct System } } - static vector3_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, vector2_type u, NBL_REF_ARG(anisocache_type) cache) + static vector3_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache) { switch(material.type) { case DIFFUSE: { diffuseBxDF.init(cparams); - return diffuseBxDF.generate(interaction, u); + return diffuseBxDF.generate(interaction, u.xy); } break; case CONDUCTOR: { conductorBxDF.init(cparams); - return conductorBxDF.generate(interaction, u, cache); + return conductorBxDF.generate(interaction, u.xy, cache); } break; case DIELECTRIC: diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 8d8d9a201..e20ef705b 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -61,6 +61,7 @@ struct Unidirectional using isocache_type = typename anisocache_type::isocache_type; using quotient_pdf_type = typename NextEventEstimator::quotient_pdf_type; using params_type = typename MaterialSystem::params_t; + using create_params_type = typename MaterialSystem::create_params_t; using scene_type = Scene; using diffuse_op_type = typename MaterialSystem::diffuse_op_type; @@ -75,17 +76,17 @@ struct Unidirectional // NextEventEstimator nee) // {} - static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams) params) + static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams) params, Buffer samplerSequence) { this_t retval; retval.randGen = randgen_type::create(params.rngState); retval.rayGen = raygen_type::create(params.pixOffsetParam, params.camPos, params.NDC, params.invMVP); retval.materialSystem = material_system_type::create(diffuseParams, conductorParams, dielectricParams); + retval.samplerSequence = samplerSequence; return retval; } - // TODO: get working, what is sampleSequence stuff - vector3_type rand3d(uint32_t protoDimension, uint32_t _sample) + vector3_type rand3d(uint32_t protoDimension, uint32_t _sample, uint32_t i) { uint32_t address = spirv::bitfieldInsert(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2); unit32_t3 seqVal = texelFetch(sampleSequence, int(address) + i).xyz; @@ -150,19 +151,18 @@ struct Unidirectional (bxdf.materialType == ext::MaterialSystem::Material::CONDUCTOR) ? bxdf_traits::type == BT_BSDF : bxdf_traits::type == BT_BSDF; - vector3_type eps0 = rand3d(depth, _sample); - vector3_type eps1 = rand3d(depth, _sample); - vector3_type eps2 = rand3d(depth, _sample); + vector3_type eps0 = rand3d(depth, _sample, 0u); + vector3_type eps1 = rand3d(depth, _sample, 1u); // thresholds - const scalar_type bsdfPdfThreshold = 0.0001; + const scalar_type bxdfPdfThreshold = 0.0001; const scalar_type lumaContributionThreshold = getLuma(colorspace::eotf::sRGB((vector3_type)1.0 / 255.0)); // OETF smallest perceptible value const vector3_type throughputCIE_Y = nbl::hlsl::transpose(colorspace::sRGBtoXYZ)[1] * throughput; // TODO: this only works if spectral_type is dim 3 const measure_type eta = bxdf.params.ior0 / bxdf.params.ior1; // assume it's real, not imaginary? const scalar_type monochromeEta = nbl::hlsl::dot(throughputCIE_Y, eta) / (throughputCIE_Y.r + throughputCIE_Y.g + throughputCIE_Y.b); // TODO: imaginary eta? // sample lights - const scalar_type neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf); + const scalar_type neeProbability = 1.0; // BSDFNode_getNEEProb(bsdf); scalar_type rcpChoiceProb; if (!math::partitionRandVariable(neeProbability, eps0.z, rcpChoiceProb) && depth < 2u) { @@ -184,9 +184,9 @@ struct Unidirectional if (neeContrib_pdf.pdf < numeric_limits::max) { - if (nbl::hlsl::any(isnan(nee_sample.L))) + if (nbl::hlsl::any(isnan(nee_sample.L.direction))) ray.payload.accumulation += vector3_type(1000.f, 0.f, 0.f); - else if (nbl::hlsl::all((vector3_type)69.f == nee_sample.L)) + else if (nbl::hlsl::all((vector3_type)69.f == nee_sample.L.direction)) ray.payload.accumulation += vector3_type(0.f, 1000.f, 0.f); else if (validPath) { @@ -233,8 +233,8 @@ struct Unidirectional // TODO: ifdef NEE only ray_type nee_ray; - nee_ray.origin = intersection + nee_sample.L * t * Tolerance::getStart(depth); - nee_ray.direction = nee_sample.L; + nee_ray.origin = intersection + nee_sample.L.direction * t * Tolerance::getStart(depth); + nee_ray.direction = nee_sample.L.direction; nee_ray.intersectionT = t; if (bsdf_quotient_pdf.pdf < numeric_limits::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector.traceRay(nee_ray, scene).id == -1) ray._payload.accumulation += neeContrib_pdf.quotient; @@ -243,6 +243,70 @@ struct Unidirectional } // sample BSDF + scalar_type bxdfPdf; + vector3_type bxdfSample; + { + ext::MaterialSystem::Material material; + material.type = bxdf.materialType; + + anisocache_type _cache; + sample_type bsdf_sample = materialSystem.generate(material, bxdf.params, interaction, eps1, _cache); + + // TODO: does not yet account for smooth dielectric + params_type params; + if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) + { + params = params_type::template create(bsdf_sample, iso_interaction, bxdf::BCM_MAX); + } + else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) + { + if (bxdf.params.is_aniso) + params = params_type::template create(bsdf_sample, interaction, _cache, bxdf::BCM_MAX); + else + { + isocache = (iso_cache)_cache; + params = params_type::template create(bsdf_sample, iso_interaction, isocache, bxdf::BCM_MAX); + } + } + else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) + { + params = params_type::template create(bsdf_sample, iso_interaction, bxdf::BCM_ABS); + } + else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) + { + if (bxdf.params.is_aniso) + params = params_type::template create(bsdf_sample, interaction, _cache, bxdf::BCM_ABS); + else + { + isocache = (iso_cache)_cache; + params = params_type::template create(bsdf_sample, iso_interaction, isocache, bxdf::BCM_ABS); + } + } + + // the value of the bsdf divided by the probability of the sample being generated + throughput *= materialSystem.quotient_and_pdf(material, bxdf.params, params); + bxdfSample = bsdf_sample.L.direction; + } + + // additional threshold + const float lumaThroughputThreshold = lumaContributionThreshold; + if (bxdfPdf > bxdfPdfThreshold && getLuma(throughput) > lumaThroughputThreshold) + { + ray.payload.throughput = throughput; + ray.payload.otherTechniqueHeuristic = neeProbability / bxdfPdf; // numerically stable, don't touch + ray.payload.otherTechniqueHeuristic *= ray.payload.otherTechniqueHeuristic; + + // trace new ray + ray.origin = intersection + bsdfSampleL * (1.0/*kSceneSize*/) * Tolerance::getStart(depth); + ray.direction = bxdfSample; + // #if POLYGON_METHOD==2 + // ray._immutable.normalAtOrigin = interaction.isotropic.N; + // ray._immutable.wasBSDFAtOrigin = isBSDF; + // #endif + return true; + } + + return false; } void missProgram(NBL_REF_ARG(ray_type) ray) @@ -288,16 +352,23 @@ struct Unidirectional Li += (accumulation - Li) * rcpSampleSize; // TODO: visualize high variance + + // TODO: russian roulette early exit? } return Li; } + NBL_CONSTEXPR_STATIC_INLINE uint32_t MAX_DEPTH_LOG2 = 4u; + NBL_CONSTEXPR_STATIC_INLINE uint32_t MAX_SAMPLES_LOG2 = 10u; + randgen_type randGen; raygen_type rayGen; intersector_type intersector; material_system_type materialSystem; nee_type nee; + + Buffer samplerSequence; }; } From 72104b8b192a447bf8bdce09b1826f4150ce1d6a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 19 Feb 2025 16:19:54 +0700 Subject: [PATCH 051/296] set up path tracer render shader --- .../app_resources/hlsl/common.hlsl | 13 +- .../hlsl/next_event_estimator.hlsl | 6 +- .../app_resources/hlsl/pathtracer.hlsl | 6 +- .../app_resources/hlsl/rand_gen.hlsl | 4 +- .../app_resources/hlsl/render.comp.hlsl | 171 ++++++++++++++++++ 5 files changed, 185 insertions(+), 15 deletions(-) create mode 100644 31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 7d29dabd4..cc92a33ba 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -86,7 +86,6 @@ struct BxDFNode uint32_t materialType; params_type params; - ObjectID objectID; } template @@ -160,8 +159,8 @@ struct Shape return 2.0 * numbers::pi * (1.0 - cosThetaMax); } - template - float deferredPdf(NBL_CONST_REF_ARG(Light) light, NBL_CONST_REF_ARG(Ray) ray) + template + float deferredPdf(NBL_CONST_REF_ARG(Ray) ray) { return 1.0 / getSolidAngle(ray.origin); } @@ -245,8 +244,8 @@ struct Shape return nbl::hlsl::cross(edges[0], edges[1]) * 0.5f; } - template - float deferredPdf(NBL_CONST_REF_ARG(Light) light, NBL_CONST_REF_ARG(Ray) ray) + template + float deferredPdf(NBL_CONST_REF_ARG(Ray) ray) { const float32_t3 L = ray.direction; switch (polygonMethod) @@ -393,8 +392,8 @@ struct Shape basis = nbl::hlsl::transpose(basis); // TODO: double check transpose } - template - float deferredPdf(NBL_CONST_REF_ARG(Light light), NBL_CONST_REF_ARG(Ray) ray) + template + float deferredPdf(NBL_CONST_REF_ARG(Ray) ray) { switch (polygonMethod) { diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index c6380094d..86c26a152 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -53,7 +53,7 @@ struct Estimator { float32_t3 position = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); Shape sphere = Shape::create(position, asfloat(intersect.data[2 + Shape::ObjSize + 3]), intersect.data[2 + Shape::ObjSize + 4]); - pdf *= sphere.template deferredPdf(light, ray); + pdf *= sphere.template deferredPdf(ray); } break; case PST_TRIANGLE: @@ -62,7 +62,7 @@ struct Estimator float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 3]), asfloat(intersect.data[2 + Shape::ObjSize + 4]), asfloat(intersect.data[2 + Shape::ObjSize + 5])); float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 6]), asfloat(intersect.data[2 + Shape::ObjSize + 7]), asfloat(intersect.data[2 + Shape::ObjSize + 8])); Shape tri = Shape::create(vertex0, vertex1, vertex2, intersect.data[2 + Shape::ObjSize + 9]); - pdf *= tri.template deferredPdf(light, ray); + pdf *= tri.template deferredPdf(ray); } break; case PST_RECTANGLE: @@ -71,7 +71,7 @@ struct Estimator float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 3]), asfloat(intersect.data[2 + Shape::ObjSize + 4]), asfloat(intersect.data[2 + Shape::ObjSize + 5])); float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 6]), asfloat(intersect.data[2 + Shape::ObjSize + 7]), asfloat(intersect.data[2 + Shape::ObjSize + 8])); Shape rect = Shape::create(offset, edge0, edge1, intersect.data[2 + Shape::ObjSize + 9]); - pdf *= rect.template deferredPdf(light, ray); + pdf *= rect.template deferredPdf(ray); } break; default: diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index e20ef705b..350e5e404 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -90,7 +90,7 @@ struct Unidirectional { uint32_t address = spirv::bitfieldInsert(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2); unit32_t3 seqVal = texelFetch(sampleSequence, int(address) + i).xyz; - seqVal ^= unit32_t3(randGen(), randGen(), randGen()); + seqVal ^= randGen(); return vector3_type(seqVal) * asfloat(0x2f800004u); } @@ -147,8 +147,8 @@ struct Unidirectional // TODO: ifdef kill diffuse specular paths - const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) ? bxdf_traits::type == BT_BSDF : - (bxdf.materialType == ext::MaterialSystem::Material::CONDUCTOR) ? bxdf_traits::type == BT_BSDF : + const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::Material::Type::DIFFUSE) ? bxdf_traits::type == BT_BSDF : + (bxdf.materialType == ext::MaterialSystem::Material::Type::CONDUCTOR) ? bxdf_traits::type == BT_BSDF : bxdf_traits::type == BT_BSDF; vector3_type eps0 = rand3d(depth, _sample, 0u); diff --git a/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl b/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl index 949c2064b..30125c687 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl @@ -22,9 +22,9 @@ struct Uniform3D return retval; } - float32_t3 operator()() + uint32_t3 operator()() { - return float32_t3(uint32_t3(rng(), rng(), rng())); + return uint32_t3(rng(), rng(), rng()); } rng_type rng; diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl new file mode 100644 index 000000000..306188fd0 --- /dev/null +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -0,0 +1,171 @@ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/random/pcg.hlsl" + +#include "pathtracer.hlsl" + +// add these defines (one at a time) using -D argument to dxc +// #define SPHERE_LIGHT +// #define TRIANGLE_LIGHT +// #define RECTANGLE_LIGHT + +#ifdef SPHERE_LIGHT +#define SPHERE_COUNT 9 +#define LIGHT_TYPE PST_SPHERE +#else +#define SPHERE_COUNT 8 +#endif + +using namespace nbl::hlsl; + +NBL_CONSTEXPR uint32_t WorkgroupSize = 32; + +struct SPushConstants +{ + float32_t4x4 invMVP; + int sampleCount; + int depth; +}; + +[[vk::push_constant]] SPushConstants pc; + +[[vk::combinedImageSampler]][[vk::binding(0, 2)]] Texture2D envMap; // unused +[[vk::combinedImageSampler]][[vk::binding(0, 2)]] SamplerState envSampler + +[[vk::binding(1, 2)]] Buffer sampleSequence; + +[[vk::combinedImageSampler]][[vk::binding(2, 2)]] Texture2D scramblebuf; // unused +[[vk::combinedImageSampler]][[vk::binding(2, 2)]] SamplerState scrambleSampler; + +[[vk::binding(0, 0)]] RWTexture2D outImage; + +int32_t2 getCoordinates() +{ + return int32_t2(glsl::gl_GlobalInvocationID.xy); +} + +float32_t2 getTexCoords() +{ + uint32_t width, height; + outImage.GetDimensions(width, height); + int32_t2 iCoords = getCoordinates(); + return float32_t2(float(iCoords.x) / width, 1.0 - float(iCoords.y) / height); +} + +using ray_dir_info_t = bxdf::ray_dir_info::SBasic; +using iso_interaction = bxdf::surface_interactions::SIsotropic; +using aniso_interaction = bxdf::surface_interactions::SAnisotropic; +using sample_t = bxdf::SLightSample; +using iso_cache = bxdf::SIsotropicMicrofacetCache; +using aniso_cache = bxdf::SAnisotropicMicrofacetCache; +using quotient_pdf_t = bxdf::quotient_and_pdf; +using spectral_t = vector; +using params_t = bxdf::SBxDFParams; +using create_params_t = SBxDFCreationParams; + +using diffuse_bxdf_type = bxdf::reflection::SOrenNayarBxDF; +using conductor_bxdf_type = bxdf::reflection::SGGXBxDF; +using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF; + +using ray_type = ext::Ray; +using light_type = ext::Light; +using bxdfnode_type = ext::BxDFNode; +using randgen_type = ext::RandGen::Uniform3D; +using raygen_type = ext::RayGen::Basic; +using intersector_type = ext::Intersector::Comprehensive; +using material_system_type = ext::MaterialSystem::System; +using nee_type = ext::NextEventEstimator::Estimator; +using pathtracer_type = ext::PathTracer::Unidirectional; + +Shape spheres[SPHERE_COUNT] = { + Shape::create(float3(0.0, -100.5, -1.0), 100.0, 0u, light_type::INVALID_ID), + Shape::create(float3(2.0, 0.0, -1.0), 0.5, 1u, light_type::INVALID_ID), + Shape::create(float3(0.0, 0.0, -1.0), 0.5, 2u, light_type::INVALID_ID), + Shape::create(float3(-2.0, 0.0, -1.0), 0.5, 3u, light_type::INVALID_ID), + Shape::create(float3(2.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID), + Shape::create(float3(0.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID), + Shape::create(float3(-2.0, 0.0, 1.0), 0.5, 5u, light_type::INVALID_ID), + Shape::create(float3(0.5, 1.0, 0.5), 0.5, 6u, light_type::INVALID_ID) +#ifdef SPHERE_LIGHT + ,Shape::create(float3(-1.5, 1.5, 0.0), 0.3, bxdfnode_type::INVALID_ID, 0u) +#endif +}; + +#ifdef TRIANGLE_LIGHT +#define LIGHT_TYPE PST_TRIANGLE +#define TRIANGLE_COUNT 1 +Shape triangles[TRIANGLE_COUNT] = { + Shape::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u) +}; +#endif + +#ifdef RECTANGLE_LIGHT +#define LIGHT_TYPE PST_RECTANGLE +#define RECTANGLE_COUNT 1 +Shape rectangles[RECTANGLE_COUNT] = { + Shape::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u) +}; +#endif + +#define LIGHT_COUNT 1 +light_type lights[LIGHT_COUNT] = { + light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE)) +}; + +#define BSDF_COUNT 7 +bxdfnode_type bsdfs[BSDF_COUNT] = { + bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25))), + bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5))), + bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5))), + bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77))), + bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))), + bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))), + bxdfnode_type(ext::MaterialSystem::Material::Type::DIELECTRIC, create_params_t(false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67))) +}; + +[numthreads(WorkgroupGridDim, WorkgroupGridDim, 1)] +void main(uint32_t3 threadID : SV_DispatchThreadID) +{ + uint32_t width, height; + outImage.GetDimensions(width, height); + const int32_t2 coords = getCoordinates(); + float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height); + texCoord.y = 1.0 - texCoord.y; + + if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) { + return; + } + + if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0) + { + float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0); + outImage[coords] = pixelCol; + return; + } + + int flatIdx = glsl::gl_GlobalInvocationID.y * glsl::gl_NumWorkGroups.x * WorkgroupSize + glsl::gl_GlobalInvocationID.x; + PCG32x2 pcg = PCG32x2::construct(flatIdx); // replaces scramblebuf? + + // set up path tracer + const PathTracerCreationParams ptCreateParams; + ptCreateParams.rngState = pcg(); + + uint2 scrambleDim; + scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y); + ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim); + + float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0); + { + vec4 tmp = mul(pc.invMVP, NDC); + ptCreateParams.camPos = tmp.xyz / tmp.w; + NDC.z = 1.0; + } + + ptCreateParams.NDC = NDC; + ptCreateParams.invMVP = pc.invMVP; + + pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, samplerSequence); + + // set up scene (can do as global var?) + Scene scene; +} From 202c645b6a43906589457bed95154c4f98785e67 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 20 Feb 2025 11:08:58 +0700 Subject: [PATCH 052/296] finish render shader --- .../app_resources/hlsl/render.comp.hlsl | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index 306188fd0..7beccd322 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -112,8 +112,8 @@ light_type lights[LIGHT_COUNT] = { light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE)) }; -#define BSDF_COUNT 7 -bxdfnode_type bsdfs[BSDF_COUNT] = { +#define BXDF_COUNT 7 +bxdfnode_type bxdfs[BXDF_COUNT] = { bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25))), bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5))), bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5))), @@ -168,4 +168,31 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) // set up scene (can do as global var?) Scene scene; + scene.sphereCount = SPHERE_COUNT; + for (uint32_t i = 0; i < SPHERE_COUNT; i++) + scene.spheres[i] = spheres[i]; +#ifdef TRIANGLE_LIGHT + scene.triangleCount = TRIANGLE_COUNT; + for (uint32_t i = 0; i < TRIANGLE_COUNT; i++) + scene.triangles[i] = triangles[i]; +#else + scene.triangleCount = 0; +#endif +#ifdef RECTANGLE_LIGHT + scene.rectangleCount = RECTANGLE_COUNT; + for (uint32_t i = 0; i < RECTANGLE_COUNT; i++) + scene.rectangles[i] = rectangles[i]; +#else + scene.rectangleCount = 0; +#endif + scene.lightCount = LIGHT_COUNT; + for (uint32_t i = 0; i < LIGHT_COUNT; i++) + scene.lights[i] = lights[i]; + scene.bxdfCount = BXDF_COUNT; + for (uint32_t i = 0; i < BXDF_COUNT; i++) + scene.bxdfs[i] = bxdfs[i]; + + float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene); + float32_t4 pixCol = float32_t4(color, 1.0); + outImage[coords] = pixCol; } From 2f77555ce484c2f8ecb390e68fc3f4c830b23ef7 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 20 Feb 2025 16:55:07 +0700 Subject: [PATCH 053/296] hlsl path tracer initial, bug fixes --- .../app_resources/hlsl/common.hlsl | 24 ++--- .../app_resources/hlsl/intersector.hlsl | 4 +- .../app_resources/hlsl/material_system.hlsl | 3 +- .../hlsl/next_event_estimator.hlsl | 2 +- .../app_resources/hlsl/pathtracer.hlsl | 2 +- .../app_resources/hlsl/rand_gen.hlsl | 2 +- .../app_resources/hlsl/ray_gen.hlsl | 6 +- .../app_resources/hlsl/render.comp.hlsl | 9 +- 31_HLSLPathTracer/main.cpp | 94 ++++++++++++++++--- 9 files changed, 109 insertions(+), 37 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index cc92a33ba..938e3ca22 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -86,7 +86,7 @@ struct BxDFNode uint32_t materialType; params_type params; -} +}; template struct Tolerance @@ -108,7 +108,7 @@ struct Tolerance { return 1.0 - nbl::hlsl::exp2(__common(depth) + 1.0); } -} +}; enum PTPolygonMethod : uint16_t { @@ -166,7 +166,7 @@ struct Shape } template - float generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi) + float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi) { float32_t3 Z = position - origin; const float distanceSQ = nbl::hlsl::dot(Z,Z); @@ -179,7 +179,7 @@ struct Shape const float cosThetaMax = nbl::hlsl::sqrt(cosThetaMax2); const float cosTheta = nbl::hlsl::mix(1.0, cosThetaMax, xi.x); - vec3 L = Z * cosTheta; + float32_t3 L = Z * cosTheta; const float cosTheta2 = cosTheta * cosTheta; const float sinTheta = nbl::hlsl::sqrt(1.0 - cosTheta2); @@ -253,7 +253,8 @@ struct Shape case PPM_AREA: { const float dist = ray.intersectionT; - return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea()), L); + const float32_t3 L = ray.direction; + return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea(), L)); } break; case PPM_SOLID_ANGLE: @@ -303,7 +304,7 @@ struct Shape { float rcpPdf; - shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, ray.origin); + shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, origin); sampling::SphericalTriangle sst = sampling::SphericalTriangle::create(st); const float32_t3 L = sst.generate(rcpPdf, xi.xy); @@ -319,7 +320,7 @@ struct Shape { float rcpPdf; - shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, ray.origin); + shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, origin); sampling::ProjectedSphericalTriangle sst = sampling::ProjectedSphericalTriangle::create(st); const float32_t3 L = sst.generate(rcpPdf, interaction.N, isBSDF, xi.xy); @@ -348,9 +349,9 @@ struct Shape template<> struct Shape { - static Shape create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID) + static Shape create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID) { - Shape retval; + Shape retval; retval.offset = offset; retval.edge0 = edge0; retval.edge1 = edge1; @@ -389,7 +390,7 @@ struct Shape basis[1] = edge1 / extents[1]; basis[2] = normalize(cross(basis[0],basis[1])); - basis = nbl::hlsl::transpose(basis); // TODO: double check transpose + basis = nbl::hlsl::transpose(basis); // TODO: double check transpose } template @@ -400,6 +401,7 @@ struct Shape case PPM_AREA: { const float dist = ray.intersectionT; + const float32_t3 L = ray.direction; return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea(), L)); } break; @@ -499,4 +501,4 @@ struct Shape } } -#endif \ No newline at end of file +#endif diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index cf2d3ae7c..5151ea9c0 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -33,7 +33,7 @@ struct IntersectData NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128; uint32_t mode : 1; - unit32_t unused : 31; // possible space for flags + uint32_t unused : 31; // possible space for flags uint32_t data[DataSize]; }; @@ -199,4 +199,4 @@ struct Comprehensive } } -#endif \ No newline at end of file +#endif diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index 038bd578a..687c41dc0 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -3,6 +3,7 @@ #include #include +#include namespace nbl { @@ -150,4 +151,4 @@ struct System } } -#endif \ No newline at end of file +#endif diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 86c26a152..5695efc0d 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -188,4 +188,4 @@ struct Estimator } } -#endif \ No newline at end of file +#endif diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 350e5e404..b14c9baae 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -376,4 +376,4 @@ struct Unidirectional } } -#endif \ No newline at end of file +#endif diff --git a/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl b/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl index 30125c687..4f5302fea 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/rand_gen.hlsl @@ -35,4 +35,4 @@ struct Uniform3D } } -#endif \ No newline at end of file +#endif diff --git a/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl b/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl index 467ef2bd4..dcb695fbe 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl @@ -1,6 +1,8 @@ #ifndef _NBL_HLSL_EXT_RAYGEN_INCLUDED_ #define _NBL_HLSL_EXT_RAYGEN_INCLUDED_ +#include + #include "common.hlsl" namespace nbl @@ -43,7 +45,7 @@ struct Basic // apply stochastic reconstruction filter const float gaussianFilterCutoff = 2.5; const float truncation = nbl::hlsl::exp(-0.5 * gaussianFilterCutoff * gaussianFilterCutoff); - vec2 remappedRand = randVec.xy; + vector2_type remappedRand = randVec.xy; remappedRand.x *= 1.0 - truncation; remappedRand.x += truncation; tmp.xy += pixOffsetParam * nbl::hlsl::boxMullerTransform(remappedRand, 1.5); @@ -77,4 +79,4 @@ struct Basic } } -#endif \ No newline at end of file +#endif diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index 7beccd322..1c8c15ec4 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -2,6 +2,9 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/random/pcg.hlsl" +#include "nbl/builtin/hlsl/bxdf/reflection.hlsl" +#include "nbl/builtin/hlsl/bxdf/transmission.hlsl" + #include "pathtracer.hlsl" // add these defines (one at a time) using -D argument to dxc @@ -30,7 +33,7 @@ struct SPushConstants [[vk::push_constant]] SPushConstants pc; [[vk::combinedImageSampler]][[vk::binding(0, 2)]] Texture2D envMap; // unused -[[vk::combinedImageSampler]][[vk::binding(0, 2)]] SamplerState envSampler +[[vk::combinedImageSampler]][[vk::binding(0, 2)]] SamplerState envSampler; [[vk::binding(1, 2)]] Buffer sampleSequence; @@ -41,7 +44,7 @@ struct SPushConstants int32_t2 getCoordinates() { - return int32_t2(glsl::gl_GlobalInvocationID.xy); + return int32_t2(glsl::gl_GlobalInvocationID().xy); } float32_t2 getTexCoords() @@ -143,7 +146,7 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) return; } - int flatIdx = glsl::gl_GlobalInvocationID.y * glsl::gl_NumWorkGroups.x * WorkgroupSize + glsl::gl_GlobalInvocationID.x; + int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x; PCG32x2 pcg = PCG32x2::construct(flatIdx); // replaces scramblebuf? // set up path tracer diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 018468e46..13aa59823 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -37,6 +37,14 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication ELG_COUNT }; + enum E_RENDER_MODE : uint8_t + { + ERM_GLSL, + ERM_HLSL, + ERM_CHECKERED, + ERM_COUNT + }; + constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 }; constexpr static inline uint32_t MaxFramesInFlight = 5; constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); @@ -49,7 +57,9 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication constexpr static inline uint8_t MaxUITextureCount = 1u; static inline std::string DefaultImagePathsFile = "envmap/envmap_0.exr"; static inline std::string OwenSamplerFilePath = "owen_sampler_buffer.bin"; - static inline std::array PTShaderPaths = { "app_resources/glsl/litBySphere.comp", "app_resources/glsl/litByTriangle.comp", "app_resources/glsl/litByRectangle.comp" }; + static inline std::array PTGLSLShaderPaths = { "app_resources/glsl/litBySphere.comp", "app_resources/glsl/litByTriangle.comp", "app_resources/glsl/litByRectangle.comp" }; + static inline std::string PTHLSLShaderPath = "app_resources/hlsl/render.comp.hlsl"; + static inline std::array PTHLSLShaderVariants = { "SPHERE_LIGHT", "TRIANGLE_LIGHT", "RECTANGLE_LIGHT" }; static inline std::string PresentShaderPath = "app_resources/hlsl/present.frag.hlsl"; const char* shaderNames[E_LIGHT_GEOMETRY::ELG_COUNT] = { @@ -301,7 +311,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication m_presentDescriptorSet = presentDSPool->createDescriptorSet(gpuPresentDescriptorSetLayout); // Create Shaders - auto loadAndCompileShader = [&](std::string pathToShader) + auto loadAndCompileGLSLShader = [&](const std::string& pathToShader) -> smart_refctd_ptr { IAssetLoader::SAssetLoadParams lp = {}; lp.workingDirectory = localInputCWD; @@ -328,10 +338,46 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication return shader; }; + auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.workingDirectory = localInputCWD; + auto assetBundle = m_assetMgr->getAsset(pathToShader, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + m_logger->log("Could not load shader: ", ILogger::ELL_ERROR, pathToShader); + std::exit(-1); + } + + auto source = IAsset::castDown(assets[0]); + // The down-cast should not fail! + assert(source); + + auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + CHLSLCompiler::SOptions options = {}; + options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; // should be compute + options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; + options.spirvOptimizer = nullptr; +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + options.spirvOptimizer = opt.get(); +#endif + options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT; + options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); + options.preprocessorOptions.logger = m_logger.get(); + options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); + + //std::string dxcOptionStr[] = { "-D" + defineMacro }; + //options.dxcOptions = std::span(dxcOptionStr); + + source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + }; + // Create compute pipelines { for (int index = 0; index < E_LIGHT_GEOMETRY::ELG_COUNT; index++) { - auto ptShader = loadAndCompileShader(PTShaderPaths[index]); const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0, @@ -348,15 +394,31 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication return logFail("Failed to create Pathtracing pipeline layout"); } - IGPUComputePipeline::SCreationParams params = {}; - params.layout = ptPipelineLayout.get(); - params.shader.shader = ptShader.get(); - params.shader.entryPoint = "main"; - params.shader.entries = nullptr; - params.shader.requireFullSubgroups = true; - params.shader.requiredSubgroupSize = static_cast(5); - if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTPipelines.data() + index)) { - return logFail("Failed to create compute pipeline!\n"); + { + auto ptShader = loadAndCompileGLSLShader(PTGLSLShaderPaths[index]); + + IGPUComputePipeline::SCreationParams params = {}; + params.layout = ptPipelineLayout.get(); + params.shader.shader = ptShader.get(); + params.shader.entryPoint = "main"; + params.shader.entries = nullptr; + params.shader.requireFullSubgroups = true; + params.shader.requiredSubgroupSize = static_cast(5); + if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTGLSLPipelines.data() + index)) + return logFail("Failed to create GLSL compute pipeline!\n"); + } + { + auto ptShader = loadAndCompileHLSLShader(PTHLSLShaderPath, PTHLSLShaderVariants[index]); + + IGPUComputePipeline::SCreationParams params = {}; + params.layout = ptPipelineLayout.get(); + params.shader.shader = ptShader.get(); + params.shader.entryPoint = "main"; + params.shader.entries = nullptr; + params.shader.requireFullSubgroups = true; + params.shader.requiredSubgroupSize = static_cast(5); + if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTHLSLPipelines.data() + index)) + return logFail("Failed to create HLSL compute pipeline!\n"); } } } @@ -369,7 +431,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); // Load Fragment Shader - auto fragmentShader = loadAndCompileShader(PresentShaderPath); + auto fragmentShader = loadAndCompileGLSLShader(PresentShaderPath); if (!fragmentShader) return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!"); @@ -985,7 +1047,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication // cube envmap handle { - auto pipeline = m_PTPipelines[PTPipline].get(); + auto pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPipelines[PTPipline].get() : m_PTGLSLPipelines[PTPipline].get(); cmdbuf->bindComputePipeline(pipeline); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get()); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get()); @@ -1220,7 +1282,8 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication // gpu resources smart_refctd_ptr m_cmdPool; - std::array, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTPipelines; + std::array, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTGLSLPipelines; + std::array, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTHLSLPipelines; smart_refctd_ptr m_presentPipeline; uint64_t m_realFrameIx = 0; std::array, MaxFramesInFlight> m_cmdBufs; @@ -1269,6 +1332,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication float camYAngle = 165.f / 180.f * 3.14159f; float camXAngle = 32.f / 180.f * 3.14159f; int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE; + int renderMode = E_RENDER_MODE::ERM_GLSL; int spp = 32; int depth = 3; From 99aed4777c208c5acc4e66bb7ea8dc48f814c8d0 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 21 Feb 2025 14:16:11 +0700 Subject: [PATCH 054/296] fix shader bugs --- .../app_resources/hlsl/common.hlsl | 30 +++++++-- .../app_resources/hlsl/intersector.hlsl | 33 +++++----- .../app_resources/hlsl/material_system.hlsl | 40 ++++++------ .../hlsl/next_event_estimator.hlsl | 61 ++++++++++--------- .../app_resources/hlsl/pathtracer.hlsl | 9 +-- .../app_resources/hlsl/render.comp.hlsl | 4 ++ .../app_resources/hlsl/scene.hlsl | 50 +++++++-------- 31_HLSLPathTracer/main.cpp | 4 +- 8 files changed, 131 insertions(+), 100 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 938e3ca22..1b0aac72f 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -123,15 +123,21 @@ struct Shape; template<> struct Shape { - static Shape create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID) + static Shape create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfLightIDs) { Shape retval; retval.position = position; retval.radius2 = radius * radius; - retval.bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + retval.bsdfLightIDs = bsdfLightIDs; return retval; } + static Shape create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID) + { + uint32_t bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + return create(position, radius, bsdfLightIDs); + } + // return intersection distance if found, nan otherwise float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction) { @@ -207,17 +213,23 @@ struct Shape template<> struct Shape { - static Shape create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfID, uint32_t lightID) + static Shape create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfLightIDs) { Shape retval; retval.vertex0 = vertex0; retval.vertex1 = vertex1; retval.vertex2 = vertex2; - retval.bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + retval.bsdfLightIDs = bsdfLightIDs; retval.polygonMethod = PPM_SOLID_ANGLE; return retval; } + static Shape create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfID, uint32_t lightID) + { + uint32_t bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + return create(vertex0, vertex1, vertex2, bsdfLightIDs); + } + float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction) { const float32_t3 edges[2] = { vertex1 - vertex0, vertex2 - vertex0 }; @@ -349,17 +361,23 @@ struct Shape template<> struct Shape { - static Shape create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID) + static Shape create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfLightIDs) { Shape retval; retval.offset = offset; retval.edge0 = edge0; retval.edge1 = edge1; - retval.bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + retval.bsdfLightIDs = bsdfLightIDs; retval.polygonMethod = PPM_SOLID_ANGLE; return retval; } + static Shape create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID) + { + uint32_t bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + return create(offset, edge0, edge1, bsdfLightIDs); + } + float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction) { const float32_t3 h = nbl::hlsl::cross(direction, edge1); diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index 5151ea9c0..0bb6cb31c 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -41,17 +41,18 @@ template struct Comprehensive { using scalar_type = typename Ray::scalar_type; + using vector3_type = vector; using ray_type = Ray; static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect) { const bool anyHit = ray.intersectionT != numeric_limits::max; const uint32_t objCount = intersect.data[0]; - const ProceduralShapeType type = intersect.data[1]; + const ProceduralShapeType type = (ProceduralShapeType)intersect.data[1]; ObjectID objectID = ray.objectID; objectID.mode = IntersectData::Mode::PROCEDURAL; - objectID.type = type; + objectID.shapeType = type; for (int i = 0; i < objCount; i++) { float t; @@ -59,25 +60,25 @@ struct Comprehensive { case PST_SPHERE: { - float32_t3 position = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); + vector3_type position = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); Shape sphere = Shape::create(position, asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), intersect.data[2 + i * Shape::ObjSize + 4]); t = sphere.intersect(ray.origin, ray.direction); } break; case PST_TRIANGLE: { - float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); - float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape::ObjSize + 5])); - float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape::ObjSize + 8])); + vector3_type vertex0 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); + vector3_type vertex1 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape::ObjSize + 5])); + vector3_type vertex2 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape::ObjSize + 8])); Shape tri = Shape::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Shape::ObjSize + 9]); t = tri.intersect(ray.origin, ray.direction); } break; case PST_RECTANGLE: { - float32_t3 offset = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); - float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape::ObjSize + 5])); - float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + i * Shape::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape::ObjSize + 8])); + vector3_type offset = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); + vector3_type edge0 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape::ObjSize + 5])); + vector3_type edge1 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape::ObjSize + 8])); Shape rect = Shape::create(offset, edge0, edge1, intersect.data[2 + i * Shape::ObjSize + 9]); t = rect.intersect(ray.origin, ray.direction); } @@ -101,7 +102,7 @@ struct Comprehensive static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect) { - const IntersectData::Mode mode = intersect.mode; + const IntersectData::Mode mode = (IntersectData::Mode)intersect.mode; switch (mode) { case IntersectData::Mode::RAY_QUERY: @@ -120,7 +121,11 @@ struct Comprehensive } break; default: - return ObjectID(-1, IntersectData::Mode::PROCEDURAL, PST_SPHERE); + { + ObjectID objID; + objID.id = -1; + return objID; + } } } @@ -136,19 +141,19 @@ struct Comprehensive if (scene.sphereCount > 0) { data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE); - objectID = intersector.traceRay(ray, data); + objectID = traceRay(ray, data); } if (scene.triangleCount > 0) { data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE); - objectID = intersector.traceRay(ray, data); + objectID = traceRay(ray, data); } if (scene.rectangleCount > 0) { data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE); - objectID = intersector.traceRay(ray, data); + objectID = traceRay(ray, data); } // TODO: trace AS diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index 687c41dc0..9d638c232 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -26,7 +26,7 @@ struct Material NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 32; uint32_t type : 1; - unit32_t unused : 31; // possible space for flags + uint32_t unused : 31; // possible space for flags uint32_t data[DataSize]; }; @@ -41,37 +41,39 @@ struct System using quotient_pdf_type = typename DiffuseBxDF::quotient_pdf_type; using anisotropic_type = typename DiffuseBxDF::anisotropic_type; using anisocache_type = typename ConductorBxDF::anisocache_type; - using params_t = SBxDFParams; - using create_params_t = SBxDFCreationParams; + using params_t = bxdf::SBxDFParams; + using create_params_t = bxdf::SBxDFCreationParams; using diffuse_op_type = DiffuseBxDF; using conductor_op_type = ConductorBxDF; using dielectric_op_type = DielectricBxDF; - static this_t create(NBL_CONST_REF_ARG(SBxDFCreationParams) diffuseParams, NBL_CONST_REF_ARG(SBxDFCreationParams) conductorParams, NBL_CONST_REF_ARG(SBxDFCreationParams) dielectricParams) + static this_t create(NBL_CONST_REF_ARG(create_params_t) diffuseParams, NBL_CONST_REF_ARG(create_params_t) conductorParams, NBL_CONST_REF_ARG(create_params_t) dielectricParams) { - diffuseBxDF = DiffuseBxDF::create(diffuseParams); - conductorBxDF = DiffuseBxDF::create(conductorParams); - dielectricBxDF = DiffuseBxDF::create(dielectricParams); + this_t retval; + retval.diffuseBxDF = DiffuseBxDF::create(diffuseParams); + retval.conductorBxDF = DiffuseBxDF::create(conductorParams); + retval.dielectricBxDF = DiffuseBxDF::create(dielectricParams); + return retval; } - static measure_type eval(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) + measure_type eval(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) { switch(material.type) { - case DIFFUSE: + case Material::Type::DIFFUSE: { diffuseBxDF.init(cparams); return (measure_type)diffuseBxDF.eval(params); } break; - case CONDUCTOR: + case Material::Type::CONDUCTOR: { conductorBxDF.init(cparams); return conductorBxDF.eval(params); } break; - case DIELECTRIC: + case Material::Type::DIELECTRIC: { dielectricBxDF.init(cparams); return dielectricBxDF.eval(params); @@ -82,23 +84,23 @@ struct System } } - static vector3_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache) + vector3_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache) { switch(material.type) { - case DIFFUSE: + case Material::Type::DIFFUSE: { diffuseBxDF.init(cparams); return diffuseBxDF.generate(interaction, u.xy); } break; - case CONDUCTOR: + case Material::Type::CONDUCTOR: { conductorBxDF.init(cparams); return conductorBxDF.generate(interaction, u.xy, cache); } break; - case DIELECTRIC: + case Material::Type::DIELECTRIC: { dielectricBxDF.init(cparams); return dielectricBxDF.generate(interaction, u, cache); @@ -109,26 +111,26 @@ struct System } } - static quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) + quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) { const float minimumProjVectorLen = 0.00000001; if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen) { switch(material.type) { - case DIFFUSE: + case Material::Type::DIFFUSE: { diffuseBxDF.init(cparams); return diffuseBxDF.quotient_and_pdf(params); } break; - case CONDUCTOR: + case Material::Type::CONDUCTOR: { conductorBxDF.init(cparams); return conductorBxDF.quotient_and_pdf(params); } break; - case DIELECTRIC: + case Material::Type::DIELECTRIC: { dielectricBxDF.init(cparams); return dielectricBxDF.quotient_and_pdf(params); diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 5695efc0d..c7573fbb3 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -12,7 +12,7 @@ namespace ext namespace NextEventEstimator { -// procedural data store: [light count] [intersect type] [obj] +// procedural data store: [light count] [event type] [obj] struct Event { @@ -26,7 +26,7 @@ struct Event NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 16; uint32_t mode : 1; - unit32_t unused : 31; // possible space for flags + uint32_t unused : 31; // possible space for flags uint32_t data[DataSize]; }; @@ -34,43 +34,44 @@ template struct Estimator { using scalar_type = typename Ray::scalar_type; + using vector3_type = vector; using ray_type = Ray; using light_type = Light; using spectral_type = typename Light::spectral_type; using interaction_type = Aniso; - using quotient_pdf_type = quotient_and_pdf; + using quotient_pdf_type = bxdf::quotient_and_pdf; using sample_type = LightSample; static spectral_type proceduralDeferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event) { const uint32_t lightCount = event.data[0]; - const ProceduralShapeType type = event.data[1]; + const ProceduralShapeType type = (ProceduralShapeType)event.data[1]; pdf = 1.0 / lightCount; switch (type) { case PST_SPHERE: { - float32_t3 position = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); - Shape sphere = Shape::create(position, asfloat(intersect.data[2 + Shape::ObjSize + 3]), intersect.data[2 + Shape::ObjSize + 4]); + vector3_type position = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); + Shape sphere = Shape::create(position, asfloat(event.data[5]), event.data[6]); pdf *= sphere.template deferredPdf(ray); } break; case PST_TRIANGLE: { - float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); - float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 3]), asfloat(intersect.data[2 + Shape::ObjSize + 4]), asfloat(intersect.data[2 + Shape::ObjSize + 5])); - float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 6]), asfloat(intersect.data[2 + Shape::ObjSize + 7]), asfloat(intersect.data[2 + Shape::ObjSize + 8])); - Shape tri = Shape::create(vertex0, vertex1, vertex2, intersect.data[2 + Shape::ObjSize + 9]); + vector3_type vertex0 = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); + vector3_type vertex1 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7])); + vector3_type vertex2 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10])); + Shape tri = Shape::create(vertex0, vertex1, vertex2, event.data[11]); pdf *= tri.template deferredPdf(ray); } break; case PST_RECTANGLE: { - float32_t3 offset = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); - float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 3]), asfloat(intersect.data[2 + Shape::ObjSize + 4]), asfloat(intersect.data[2 + Shape::ObjSize + 5])); - float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 6]), asfloat(intersect.data[2 + Shape::ObjSize + 7]), asfloat(intersect.data[2 + Shape::ObjSize + 8])); - Shape rect = Shape::create(offset, edge0, edge1, intersect.data[2 + Shape::ObjSize + 9]); + vector3_type offset = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); + vector3_type edge0 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7])); + vector3_type edge1 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10])); + Shape rect = Shape::create(offset, edge0, edge1, event.data[11]); pdf *= rect.template deferredPdf(ray); } break; @@ -84,7 +85,7 @@ struct Estimator static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event) { - const Event::Mode mode = event.mode; + const Event::Mode mode = (Event::Mode)event.mode; switch (mode) { case Event::Mode::RAY_QUERY: @@ -107,10 +108,10 @@ struct Estimator } } - static sample_type procedural_generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, unit32_t depth, NBL_CONST_REF_ARG(Event) event) + static sample_type procedural_generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event) { const uint32_t lightCount = event.data[0]; - const ProceduralShapeType type = event.data[1]; + const ProceduralShapeType type = (ProceduralShapeType)event.data[1]; sample_type L; scalar_type pdf; @@ -118,26 +119,26 @@ struct Estimator { case PST_SPHERE: { - float32_t3 position = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); - Shape sphere = Shape::create(position, asfloat(intersect.data[2 + Shape::ObjSize + 3]), intersect.data[2 + Shape::ObjSize + 4]); + vector3_type position = vector3_type(asfloat(event.data[2 + Shape::ObjSize]), asfloat(event.data[2 + Shape::ObjSize + 1]), asfloat(event.data[2 + Shape::ObjSize + 2])); + Shape sphere = Shape::create(position, asfloat(event.data[2 + Shape::ObjSize + 3]), event.data[2 + Shape::ObjSize + 4]); L = sphere.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); } break; case PST_TRIANGLE: { - float32_t3 vertex0 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); - float32_t3 vertex1 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 3]), asfloat(intersect.data[2 + Shape::ObjSize + 4]), asfloat(intersect.data[2 + Shape::ObjSize + 5])); - float32_t3 vertex2 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 6]), asfloat(intersect.data[2 + Shape::ObjSize + 7]), asfloat(intersect.data[2 + Shape::ObjSize + 8])); - Shape tri = Shape::create(vertex0, vertex1, vertex2, intersect.data[2 + Shape::ObjSize + 9]); + vector3_type vertex0 = vector3_type(asfloat(event.data[2 + Shape::ObjSize]), asfloat(event.data[2 + Shape::ObjSize + 1]), asfloat(event.data[2 + Shape::ObjSize + 2])); + vector3_type vertex1 = vector3_type(asfloat(event.data[2 + Shape::ObjSize + 3]), asfloat(event.data[2 + Shape::ObjSize + 4]), asfloat(event.data[2 + Shape::ObjSize + 5])); + vector3_type vertex2 = vector3_type(asfloat(event.data[2 + Shape::ObjSize + 6]), asfloat(event.data[2 + Shape::ObjSize + 7]), asfloat(event.data[2 + Shape::ObjSize + 8])); + Shape tri = Shape::create(vertex0, vertex1, vertex2, event.data[2 + Shape::ObjSize + 9]); L = tri.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); } break; case PST_RECTANGLE: { - float32_t3 offset = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize]), asfloat(intersect.data[2 + Shape::ObjSize + 1]), asfloat(intersect.data[2 + Shape::ObjSize + 2])); - float32_t3 edge0 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 3]), asfloat(intersect.data[2 + Shape::ObjSize + 4]), asfloat(intersect.data[2 + Shape::ObjSize + 5])); - float32_t3 edge1 = float32_t3(asfloat(intersect.data[2 + Shape::ObjSize + 6]), asfloat(intersect.data[2 + Shape::ObjSize + 7]), asfloat(intersect.data[2 + Shape::ObjSize + 8])); - Shape rect = Shape::create(offset, edge0, edge1, intersect.data[2 + Shape::ObjSize + 9]); + vector3_type offset = vector3_type(asfloat(event.data[2 + Shape::ObjSize]), asfloat(event.data[2 + Shape::ObjSize + 1]), asfloat(event.data[2 + Shape::ObjSize + 2])); + vector3_type edge0 = vector3_type(asfloat(event.data[2 + Shape::ObjSize + 3]), asfloat(event.data[2 + Shape::ObjSize + 4]), asfloat(event.data[2 + Shape::ObjSize + 5])); + vector3_type edge1 = vector3_type(asfloat(event.data[2 + Shape::ObjSize + 6]), asfloat(event.data[2 + Shape::ObjSize + 7]), asfloat(event.data[2 + Shape::ObjSize + 8])); + Shape rect = Shape::create(offset, edge0, edge1, event.data[2 + Shape::ObjSize + 9]); L = rect.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); } break; @@ -154,9 +155,9 @@ struct Estimator return L; } - static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, unit32_t depth, NBL_CONST_REF_ARG(Event) event) + static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event) { - const Event::Mode mode = event.mode; + const Event::Mode mode = (Event::Mode)event.mode; switch (mode) { case Event::Mode::RAY_QUERY: @@ -171,7 +172,7 @@ struct Estimator break; case Event::Mode::PROCEDURAL: { - return procedural_generate_and_quotient_and_pdf(newRayMaxT, origin, interaction, isBSDF, xi, depth, event); + return procedural_generate_and_quotient_and_pdf(quotient_pdf, newRayMaxT, light, origin, interaction, isBSDF, xi, depth, event); } break; default: diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index b14c9baae..a740ec388 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -10,6 +10,7 @@ #include "intersector.hlsl" #include "material_system.hlsl" #include "next_event_estimator.hlsl" +#include "scene.hlsl" namespace nbl { @@ -170,8 +171,8 @@ struct Unidirectional scalar_type t; sample_type nee_sample = nee.generate_and_quotient_and_pdf( neeContrib_pdf, t, - intersection, interaction, - isBSDF, eps0, depth + lights[lightID], intersection, interaction, + isBSDF, eps0, depth, scene.toNextEvent(lightID) ); // We don't allow non watertight transmitters in this renderer @@ -236,7 +237,7 @@ struct Unidirectional nee_ray.origin = intersection + nee_sample.L.direction * t * Tolerance::getStart(depth); nee_ray.direction = nee_sample.L.direction; nee_ray.intersectionT = t; - if (bsdf_quotient_pdf.pdf < numeric_limits::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector.traceRay(nee_ray, scene).id == -1) + if (bsdf_quotient_pdf.pdf < numeric_limits::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector::traceRay(nee_ray, scene).id == -1) ray._payload.accumulation += neeContrib_pdf.quotient; } } @@ -338,7 +339,7 @@ struct Unidirectional for (int d = 1; d <= depth && hit && rayAlive; d += 2) { ray.intersectionT = numeric_limits::max; - ray.objectID = intersector.traceRay(ray, scene); + ray.objectID = intersector::traceRay(ray, scene); hit = ray.objectID.id != -1; if (hit) diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index 1c8c15ec4..f9558c3d1 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -167,6 +167,10 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) ptCreateParams.NDC = NDC; ptCreateParams.invMVP = pc.invMVP; + ptCreateParams.diffuseParams = bxdfs[0].params; + ptCreateParams.conductorParams = bxdfs[3].params; + ptCreateParams.dielectricParams = bxdfs[6].params; + pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, samplerSequence); // set up scene (can do as global var?) diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl index cbc9d153c..fc10d906c 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl @@ -129,41 +129,41 @@ struct Scene case PST_SPHERE: { Shape sphere = spheres[id]; - retval.data[2 + Shape::ObjSize] = asuint(sphere.position.x); - retval.data[2 + Shape::ObjSize + 1] = asuint(sphere.position.y); - retval.data[2 + Shape::ObjSize + 2] = asuint(sphere.position.z); - retval.data[2 + Shape::ObjSize + 3] = asuint(sphere.radius); - retval.data[2 + Shape::ObjSize + 4] = sphere.bsdfLightIDs; + retval.data[2] = asuint(sphere.position.x); + retval.data[3] = asuint(sphere.position.y); + retval.data[4] = asuint(sphere.position.z); + retval.data[5] = asuint(sphere.radius); + retval.data[6] = sphere.bsdfLightIDs; } break; case PST_TRIANGLE: { Shape tri = triangles[id]; - retval.data[2 + Shape::ObjSize] = asuint(tri.vertex0.x); - retval.data[2 + Shape::ObjSize + 1] = asuint(tri.vertex0.y); - retval.data[2 + Shape::ObjSize + 2] = asuint(tri.vertex0.z); - retval.data[2 + Shape::ObjSize + 3] = asuint(tri.vertex1.x); - retval.data[2 + Shape::ObjSize + 4] = asuint(tri.vertex1.y); - retval.data[2 + Shape::ObjSize + 5] = asuint(tri.vertex1.z); - retval.data[2 + Shape::ObjSize + 6] = asuint(tri.vertex2.x); - retval.data[2 + Shape::ObjSize + 7] = asuint(tri.vertex2.y); - retval.data[2 + Shape::ObjSize + 8] = asuint(tri.vertex2.z); - retval.data[2 + Shape::ObjSize + 9] = tri.bsdfLightIDs; + retval.data[2] = asuint(tri.vertex0.x); + retval.data[3] = asuint(tri.vertex0.y); + retval.data[4] = asuint(tri.vertex0.z); + retval.data[5] = asuint(tri.vertex1.x); + retval.data[6] = asuint(tri.vertex1.y); + retval.data[7] = asuint(tri.vertex1.z); + retval.data[8] = asuint(tri.vertex2.x); + retval.data[9] = asuint(tri.vertex2.y); + retval.data[10] = asuint(tri.vertex2.z); + retval.data[11] = tri.bsdfLightIDs; } break; case PST_RECTANGLE: { Shape rect = rectangles[id]; - retval.data[2 + Shape::ObjSize] = asuint(rect.offset.x); - retval.data[2 + Shape::ObjSize + 1] = asuint(rect.offset.y); - retval.data[2 + Shape::ObjSize + 2] = asuint(rect.offset.z); - retval.data[2 + Shape::ObjSize + 3] = asuint(rect.edge0.x); - retval.data[2 + Shape::ObjSize + 4] = asuint(rect.edge0.y); - retval.data[2 + Shape::ObjSize + 5] = asuint(rect.edge0.z); - retval.data[2 + Shape::ObjSize + 6] = asuint(rect.edge1.x); - retval.data[2 + Shape::ObjSize + 7] = asuint(rect.edge1.y); - retval.data[2 + Shape::ObjSize + 8] = asuint(rect.edge1.z); - retval.data[2 + Shape::ObjSize + 9] = rect.bsdfLightIDs; + retval.data[2] = asuint(rect.offset.x); + retval.data[3] = asuint(rect.offset.y); + retval.data[4] = asuint(rect.offset.z); + retval.data[5] = asuint(rect.edge0.x); + retval.data[6] = asuint(rect.edge0.y); + retval.data[7] = asuint(rect.edge0.z); + retval.data[8] = asuint(rect.edge1.x); + retval.data[9] = asuint(rect.edge1.y); + retval.data[10] = asuint(rect.edge1.z); + retval.data[11] = rect.bsdfLightIDs; } break; default: diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 13aa59823..5aff6bde7 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -369,8 +369,8 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication options.preprocessorOptions.logger = m_logger.get(); options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); - //std::string dxcOptionStr[] = { "-D" + defineMacro }; - //options.dxcOptions = std::span(dxcOptionStr); + std::string dxcOptionStr[] = { "-D" + defineMacro }; + options.dxcOptions = std::span(dxcOptionStr); source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); }; From a802a97943bd9e17187a306f8058c21d2774678b Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 21 Feb 2025 16:57:15 +0700 Subject: [PATCH 055/296] bug fixes #3 --- .../app_resources/hlsl/common.hlsl | 7 ++++--- .../hlsl/next_event_estimator.hlsl | 20 +++++++++---------- .../app_resources/hlsl/pathtracer.hlsl | 16 +++++++-------- .../app_resources/hlsl/render.comp.hlsl | 10 +++++----- .../app_resources/hlsl/scene.hlsl | 18 ++++++++--------- 5 files changed, 36 insertions(+), 35 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 1b0aac72f..f12b72b5d 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -134,7 +135,7 @@ struct Shape static Shape create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID) { - uint32_t bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + uint32_t bsdfLightIDs = glsl::bitfieldInsert(bsdfID, lightID, 16, 16); return create(position, radius, bsdfLightIDs); } @@ -226,7 +227,7 @@ struct Shape static Shape create(NBL_CONST_REF_ARG(float32_t3) vertex0, NBL_CONST_REF_ARG(float32_t3) vertex1, NBL_CONST_REF_ARG(float32_t3) vertex2, uint32_t bsdfID, uint32_t lightID) { - uint32_t bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + uint32_t bsdfLightIDs = glsl::bitfieldInsert(bsdfID, lightID, 16, 16); return create(vertex0, vertex1, vertex2, bsdfLightIDs); } @@ -374,7 +375,7 @@ struct Shape static Shape create(NBL_CONST_REF_ARG(float32_t3) offset, NBL_CONST_REF_ARG(float32_t3) edge0, NBL_CONST_REF_ARG(float32_t3) edge1, uint32_t bsdfID, uint32_t lightID) { - uint32_t bsdfLightIDs = spirv::bitFieldInsert(bsdfID, lightID, 16, 16); + uint32_t bsdfLightIDs = glsl::bitfieldInsert(bsdfID, lightID, 16, 16); return create(offset, edge0, edge1, bsdfLightIDs); } diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index c7573fbb3..32a7b7476 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -119,26 +119,26 @@ struct Estimator { case PST_SPHERE: { - vector3_type position = vector3_type(asfloat(event.data[2 + Shape::ObjSize]), asfloat(event.data[2 + Shape::ObjSize + 1]), asfloat(event.data[2 + Shape::ObjSize + 2])); - Shape sphere = Shape::create(position, asfloat(event.data[2 + Shape::ObjSize + 3]), event.data[2 + Shape::ObjSize + 4]); + vector3_type position = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); + Shape sphere = Shape::create(position, asfloat(event.data[5]), event.data[6]); L = sphere.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); } break; case PST_TRIANGLE: { - vector3_type vertex0 = vector3_type(asfloat(event.data[2 + Shape::ObjSize]), asfloat(event.data[2 + Shape::ObjSize + 1]), asfloat(event.data[2 + Shape::ObjSize + 2])); - vector3_type vertex1 = vector3_type(asfloat(event.data[2 + Shape::ObjSize + 3]), asfloat(event.data[2 + Shape::ObjSize + 4]), asfloat(event.data[2 + Shape::ObjSize + 5])); - vector3_type vertex2 = vector3_type(asfloat(event.data[2 + Shape::ObjSize + 6]), asfloat(event.data[2 + Shape::ObjSize + 7]), asfloat(event.data[2 + Shape::ObjSize + 8])); - Shape tri = Shape::create(vertex0, vertex1, vertex2, event.data[2 + Shape::ObjSize + 9]); + vector3_type vertex0 = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); + vector3_type vertex1 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7])); + vector3_type vertex2 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10])); + Shape tri = Shape::create(vertex0, vertex1, vertex2, event.data[11]); L = tri.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); } break; case PST_RECTANGLE: { - vector3_type offset = vector3_type(asfloat(event.data[2 + Shape::ObjSize]), asfloat(event.data[2 + Shape::ObjSize + 1]), asfloat(event.data[2 + Shape::ObjSize + 2])); - vector3_type edge0 = vector3_type(asfloat(event.data[2 + Shape::ObjSize + 3]), asfloat(event.data[2 + Shape::ObjSize + 4]), asfloat(event.data[2 + Shape::ObjSize + 5])); - vector3_type edge1 = vector3_type(asfloat(event.data[2 + Shape::ObjSize + 6]), asfloat(event.data[2 + Shape::ObjSize + 7]), asfloat(event.data[2 + Shape::ObjSize + 8])); - Shape rect = Shape::create(offset, edge0, edge1, event.data[2 + Shape::ObjSize + 9]); + vector3_type offset = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); + vector3_type edge0 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7])); + vector3_type edge1 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10])); + Shape rect = Shape::create(offset, edge0, edge1, event.data[11]); L = rect.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); } break; diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index a740ec388..c47f24753 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -82,15 +82,15 @@ struct Unidirectional this_t retval; retval.randGen = randgen_type::create(params.rngState); retval.rayGen = raygen_type::create(params.pixOffsetParam, params.camPos, params.NDC, params.invMVP); - retval.materialSystem = material_system_type::create(diffuseParams, conductorParams, dielectricParams); + retval.materialSystem = material_system_type::create(params.diffuseParams, params.conductorParams, params.dielectricParams); retval.samplerSequence = samplerSequence; return retval; } vector3_type rand3d(uint32_t protoDimension, uint32_t _sample, uint32_t i) { - uint32_t address = spirv::bitfieldInsert(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2); - unit32_t3 seqVal = texelFetch(sampleSequence, int(address) + i).xyz; + uint32_t address = glsl::bitfieldInsert(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2); + uint32_t3 seqVal = texelFetch(sampleSequence, int(address) + i).xyz; seqVal ^= randGen(); return vector3_type(seqVal) * asfloat(0x2f800004u); } @@ -101,7 +101,7 @@ struct Unidirectional } // TODO: probably will only work with procedural shapes, do the other ones - bool closestHitProgram(unit32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene) + bool closestHitProgram(uint32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene) { const uint32_t objectID = ray.objectID; const vector3_type intersection = ray.origin + ray.direction * ray.intersectionT; @@ -117,8 +117,8 @@ struct Unidirectional break; case ext::Intersector::IntersectData::Mode::PROCEDURAL: { - bsdfLightIDs = scene.getBsdfLightIDs(objectID.id); - vector3_type N = scene.getNormal(objectID.id) + bsdfLightIDs = scene.getBsdfLightIDs(objectID); + vector3_type N = scene.getNormal(objectID); N = nbl::hlsl::normalize(N); typename isotropic_type::ray_dir_info_type V; V.direction = nbl::hlsl::normalize(-ray.direction); @@ -133,14 +133,14 @@ struct Unidirectional vector3_type throughput = ray.payload.throughput; // emissive - const uint32_t lightID = spirv::bitfieldExtract(bsdfLightIDs, 16, 16); + const uint32_t lightID = glsl::bitfieldExtract(bsdfLightIDs, 16, 16); if (lightID != light_type::INVALID_ID) { float pdf; ray.payload.accumulation += nee.deferredEvalAndPdf(pdf, lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + pdf * pdf * ray.payload.otherTechniqueHeuristic); } - const uint32_t bsdfID = spirv::bitfieldExtract(bsdfLightIDs, 0, 16); + const uint32_t bsdfID = glsl::bitfieldExtract(bsdfLightIDs, 0, 16); if (bsdfID == bxdfnode_type::INVALID_ID) return false; diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index f9558c3d1..4143b973d 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -80,7 +80,7 @@ using material_system_type = ext::MaterialSystem::System; using pathtracer_type = ext::PathTracer::Unidirectional; -Shape spheres[SPHERE_COUNT] = { +static const Shape spheres[SPHERE_COUNT] = { Shape::create(float3(0.0, -100.5, -1.0), 100.0, 0u, light_type::INVALID_ID), Shape::create(float3(2.0, 0.0, -1.0), 0.5, 1u, light_type::INVALID_ID), Shape::create(float3(0.0, 0.0, -1.0), 0.5, 2u, light_type::INVALID_ID), @@ -97,7 +97,7 @@ Shape spheres[SPHERE_COUNT] = { #ifdef TRIANGLE_LIGHT #define LIGHT_TYPE PST_TRIANGLE #define TRIANGLE_COUNT 1 -Shape triangles[TRIANGLE_COUNT] = { +static const Shape triangles[TRIANGLE_COUNT] = { Shape::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u) }; #endif @@ -105,18 +105,18 @@ Shape triangles[TRIANGLE_COUNT] = { #ifdef RECTANGLE_LIGHT #define LIGHT_TYPE PST_RECTANGLE #define RECTANGLE_COUNT 1 -Shape rectangles[RECTANGLE_COUNT] = { +static const Shape rectangles[RECTANGLE_COUNT] = { Shape::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u) }; #endif #define LIGHT_COUNT 1 -light_type lights[LIGHT_COUNT] = { +static const light_type lights[LIGHT_COUNT] = { light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE)) }; #define BXDF_COUNT 7 -bxdfnode_type bxdfs[BXDF_COUNT] = { +static const bxdfnode_type bxdfs[BXDF_COUNT] = { bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25))), bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5))), bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5))), diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl index fc10d906c..88940c54d 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl @@ -65,7 +65,7 @@ struct Scene retval.data[2 + i * Shape::ObjSize] = asuint(sphere.position.x); retval.data[2 + i * Shape::ObjSize + 1] = asuint(sphere.position.y); retval.data[2 + i * Shape::ObjSize + 2] = asuint(sphere.position.z); - retval.data[2 + i * Shape::ObjSize + 3] = asuint(sphere.radius); + retval.data[2 + i * Shape::ObjSize + 3] = asuint(sphere.radius2); retval.data[2 + i * Shape::ObjSize + 4] = sphere.bsdfLightIDs; } } @@ -174,18 +174,18 @@ struct Scene } // TODO: get these to work with AS types as well - uint32_t getBsdfLightIDs(uint32_t id) + uint32_t getBsdfLightIDs(NBL_CONST_REF_ARG(ObjectID) objectID) { - return (objectID.type == PST_SPHERE) ? spheres[id].bsdfLightIDs : - (objectID.type == PST_TRIANGLE) ? triangles[id].bsdfLightIDs : - (objectID.type == PST_RECTANGLE) ? rectangles[id].bsdfLightIDs : -1; + return (objectID.type == PST_SPHERE) ? spheres[objectID.id].bsdfLightIDs : + (objectID.type == PST_TRIANGLE) ? triangles[objectID.id].bsdfLightIDs : + (objectID.type == PST_RECTANGLE) ? rectangles[objectID.id].bsdfLightIDs : -1; } - float32_t3 getNormal(uint32_t id, NBL_CONST_REF_ARG(float32_t3) intersection) + float32_t3 getNormal(NBL_CONST_REF_ARG(ObjectID) objectID, NBL_CONST_REF_ARG(float32_t3) intersection) { - return (objectID.type == PST_SPHERE) ? scene.spheres[id].getNormal(intersection) : - (objectID.type == PST_TRIANGLE) ? scene.triangles[id].getNormalTimesArea() : - (objectID.type == PST_RECTANGLE) ? scene.rectangles[id].getNormalTimesArea() : + return (objectID.type == PST_SPHERE) ? scene.spheres[objectID.id].getNormal(intersection) : + (objectID.type == PST_TRIANGLE) ? scene.triangles[objectID.id].getNormalTimesArea() : + (objectID.type == PST_RECTANGLE) ? scene.rectangles[objectID.id].getNormalTimesArea() : (float32_t3)0.0; } }; From eed47e73c53be25cb9be67924ca0d075897b64bc Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 24 Feb 2025 10:41:08 +0700 Subject: [PATCH 056/296] fix include when embed resources off --- 31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp | 2 +- 31_HLSLPathTracer/app_resources/glsl/litBySphere.comp | 2 +- 31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp b/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp index 300cef559..d898655c4 100644 --- a/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp +++ b/31_HLSLPathTracer/app_resources/glsl/litByRectangle.comp @@ -7,7 +7,7 @@ #define SPHERE_COUNT 8 #define POLYGON_METHOD 1 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling -#include "common.glsl" +#include "app_resources/glsl/common.glsl" #define RECTANGLE_COUNT 1 const vec3 edge0 = normalize(vec3(2,0,-1)); diff --git a/31_HLSLPathTracer/app_resources/glsl/litBySphere.comp b/31_HLSLPathTracer/app_resources/glsl/litBySphere.comp index bd1a48575..c8ebb9f08 100644 --- a/31_HLSLPathTracer/app_resources/glsl/litBySphere.comp +++ b/31_HLSLPathTracer/app_resources/glsl/litBySphere.comp @@ -6,7 +6,7 @@ #extension GL_GOOGLE_include_directive : require #define SPHERE_COUNT 9 -#include "common.glsl" +#include "app_resources/glsl/common.glsl" void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction) diff --git a/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp b/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp index ba23c82e5..36fe522f2 100644 --- a/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp +++ b/31_HLSLPathTracer/app_resources/glsl/litByTriangle.comp @@ -7,7 +7,7 @@ #define SPHERE_COUNT 8 #define POLYGON_METHOD 1 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling -#include "common.glsl" +#include "app_resources/glsl/common.glsl" #define TRIANGLE_COUNT 1 Triangle triangles[TRIANGLE_COUNT] = { From 6e26dae254d190ea66e812fa0789e958716edacc Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 24 Feb 2025 17:11:42 +0700 Subject: [PATCH 057/296] fixed more bugs #4 --- .../app_resources/hlsl/common.hlsl | 2 +- .../app_resources/hlsl/pathtracer.hlsl | 35 +-- .../app_resources/hlsl/render.comp.hlsl | 273 +++++++++--------- .../app_resources/hlsl/scene.hlsl | 18 +- 4 files changed, 165 insertions(+), 163 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index f12b72b5d..cd2310fbf 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -184,7 +184,7 @@ struct Shape Z *= rcpDistance; const float cosThetaMax = nbl::hlsl::sqrt(cosThetaMax2); - const float cosTheta = nbl::hlsl::mix(1.0, cosThetaMax, xi.x); + const float cosTheta = nbl::hlsl::mix(1.0, cosThetaMax, xi.x); float32_t3 L = Z * cosTheta; diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index c47f24753..f1237006c 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -4,6 +4,7 @@ #include #include #include +#include #include "rand_gen.hlsl" #include "ray_gen.hlsl" @@ -77,13 +78,13 @@ struct Unidirectional // NextEventEstimator nee) // {} - static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams) params, Buffer samplerSequence) + static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams) params, Buffer sampleSequence) { this_t retval; retval.randGen = randgen_type::create(params.rngState); retval.rayGen = raygen_type::create(params.pixOffsetParam, params.camPos, params.NDC, params.invMVP); retval.materialSystem = material_system_type::create(params.diffuseParams, params.conductorParams, params.dielectricParams); - retval.samplerSequence = samplerSequence; + retval.sampleSequence = sampleSequence; return retval; } @@ -103,13 +104,14 @@ struct Unidirectional // TODO: probably will only work with procedural shapes, do the other ones bool closestHitProgram(uint32_t depth, uint32_t _sample, NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene) { - const uint32_t objectID = ray.objectID; + const ObjectID objectID = ray.objectID; const vector3_type intersection = ray.origin + ray.direction * ray.intersectionT; uint32_t bsdfLightIDs; anisotropic_type interaction; isotropic_type iso_interaction; - switch (objectID.mode) + ext::Intersector::IntersectData::Mode mode = (ext::Intersector::IntersectData::Mode)objectID.mode; + switch (mode) { // TODO case ext::Intersector::IntersectData::Mode::RAY_QUERY: @@ -137,14 +139,14 @@ struct Unidirectional if (lightID != light_type::INVALID_ID) { float pdf; - ray.payload.accumulation += nee.deferredEvalAndPdf(pdf, lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + pdf * pdf * ray.payload.otherTechniqueHeuristic); + ray.payload.accumulation += nee.deferredEvalAndPdf(pdf, scene.lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + pdf * pdf * ray.payload.otherTechniqueHeuristic); } const uint32_t bsdfID = glsl::bitfieldExtract(bsdfLightIDs, 0, 16); if (bsdfID == bxdfnode_type::INVALID_ID) return false; - BxDFNode bxdf = scene.bxdfs[bsdfID]; + bxdfnode_type bxdf = scene.bxdfs[bsdfID]; // TODO: ifdef kill diffuse specular paths @@ -171,7 +173,7 @@ struct Unidirectional scalar_type t; sample_type nee_sample = nee.generate_and_quotient_and_pdf( neeContrib_pdf, t, - lights[lightID], intersection, interaction, + scene.lights[lightID], intersection, interaction, isBSDF, eps0, depth, scene.toNextEvent(lightID) ); @@ -206,7 +208,7 @@ struct Unidirectional params = params_type::template create(nee_sample, interaction, _cache, bxdf::BCM_MAX); else { - isocache = (iso_cache)_cache; + isocache_type isocache = (isocache_type)_cache; params = params_type::template create(nee_sample, iso_interaction, isocache, bxdf::BCM_MAX); } } @@ -220,7 +222,7 @@ struct Unidirectional params = params_type::template create(nee_sample, interaction, _cache, bxdf::BCM_ABS); else { - isocache = (iso_cache)_cache; + isocache_type isocache = (isocache_type)_cache; params = params_type::template create(nee_sample, iso_interaction, isocache, bxdf::BCM_ABS); } } @@ -237,7 +239,7 @@ struct Unidirectional nee_ray.origin = intersection + nee_sample.L.direction * t * Tolerance::getStart(depth); nee_ray.direction = nee_sample.L.direction; nee_ray.intersectionT = t; - if (bsdf_quotient_pdf.pdf < numeric_limits::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector::traceRay(nee_ray, scene).id == -1) + if (bsdf_quotient_pdf.pdf < numeric_limits::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector_type::traceRay(nee_ray, scene).id == -1) ray._payload.accumulation += neeContrib_pdf.quotient; } } @@ -265,7 +267,7 @@ struct Unidirectional params = params_type::template create(bsdf_sample, interaction, _cache, bxdf::BCM_MAX); else { - isocache = (iso_cache)_cache; + isocache_type isocache = (isocache_type)_cache; params = params_type::template create(bsdf_sample, iso_interaction, isocache, bxdf::BCM_MAX); } } @@ -279,7 +281,7 @@ struct Unidirectional params = params_type::template create(bsdf_sample, interaction, _cache, bxdf::BCM_ABS); else { - isocache = (iso_cache)_cache; + isocache_type isocache = (isocache_type)_cache; params = params_type::template create(bsdf_sample, iso_interaction, isocache, bxdf::BCM_ABS); } } @@ -298,7 +300,7 @@ struct Unidirectional ray.payload.otherTechniqueHeuristic *= ray.payload.otherTechniqueHeuristic; // trace new ray - ray.origin = intersection + bsdfSampleL * (1.0/*kSceneSize*/) * Tolerance::getStart(depth); + ray.origin = intersection + bxdfSample * (1.0/*kSceneSize*/) * Tolerance::getStart(depth); ray.direction = bxdfSample; // #if POLYGON_METHOD==2 // ray._immutable.normalAtOrigin = interaction.isotropic.N; @@ -339,7 +341,7 @@ struct Unidirectional for (int d = 1; d <= depth && hit && rayAlive; d += 2) { ray.intersectionT = numeric_limits::max; - ray.objectID = intersector::traceRay(ray, scene); + ray.objectID = intersector_type::traceRay(ray, scene); hit = ray.objectID.id != -1; if (hit) @@ -348,7 +350,7 @@ struct Unidirectional if (!hit) missProgram(ray); - spectral_type accumulation = ray.payload.accumulation; + measure_type accumulation = ray.payload.accumulation; scalar_type rcpSampleSize = 1.0 / (i + 1); Li += (accumulation - Li) * rcpSampleSize; @@ -365,11 +367,10 @@ struct Unidirectional randgen_type randGen; raygen_type rayGen; - intersector_type intersector; material_system_type materialSystem; nee_type nee; - Buffer samplerSequence; + Buffer sampleSequence; }; } diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index 4143b973d..cc64de33c 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -1,6 +1,7 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/random/pcg.hlsl" +#include "nbl/builtin/hlsl/random/xoroshiro.hlsl" #include "nbl/builtin/hlsl/bxdf/reflection.hlsl" #include "nbl/builtin/hlsl/bxdf/transmission.hlsl" @@ -40,7 +41,7 @@ struct SPushConstants [[vk::combinedImageSampler]][[vk::binding(2, 2)]] Texture2D scramblebuf; // unused [[vk::combinedImageSampler]][[vk::binding(2, 2)]] SamplerState scrambleSampler; -[[vk::binding(0, 0)]] RWTexture2D outImage; +[[vk::image_format("rgba16f")]][[vk::binding(0, 0)]] RWTexture2D outImage; int32_t2 getCoordinates() { @@ -64,142 +65,142 @@ using aniso_cache = bxdf::SAnisotropicMicrofacetCache; using quotient_pdf_t = bxdf::quotient_and_pdf; using spectral_t = vector; using params_t = bxdf::SBxDFParams; -using create_params_t = SBxDFCreationParams; - -using diffuse_bxdf_type = bxdf::reflection::SOrenNayarBxDF; -using conductor_bxdf_type = bxdf::reflection::SGGXBxDF; -using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF; - -using ray_type = ext::Ray; -using light_type = ext::Light; -using bxdfnode_type = ext::BxDFNode; -using randgen_type = ext::RandGen::Uniform3D; -using raygen_type = ext::RayGen::Basic; -using intersector_type = ext::Intersector::Comprehensive; -using material_system_type = ext::MaterialSystem::System; -using nee_type = ext::NextEventEstimator::Estimator; -using pathtracer_type = ext::PathTracer::Unidirectional; - -static const Shape spheres[SPHERE_COUNT] = { - Shape::create(float3(0.0, -100.5, -1.0), 100.0, 0u, light_type::INVALID_ID), - Shape::create(float3(2.0, 0.0, -1.0), 0.5, 1u, light_type::INVALID_ID), - Shape::create(float3(0.0, 0.0, -1.0), 0.5, 2u, light_type::INVALID_ID), - Shape::create(float3(-2.0, 0.0, -1.0), 0.5, 3u, light_type::INVALID_ID), - Shape::create(float3(2.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID), - Shape::create(float3(0.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID), - Shape::create(float3(-2.0, 0.0, 1.0), 0.5, 5u, light_type::INVALID_ID), - Shape::create(float3(0.5, 1.0, 0.5), 0.5, 6u, light_type::INVALID_ID) -#ifdef SPHERE_LIGHT - ,Shape::create(float3(-1.5, 1.5, 0.0), 0.3, bxdfnode_type::INVALID_ID, 0u) -#endif -}; - -#ifdef TRIANGLE_LIGHT -#define LIGHT_TYPE PST_TRIANGLE -#define TRIANGLE_COUNT 1 -static const Shape triangles[TRIANGLE_COUNT] = { - Shape::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u) -}; -#endif - -#ifdef RECTANGLE_LIGHT -#define LIGHT_TYPE PST_RECTANGLE -#define RECTANGLE_COUNT 1 -static const Shape rectangles[RECTANGLE_COUNT] = { - Shape::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u) -}; -#endif - -#define LIGHT_COUNT 1 -static const light_type lights[LIGHT_COUNT] = { - light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE)) -}; - -#define BXDF_COUNT 7 -static const bxdfnode_type bxdfs[BXDF_COUNT] = { - bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25))), - bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5))), - bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5))), - bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77))), - bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))), - bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))), - bxdfnode_type(ext::MaterialSystem::Material::Type::DIELECTRIC, create_params_t(false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67))) -}; - -[numthreads(WorkgroupGridDim, WorkgroupGridDim, 1)] +using create_params_t = bxdf::SBxDFCreationParams; + +// using diffuse_bxdf_type = bxdf::reflection::SOrenNayarBxDF; +// using conductor_bxdf_type = bxdf::reflection::SGGXBxDF; +// using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF; + +// using ray_type = ext::Ray; +// using light_type = ext::Light; +// using bxdfnode_type = ext::BxDFNode; +// using randgen_type = ext::RandGen::Uniform3D; +// using raygen_type = ext::RayGen::Basic; +// using intersector_type = ext::Intersector::Comprehensive; +// using material_system_type = ext::MaterialSystem::System; +// using nee_type = ext::NextEventEstimator::Estimator; +// using pathtracer_type = ext::PathTracer::Unidirectional; + +// static const Shape spheres[SPHERE_COUNT] = { +// Shape::create(float3(0.0, -100.5, -1.0), 100.0, 0u, light_type::INVALID_ID), +// Shape::create(float3(2.0, 0.0, -1.0), 0.5, 1u, light_type::INVALID_ID), +// Shape::create(float3(0.0, 0.0, -1.0), 0.5, 2u, light_type::INVALID_ID), +// Shape::create(float3(-2.0, 0.0, -1.0), 0.5, 3u, light_type::INVALID_ID), +// Shape::create(float3(2.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID), +// Shape::create(float3(0.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID), +// Shape::create(float3(-2.0, 0.0, 1.0), 0.5, 5u, light_type::INVALID_ID), +// Shape::create(float3(0.5, 1.0, 0.5), 0.5, 6u, light_type::INVALID_ID) +// #ifdef SPHERE_LIGHT +// ,Shape::create(float3(-1.5, 1.5, 0.0), 0.3, bxdfnode_type::INVALID_ID, 0u) +// #endif +// }; + +// #ifdef TRIANGLE_LIGHT +// #define LIGHT_TYPE PST_TRIANGLE +// #define TRIANGLE_COUNT 1 +// static const Shape triangles[TRIANGLE_COUNT] = { +// Shape::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u) +// }; +// #endif + +// #ifdef RECTANGLE_LIGHT +// #define LIGHT_TYPE PST_RECTANGLE +// #define RECTANGLE_COUNT 1 +// static const Shape rectangles[RECTANGLE_COUNT] = { +// Shape::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u) +// }; +// #endif + +// #define LIGHT_COUNT 1 +// static const light_type lights[LIGHT_COUNT] = { +// light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE)) +// }; + +// #define BXDF_COUNT 7 +// static const bxdfnode_type bxdfs[BXDF_COUNT] = { +// bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25))), +// bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5))), +// bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5))), +// bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77))), +// bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))), +// bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))), +// bxdfnode_type(ext::MaterialSystem::Material::Type::DIELECTRIC, create_params_t(false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67))) +// }; + +[numthreads(WorkgroupSize, WorkgroupSize, 1)] void main(uint32_t3 threadID : SV_DispatchThreadID) { - uint32_t width, height; - outImage.GetDimensions(width, height); - const int32_t2 coords = getCoordinates(); - float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height); - texCoord.y = 1.0 - texCoord.y; - - if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) { - return; - } - - if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0) - { - float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0); - outImage[coords] = pixelCol; - return; - } - - int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x; - PCG32x2 pcg = PCG32x2::construct(flatIdx); // replaces scramblebuf? - - // set up path tracer - const PathTracerCreationParams ptCreateParams; - ptCreateParams.rngState = pcg(); - - uint2 scrambleDim; - scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y); - ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim); - - float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0); - { - vec4 tmp = mul(pc.invMVP, NDC); - ptCreateParams.camPos = tmp.xyz / tmp.w; - NDC.z = 1.0; - } +// uint32_t width, height; +// outImage.GetDimensions(width, height); +// const int32_t2 coords = getCoordinates(); +// float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height); +// texCoord.y = 1.0 - texCoord.y; + +// if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) { +// return; +// } + +// if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0) +// { +// float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0); +// outImage[coords] = pixelCol; +// return; +// } + +// int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x; +// PCG32x2 pcg = PCG32x2::construct(flatIdx); // replaces scramblebuf? + +// // set up path tracer +// const PathTracerCreationParams ptCreateParams; +// ptCreateParams.rngState = pcg(); + +// uint2 scrambleDim; +// scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y); +// ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim); + +// float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0); +// { +// vec4 tmp = mul(pc.invMVP, NDC); +// ptCreateParams.camPos = tmp.xyz / tmp.w; +// NDC.z = 1.0; +// } - ptCreateParams.NDC = NDC; - ptCreateParams.invMVP = pc.invMVP; - - ptCreateParams.diffuseParams = bxdfs[0].params; - ptCreateParams.conductorParams = bxdfs[3].params; - ptCreateParams.dielectricParams = bxdfs[6].params; - - pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, samplerSequence); - - // set up scene (can do as global var?) - Scene scene; - scene.sphereCount = SPHERE_COUNT; - for (uint32_t i = 0; i < SPHERE_COUNT; i++) - scene.spheres[i] = spheres[i]; -#ifdef TRIANGLE_LIGHT - scene.triangleCount = TRIANGLE_COUNT; - for (uint32_t i = 0; i < TRIANGLE_COUNT; i++) - scene.triangles[i] = triangles[i]; -#else - scene.triangleCount = 0; -#endif -#ifdef RECTANGLE_LIGHT - scene.rectangleCount = RECTANGLE_COUNT; - for (uint32_t i = 0; i < RECTANGLE_COUNT; i++) - scene.rectangles[i] = rectangles[i]; -#else - scene.rectangleCount = 0; -#endif - scene.lightCount = LIGHT_COUNT; - for (uint32_t i = 0; i < LIGHT_COUNT; i++) - scene.lights[i] = lights[i]; - scene.bxdfCount = BXDF_COUNT; - for (uint32_t i = 0; i < BXDF_COUNT; i++) - scene.bxdfs[i] = bxdfs[i]; - - float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene); - float32_t4 pixCol = float32_t4(color, 1.0); - outImage[coords] = pixCol; +// ptCreateParams.NDC = NDC; +// ptCreateParams.invMVP = pc.invMVP; + +// ptCreateParams.diffuseParams = bxdfs[0].params; +// ptCreateParams.conductorParams = bxdfs[3].params; +// ptCreateParams.dielectricParams = bxdfs[6].params; + +// pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, sampleSequence); + +// // set up scene (can do as global var?) +// Scene scene; +// scene.sphereCount = SPHERE_COUNT; +// for (uint32_t i = 0; i < SPHERE_COUNT; i++) +// scene.spheres[i] = spheres[i]; +// #ifdef TRIANGLE_LIGHT +// scene.triangleCount = TRIANGLE_COUNT; +// for (uint32_t i = 0; i < TRIANGLE_COUNT; i++) +// scene.triangles[i] = triangles[i]; +// #else +// scene.triangleCount = 0; +// #endif +// #ifdef RECTANGLE_LIGHT +// scene.rectangleCount = RECTANGLE_COUNT; +// for (uint32_t i = 0; i < RECTANGLE_COUNT; i++) +// scene.rectangles[i] = rectangles[i]; +// #else +// scene.rectangleCount = 0; +// #endif +// scene.lightCount = LIGHT_COUNT; +// for (uint32_t i = 0; i < LIGHT_COUNT; i++) +// scene.lights[i] = lights[i]; +// scene.bxdfCount = BXDF_COUNT; +// for (uint32_t i = 0; i < BXDF_COUNT; i++) +// scene.bxdfs[i] = bxdfs[i]; + +// float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene); +// float32_t4 pixCol = float32_t4(color, 1.0); +// outImage[coords] = pixCol; } diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl index 88940c54d..ed0c612f1 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl @@ -121,10 +121,10 @@ struct Scene retval.mode = objectID.mode; retval.data[0] = lightCount; - retval.data[1] = objectID.type; + retval.data[1] = objectID.shapeType; uint32_t id = objectID.id; - switch (type) + switch (objectID.shapeType) { case PST_SPHERE: { @@ -132,7 +132,7 @@ struct Scene retval.data[2] = asuint(sphere.position.x); retval.data[3] = asuint(sphere.position.y); retval.data[4] = asuint(sphere.position.z); - retval.data[5] = asuint(sphere.radius); + retval.data[5] = asuint(sphere.radius2); retval.data[6] = sphere.bsdfLightIDs; } break; @@ -176,16 +176,16 @@ struct Scene // TODO: get these to work with AS types as well uint32_t getBsdfLightIDs(NBL_CONST_REF_ARG(ObjectID) objectID) { - return (objectID.type == PST_SPHERE) ? spheres[objectID.id].bsdfLightIDs : - (objectID.type == PST_TRIANGLE) ? triangles[objectID.id].bsdfLightIDs : - (objectID.type == PST_RECTANGLE) ? rectangles[objectID.id].bsdfLightIDs : -1; + return (objectID.shapeType == PST_SPHERE) ? spheres[objectID.id].bsdfLightIDs : + (objectID.shapeType == PST_TRIANGLE) ? triangles[objectID.id].bsdfLightIDs : + (objectID.shapeType == PST_RECTANGLE) ? rectangles[objectID.id].bsdfLightIDs : -1; } float32_t3 getNormal(NBL_CONST_REF_ARG(ObjectID) objectID, NBL_CONST_REF_ARG(float32_t3) intersection) { - return (objectID.type == PST_SPHERE) ? scene.spheres[objectID.id].getNormal(intersection) : - (objectID.type == PST_TRIANGLE) ? scene.triangles[objectID.id].getNormalTimesArea() : - (objectID.type == PST_RECTANGLE) ? scene.rectangles[objectID.id].getNormalTimesArea() : + return (objectID.shapeType == PST_SPHERE) ? spheres[objectID.id].getNormal(intersection) : + (objectID.shapeType == PST_TRIANGLE) ? triangles[objectID.id].getNormalTimesArea() : + (objectID.shapeType == PST_RECTANGLE) ? rectangles[objectID.id].getNormalTimesArea() : (float32_t3)0.0; } }; From da661c08d50eb60b8e95fe4a0028aac653a10c4b Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 25 Feb 2025 12:12:38 +0700 Subject: [PATCH 058/296] fix compile hlsl shader bug --- 31_HLSLPathTracer/main.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 5aff6bde7..4a2c1110b 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -331,7 +331,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication auto shader = m_device->createShader(source.get()); if (!shader) { - m_logger->log("Shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader); + m_logger->log("GLSL shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader); std::exit(-1); } @@ -373,6 +373,15 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication options.dxcOptions = std::span(dxcOptionStr); source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + + auto shader = m_device->createShader(source.get()); + if (!shader) + { + m_logger->log("HLSL shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader); + std::exit(-1); + } + + return shader; }; // Create compute pipelines From f97757bffcc28ad208a10dfb485214b8d9e1fdd1 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 25 Feb 2025 16:55:48 +0700 Subject: [PATCH 059/296] more bug fixes #5 --- .../app_resources/hlsl/common.hlsl | 68 +++++ .../app_resources/hlsl/intersector.hlsl | 42 +-- .../app_resources/hlsl/material_system.hlsl | 6 +- .../hlsl/next_event_estimator.hlsl | 18 -- .../app_resources/hlsl/pathtracer.hlsl | 6 +- .../app_resources/hlsl/ray_gen.hlsl | 2 +- .../app_resources/hlsl/render.comp.hlsl | 266 +++++++++--------- .../app_resources/hlsl/scene.hlsl | 3 - 31_HLSLPathTracer/main.cpp | 6 +- 9 files changed, 222 insertions(+), 195 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index cd2310fbf..a264fabd5 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -42,6 +42,15 @@ enum ProceduralShapeType : uint16_t struct ObjectID { + static ObjectID create(uint32_t id, uint32_t mode, ProceduralShapeType shapeType) + { + ObjectID retval; + retval.id = id; + retval.mode = mode; + retval.shapeType = shapeType; + return retval; + } + uint32_t id; uint32_t mode; ProceduralShapeType shapeType; @@ -85,6 +94,17 @@ struct BxDFNode NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu; + static BxDFNode create(uint32_t materialType, bool isAniso, NBL_CONST_REF_ARG(float32_t2) A, NBL_CONST_REF_ARG(spectral_type) ior0, NBL_CONST_REF_ARG(spectral_type) ior1) + { + BxDFNode retval; + retval.materialType = materialType; + retval.params.is_aniso = isAniso; + retval.params.A = A; + retval.params.ior0 = ior0; + retval.params.ior1 = ior1; + return retval; + } + uint32_t materialType; params_type params; }; @@ -118,6 +138,54 @@ enum PTPolygonMethod : uint16_t PPM_APPROX_PROJECTED_SOLID_ANGLE }; +namespace Intersector +{ +// ray query method +// ray query struct holds AS info +// pass in address to vertex/index buffers? + +// ray tracing pipeline method + +// procedural data store: [obj count] [intersect type] [obj1] [obj2] [...] + +struct IntersectData +{ + enum Mode : uint32_t // enum class? + { + RAY_QUERY, + RAY_TRACING, + PROCEDURAL + }; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128; + + uint32_t mode : 1; + uint32_t unused : 31; // possible space for flags + uint32_t data[DataSize]; +}; +} + +namespace NextEventEstimator +{ +// procedural data store: [light count] [event type] [obj] + +struct Event +{ + enum Mode : uint32_t // enum class? + { + RAY_QUERY, + RAY_TRACING, + PROCEDURAL + }; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 16; + + uint32_t mode : 1; + uint32_t unused : 31; // possible space for flags + uint32_t data[DataSize]; +}; +} + template struct Shape; diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index 0bb6cb31c..880ae1169 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -2,6 +2,7 @@ #define _NBL_HLSL_EXT_INTERSECTOR_INCLUDED_ #include "common.hlsl" +#include "scene.hlsl" #include namespace nbl @@ -13,38 +14,18 @@ namespace ext namespace Intersector { -// ray query method -// ray query struct holds AS info -// pass in address to vertex/index buffers? - -// ray tracing pipeline method - -// procedural data store: [obj count] [intersect type] [obj1] [obj2] [...] - -struct IntersectData -{ - enum Mode : uint32_t // enum class? - { - RAY_QUERY, - RAY_TRACING, - PROCEDURAL - }; - - NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128; - - uint32_t mode : 1; - uint32_t unused : 31; // possible space for flags - uint32_t data[DataSize]; -}; - -template +template struct Comprehensive { using scalar_type = typename Ray::scalar_type; using vector3_type = vector; using ray_type = Ray; - static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect) + using light_type = Light; + using bxdfnode_type = BxdfNode; + using scene_type = Scene; + + static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect) { const bool anyHit = ray.intersectionT != numeric_limits::max; const uint32_t objCount = intersect.data[0]; @@ -100,7 +81,7 @@ struct Comprehensive return objectID; } - static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_REF_ARG(IntersectData) intersect) + static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect) { const IntersectData::Mode mode = (IntersectData::Mode)intersect.mode; switch (mode) @@ -122,15 +103,12 @@ struct Comprehensive break; default: { - ObjectID objID; - objID.id = -1; - return objID; + return ObjectID::create(-1, 0, PST_SPHERE); } } } - template - static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Scene) scene) + static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene) { IntersectData data; diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index 9d638c232..16f8dcabf 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -51,9 +51,9 @@ struct System static this_t create(NBL_CONST_REF_ARG(create_params_t) diffuseParams, NBL_CONST_REF_ARG(create_params_t) conductorParams, NBL_CONST_REF_ARG(create_params_t) dielectricParams) { this_t retval; - retval.diffuseBxDF = DiffuseBxDF::create(diffuseParams); - retval.conductorBxDF = DiffuseBxDF::create(conductorParams); - retval.dielectricBxDF = DiffuseBxDF::create(dielectricParams); + retval.diffuseBxDF = diffuse_op_type::create(diffuseParams); + retval.conductorBxDF = conductor_op_type::create(conductorParams); + retval.dielectricBxDF = dielectric_op_type::create(dielectricParams); return retval; } diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 32a7b7476..f0eeb0885 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -12,24 +12,6 @@ namespace ext namespace NextEventEstimator { -// procedural data store: [light count] [event type] [obj] - -struct Event -{ - enum Mode : uint32_t // enum class? - { - RAY_QUERY, - RAY_TRACING, - PROCEDURAL - }; - - NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 16; - - uint32_t mode : 1; - uint32_t unused : 31; // possible space for flags - uint32_t data[DataSize]; -}; - template struct Estimator { diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index f1237006c..460744940 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -91,7 +91,7 @@ struct Unidirectional vector3_type rand3d(uint32_t protoDimension, uint32_t _sample, uint32_t i) { uint32_t address = glsl::bitfieldInsert(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2); - uint32_t3 seqVal = texelFetch(sampleSequence, int(address) + i).xyz; + uint32_t3 seqVal = sampleSequence[address + i].xyz; seqVal ^= randGen(); return vector3_type(seqVal) * asfloat(0x2f800004u); } @@ -120,7 +120,7 @@ struct Unidirectional case ext::Intersector::IntersectData::Mode::PROCEDURAL: { bsdfLightIDs = scene.getBsdfLightIDs(objectID); - vector3_type N = scene.getNormal(objectID); + vector3_type N = scene.getNormal(objectID, intersection); N = nbl::hlsl::normalize(N); typename isotropic_type::ray_dir_info_type V; V.direction = nbl::hlsl::normalize(-ray.direction); @@ -332,7 +332,7 @@ struct Unidirectional scalar_type meanLumaSq = 0.0; for (uint32_t i = 0; i < numSamples; i++) { - vector3_type uvw = rand3d(0u, i); + vector3_type uvw = rand3d(0u, i, randGen.rng()); // TODO: take from scramblebuf? ray_type ray = rayGen.generate(uvw); // bounces diff --git a/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl b/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl index dcb695fbe..0759b1cd3 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/ray_gen.hlsl @@ -50,7 +50,7 @@ struct Basic remappedRand.x += truncation; tmp.xy += pixOffsetParam * nbl::hlsl::boxMullerTransform(remappedRand, 1.5); // for depth of field we could do another stochastic point-pick - tmp = invMVP * tmp; + tmp = nbl::hlsl::mul(invMVP, tmp); ray.direction = nbl::hlsl::normalize(tmp.xyz / tmp.w - camPos); // #if POLYGON_METHOD==2 diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index cc64de33c..5be6adf78 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -15,7 +15,7 @@ #ifdef SPHERE_LIGHT #define SPHERE_COUNT 9 -#define LIGHT_TYPE PST_SPHERE +#define LIGHT_TYPE ext::PST_SPHERE #else #define SPHERE_COUNT 8 #endif @@ -23,6 +23,8 @@ using namespace nbl::hlsl; NBL_CONSTEXPR uint32_t WorkgroupSize = 32; +NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4; +NBL_CONSTEXPR uint32_t MAX_SAMPLES_LOG2 = 10; struct SPushConstants { @@ -67,140 +69,140 @@ using spectral_t = vector; using params_t = bxdf::SBxDFParams; using create_params_t = bxdf::SBxDFCreationParams; -// using diffuse_bxdf_type = bxdf::reflection::SOrenNayarBxDF; -// using conductor_bxdf_type = bxdf::reflection::SGGXBxDF; -// using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF; - -// using ray_type = ext::Ray; -// using light_type = ext::Light; -// using bxdfnode_type = ext::BxDFNode; -// using randgen_type = ext::RandGen::Uniform3D; -// using raygen_type = ext::RayGen::Basic; -// using intersector_type = ext::Intersector::Comprehensive; -// using material_system_type = ext::MaterialSystem::System; -// using nee_type = ext::NextEventEstimator::Estimator; -// using pathtracer_type = ext::PathTracer::Unidirectional; - -// static const Shape spheres[SPHERE_COUNT] = { -// Shape::create(float3(0.0, -100.5, -1.0), 100.0, 0u, light_type::INVALID_ID), -// Shape::create(float3(2.0, 0.0, -1.0), 0.5, 1u, light_type::INVALID_ID), -// Shape::create(float3(0.0, 0.0, -1.0), 0.5, 2u, light_type::INVALID_ID), -// Shape::create(float3(-2.0, 0.0, -1.0), 0.5, 3u, light_type::INVALID_ID), -// Shape::create(float3(2.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID), -// Shape::create(float3(0.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID), -// Shape::create(float3(-2.0, 0.0, 1.0), 0.5, 5u, light_type::INVALID_ID), -// Shape::create(float3(0.5, 1.0, 0.5), 0.5, 6u, light_type::INVALID_ID) -// #ifdef SPHERE_LIGHT -// ,Shape::create(float3(-1.5, 1.5, 0.0), 0.3, bxdfnode_type::INVALID_ID, 0u) -// #endif -// }; - -// #ifdef TRIANGLE_LIGHT -// #define LIGHT_TYPE PST_TRIANGLE -// #define TRIANGLE_COUNT 1 -// static const Shape triangles[TRIANGLE_COUNT] = { -// Shape::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u) -// }; -// #endif - -// #ifdef RECTANGLE_LIGHT -// #define LIGHT_TYPE PST_RECTANGLE -// #define RECTANGLE_COUNT 1 -// static const Shape rectangles[RECTANGLE_COUNT] = { -// Shape::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u) -// }; -// #endif - -// #define LIGHT_COUNT 1 -// static const light_type lights[LIGHT_COUNT] = { -// light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE)) -// }; - -// #define BXDF_COUNT 7 -// static const bxdfnode_type bxdfs[BXDF_COUNT] = { -// bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25))), -// bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5))), -// bxdfnode_type(ext::MaterialSystem::Material::Type::DIFFUSE, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5))), -// bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77))), -// bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))), -// bxdfnode_type(ext::MaterialSystem::Material::Type::CONDUCTOR, create_params_t(false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98))), -// bxdfnode_type(ext::MaterialSystem::Material::Type::DIELECTRIC, create_params_t(false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67))) -// }; +using diffuse_bxdf_type = bxdf::reflection::SOrenNayarBxDF; +using conductor_bxdf_type = bxdf::reflection::SGGXBxDF; +using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF; + +using ray_type = ext::Ray; +using light_type = ext::Light; +using bxdfnode_type = ext::BxDFNode; +using randgen_type = ext::RandGen::Uniform3D; +using raygen_type = ext::RayGen::Basic; +using intersector_type = ext::Intersector::Comprehensive; +using material_system_type = ext::MaterialSystem::System; +using nee_type = ext::NextEventEstimator::Estimator; +using pathtracer_type = ext::PathTracer::Unidirectional; + +static const ext::Shape spheres[SPHERE_COUNT] = { + ext::Shape::create(float3(0.0, -100.5, -1.0), 100.0, 0u, light_type::INVALID_ID), + ext::Shape::create(float3(2.0, 0.0, -1.0), 0.5, 1u, light_type::INVALID_ID), + ext::Shape::create(float3(0.0, 0.0, -1.0), 0.5, 2u, light_type::INVALID_ID), + ext::Shape::create(float3(-2.0, 0.0, -1.0), 0.5, 3u, light_type::INVALID_ID), + ext::Shape::create(float3(2.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID), + ext::Shape::create(float3(0.0, 0.0, 1.0), 0.5, 4u, light_type::INVALID_ID), + ext::Shape::create(float3(-2.0, 0.0, 1.0), 0.5, 5u, light_type::INVALID_ID), + ext::Shape::create(float3(0.5, 1.0, 0.5), 0.5, 6u, light_type::INVALID_ID) +#ifdef SPHERE_LIGHT + ,ext::Shape::create(float3(-1.5, 1.5, 0.0), 0.3, bxdfnode_type::INVALID_ID, 0u) +#endif +}; + +#ifdef TRIANGLE_LIGHT +#define LIGHT_TYPE ext::PST_TRIANGLE +#define TRIANGLE_COUNT 1 +static const ext::Shape triangles[TRIANGLE_COUNT] = { + ext::Shape::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u) +}; +#endif + +#ifdef RECTANGLE_LIGHT +#define LIGHT_TYPE ext::PST_RECTANGLE +#define RECTANGLE_COUNT 1 +static const ext::Shape rectangles[RECTANGLE_COUNT] = { + ext::Shape::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u) +}; +#endif + +#define LIGHT_COUNT 1 +static const light_type lights[LIGHT_COUNT] = { + light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE)) +}; + +#define BXDF_COUNT 7 +static const bxdfnode_type bxdfs[BXDF_COUNT] = { + bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25)), + bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5)), + bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5)), + bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77)), + bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)), + bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)), + bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIELECTRIC, false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67)) +}; [numthreads(WorkgroupSize, WorkgroupSize, 1)] void main(uint32_t3 threadID : SV_DispatchThreadID) { -// uint32_t width, height; -// outImage.GetDimensions(width, height); -// const int32_t2 coords = getCoordinates(); -// float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height); -// texCoord.y = 1.0 - texCoord.y; - -// if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) { -// return; -// } - -// if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0) -// { -// float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0); -// outImage[coords] = pixelCol; -// return; -// } - -// int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x; -// PCG32x2 pcg = PCG32x2::construct(flatIdx); // replaces scramblebuf? - -// // set up path tracer -// const PathTracerCreationParams ptCreateParams; -// ptCreateParams.rngState = pcg(); - -// uint2 scrambleDim; -// scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y); -// ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim); - -// float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0); -// { -// vec4 tmp = mul(pc.invMVP, NDC); -// ptCreateParams.camPos = tmp.xyz / tmp.w; -// NDC.z = 1.0; -// } + uint32_t width, height; + outImage.GetDimensions(width, height); + const int32_t2 coords = getCoordinates(); + float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height); + texCoord.y = 1.0 - texCoord.y; + + if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) { + return; + } + + if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0) + { + float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0); + outImage[coords] = pixelCol; + return; + } + + int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x; + PCG32x2 pcg = PCG32x2::construct(flatIdx); // replaces scramblebuf? + + // set up path tracer + ext::PathTracer::PathTracerCreationParams ptCreateParams; + ptCreateParams.rngState = pcg(); + + uint2 scrambleDim; + scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y); + ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim); + + float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0); + { + float4 tmp = mul(pc.invMVP, NDC); + ptCreateParams.camPos = tmp.xyz / tmp.w; + NDC.z = 1.0; + } -// ptCreateParams.NDC = NDC; -// ptCreateParams.invMVP = pc.invMVP; - -// ptCreateParams.diffuseParams = bxdfs[0].params; -// ptCreateParams.conductorParams = bxdfs[3].params; -// ptCreateParams.dielectricParams = bxdfs[6].params; - -// pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, sampleSequence); - -// // set up scene (can do as global var?) -// Scene scene; -// scene.sphereCount = SPHERE_COUNT; -// for (uint32_t i = 0; i < SPHERE_COUNT; i++) -// scene.spheres[i] = spheres[i]; -// #ifdef TRIANGLE_LIGHT -// scene.triangleCount = TRIANGLE_COUNT; -// for (uint32_t i = 0; i < TRIANGLE_COUNT; i++) -// scene.triangles[i] = triangles[i]; -// #else -// scene.triangleCount = 0; -// #endif -// #ifdef RECTANGLE_LIGHT -// scene.rectangleCount = RECTANGLE_COUNT; -// for (uint32_t i = 0; i < RECTANGLE_COUNT; i++) -// scene.rectangles[i] = rectangles[i]; -// #else -// scene.rectangleCount = 0; -// #endif -// scene.lightCount = LIGHT_COUNT; -// for (uint32_t i = 0; i < LIGHT_COUNT; i++) -// scene.lights[i] = lights[i]; -// scene.bxdfCount = BXDF_COUNT; -// for (uint32_t i = 0; i < BXDF_COUNT; i++) -// scene.bxdfs[i] = bxdfs[i]; - -// float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene); -// float32_t4 pixCol = float32_t4(color, 1.0); -// outImage[coords] = pixCol; + ptCreateParams.NDC = NDC; + ptCreateParams.invMVP = pc.invMVP; + + ptCreateParams.diffuseParams = bxdfs[0].params; + ptCreateParams.conductorParams = bxdfs[3].params; + ptCreateParams.dielectricParams = bxdfs[6].params; + + pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, sampleSequence); + + // set up scene (can do as global var?) + ext::Scene scene; + scene.sphereCount = SPHERE_COUNT; + for (uint32_t i = 0; i < SPHERE_COUNT; i++) + scene.spheres[i] = spheres[i]; +#ifdef TRIANGLE_LIGHT + scene.triangleCount = TRIANGLE_COUNT; + for (uint32_t i = 0; i < TRIANGLE_COUNT; i++) + scene.triangles[i] = triangles[i]; +#else + scene.triangleCount = 0; +#endif +#ifdef RECTANGLE_LIGHT + scene.rectangleCount = RECTANGLE_COUNT; + for (uint32_t i = 0; i < RECTANGLE_COUNT; i++) + scene.rectangles[i] = rectangles[i]; +#else + scene.rectangleCount = 0; +#endif + scene.lightCount = LIGHT_COUNT; + for (uint32_t i = 0; i < LIGHT_COUNT; i++) + scene.lights[i] = lights[i]; + scene.bxdfCount = BXDF_COUNT; + for (uint32_t i = 0; i < BXDF_COUNT; i++) + scene.bxdfs[i] = bxdfs[i]; + + float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene); + float32_t4 pixCol = float32_t4(color, 1.0); + outImage[coords] = pixCol; } diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl index ed0c612f1..48be039a7 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl @@ -2,9 +2,6 @@ #define _NBL_HLSL_EXT_PATHTRACING_SCENE_INCLUDED_ #include "common.hlsl" -#include "material_system.hlsl" -#include "next_event_estimator.hlsl" -#include "intersector.hlsl" namespace nbl { diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 4a2c1110b..4bb260b09 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -368,9 +368,9 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); options.preprocessorOptions.logger = m_logger.get(); options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); - - std::string dxcOptionStr[] = { "-D" + defineMacro }; - options.dxcOptions = std::span(dxcOptionStr); + + const IShaderCompiler::SMacroDefinition variantDefine = { defineMacro, "" }; + options.preprocessorOptions.extraDefines = { &variantDefine, &variantDefine + 1 }; source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); From 890c99297f59f19bfef5d7a9b64c68de4b1488f6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 25 Feb 2025 18:34:32 +0700 Subject: [PATCH 060/296] Update demo to use SShaderGroupHandle type --- 71_RayTracingPipeline/main.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index d457e37dc..0c5473b73 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -356,7 +356,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, shaderGroups.raygenGroup = { .shaderIndex = RTDS_RAYGEN }; - SGeneralShaderGroup missGroups[EMT_COUNT]; + IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT]; missGroups[EMT_PRIMARY] = { .shaderIndex = RTDS_MISS }; missGroups[EMT_OCCLUSION] = { .shaderIndex = RTDS_SHADOW_MISS }; shaderGroups.missGroups = missGroups; @@ -365,7 +365,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { return geomType * ERT_COUNT + rayType; }; - SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; + IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { .closestHitShaderIndex = RTDS_CLOSEST_HIT, .anyHitShaderIndex = RTDS_ANYHIT_PRIMARY, @@ -384,7 +384,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, }; shaderGroups.hitGroups = hitGroups; - SGeneralShaderGroup callableGroups[ELT_COUNT]; + IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT]; callableGroups[ELT_DIRECTIONAL] = { .shaderIndex = RTDS_DIRECTIONAL_CALL }; callableGroups[ELT_POINT] = { .shaderIndex = RTDS_POINT_CALL }; callableGroups[ELT_SPOT] = { .shaderIndex = RTDS_SPOT_CALL }; @@ -1354,13 +1354,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, uint8_t* pData = reinterpret_cast(cpuBuffer->getPointer()); // copy raygen region - memcpy(pData, pipeline->getRaygenGroupShaderHandle().data(), handleSize); + memcpy(pData, &pipeline->getRaygen(), handleSize); // copy miss region uint8_t* pMissData = pData + missRange.offset; for (int32_t missIx = 0; missIx < pipeline->getMissGroupCount(); missIx++) { - memcpy(pMissData, pipeline->getMissGroupShaderHandle(missIx).data(), handleSize); + memcpy(pMissData, &pipeline->getMiss(missIx), handleSize); pMissData += m_shaderBindingTable.missGroupsStride; } @@ -1368,7 +1368,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, uint8_t* pHitData = pData + hitRange.offset; for (int32_t hitIx = 0; hitIx < pipeline->getHitGroupCount(); hitIx++) { - memcpy(pHitData, pipeline->getHitGroupShaderHandle(hitIx).data(), handleSize); + memcpy(pHitData, &pipeline->getHit(hitIx), handleSize); pHitData += m_shaderBindingTable.hitGroupsStride; } @@ -1376,7 +1376,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, uint8_t* pCallableData = pData + callableRange.offset; for (int32_t callableIx = 0; callableIx < pipeline->getCallableGroupCount(); callableIx++) { - memcpy(pCallableData, pipeline->getCallableGroupShaderHandle(callableIx).data(), handleSize); + memcpy(pCallableData, &pipeline->getCallable(callableIx), handleSize); pCallableData += m_shaderBindingTable.callableGroupsStride; } From 19ad8b03480ffbac7d4d28c4f9f7f73a06d3a841 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 25 Feb 2025 23:23:26 +0700 Subject: [PATCH 061/296] Fix spot light --- 71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl index f1357d30b..fcb130104 100644 --- a/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl +++ b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl @@ -10,7 +10,7 @@ void main(inout RayLight cLight) cLight.outIntensity = LightIntensity / (cLight.outLightDistance * cLight.outLightDistance); cLight.outLightDir = normalize(lDir); float theta = dot(cLight.outLightDir, normalize(-pc.light.direction)); - float epsilon = - pc.light.outerCutoff; + float epsilon = 1 - pc.light.outerCutoff; float spotIntensity = clamp((theta - pc.light.outerCutoff) / epsilon, 0.0, 1.0); cLight.outIntensity *= spotIntensity; } From 8e759f24d5b386291660f50af1c04efbff3eff08 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 26 Feb 2025 16:53:31 +0700 Subject: [PATCH 062/296] more bug fixes #6 --- .../app_resources/hlsl/common.hlsl | 28 +++++++++++++------ .../app_resources/hlsl/intersector.hlsl | 11 ++++---- .../app_resources/hlsl/material_system.hlsl | 5 ++-- .../hlsl/next_event_estimator.hlsl | 15 ++++++++-- .../app_resources/hlsl/pathtracer.hlsl | 16 ++++++----- .../app_resources/hlsl/render.comp.hlsl | 4 +-- .../app_resources/hlsl/scene.hlsl | 4 +-- 7 files changed, 54 insertions(+), 29 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index a264fabd5..913225f8b 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -66,7 +66,10 @@ struct Ray // immutable vector3_type origin; vector3_type direction; + // TODO: polygon method == 2 stuff + vector3_type normalAtOrigin; + bool wasBSDFAtOrigin; // mutable scalar_type intersectionT; @@ -82,6 +85,14 @@ struct Light NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu; + static Light create(NBL_CONST_REF_ARG(spectral_type) radiance, NBL_CONST_REF_ARG(ObjectID) objectID) + { + Light retval; + retval.radiance = radiance; + retval.objectID = objectID; + return retval; + } + spectral_type radiance; ObjectID objectID; }; @@ -250,7 +261,7 @@ struct Shape { const float rcpDistance = 1.0 / nbl::hlsl::sqrt(distanceSQ); Z *= rcpDistance; - + const float cosThetaMax = nbl::hlsl::sqrt(cosThetaMax2); const float cosTheta = nbl::hlsl::mix(1.0, cosThetaMax, xi.x); @@ -261,9 +272,9 @@ struct Shape float sinPhi, cosPhi; math::sincos(2.0 * numbers::pi * xi.y - numbers::pi, sinPhi, cosPhi); float32_t2x3 XY = math::frisvad(Z); - + L += (XY[0] * cosPhi + XY[1] * sinPhi) * sinTheta; - + newRayMaxT = (cosTheta - nbl::hlsl::sqrt(cosTheta2 - cosThetaMax2)) / rcpDistance; pdf = 1.0 / (2.0 * numbers::pi * (1.0 - cosThetaMax)); return L; @@ -342,14 +353,15 @@ struct Shape { shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, ray.origin); const float rcpProb = st.solidAngleOfTriangle(); - // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 + // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 return rcpProb > numeric_limits::min ? (1.0 / rcpProb) : numeric_limits::max; } break; case PPM_APPROX_PROJECTED_SOLID_ANGLE: { shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, ray.origin); - const float pdf = st.projectedSolidAngleOfTriangle(ray.normalAtOrigin, ray.wasBSDFAtOrigin, L); + sampling::ProjectedSphericalTriangle pst = sampling::ProjectedSphericalTriangle::create(st); + const float pdf = pst.pdf(ray.normalAtOrigin, ray.wasBSDFAtOrigin, L); // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small return pdf < numeric_limits::max ? pdf : 0.0; } @@ -371,11 +383,11 @@ struct Shape const float sqrtU = nbl::hlsl::sqrt(xi.x); float32_t3 pnt = vertex0 + edge0 * (1.0 - sqrtU) + edge1 * sqrtU * xi.y; float32_t3 L = pnt - origin; - + const float distanceSq = nbl::hlsl::dot(L,L); const float rcpDistance = 1.0 / nbl::hlsl::sqrt(distanceSq); L *= rcpDistance; - + pdf = distanceSq / nbl::hlsl::abs(nbl::hlsl::dot(nbl::hlsl::cross(edge0, edge1) * 0.5f, L)); newRayMaxT = 1.0 / rcpDistance; return L; @@ -403,7 +415,7 @@ struct Shape shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, origin); sampling::ProjectedSphericalTriangle sst = sampling::ProjectedSphericalTriangle::create(st); - + const float32_t3 L = sst.generate(rcpPdf, interaction.N, isBSDF, xi.xy); pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : 0.0; diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index 880ae1169..525af5525 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -68,12 +68,12 @@ struct Comprehensive t = numeric_limits::infinity; break; } - + bool closerIntersection = t > 0.0 && t < ray.intersectionT; ray.intersectionT = closerIntersection ? t : ray.intersectionT; objectID.id = closerIntersection ? i : objectID.id; - + // allowing early out results in a performance regression, WTF!? //if (anyHit && closerIntersection) //break; @@ -106,6 +106,7 @@ struct Comprehensive return ObjectID::create(-1, 0, PST_SPHERE); } } + return ObjectID::create(-1, 0, PST_SPHERE); } static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene) @@ -114,7 +115,7 @@ struct Comprehensive ObjectID objectID; objectID.id = -1; // start with no intersect - + // prodedural shapes if (scene.sphereCount > 0) { @@ -161,12 +162,12 @@ struct Comprehensive // t = sphere.intersect(ray.origin, ray.direction); // } // // TODO: other types - + // bool closerIntersection = t > 0.0 && t < ray.intersectionT; // ray.intersectionT = closerIntersection ? t : ray.intersectionT; // objectID = closerIntersection ? i : objectID; - + // // allowing early out results in a performance regression, WTF!? // //if (anyHit && closerIntersection) // //break; diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index 16f8dcabf..1a613080f 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -38,6 +38,7 @@ struct System using vector2_type = vector; using vector3_type = vector; using measure_type = typename DiffuseBxDF::spectral_type; + using sample_type = typename DiffuseBxDF::sample_type; using quotient_pdf_type = typename DiffuseBxDF::quotient_pdf_type; using anisotropic_type = typename DiffuseBxDF::anisotropic_type; using anisocache_type = typename ConductorBxDF::anisocache_type; @@ -84,7 +85,7 @@ struct System } } - vector3_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache) + sample_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache) { switch(material.type) { @@ -107,7 +108,7 @@ struct System } break; default: - return (vector3_type)numeric_limits::infinity; + return (sample_type)numeric_limits::infinity; } } diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index f0eeb0885..15dbf3a9b 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -103,7 +103,10 @@ struct Estimator { vector3_type position = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); Shape sphere = Shape::create(position, asfloat(event.data[5]), event.data[6]); - L = sphere.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); + const vector3_type sampleL = sphere.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); + const vector3_type V = interaction.V.getDirection(); + const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); + L = sample_type::create(sampleL,VdotL,interaction.T,interaction.B,interaction.N); } break; case PST_TRIANGLE: @@ -112,7 +115,10 @@ struct Estimator vector3_type vertex1 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7])); vector3_type vertex2 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10])); Shape tri = Shape::create(vertex0, vertex1, vertex2, event.data[11]); - L = tri.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); + const vector3_type sampleL = tri.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); + const vector3_type V = interaction.V.getDirection(); + const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); + L = sample_type::create(sampleL,VdotL,interaction.T,interaction.B,interaction.N); } break; case PST_RECTANGLE: @@ -121,7 +127,10 @@ struct Estimator vector3_type edge0 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7])); vector3_type edge1 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10])); Shape rect = Shape::create(offset, edge0, edge1, event.data[11]); - L = rect.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); + const vector3_type sampleL = rect.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); + const vector3_type V = interaction.V.getDirection(); + const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); + L = sample_type::create(sampleL,VdotL,interaction.T,interaction.B,interaction.N); } break; default: diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 460744940..62398a58e 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -227,11 +227,12 @@ struct Unidirectional } } - quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params) * throughput; + quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params); + bsdf_quotient_pdf.quotient *= throughput; neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient; const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb; const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf; - neeContrib_pdf.quotient *= otherGenOverChoice/(1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice); // balance heuristic + neeContrib_pdf.quotient *= otherGenOverChoice / (1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice); // balance heuristic // TODO: ifdef NEE only @@ -240,7 +241,7 @@ struct Unidirectional nee_ray.direction = nee_sample.L.direction; nee_ray.intersectionT = t; if (bsdf_quotient_pdf.pdf < numeric_limits::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector_type::traceRay(nee_ray, scene).id == -1) - ray._payload.accumulation += neeContrib_pdf.quotient; + ray.payload.accumulation += neeContrib_pdf.quotient; } } } @@ -256,7 +257,7 @@ struct Unidirectional sample_type bsdf_sample = materialSystem.generate(material, bxdf.params, interaction, eps1, _cache); // TODO: does not yet account for smooth dielectric - params_type params; + params_type params; if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) { params = params_type::template create(bsdf_sample, iso_interaction, bxdf::BCM_MAX); @@ -287,7 +288,8 @@ struct Unidirectional } // the value of the bsdf divided by the probability of the sample being generated - throughput *= materialSystem.quotient_and_pdf(material, bxdf.params, params); + quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params); + throughput *= bsdf_quotient_pdf.quotient; bxdfSample = bsdf_sample.L.direction; } @@ -298,7 +300,7 @@ struct Unidirectional ray.payload.throughput = throughput; ray.payload.otherTechniqueHeuristic = neeProbability / bxdfPdf; // numerically stable, don't touch ray.payload.otherTechniqueHeuristic *= ray.payload.otherTechniqueHeuristic; - + // trace new ray ray.origin = intersection + bxdfSample * (1.0/*kSceneSize*/) * Tolerance::getStart(depth); ray.direction = bxdfSample; @@ -314,7 +316,7 @@ struct Unidirectional void missProgram(NBL_REF_ARG(ray_type) ray) { - vector3_type finalContribution = ray.payload.throughput; + vector3_type finalContribution = ray.payload.throughput; // #ifdef USE_ENVMAP // vec2 uv = SampleSphericalMap(_immutable.direction); // finalContribution *= textureLod(envMap, uv, 0.0).rgb; diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index 5be6adf78..360d085a6 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -115,7 +115,7 @@ static const ext::Shape rectangles[RECTANGLE_COUNT] = { #define LIGHT_COUNT 1 static const light_type lights[LIGHT_COUNT] = { - light_type(spectral_t(30.0,25.0,15.0), ext::ObjectID(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE)) + light_type::create(spectral_t(30.0,25.0,15.0), ext::ObjectID::create(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE)) }; #define BXDF_COUNT 7 @@ -166,7 +166,7 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) ptCreateParams.camPos = tmp.xyz / tmp.w; NDC.z = 1.0; } - + ptCreateParams.NDC = NDC; ptCreateParams.invMVP = pc.invMVP; diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl index 48be039a7..79b66dbfb 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl @@ -32,7 +32,7 @@ struct Scene light_type lights[maxLightCount]; uint32_t lightCount; - + NBL_CONSTEXPR_STATIC_INLINE uint32_t maxBxdfCount = 16; // TODO: limit change? bxdfnode_type bxdfs[maxBxdfCount]; @@ -51,7 +51,7 @@ struct Scene -1; retval.data[0] = objCount; retval.data[1] = type; - + switch (type) { case PST_SPHERE: From eb38ef5169ffd2508dc74ff05632394c0100cb93 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 3 Mar 2025 18:21:21 +0700 Subject: [PATCH 063/296] Adjust demo to ray_tracing_pipeline_demo fixes --- 71_RayTracingPipeline/main.cpp | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 0c5473b73..5793ff8d3 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -354,12 +354,12 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, auto& shaderGroups = params.shaderGroups; - shaderGroups.raygenGroup = { .shaderIndex = RTDS_RAYGEN }; + shaderGroups.raygen = { .index = RTDS_RAYGEN }; IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT]; - missGroups[EMT_PRIMARY] = { .shaderIndex = RTDS_MISS }; - missGroups[EMT_OCCLUSION] = { .shaderIndex = RTDS_SHADOW_MISS }; - shaderGroups.missGroups = missGroups; + missGroups[EMT_PRIMARY] = { .index = RTDS_MISS }; + missGroups[EMT_OCCLUSION] = { .index = RTDS_SHADOW_MISS }; + shaderGroups.misses = missGroups; auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) { @@ -367,28 +367,28 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, }; IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { - .closestHitShaderIndex = RTDS_CLOSEST_HIT, - .anyHitShaderIndex = RTDS_ANYHIT_PRIMARY, + .closestHit = RTDS_CLOSEST_HIT, + .anyHit = RTDS_ANYHIT_PRIMARY, }; hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { - .anyHitShaderIndex = RTDS_ANYHIT_SHADOW, + .anyHit = RTDS_ANYHIT_SHADOW, }; hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { - .closestHitShaderIndex = RTDS_SPHERE_CLOSEST_HIT, - .anyHitShaderIndex = RTDS_ANYHIT_PRIMARY, - .intersectionShaderIndex = RTDS_INTERSECTION, + .closestHit = RTDS_SPHERE_CLOSEST_HIT, + .anyHit = RTDS_ANYHIT_PRIMARY, + .intersectionShader = RTDS_INTERSECTION, }; hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { - .anyHitShaderIndex = RTDS_ANYHIT_SHADOW, - .intersectionShaderIndex = RTDS_INTERSECTION, + .anyHit = RTDS_ANYHIT_SHADOW, + .intersectionShader = RTDS_INTERSECTION, }; - shaderGroups.hitGroups = hitGroups; + shaderGroups.hits = hitGroups; IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT]; - callableGroups[ELT_DIRECTIONAL] = { .shaderIndex = RTDS_DIRECTIONAL_CALL }; - callableGroups[ELT_POINT] = { .shaderIndex = RTDS_POINT_CALL }; - callableGroups[ELT_SPOT] = { .shaderIndex = RTDS_SPOT_CALL }; - shaderGroups.callableGroups = callableGroups; + callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL }; + callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL }; + callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL }; + shaderGroups.callables = callableGroups; params.cached.maxRecursionDepth = 1; From b1831d983d2f8d8df7641d44d8cde857c6977a2c Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 4 Mar 2025 16:56:49 +0700 Subject: [PATCH 064/296] refactor to use new frisvad --- 31_HLSLPathTracer/app_resources/hlsl/common.hlsl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 913225f8b..2482806e2 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -271,9 +271,10 @@ struct Shape const float sinTheta = nbl::hlsl::sqrt(1.0 - cosTheta2); float sinPhi, cosPhi; math::sincos(2.0 * numbers::pi * xi.y - numbers::pi, sinPhi, cosPhi); - float32_t2x3 XY = math::frisvad(Z); + float32_t3 X, Y; + math::frisvad(Z, X, Y); - L += (XY[0] * cosPhi + XY[1] * sinPhi) * sinTheta; + L += (X * cosPhi + Y * sinPhi) * sinTheta; newRayMaxT = (cosTheta - nbl::hlsl::sqrt(cosTheta2 - cosThetaMax2)) / rcpDistance; pdf = 1.0 / (2.0 * numbers::pi * (1.0 - cosThetaMax)); From cb5662a63b6d24dbdc620aeb3606582dc51a4f9f Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 7 Mar 2025 10:44:08 +0700 Subject: [PATCH 065/296] fix bugs again --- .../app_resources/hlsl/common.hlsl | 4 ++-- .../app_resources/hlsl/material_system.hlsl | 17 ++++++++++---- .../hlsl/next_event_estimator.hlsl | 23 +++++++++++++------ .../app_resources/hlsl/pathtracer.hlsl | 7 +++--- 31_HLSLPathTracer/main.cpp | 6 ++--- 5 files changed, 38 insertions(+), 19 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 2482806e2..244a92107 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -66,7 +66,7 @@ struct Ray // immutable vector3_type origin; vector3_type direction; - + // TODO: polygon method == 2 stuff vector3_type normalAtOrigin; bool wasBSDFAtOrigin; @@ -417,7 +417,7 @@ struct Shape shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, origin); sampling::ProjectedSphericalTriangle sst = sampling::ProjectedSphericalTriangle::create(st); - const float32_t3 L = sst.generate(rcpPdf, interaction.N, isBSDF, xi.xy); + const float32_t3 L = sst.generate(rcpPdf, interaction.isotropic.N, isBSDF, xi.xy); pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : 0.0; diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index 1a613080f..09236c85e 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -39,6 +39,7 @@ struct System using vector3_type = vector; using measure_type = typename DiffuseBxDF::spectral_type; using sample_type = typename DiffuseBxDF::sample_type; + using ray_dir_info_type = typename sample_type::ray_dir_info_type; using quotient_pdf_type = typename DiffuseBxDF::quotient_pdf_type; using anisotropic_type = typename DiffuseBxDF::anisotropic_type; using anisocache_type = typename ConductorBxDF::anisocache_type; @@ -85,7 +86,7 @@ struct System } } - sample_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, anisotropic_type interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache) + sample_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(anisotropic_type) interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) _cache) { switch(material.type) { @@ -98,18 +99,26 @@ struct System case Material::Type::CONDUCTOR: { conductorBxDF.init(cparams); - return conductorBxDF.generate(interaction, u.xy, cache); + return conductorBxDF.generate(interaction, u.xy, _cache); } break; case Material::Type::DIELECTRIC: { dielectricBxDF.init(cparams); - return dielectricBxDF.generate(interaction, u, cache); + return dielectricBxDF.generate(interaction, u, _cache); } break; default: - return (sample_type)numeric_limits::infinity; + { + ray_dir_info_type L; + L.direction = (vector3_type)0; + return sample_type::create(L, 0, (vector3_type)0); + } } + + ray_dir_info_type L; + L.direction = (vector3_type)0; + return sample_type::create(L, 0, (vector3_type)0); } quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 15dbf3a9b..38a5fae15 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -23,6 +23,7 @@ struct Estimator using interaction_type = Aniso; using quotient_pdf_type = bxdf::quotient_and_pdf; using sample_type = LightSample; + using ray_dir_info_type = typename sample_type::ray_dir_info_type; static spectral_type proceduralDeferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event) { @@ -88,6 +89,7 @@ struct Estimator default: return (spectral_type)0.0; } + return (spectral_type)0.0; } static sample_type procedural_generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event) @@ -104,9 +106,11 @@ struct Estimator vector3_type position = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); Shape sphere = Shape::create(position, asfloat(event.data[5]), event.data[6]); const vector3_type sampleL = sphere.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); - const vector3_type V = interaction.V.getDirection(); + const vector3_type V = interaction.isotropic.V.getDirection(); const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); - L = sample_type::create(sampleL,VdotL,interaction.T,interaction.B,interaction.N); + ray_dir_info_type rayL; + rayL.direction = sampleL; + L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); } break; case PST_TRIANGLE: @@ -116,9 +120,11 @@ struct Estimator vector3_type vertex2 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10])); Shape tri = Shape::create(vertex0, vertex1, vertex2, event.data[11]); const vector3_type sampleL = tri.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); - const vector3_type V = interaction.V.getDirection(); + const vector3_type V = interaction.isotropic.V.getDirection(); const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); - L = sample_type::create(sampleL,VdotL,interaction.T,interaction.B,interaction.N); + ray_dir_info_type rayL; + rayL.direction = sampleL; + L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); } break; case PST_RECTANGLE: @@ -128,9 +134,11 @@ struct Estimator vector3_type edge1 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10])); Shape rect = Shape::create(offset, edge0, edge1, event.data[11]); const vector3_type sampleL = rect.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); - const vector3_type V = interaction.V.getDirection(); + const vector3_type V = interaction.isotropic.V.getDirection(); const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); - L = sample_type::create(sampleL,VdotL,interaction.T,interaction.B,interaction.N); + ray_dir_info_type rayL; + rayL.direction = sampleL; + L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); } break; default: @@ -149,6 +157,7 @@ struct Estimator static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event) { const Event::Mode mode = (Event::Mode)event.mode; + sample_type L; switch (mode) { case Event::Mode::RAY_QUERY: @@ -168,10 +177,10 @@ struct Estimator break; default: { - sample_type L; return L; } } + return L; } }; diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 62398a58e..ba683d443 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -54,6 +54,7 @@ struct Unidirectional using vector3_type = vector; using measure_type = typename MaterialSystem::measure_type; using sample_type = typename NextEventEstimator::sample_type; + using ray_dir_info_type = typename sample_type::ray_dir_info_type; using ray_type = typename RayGen::ray_type; using light_type = Light; using bxdfnode_type = BxDFNode; @@ -181,7 +182,7 @@ struct Unidirectional bool validPath = nee_sample.NdotL > numeric_limits::min; // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself anisocache_type _cache; - validPath = validPath && anisocache_type::compute(_cache, interaction, nee_sample, monochromeEta); + validPath = validPath && anisocache_type::template compute(_cache, interaction, nee_sample, monochromeEta); bxdf.params.A = nbl::hlsl::max(bxdf.params.A, vector(0,0)); bxdf.params.eta = monochromeEta; @@ -268,7 +269,7 @@ struct Unidirectional params = params_type::template create(bsdf_sample, interaction, _cache, bxdf::BCM_MAX); else { - isocache_type isocache = (isocache_type)_cache; + isocache_type isocache = _cache.iso_cache; params = params_type::template create(bsdf_sample, iso_interaction, isocache, bxdf::BCM_MAX); } } @@ -282,7 +283,7 @@ struct Unidirectional params = params_type::template create(bsdf_sample, interaction, _cache, bxdf::BCM_ABS); else { - isocache_type isocache = (isocache_type)_cache; + isocache_type isocache = _cache.iso_cache; params = params_type::template create(bsdf_sample, iso_interaction, isocache, bxdf::BCM_ABS); } } diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 4bb260b09..036fcdb79 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -23,7 +23,7 @@ struct PTPushConstant { // TODO: Add a QueryPool for timestamping once its ready // TODO: Do buffer creation using assConv -class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { using device_base_t = examples::SimpleWindowedApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; @@ -69,7 +69,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication }; public: - inline ComputeShaderPathtracer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + inline HLSLComputePathtracer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} inline bool isComputeOnly() const override { return false; } @@ -1349,4 +1349,4 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; }; -NBL_MAIN_FUNC(ComputeShaderPathtracer) +NBL_MAIN_FUNC(HLSLComputePathtracer) From 8eaa71463bfc1cf1cda0002e67f0f67e1d3a4ba5 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 7 Mar 2025 16:58:44 +0700 Subject: [PATCH 066/296] fix intersector, no use intersectdata --- .../app_resources/hlsl/common.hlsl | 82 +++++----- .../app_resources/hlsl/intersector.hlsl | 105 ++++++++++--- .../app_resources/hlsl/pathtracer.hlsl | 18 ++- .../app_resources/hlsl/render.comp.hlsl | 2 +- .../app_resources/hlsl/scene.hlsl | 140 +++++++++--------- 31_HLSLPathTracer/main.cpp | 2 +- 6 files changed, 207 insertions(+), 142 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 244a92107..0fd595bca 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -203,11 +203,11 @@ struct Shape; template<> struct Shape { - static Shape create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfLightIDs) + static Shape create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius2, uint32_t bsdfLightIDs) { Shape retval; retval.position = position; - retval.radius2 = radius * radius; + retval.radius2 = radius2; retval.bsdfLightIDs = bsdfLightIDs; return retval; } @@ -215,20 +215,20 @@ struct Shape static Shape create(NBL_CONST_REF_ARG(float32_t3) position, float32_t radius, uint32_t bsdfID, uint32_t lightID) { uint32_t bsdfLightIDs = glsl::bitfieldInsert(bsdfID, lightID, 16, 16); - return create(position, radius, bsdfLightIDs); + return create(position, radius * radius, bsdfLightIDs); } // return intersection distance if found, nan otherwise float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction) { float32_t3 relOrigin = origin - position; - float relOriginLen2 = nbl::hlsl::dot(relOrigin, relOrigin); + float relOriginLen2 = hlsl::dot(relOrigin, relOrigin); - float dirDotRelOrigin = nbl::hlsl::dot(direction, relOrigin); + float dirDotRelOrigin = hlsl::dot(direction, relOrigin); float det = radius2 - relOriginLen2 + dirDotRelOrigin * dirDotRelOrigin; // do some speculative math here - float detsqrt = nbl::hlsl::sqrt(det); + float detsqrt = hlsl::sqrt(det); return -dirDotRelOrigin + (relOriginLen2 > radius2 ? (-detsqrt) : detsqrt); } @@ -241,7 +241,7 @@ struct Shape float getSolidAngle(NBL_CONST_REF_ARG(float32_t3) origin) { float32_t3 dist = position - origin; - float cosThetaMax = nbl::hlsl::sqrt(1.0 - radius2 / nbl::hlsl::dot(dist, dist)); + float cosThetaMax = hlsl::sqrt(1.0 - radius2 / hlsl::dot(dist, dist)); return 2.0 * numbers::pi * (1.0 - cosThetaMax); } @@ -255,28 +255,28 @@ struct Shape float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi) { float32_t3 Z = position - origin; - const float distanceSQ = nbl::hlsl::dot(Z,Z); + const float distanceSQ = hlsl::dot(Z,Z); const float cosThetaMax2 = 1.0 - radius2 / distanceSQ; if (cosThetaMax2 > 0.0) { - const float rcpDistance = 1.0 / nbl::hlsl::sqrt(distanceSQ); + const float rcpDistance = 1.0 / hlsl::sqrt(distanceSQ); Z *= rcpDistance; - const float cosThetaMax = nbl::hlsl::sqrt(cosThetaMax2); + const float cosThetaMax = hlsl::sqrt(cosThetaMax2); const float cosTheta = nbl::hlsl::mix(1.0, cosThetaMax, xi.x); float32_t3 L = Z * cosTheta; const float cosTheta2 = cosTheta * cosTheta; - const float sinTheta = nbl::hlsl::sqrt(1.0 - cosTheta2); + const float sinTheta = hlsl::sqrt(1.0 - cosTheta2); float sinPhi, cosPhi; - math::sincos(2.0 * numbers::pi * xi.y - numbers::pi, sinPhi, cosPhi); + math::sincos(2.0 * numbers::pi * xi.y - numbers::pi, sinPhi, cosPhi); float32_t3 X, Y; math::frisvad(Z, X, Y); L += (X * cosPhi + Y * sinPhi) * sinTheta; - newRayMaxT = (cosTheta - nbl::hlsl::sqrt(cosTheta2 - cosThetaMax2)) / rcpDistance; + newRayMaxT = (cosTheta - hlsl::sqrt(cosTheta2 - cosThetaMax2)) / rcpDistance; pdf = 1.0 / (2.0 * numbers::pi * (1.0 - cosThetaMax)); return L; } @@ -315,26 +315,26 @@ struct Shape { const float32_t3 edges[2] = { vertex1 - vertex0, vertex2 - vertex0 }; - const float32_t3 h = nbl::hlsl::cross(direction, edges[1]); - const float a = nbl::hlsl::dot(edges[0], h); + const float32_t3 h = hlsl::cross(direction, edges[1]); + const float a = hlsl::dot(edges[0], h); const float32_t3 relOrigin = origin - vertex0; - const float u = nbl::hlsl::dot(relOrigin, h) / a; + const float u = hlsl::dot(relOrigin, h) / a; - const float32_t3 q = nbl::hlsl::cross(relOrigin, edges[0]); - const float v = nbl::hlsl::dot(direction, q) / a; + const float32_t3 q = hlsl::cross(relOrigin, edges[0]); + const float v = hlsl::dot(direction, q) / a; - const float t = nbl::hlsl::dot(edges[1], q) / a; + const float t = hlsl::dot(edges[1], q) / a; const bool intersection = t > 0.f && u >= 0.f && v >= 0.f && (u + v) <= 1.f; - return intersection ? t : numeric_limits::infinity; + return intersection ? t : bit_cast(numeric_limits::infinity); } float32_t3 getNormalTimesArea() { const float32_t3 edges[2] = { vertex1 - vertex0, vertex2 - vertex0 }; - return nbl::hlsl::cross(edges[0], edges[1]) * 0.5f; + return hlsl::cross(edges[0], edges[1]) * 0.5f; } template @@ -347,7 +347,7 @@ struct Shape { const float dist = ray.intersectionT; const float32_t3 L = ray.direction; - return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea(), L)); + return dist * dist / hlsl::abs(hlsl::dot(getNormalTimesArea(), L)); } break; case PPM_SOLID_ANGLE: @@ -381,15 +381,15 @@ struct Shape { const float32_t3 edge0 = vertex1 - vertex0; const float32_t3 edge1 = vertex2 - vertex0; - const float sqrtU = nbl::hlsl::sqrt(xi.x); + const float sqrtU = hlsl::sqrt(xi.x); float32_t3 pnt = vertex0 + edge0 * (1.0 - sqrtU) + edge1 * sqrtU * xi.y; float32_t3 L = pnt - origin; - const float distanceSq = nbl::hlsl::dot(L,L); - const float rcpDistance = 1.0 / nbl::hlsl::sqrt(distanceSq); + const float distanceSq = hlsl::dot(L,L); + const float rcpDistance = 1.0 / hlsl::sqrt(distanceSq); L *= rcpDistance; - pdf = distanceSq / nbl::hlsl::abs(nbl::hlsl::dot(nbl::hlsl::cross(edge0, edge1) * 0.5f, L)); + pdf = distanceSq / hlsl::abs(hlsl::dot(hlsl::cross(edge0, edge1) * 0.5f, L)); newRayMaxT = 1.0 / rcpDistance; return L; } @@ -406,7 +406,7 @@ struct Shape pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : 0.0; const float32_t3 N = getNormalTimesArea(); - newRayMaxT = nbl::hlsl::dot(N, vertex0 - origin) / nbl::hlsl::dot(N, L); + newRayMaxT = hlsl::dot(N, vertex0 - origin) / hlsl::dot(N, L); return L; } break; @@ -422,7 +422,7 @@ struct Shape pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : 0.0; const float32_t3 N = getNormalTimesArea(); - newRayMaxT = nbl::hlsl::dot(N, vertex0 - origin) / nbl::hlsl::dot(N, L); + newRayMaxT = hlsl::dot(N, vertex0 - origin) / hlsl::dot(N, L); return L; } break; @@ -462,25 +462,25 @@ struct Shape float intersect(NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(float32_t3) direction) { - const float32_t3 h = nbl::hlsl::cross(direction, edge1); - const float a = nbl::hlsl::dot(edge0, h); + const float32_t3 h = hlsl::cross(direction, edge1); + const float a = hlsl::dot(edge0, h); const float32_t3 relOrigin = origin - offset; - const float u = nbl::hlsl::dot(relOrigin,h)/a; + const float u = hlsl::dot(relOrigin,h)/a; - const float32_t3 q = nbl::hlsl::cross(relOrigin, edge0); - const float v = nbl::hlsl::dot(direction, q) / a; + const float32_t3 q = hlsl::cross(relOrigin, edge0); + const float v = hlsl::dot(direction, q) / a; - const float t = nbl::hlsl::dot(edge1, q) / a; + const float t = hlsl::dot(edge1, q) / a; const bool intersection = t > 0.f && u >= 0.f && v >= 0.f && u <= 1.f && v <= 1.f; - return intersection ? t : numeric_limits::infinity; + return intersection ? t : bit_cast(numeric_limits::infinity); } float32_t3 getNormalTimesArea() { - return nbl::hlsl::cross(edge0, edge1); + return hlsl::cross(edge0, edge1); } void getNormalBasis(NBL_REF_ARG(float32_t3x3) basis, NBL_REF_ARG(float32_t2) extents) @@ -502,7 +502,7 @@ struct Shape { const float dist = ray.intersectionT; const float32_t3 L = ray.direction; - return dist * dist / nbl::hlsl::abs(nbl::hlsl::dot(getNormalTimesArea(), L)); + return dist * dist / hlsl::abs(hlsl::dot(getNormalTimesArea(), L)); } break; // #ifdef TRIANGLE_REFERENCE ? @@ -542,10 +542,10 @@ struct Shape case PPM_AREA: { float32_t3 L = origin2origin + edge0 * xi.x + edge1 * xi.y; - const float distSq = nbl::hlsl::dot(L, L); - const float rcpDist = 1.0 / nbl::hlsl::sqrt(distSq); + const float distSq = hlsl::dot(L, L); + const float rcpDist = 1.0 / hlsl::sqrt(distSq); L *= rcpDist; - pdf = distSq / nbl::hlsl::abs(nbl::hlsl::dot(N, L)); + pdf = distSq / hlsl::abs(hlsl::dot(N, L)); newRayMaxT = 1.0 / rcpDist; return L; } @@ -572,7 +572,7 @@ struct Shape else pdf = numeric_limits::infinity; - newRayMaxT = nbl::hlsl::dot(N, origin2origin) / nbl::hlsl::dot(N, L); + newRayMaxT = hlsl::dot(N, origin2origin) / hlsl::dot(N, L); return L; } break; diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index 525af5525..68ea75dd3 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -25,6 +25,62 @@ struct Comprehensive using bxdfnode_type = BxdfNode; using scene_type = Scene; + static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene) + { + ObjectID objectID; + objectID.id = -1; + + // prodedural shapes + for (int i = 0; i < scene.sphereCount; i++) + { + float t = scene.spheres[i].intersect(ray.origin, ray.direction); + + bool closerIntersection = t > 0.0 && t < ray.intersectionT; + + if (closerIntersection) + { + ray.intersectionT = t; + objectID.id = i; + objectID.mode = IntersectData::Mode::PROCEDURAL; + objectID.shapeType = PST_SPHERE; + } + } + for (int i = 0; i < scene.triangleCount; i++) + { + float t = scene.triangles[i].intersect(ray.origin, ray.direction); + + bool closerIntersection = t > 0.0 && t < ray.intersectionT; + + if (closerIntersection) + { + ray.intersectionT = t; + objectID.id = i; + objectID.mode = IntersectData::Mode::PROCEDURAL; + objectID.shapeType = PST_TRIANGLE; + } + } + for (int i = 0; i < scene.rectangleCount; i++) + { + float t = scene.rectangles[i].intersect(ray.origin, ray.direction); + + bool closerIntersection = t > 0.0 && t < ray.intersectionT; + + if (closerIntersection) + { + ray.intersectionT = t; + objectID.id = i; + objectID.mode = IntersectData::Mode::PROCEDURAL; + objectID.shapeType = PST_TRIANGLE; + } + } + + // TODO: trace AS + + return objectID; + } + + // note for future consideration: still need to encode to IntersectData? + // obsolete? static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect) { const bool anyHit = ray.intersectionT != numeric_limits::max; @@ -81,6 +137,7 @@ struct Comprehensive return objectID; } + // obsolete? static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect) { const IntersectData::Mode mode = (IntersectData::Mode)intersect.mode; @@ -109,36 +166,36 @@ struct Comprehensive return ObjectID::create(-1, 0, PST_SPHERE); } - static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene) - { - IntersectData data; + // static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene) + // { + // IntersectData data; - ObjectID objectID; - objectID.id = -1; // start with no intersect + // ObjectID objectID; + // objectID.id = -1; // start with no intersect - // prodedural shapes - if (scene.sphereCount > 0) - { - data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE); - objectID = traceRay(ray, data); - } + // // prodedural shapes + // if (scene.sphereCount > 0) + // { + // data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE); + // objectID = traceRay(ray, data); + // } - if (scene.triangleCount > 0) - { - data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE); - objectID = traceRay(ray, data); - } + // if (scene.triangleCount > 0) + // { + // data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE); + // objectID = traceRay(ray, data); + // } - if (scene.rectangleCount > 0) - { - data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE); - objectID = traceRay(ray, data); - } + // if (scene.rectangleCount > 0) + // { + // data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE); + // objectID = traceRay(ray, data); + // } - // TODO: trace AS + // // TODO: trace AS - return objectID; - } + // return objectID; + // } }; // does everything in traceray in ex 30 diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index ba683d443..6b49bc758 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -79,7 +79,7 @@ struct Unidirectional // NextEventEstimator nee) // {} - static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams) params, Buffer sampleSequence) + static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams) params, Buffer sampleSequence) { this_t retval; retval.randGen = randgen_type::create(params.rngState); @@ -341,15 +341,21 @@ struct Unidirectional // bounces bool hit = true; bool rayAlive = true; - for (int d = 1; d <= depth && hit && rayAlive; d += 2) - { + // TODO for (int d = 1; d <= depth && hit && rayAlive; d += 2) + // TODO { ray.intersectionT = numeric_limits::max; + ray.objectID.id = -1; ray.objectID = intersector_type::traceRay(ray, scene); hit = ray.objectID.id != -1; if (hit) - rayAlive = closestHitProgram(d, i, ray, scene); - } + { + float pp = float(ray.objectID.id) / 10.0; + ray.payload.accumulation = measure_type(pp, 1.0-pp, 0.3); + // TODO rayAlive = closestHitProgram(1, i, ray, scene); + } + + // TODO } if (!hit) missProgram(ray); @@ -373,7 +379,7 @@ struct Unidirectional material_system_type materialSystem; nee_type nee; - Buffer sampleSequence; + Buffer sampleSequence; }; } diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index 360d085a6..f23d042a8 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -38,7 +38,7 @@ struct SPushConstants [[vk::combinedImageSampler]][[vk::binding(0, 2)]] Texture2D envMap; // unused [[vk::combinedImageSampler]][[vk::binding(0, 2)]] SamplerState envSampler; -[[vk::binding(1, 2)]] Buffer sampleSequence; +[[vk::binding(1, 2)]] Buffer sampleSequence; [[vk::combinedImageSampler]][[vk::binding(2, 2)]] Texture2D scramblebuf; // unused [[vk::combinedImageSampler]][[vk::binding(2, 2)]] SamplerState scrambleSampler; diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl index 79b66dbfb..1c17e2531 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl @@ -40,75 +40,77 @@ struct Scene // AS ases; - Intersector::IntersectData toIntersectData(uint32_t mode, ProceduralShapeType type) - { - Intersector::IntersectData retval; - retval.mode = mode; - - uint32_t objCount = (type == PST_SPHERE) ? sphereCount : - (type == PST_TRIANGLE) ? triangleCount : - (type == PST_RECTANGLE) ? rectangleCount : - -1; - retval.data[0] = objCount; - retval.data[1] = type; - - switch (type) - { - case PST_SPHERE: - { - for (int i = 0; i < objCount; i++) - { - Shape sphere = spheres[i]; - retval.data[2 + i * Shape::ObjSize] = asuint(sphere.position.x); - retval.data[2 + i * Shape::ObjSize + 1] = asuint(sphere.position.y); - retval.data[2 + i * Shape::ObjSize + 2] = asuint(sphere.position.z); - retval.data[2 + i * Shape::ObjSize + 3] = asuint(sphere.radius2); - retval.data[2 + i * Shape::ObjSize + 4] = sphere.bsdfLightIDs; - } - } - break; - case PST_TRIANGLE: - { - for (int i = 0; i < objCount; i++) - { - Shape tri = triangles[i]; - retval.data[2 + i * Shape::ObjSize] = asuint(tri.vertex0.x); - retval.data[2 + i * Shape::ObjSize + 1] = asuint(tri.vertex0.y); - retval.data[2 + i * Shape::ObjSize + 2] = asuint(tri.vertex0.z); - retval.data[2 + i * Shape::ObjSize + 3] = asuint(tri.vertex1.x); - retval.data[2 + i * Shape::ObjSize + 4] = asuint(tri.vertex1.y); - retval.data[2 + i * Shape::ObjSize + 5] = asuint(tri.vertex1.z); - retval.data[2 + i * Shape::ObjSize + 6] = asuint(tri.vertex2.x); - retval.data[2 + i * Shape::ObjSize + 7] = asuint(tri.vertex2.y); - retval.data[2 + i * Shape::ObjSize + 8] = asuint(tri.vertex2.z); - retval.data[2 + i * Shape::ObjSize + 9] = tri.bsdfLightIDs; - } - } - break; - case PST_RECTANGLE: - { - for (int i = 0; i < objCount; i++) - { - Shape rect = rectangles[i]; - retval.data[2 + i * Shape::ObjSize] = asuint(rect.offset.x); - retval.data[2 + i * Shape::ObjSize + 1] = asuint(rect.offset.y); - retval.data[2 + i * Shape::ObjSize + 2] = asuint(rect.offset.z); - retval.data[2 + i * Shape::ObjSize + 3] = asuint(rect.edge0.x); - retval.data[2 + i * Shape::ObjSize + 4] = asuint(rect.edge0.y); - retval.data[2 + i * Shape::ObjSize + 5] = asuint(rect.edge0.z); - retval.data[2 + i * Shape::ObjSize + 6] = asuint(rect.edge1.x); - retval.data[2 + i * Shape::ObjSize + 7] = asuint(rect.edge1.y); - retval.data[2 + i * Shape::ObjSize + 8] = asuint(rect.edge1.z); - retval.data[2 + i * Shape::ObjSize + 9] = rect.bsdfLightIDs; - } - } - break; - default: - // for ASes - break; - } - return retval; - } + // obsolete? + // Intersector::IntersectData toIntersectData(uint32_t mode, ProceduralShapeType type) + // { + // Intersector::IntersectData retval; + // retval.mode = mode; + + // uint32_t objCount = (type == PST_SPHERE) ? sphereCount : + // (type == PST_TRIANGLE) ? triangleCount : + // (type == PST_RECTANGLE) ? rectangleCount : + // -1; + // retval.data[0] = objCount; + // retval.data[1] = type; + + // switch (type) + // { + // case PST_SPHERE: + // { + // for (int i = 0; i < objCount; i++) + // { + // Shape sphere = spheres[i]; + // uint32_t3 uintPos = bit_cast(sphere.position); + // retval.data[2 + i * Shape::ObjSize] = uintPos.x; + // retval.data[2 + i * Shape::ObjSize + 1] = uintPos.y; + // retval.data[2 + i * Shape::ObjSize + 2] = uintPos.z; + // retval.data[2 + i * Shape::ObjSize + 3] = bit_cast(sphere.radius2); + // retval.data[2 + i * Shape::ObjSize + 4] = sphere.bsdfLightIDs; + // } + // } + // break; + // case PST_TRIANGLE: + // { + // for (int i = 0; i < objCount; i++) + // { + // Shape tri = triangles[i]; + // retval.data[2 + i * Shape::ObjSize] = asuint(tri.vertex0.x); + // retval.data[2 + i * Shape::ObjSize + 1] = asuint(tri.vertex0.y); + // retval.data[2 + i * Shape::ObjSize + 2] = asuint(tri.vertex0.z); + // retval.data[2 + i * Shape::ObjSize + 3] = asuint(tri.vertex1.x); + // retval.data[2 + i * Shape::ObjSize + 4] = asuint(tri.vertex1.y); + // retval.data[2 + i * Shape::ObjSize + 5] = asuint(tri.vertex1.z); + // retval.data[2 + i * Shape::ObjSize + 6] = asuint(tri.vertex2.x); + // retval.data[2 + i * Shape::ObjSize + 7] = asuint(tri.vertex2.y); + // retval.data[2 + i * Shape::ObjSize + 8] = asuint(tri.vertex2.z); + // retval.data[2 + i * Shape::ObjSize + 9] = tri.bsdfLightIDs; + // } + // } + // break; + // case PST_RECTANGLE: + // { + // for (int i = 0; i < objCount; i++) + // { + // Shape rect = rectangles[i]; + // retval.data[2 + i * Shape::ObjSize] = asuint(rect.offset.x); + // retval.data[2 + i * Shape::ObjSize + 1] = asuint(rect.offset.y); + // retval.data[2 + i * Shape::ObjSize + 2] = asuint(rect.offset.z); + // retval.data[2 + i * Shape::ObjSize + 3] = asuint(rect.edge0.x); + // retval.data[2 + i * Shape::ObjSize + 4] = asuint(rect.edge0.y); + // retval.data[2 + i * Shape::ObjSize + 5] = asuint(rect.edge0.z); + // retval.data[2 + i * Shape::ObjSize + 6] = asuint(rect.edge1.x); + // retval.data[2 + i * Shape::ObjSize + 7] = asuint(rect.edge1.y); + // retval.data[2 + i * Shape::ObjSize + 8] = asuint(rect.edge1.z); + // retval.data[2 + i * Shape::ObjSize + 9] = rect.bsdfLightIDs; + // } + // } + // break; + // default: + // // for ASes + // break; + // } + // return retval; + // } NextEventEstimator::Event toNextEvent(uint32_t lightID) { diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 036fcdb79..8da32083e 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -1341,7 +1341,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, float camYAngle = 165.f / 180.f * 3.14159f; float camXAngle = 32.f / 180.f * 3.14159f; int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE; - int renderMode = E_RENDER_MODE::ERM_GLSL; + int renderMode = E_RENDER_MODE::ERM_HLSL; int spp = 32; int depth = 3; From 4d3e04698b85f31910d8d28b82925fe2b641adae Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 10 Mar 2025 14:55:22 +0700 Subject: [PATCH 067/296] removed intersectdata usage, fix emissive bug --- .../app_resources/hlsl/common.hlsl | 76 ++++--- .../app_resources/hlsl/intersector.hlsl | 207 +++++++----------- .../app_resources/hlsl/material_system.hlsl | 6 +- .../hlsl/next_event_estimator.hlsl | 40 ++-- .../app_resources/hlsl/pathtracer.hlsl | 38 ++-- .../app_resources/hlsl/render.comp.hlsl | 8 +- .../app_resources/hlsl/scene.hlsl | 51 +++-- 7 files changed, 214 insertions(+), 212 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 0fd595bca..f67716060 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -105,17 +105,33 @@ struct BxDFNode NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu; + // for diffuse bxdfs + static BxDFNode create(uint32_t materialType, bool isAniso, NBL_CONST_REF_ARG(float32_t2) A, NBL_CONST_REF_ARG(spectral_type) albedo) + { + BxDFNode retval; + retval.albedo = albedo; + retval.materialType = materialType; + retval.params.is_aniso = isAniso; + retval.params.A = hlsl::max(A, 1e-4); + retval.params.ior0 = (spectral_type)1.0; + retval.params.ior1 = (spectral_type)1.0; + return retval; + } + + // for conductor + dielectric static BxDFNode create(uint32_t materialType, bool isAniso, NBL_CONST_REF_ARG(float32_t2) A, NBL_CONST_REF_ARG(spectral_type) ior0, NBL_CONST_REF_ARG(spectral_type) ior1) { BxDFNode retval; + retval.albedo = (spectral_type)1.0; retval.materialType = materialType; retval.params.is_aniso = isAniso; - retval.params.A = A; + retval.params.A = hlsl::max(A, 1e-4); retval.params.ior0 = ior0; retval.params.ior1 = ior1; return retval; } + spectral_type albedo; uint32_t materialType; params_type params; }; @@ -149,32 +165,39 @@ enum PTPolygonMethod : uint16_t PPM_APPROX_PROJECTED_SOLID_ANGLE }; -namespace Intersector -{ -// ray query method -// ray query struct holds AS info -// pass in address to vertex/index buffers? +// namespace Intersector +// { +// // ray query method +// // ray query struct holds AS info +// // pass in address to vertex/index buffers? -// ray tracing pipeline method +// // ray tracing pipeline method -// procedural data store: [obj count] [intersect type] [obj1] [obj2] [...] +// // procedural data store: [obj count] [intersect type] [obj1] [obj2] [...] -struct IntersectData -{ - enum Mode : uint32_t // enum class? - { - RAY_QUERY, - RAY_TRACING, - PROCEDURAL - }; +// struct IntersectData +// { +// enum Mode : uint32_t // enum class? +// { +// RAY_QUERY, +// RAY_TRACING, +// PROCEDURAL +// }; - NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128; +// NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128; - uint32_t mode : 1; - uint32_t unused : 31; // possible space for flags - uint32_t data[DataSize]; +// uint32_t mode : 2; +// uint32_t unused : 30; // possible space for flags +// uint32_t data[DataSize]; +// }; +// } + +enum IntersectMode : uint32_t +{ + IM_RAY_QUERY, + IM_RAY_TRACING, + IM_PROCEDURAL }; -} namespace NextEventEstimator { @@ -182,17 +205,10 @@ namespace NextEventEstimator struct Event { - enum Mode : uint32_t // enum class? - { - RAY_QUERY, - RAY_TRACING, - PROCEDURAL - }; - NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 16; - uint32_t mode : 1; - uint32_t unused : 31; // possible space for flags + uint32_t mode : 2; + uint32_t unused : 30; // possible space for flags uint32_t data[DataSize]; }; } diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index 68ea75dd3..03a45f866 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -41,7 +41,7 @@ struct Comprehensive { ray.intersectionT = t; objectID.id = i; - objectID.mode = IntersectData::Mode::PROCEDURAL; + objectID.mode = IM_PROCEDURAL; objectID.shapeType = PST_SPHERE; } } @@ -55,7 +55,7 @@ struct Comprehensive { ray.intersectionT = t; objectID.id = i; - objectID.mode = IntersectData::Mode::PROCEDURAL; + objectID.mode = IM_PROCEDURAL; objectID.shapeType = PST_TRIANGLE; } } @@ -69,7 +69,7 @@ struct Comprehensive { ray.intersectionT = t; objectID.id = i; - objectID.mode = IntersectData::Mode::PROCEDURAL; + objectID.mode = IM_PROCEDURAL; objectID.shapeType = PST_TRIANGLE; } } @@ -81,90 +81,90 @@ struct Comprehensive // note for future consideration: still need to encode to IntersectData? // obsolete? - static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect) - { - const bool anyHit = ray.intersectionT != numeric_limits::max; - const uint32_t objCount = intersect.data[0]; - const ProceduralShapeType type = (ProceduralShapeType)intersect.data[1]; - - ObjectID objectID = ray.objectID; - objectID.mode = IntersectData::Mode::PROCEDURAL; - objectID.shapeType = type; - for (int i = 0; i < objCount; i++) - { - float t; - switch (type) - { - case PST_SPHERE: - { - vector3_type position = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); - Shape sphere = Shape::create(position, asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), intersect.data[2 + i * Shape::ObjSize + 4]); - t = sphere.intersect(ray.origin, ray.direction); - } - break; - case PST_TRIANGLE: - { - vector3_type vertex0 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); - vector3_type vertex1 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape::ObjSize + 5])); - vector3_type vertex2 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape::ObjSize + 8])); - Shape tri = Shape::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Shape::ObjSize + 9]); - t = tri.intersect(ray.origin, ray.direction); - } - break; - case PST_RECTANGLE: - { - vector3_type offset = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); - vector3_type edge0 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape::ObjSize + 5])); - vector3_type edge1 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape::ObjSize + 8])); - Shape rect = Shape::create(offset, edge0, edge1, intersect.data[2 + i * Shape::ObjSize + 9]); - t = rect.intersect(ray.origin, ray.direction); - } - break; - default: - t = numeric_limits::infinity; - break; - } - - bool closerIntersection = t > 0.0 && t < ray.intersectionT; - - ray.intersectionT = closerIntersection ? t : ray.intersectionT; - objectID.id = closerIntersection ? i : objectID.id; - - // allowing early out results in a performance regression, WTF!? - //if (anyHit && closerIntersection) - //break; - } - return objectID; - } + // static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect) + // { + // const bool anyHit = ray.intersectionT != numeric_limits::max; + // const uint32_t objCount = intersect.data[0]; + // const ProceduralShapeType type = (ProceduralShapeType)intersect.data[1]; + + // ObjectID objectID = ray.objectID; + // objectID.mode = IM_PROCEDURAL; + // objectID.shapeType = type; + // for (int i = 0; i < objCount; i++) + // { + // float t; + // switch (type) + // { + // case PST_SPHERE: + // { + // vector3_type position = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); + // Shape sphere = Shape::create(position, asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), intersect.data[2 + i * Shape::ObjSize + 4]); + // t = sphere.intersect(ray.origin, ray.direction); + // } + // break; + // case PST_TRIANGLE: + // { + // vector3_type vertex0 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); + // vector3_type vertex1 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape::ObjSize + 5])); + // vector3_type vertex2 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape::ObjSize + 8])); + // Shape tri = Shape::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Shape::ObjSize + 9]); + // t = tri.intersect(ray.origin, ray.direction); + // } + // break; + // case PST_RECTANGLE: + // { + // vector3_type offset = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); + // vector3_type edge0 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape::ObjSize + 5])); + // vector3_type edge1 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape::ObjSize + 8])); + // Shape rect = Shape::create(offset, edge0, edge1, intersect.data[2 + i * Shape::ObjSize + 9]); + // t = rect.intersect(ray.origin, ray.direction); + // } + // break; + // default: + // t = numeric_limits::infinity; + // break; + // } + + // bool closerIntersection = t > 0.0 && t < ray.intersectionT; + + // ray.intersectionT = closerIntersection ? t : ray.intersectionT; + // objectID.id = closerIntersection ? i : objectID.id; + + // // allowing early out results in a performance regression, WTF!? + // //if (anyHit && closerIntersection) + // //break; + // } + // return objectID; + // } // obsolete? - static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect) - { - const IntersectData::Mode mode = (IntersectData::Mode)intersect.mode; - switch (mode) - { - case IntersectData::Mode::RAY_QUERY: - { - // TODO: do ray query stuff - } - break; - case IntersectData::Mode::RAY_TRACING: - { - // TODO: do ray tracing stuff - } - break; - case IntersectData::Mode::PROCEDURAL: - { - return traceProcedural(ray, intersect); - } - break; - default: - { - return ObjectID::create(-1, 0, PST_SPHERE); - } - } - return ObjectID::create(-1, 0, PST_SPHERE); - } + // static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect) + // { + // const uint32_t mode = intersect.mode; + // switch (mode) + // { + // case IM_RAY_QUERY: + // { + // // TODO: do ray query stuff + // } + // break; + // case IM_RAY_TRACING: + // { + // // TODO: do ray tracing stuff + // } + // break; + // case IM_PROCEDURAL: + // { + // return traceProcedural(ray, intersect); + // } + // break; + // default: + // { + // return ObjectID::create(-1, 0, PST_SPHERE); + // } + // } + // return ObjectID::create(-1, 0, PST_SPHERE); + // } // static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene) // { @@ -198,43 +198,6 @@ struct Comprehensive // } }; -// does everything in traceray in ex 30 -// template -// struct Procedural -// { -// using scalar_type = typename Ray::scalar_type; -// using ray_type = Ray; - -// static int traceRay(NBL_REF_ARG(ray_type) ray, IIntersection objects[32], int objCount) -// { -// const bool anyHit = ray.intersectionT != numeric_limits::max; - -// int objectID = -1; -// for (int i = 0; i < objCount; i++) -// { -// float t; -// if (objects[i].type == PST_SPHERE) // we don't know what type of intersection it is so cast, there has to be a better way to do this -// { -// Shape sphere = (Shape)objects[i]; -// t = sphere.intersect(ray.origin, ray.direction); -// } -// // TODO: other types - -// bool closerIntersection = t > 0.0 && t < ray.intersectionT; - -// ray.intersectionT = closerIntersection ? t : ray.intersectionT; -// objectID = closerIntersection ? i : objectID; - -// // allowing early out results in a performance regression, WTF!? -// //if (anyHit && closerIntersection) -// //break; -// } -// return objectID; -// } - -// // TODO? traceray with vertex/index buffer -// }; - } } } diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index 09236c85e..7fb153791 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -25,8 +25,8 @@ struct Material NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 32; - uint32_t type : 1; - uint32_t unused : 31; // possible space for flags + uint32_t type : 2; + uint32_t unused : 30; // possible space for flags uint32_t data[DataSize]; }; @@ -66,7 +66,7 @@ struct System case Material::Type::DIFFUSE: { diffuseBxDF.init(cparams); - return (measure_type)diffuseBxDF.eval(params); + return cparams.albedo * (measure_type)diffuseBxDF.eval(params); } break; case Material::Type::CONDUCTOR: diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 38a5fae15..949db8456 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -35,16 +35,28 @@ struct Estimator { case PST_SPHERE: { - vector3_type position = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); - Shape sphere = Shape::create(position, asfloat(event.data[5]), event.data[6]); + const vector3_type position = vector3_type( + bit_cast(event.data[2]), + bit_cast(event.data[3]), + bit_cast(event.data[4])); + Shape sphere = Shape::create(position, bit_cast(event.data[5]), event.data[6]); pdf *= sphere.template deferredPdf(ray); } break; case PST_TRIANGLE: { - vector3_type vertex0 = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); - vector3_type vertex1 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7])); - vector3_type vertex2 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10])); + const vector3_type vertex0 = vector3_type( + bit_cast(event.data[2]), + bit_cast(event.data[3]), + bit_cast(event.data[4])); + const vector3_type vertex1 = vector3_type( + bit_cast(event.data[5]), + bit_cast(event.data[6]), + bit_cast(event.data[7])); + const vector3_type vertex2 = vector3_type( + bit_cast(event.data[8]), + bit_cast(event.data[9]), + bit_cast(event.data[10])); Shape tri = Shape::create(vertex0, vertex1, vertex2, event.data[11]); pdf *= tri.template deferredPdf(ray); } @@ -59,7 +71,7 @@ struct Estimator } break; default: - pdf = numeric_limits::infinity; + pdf = bit_cast(numeric_limits::infinity); break; } @@ -68,20 +80,20 @@ struct Estimator static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event) { - const Event::Mode mode = (Event::Mode)event.mode; + const uint32_t mode = event.mode; switch (mode) { - case Event::Mode::RAY_QUERY: + case IM_RAY_QUERY: { // TODO: do ray query stuff } break; - case Event::Mode::RAY_TRACING: + case IM_RAY_TRACING: { // TODO: do ray tracing stuff } break; - case Event::Mode::PROCEDURAL: + case IM_PROCEDURAL: { return proceduralDeferredEvalAndPdf(pdf, light, ray, event); } @@ -156,21 +168,21 @@ struct Estimator static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event) { - const Event::Mode mode = (Event::Mode)event.mode; + const uint32_t mode = event.mode; sample_type L; switch (mode) { - case Event::Mode::RAY_QUERY: + case IM_RAY_QUERY: { // TODO: do ray query stuff } break; - case Event::Mode::RAY_TRACING: + case IM_RAY_TRACING: { // TODO: do ray tracing stuff } break; - case Event::Mode::PROCEDURAL: + case IM_PROCEDURAL: { return procedural_generate_and_quotient_and_pdf(quotient_pdf, newRayMaxT, light, origin, interaction, isBSDF, xi, depth, event); } diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 6b49bc758..df4792a9c 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -111,14 +111,14 @@ struct Unidirectional uint32_t bsdfLightIDs; anisotropic_type interaction; isotropic_type iso_interaction; - ext::Intersector::IntersectData::Mode mode = (ext::Intersector::IntersectData::Mode)objectID.mode; + uint32_t mode = objectID.mode; switch (mode) { // TODO - case ext::Intersector::IntersectData::Mode::RAY_QUERY: - case ext::Intersector::IntersectData::Mode::RAY_TRACING: + case IM_RAY_QUERY: + case IM_RAY_TRACING: break; - case ext::Intersector::IntersectData::Mode::PROCEDURAL: + case IM_PROCEDURAL: { bsdfLightIDs = scene.getBsdfLightIDs(objectID); vector3_type N = scene.getNormal(objectID, intersection); @@ -139,10 +139,12 @@ struct Unidirectional const uint32_t lightID = glsl::bitfieldExtract(bsdfLightIDs, 16, 16); if (lightID != light_type::INVALID_ID) { - float pdf; - ray.payload.accumulation += nee.deferredEvalAndPdf(pdf, scene.lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + pdf * pdf * ray.payload.otherTechniqueHeuristic); + float _pdf; + ray.payload.accumulation += nee.deferredEvalAndPdf(_pdf, scene.lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + _pdf * _pdf * ray.payload.otherTechniqueHeuristic); } + return false; // emissive only + const uint32_t bsdfID = glsl::bitfieldExtract(bsdfLightIDs, 0, 16); if (bsdfID == bxdfnode_type::INVALID_ID) return false; @@ -209,7 +211,7 @@ struct Unidirectional params = params_type::template create(nee_sample, interaction, _cache, bxdf::BCM_MAX); else { - isocache_type isocache = (isocache_type)_cache; + isocache_type isocache = _cache.iso_cache; params = params_type::template create(nee_sample, iso_interaction, isocache, bxdf::BCM_MAX); } } @@ -223,7 +225,7 @@ struct Unidirectional params = params_type::template create(nee_sample, interaction, _cache, bxdf::BCM_ABS); else { - isocache_type isocache = (isocache_type)_cache; + isocache_type isocache = _cache.iso_cache; params = params_type::template create(nee_sample, iso_interaction, isocache, bxdf::BCM_ABS); } } @@ -232,10 +234,11 @@ struct Unidirectional bsdf_quotient_pdf.quotient *= throughput; neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient; const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb; - const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf; - neeContrib_pdf.quotient *= otherGenOverChoice / (1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice); // balance heuristic + // const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf; + // neeContrib_pdf.quotient *= otherGenOverChoice / (1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice); // balance heuristic // TODO: ifdef NEE only + neeContrib_pdf.quotient *= otherGenOverChoice; ray_type nee_ray; nee_ray.origin = intersection + nee_sample.L.direction * t * Tolerance::getStart(depth); @@ -247,6 +250,8 @@ struct Unidirectional } } + return false; // NEE only + // sample BSDF scalar_type bxdfPdf; vector3_type bxdfSample; @@ -341,21 +346,20 @@ struct Unidirectional // bounces bool hit = true; bool rayAlive = true; - // TODO for (int d = 1; d <= depth && hit && rayAlive; d += 2) - // TODO { + for (int d = 1; d <= depth && hit && rayAlive; d += 2) + { ray.intersectionT = numeric_limits::max; - ray.objectID.id = -1; ray.objectID = intersector_type::traceRay(ray, scene); hit = ray.objectID.id != -1; if (hit) { - float pp = float(ray.objectID.id) / 10.0; - ray.payload.accumulation = measure_type(pp, 1.0-pp, 0.3); - // TODO rayAlive = closestHitProgram(1, i, ray, scene); + // float pp = float(ray.objectID.id) / 10.0; + // ray.payload.accumulation = measure_type(pp, 1.0-pp, 0.3); + rayAlive = closestHitProgram(1, i, ray, scene); } - // TODO } + } if (!hit) missProgram(ray); diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index f23d042a8..e25961b56 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -115,14 +115,14 @@ static const ext::Shape rectangles[RECTANGLE_COUNT] = { #define LIGHT_COUNT 1 static const light_type lights[LIGHT_COUNT] = { - light_type::create(spectral_t(30.0,25.0,15.0), ext::ObjectID::create(8u, ext::NextEventEstimator::Event::Mode::PROCEDURAL, LIGHT_TYPE)) + light_type::create(spectral_t(30.0,25.0,15.0), ext::ObjectID::create(8u, ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE)) }; #define BXDF_COUNT 7 static const bxdfnode_type bxdfs[BXDF_COUNT] = { - bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,1.25,1.25)), - bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(1,1,1), spectral_t(1.25,2.5,2.5)), - bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(1,1,1), spectral_t(2.5,1.25,2.5)), + bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.8,0.8)), + bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.4,0.4)), + bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(0.4,0.8,0.4)), bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77)), bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)), bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)), diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl index 1c17e2531..5b4178ec4 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl @@ -128,40 +128,47 @@ struct Scene case PST_SPHERE: { Shape sphere = spheres[id]; - retval.data[2] = asuint(sphere.position.x); - retval.data[3] = asuint(sphere.position.y); - retval.data[4] = asuint(sphere.position.z); - retval.data[5] = asuint(sphere.radius2); + uint32_t3 position = bit_cast(sphere.position); + retval.data[2] = position.x; + retval.data[3] = position.y; + retval.data[4] = position.z; + retval.data[5] = bit_cast(sphere.radius2); retval.data[6] = sphere.bsdfLightIDs; } break; case PST_TRIANGLE: { Shape tri = triangles[id]; - retval.data[2] = asuint(tri.vertex0.x); - retval.data[3] = asuint(tri.vertex0.y); - retval.data[4] = asuint(tri.vertex0.z); - retval.data[5] = asuint(tri.vertex1.x); - retval.data[6] = asuint(tri.vertex1.y); - retval.data[7] = asuint(tri.vertex1.z); - retval.data[8] = asuint(tri.vertex2.x); - retval.data[9] = asuint(tri.vertex2.y); - retval.data[10] = asuint(tri.vertex2.z); + uint32_t3 vertex = bit_cast(tri.vertex0); + retval.data[2] = vertex.x; + retval.data[3] = vertex.y; + retval.data[4] = vertex.z; + vertex = bit_cast(tri.vertex1); + retval.data[5] = vertex.x; + retval.data[6] = vertex.y; + retval.data[7] = vertex.z; + vertex = bit_cast(tri.vertex2); + retval.data[8] = vertex.x; + retval.data[9] = vertex.y; + retval.data[10] = vertex.z; retval.data[11] = tri.bsdfLightIDs; } break; case PST_RECTANGLE: { Shape rect = rectangles[id]; - retval.data[2] = asuint(rect.offset.x); - retval.data[3] = asuint(rect.offset.y); - retval.data[4] = asuint(rect.offset.z); - retval.data[5] = asuint(rect.edge0.x); - retval.data[6] = asuint(rect.edge0.y); - retval.data[7] = asuint(rect.edge0.z); - retval.data[8] = asuint(rect.edge1.x); - retval.data[9] = asuint(rect.edge1.y); - retval.data[10] = asuint(rect.edge1.z); + uint32_t3 tmp = bit_cast(rect.offset); + retval.data[2] = tmp.x; + retval.data[3] = tmp.y; + retval.data[4] = tmp.z; + tmp = bit_cast(rect.edge0); + retval.data[5] = tmp.x; + retval.data[6] = tmp.y; + retval.data[7] = tmp.z; + tmp = bit_cast(rect.edge1); + retval.data[8] = tmp.x; + retval.data[9] = tmp.y; + retval.data[10] = tmp.z; retval.data[11] = rect.bsdfLightIDs; } break; From e7d4670fca8009843abde9cf4fc1ddd6aedc9290 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 10 Mar 2025 17:42:58 +0700 Subject: [PATCH 068/296] fixed light sampling nee --- .../app_resources/glsl/common.glsl | 2 +- .../app_resources/hlsl/material_system.hlsl | 15 +++-- .../hlsl/next_event_estimator.hlsl | 57 ++++++++++++---- .../app_resources/hlsl/pathtracer.hlsl | 67 +++++++++---------- 4 files changed, 86 insertions(+), 55 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl index 2463f82cf..15b3662d0 100644 --- a/31_HLSLPathTracer/app_resources/glsl/common.glsl +++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl @@ -7,7 +7,7 @@ //#define VISUALIZE_HIGH_VARIANCE // debug -//#define NEE_ONLY +#define NEE_ONLY 1 layout(set = 2, binding = 0) uniform sampler2D envMap; layout(set = 2, binding = 1) uniform usamplerBuffer sampleSequence; diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index 7fb153791..af8d5b131 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -23,7 +23,7 @@ struct Material DIELECTRIC }; - NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 32; + NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 1; uint32_t type : 2; uint32_t unused : 30; // possible space for flags @@ -66,7 +66,7 @@ struct System case Material::Type::DIFFUSE: { diffuseBxDF.init(cparams); - return cparams.albedo * (measure_type)diffuseBxDF.eval(params); + return (measure_type)diffuseBxDF.eval(params); } break; case Material::Type::CONDUCTOR: @@ -123,8 +123,13 @@ struct System quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) { + + const bool transmissive = material.type == Material::Type::DIELECTRIC; + const float clampedNdotV = math::conditionalAbsOrMax(transmissive, params.uNdotV, 0.0); + const float clampedNdotL = math::conditionalAbsOrMax(transmissive, params.uNdotL, 0.0); + const float minimumProjVectorLen = 0.00000001; - if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen) + if (clampedNdotV > minimumProjVectorLen && clampedNdotL > minimumProjVectorLen) { switch(material.type) { @@ -147,10 +152,10 @@ struct System } break; default: - return quotient_pdf_type::create((measure_type)0.0, numeric_limits::infinity); + return quotient_pdf_type::create((measure_type)0.0, 0.0); } } - return quotient_pdf_type::create((measure_type)0.0, numeric_limits::infinity); + return quotient_pdf_type::create((measure_type)0.0, 0.0); } DiffuseBxDF diffuseBxDF; diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 949db8456..65646b3c1 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -63,9 +63,18 @@ struct Estimator break; case PST_RECTANGLE: { - vector3_type offset = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); - vector3_type edge0 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7])); - vector3_type edge1 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10])); + const vector3_type offset = vector3_type( + bit_cast(event.data[2]), + bit_cast(event.data[3]), + bit_cast(event.data[4])); + const vector3_type edge0 = vector3_type( + bit_cast(event.data[5]), + bit_cast(event.data[6]), + bit_cast(event.data[7])); + const vector3_type edge1 = vector3_type( + bit_cast(event.data[8]), + bit_cast(event.data[9]), + bit_cast(event.data[10])); Shape rect = Shape::create(offset, edge0, edge1, event.data[11]); pdf *= rect.template deferredPdf(ray); } @@ -115,8 +124,12 @@ struct Estimator { case PST_SPHERE: { - vector3_type position = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); - Shape sphere = Shape::create(position, asfloat(event.data[5]), event.data[6]); + const vector3_type position = vector3_type( + bit_cast(event.data[2]), + bit_cast(event.data[3]), + bit_cast(event.data[4])); + Shape sphere = Shape::create(position, bit_cast(event.data[5]), event.data[6]); + const vector3_type sampleL = sphere.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); const vector3_type V = interaction.isotropic.V.getDirection(); const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); @@ -127,10 +140,20 @@ struct Estimator break; case PST_TRIANGLE: { - vector3_type vertex0 = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); - vector3_type vertex1 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7])); - vector3_type vertex2 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10])); + const vector3_type vertex0 = vector3_type( + bit_cast(event.data[2]), + bit_cast(event.data[3]), + bit_cast(event.data[4])); + const vector3_type vertex1 = vector3_type( + bit_cast(event.data[5]), + bit_cast(event.data[6]), + bit_cast(event.data[7])); + const vector3_type vertex2 = vector3_type( + bit_cast(event.data[8]), + bit_cast(event.data[9]), + bit_cast(event.data[10])); Shape tri = Shape::create(vertex0, vertex1, vertex2, event.data[11]); + const vector3_type sampleL = tri.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); const vector3_type V = interaction.isotropic.V.getDirection(); const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); @@ -141,10 +164,20 @@ struct Estimator break; case PST_RECTANGLE: { - vector3_type offset = vector3_type(asfloat(event.data[2]), asfloat(event.data[3]), asfloat(event.data[4])); - vector3_type edge0 = vector3_type(asfloat(event.data[5]), asfloat(event.data[6]), asfloat(event.data[7])); - vector3_type edge1 = vector3_type(asfloat(event.data[8]), asfloat(event.data[9]), asfloat(event.data[10])); + const vector3_type offset = vector3_type( + bit_cast(event.data[2]), + bit_cast(event.data[3]), + bit_cast(event.data[4])); + const vector3_type edge0 = vector3_type( + bit_cast(event.data[5]), + bit_cast(event.data[6]), + bit_cast(event.data[7])); + const vector3_type edge1 = vector3_type( + bit_cast(event.data[8]), + bit_cast(event.data[9]), + bit_cast(event.data[10])); Shape rect = Shape::create(offset, edge0, edge1, event.data[11]); + const vector3_type sampleL = rect.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); const vector3_type V = interaction.isotropic.V.getDirection(); const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); @@ -154,7 +187,7 @@ struct Estimator } break; default: - pdf = numeric_limits::infinity; + pdf = bit_cast(numeric_limits::infinity); break; } diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index df4792a9c..6f1518a46 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -99,7 +99,7 @@ struct Unidirectional scalar_type getLuma(NBL_CONST_REF_ARG(vector3_type) col) { - return nbl::hlsl::dot(nbl::hlsl::transpose(colorspace::scRGBtoXYZ)[1], col); + return hlsl::dot(hlsl::transpose(colorspace::scRGBtoXYZ)[1], col); } // TODO: probably will only work with procedural shapes, do the other ones @@ -123,8 +123,8 @@ struct Unidirectional bsdfLightIDs = scene.getBsdfLightIDs(objectID); vector3_type N = scene.getNormal(objectID, intersection); N = nbl::hlsl::normalize(N); - typename isotropic_type::ray_dir_info_type V; - V.direction = nbl::hlsl::normalize(-ray.direction); + ray_dir_info_type V; + V.direction = -ray.direction; isotropic_type iso_interaction = isotropic_type::create(V, N); interaction = anisotropic_type::create(iso_interaction); } @@ -143,8 +143,6 @@ struct Unidirectional ray.payload.accumulation += nee.deferredEvalAndPdf(_pdf, scene.lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + _pdf * _pdf * ray.payload.otherTechniqueHeuristic); } - return false; // emissive only - const uint32_t bsdfID = glsl::bitfieldExtract(bsdfLightIDs, 0, 16); if (bsdfID == bxdfnode_type::INVALID_ID) return false; @@ -163,9 +161,9 @@ struct Unidirectional // thresholds const scalar_type bxdfPdfThreshold = 0.0001; const scalar_type lumaContributionThreshold = getLuma(colorspace::eotf::sRGB((vector3_type)1.0 / 255.0)); // OETF smallest perceptible value - const vector3_type throughputCIE_Y = nbl::hlsl::transpose(colorspace::sRGBtoXYZ)[1] * throughput; // TODO: this only works if spectral_type is dim 3 + const vector3_type throughputCIE_Y = hlsl::transpose(colorspace::sRGBtoXYZ)[1] * throughput; // TODO: this only works if spectral_type is dim 3 const measure_type eta = bxdf.params.ior0 / bxdf.params.ior1; // assume it's real, not imaginary? - const scalar_type monochromeEta = nbl::hlsl::dot(throughputCIE_Y, eta) / (throughputCIE_Y.r + throughputCIE_Y.g + throughputCIE_Y.b); // TODO: imaginary eta? + const scalar_type monochromeEta = hlsl::dot(throughputCIE_Y, eta) / (throughputCIE_Y.r + throughputCIE_Y.g + throughputCIE_Y.b); // TODO: imaginary eta? // sample lights const scalar_type neeProbability = 1.0; // BSDFNode_getNEEProb(bsdf); @@ -185,7 +183,6 @@ struct Unidirectional // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself anisocache_type _cache; validPath = validPath && anisocache_type::template compute(_cache, interaction, nee_sample, monochromeEta); - bxdf.params.A = nbl::hlsl::max(bxdf.params.A, vector(0,0)); bxdf.params.eta = monochromeEta; if (neeContrib_pdf.pdf < numeric_limits::max) @@ -231,7 +228,7 @@ struct Unidirectional } quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params); - bsdf_quotient_pdf.quotient *= throughput; + bsdf_quotient_pdf.quotient *= bxdf.albedo * throughput; neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient; const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb; // const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf; @@ -268,30 +265,30 @@ struct Unidirectional { params = params_type::template create(bsdf_sample, iso_interaction, bxdf::BCM_MAX); } - else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) - { - if (bxdf.params.is_aniso) - params = params_type::template create(bsdf_sample, interaction, _cache, bxdf::BCM_MAX); - else - { - isocache_type isocache = _cache.iso_cache; - params = params_type::template create(bsdf_sample, iso_interaction, isocache, bxdf::BCM_MAX); - } - } - else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) - { - params = params_type::template create(bsdf_sample, iso_interaction, bxdf::BCM_ABS); - } - else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) - { - if (bxdf.params.is_aniso) - params = params_type::template create(bsdf_sample, interaction, _cache, bxdf::BCM_ABS); - else - { - isocache_type isocache = _cache.iso_cache; - params = params_type::template create(bsdf_sample, iso_interaction, isocache, bxdf::BCM_ABS); - } - } + // else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) + // { + // if (bxdf.params.is_aniso) + // params = params_type::template create(bsdf_sample, interaction, _cache, bxdf::BCM_MAX); + // else + // { + // isocache_type isocache = _cache.iso_cache; + // params = params_type::template create(bsdf_sample, iso_interaction, isocache, bxdf::BCM_MAX); + // } + // } + // else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) + // { + // params = params_type::template create(bsdf_sample, iso_interaction, bxdf::BCM_ABS); + // } + // else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) + // { + // if (bxdf.params.is_aniso) + // params = params_type::template create(bsdf_sample, interaction, _cache, bxdf::BCM_ABS); + // else + // { + // isocache_type isocache = _cache.iso_cache; + // params = params_type::template create(bsdf_sample, iso_interaction, isocache, bxdf::BCM_ABS); + // } + // } // the value of the bsdf divided by the probability of the sample being generated quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params); @@ -353,11 +350,7 @@ struct Unidirectional hit = ray.objectID.id != -1; if (hit) - { - // float pp = float(ray.objectID.id) / 10.0; - // ray.payload.accumulation = measure_type(pp, 1.0-pp, 0.3); rayAlive = closestHitProgram(1, i, ray, scene); - } } if (!hit) From 077d150bded805ce9ed10bc7711544576779ad39 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 11 Mar 2025 17:02:59 +0700 Subject: [PATCH 069/296] 1st working ver, sort of --- .../app_resources/glsl/common.glsl | 2 +- .../app_resources/hlsl/common.hlsl | 4 +- .../app_resources/hlsl/material_system.hlsl | 7 +- .../app_resources/hlsl/pathtracer.hlsl | 92 ++++++++++--------- 31_HLSLPathTracer/imgui.ini | 8 ++ 5 files changed, 63 insertions(+), 50 deletions(-) create mode 100644 31_HLSLPathTracer/imgui.ini diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl index 15b3662d0..2463f82cf 100644 --- a/31_HLSLPathTracer/app_resources/glsl/common.glsl +++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl @@ -7,7 +7,7 @@ //#define VISUALIZE_HIGH_VARIANCE // debug -#define NEE_ONLY 1 +//#define NEE_ONLY layout(set = 2, binding = 0) uniform sampler2D envMap; layout(set = 2, binding = 1) uniform usamplerBuffer sampleSequence; diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index f67716060..ac1e0f09a 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -268,7 +268,7 @@ struct Shape } template - float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi) + float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi) { float32_t3 Z = position - origin; const float distanceSQ = hlsl::dot(Z,Z); @@ -279,7 +279,7 @@ struct Shape Z *= rcpDistance; const float cosThetaMax = hlsl::sqrt(cosThetaMax2); - const float cosTheta = nbl::hlsl::mix(1.0, cosThetaMax, xi.x); + const float cosTheta = hlsl::mix(1.0, cosThetaMax, xi.x); float32_t3 L = Z * cosTheta; diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index af8d5b131..0d739d9ec 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -123,13 +123,8 @@ struct System quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) { - - const bool transmissive = material.type == Material::Type::DIELECTRIC; - const float clampedNdotV = math::conditionalAbsOrMax(transmissive, params.uNdotV, 0.0); - const float clampedNdotL = math::conditionalAbsOrMax(transmissive, params.uNdotL, 0.0); - const float minimumProjVectorLen = 0.00000001; - if (clampedNdotV > minimumProjVectorLen && clampedNdotL > minimumProjVectorLen) + if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen) { switch(material.type) { diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 6f1518a46..6ed89de7a 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -94,7 +94,7 @@ struct Unidirectional uint32_t address = glsl::bitfieldInsert(protoDimension, _sample, MAX_DEPTH_LOG2, MAX_SAMPLES_LOG2); uint32_t3 seqVal = sampleSequence[address + i].xyz; seqVal ^= randGen(); - return vector3_type(seqVal) * asfloat(0x2f800004u); + return vector3_type(seqVal) * bit_cast(0x2f800004u); } scalar_type getLuma(NBL_CONST_REF_ARG(vector3_type) col) @@ -177,6 +177,7 @@ struct Unidirectional scene.lights[lightID], intersection, interaction, isBSDF, eps0, depth, scene.toNextEvent(lightID) ); + //printf("%f %f %f\n", nee_sample.L.direction.x, nee_sample.L.direction.y, nee_sample.L.direction.z); // We don't allow non watertight transmitters in this renderer bool validPath = nee_sample.NdotL > numeric_limits::min; @@ -195,47 +196,51 @@ struct Unidirectional { ext::MaterialSystem::Material material; material.type = bxdf.materialType; - params_type params; + + bxdf::BxDFClampMode _clamp; + _clamp = (bxdf.materialType == ext::MaterialSystem::Material::Type::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX; + // example only uses isotropic bxdfs + params_type params = params_type::template create(nee_sample, interaction.isotropic, _cache.iso_cache, _clamp); // TODO: does not yet account for smooth dielectric - if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) - { - params = params_type::template create(nee_sample, iso_interaction, bxdf::BCM_MAX); - } - else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) - { - if (bxdf.params.is_aniso) - params = params_type::template create(nee_sample, interaction, _cache, bxdf::BCM_MAX); - else - { - isocache_type isocache = _cache.iso_cache; - params = params_type::template create(nee_sample, iso_interaction, isocache, bxdf::BCM_MAX); - } - } - else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) - { - params = params_type::template create(nee_sample, iso_interaction, bxdf::BCM_ABS); - } - else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) - { - if (bxdf.params.is_aniso) - params = params_type::template create(nee_sample, interaction, _cache, bxdf::BCM_ABS); - else - { - isocache_type isocache = _cache.iso_cache; - params = params_type::template create(nee_sample, iso_interaction, isocache, bxdf::BCM_ABS); - } - } + // if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) + // { + // params = params_type::template create(nee_sample, interaction.isotropic, bxdf::BCM_MAX); + // } + // else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) + // { + // if (bxdf.params.is_aniso) + // params = params_type::template create(nee_sample, interaction, _cache, bxdf::BCM_MAX); + // else + // { + // isocache_type isocache = _cache.iso_cache; + // params = params_type::template create(nee_sample, interaction.isotropic, _cache.iso_cache, bxdf::BCM_MAX); + // } + // } + // else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) + // { + // params = params_type::template create(nee_sample, interaction.isotropic, bxdf::BCM_ABS); + // } + // else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) + // { + // if (bxdf.params.is_aniso) + // params = params_type::template create(nee_sample, interaction, _cache, bxdf::BCM_ABS); + // else + // { + // isocache_type isocache = _cache.iso_cache; + // params = params_type::template create(nee_sample, interaction.isotropic, _cache.iso_cache, bxdf::BCM_ABS); + // } + // } quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params); bsdf_quotient_pdf.quotient *= bxdf.albedo * throughput; neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient; const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb; - // const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf; - // neeContrib_pdf.quotient *= otherGenOverChoice / (1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice); // balance heuristic + const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf; + neeContrib_pdf.quotient *= otherGenOverChoice / (1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice); // balance heuristic // TODO: ifdef NEE only - neeContrib_pdf.quotient *= otherGenOverChoice; + // neeContrib_pdf.quotient *= otherGenOverChoice; ray_type nee_ray; nee_ray.origin = intersection + nee_sample.L.direction * t * Tolerance::getStart(depth); @@ -247,7 +252,7 @@ struct Unidirectional } } - return false; // NEE only + //return false; // NEE only // sample BSDF scalar_type bxdfPdf; @@ -259,12 +264,17 @@ struct Unidirectional anisocache_type _cache; sample_type bsdf_sample = materialSystem.generate(material, bxdf.params, interaction, eps1, _cache); + bxdf::BxDFClampMode _clamp; + _clamp = (bxdf.materialType == ext::MaterialSystem::Material::Type::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX; + // example only uses isotropic bxdfs + params_type params = params_type::template create(bsdf_sample, interaction.isotropic, _cache.iso_cache, _clamp); + // TODO: does not yet account for smooth dielectric - params_type params; - if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) - { - params = params_type::template create(bsdf_sample, iso_interaction, bxdf::BCM_MAX); - } + // params_type params; + // if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) + // { + // params = params_type::template create(bsdf_sample, iso_interaction, bxdf::BCM_MAX); + // } // else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) // { // if (bxdf.params.is_aniso) @@ -292,7 +302,8 @@ struct Unidirectional // the value of the bsdf divided by the probability of the sample being generated quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params); - throughput *= bsdf_quotient_pdf.quotient; + throughput *= bxdf.albedo * bsdf_quotient_pdf.quotient; + bxdfPdf = bsdf_quotient_pdf.pdf; bxdfSample = bsdf_sample.L.direction; } @@ -351,7 +362,6 @@ struct Unidirectional hit = ray.objectID.id != -1; if (hit) rayAlive = closestHitProgram(1, i, ray, scene); - } if (!hit) missProgram(ray); diff --git a/31_HLSLPathTracer/imgui.ini b/31_HLSLPathTracer/imgui.ini new file mode 100644 index 000000000..e60624929 --- /dev/null +++ b/31_HLSLPathTracer/imgui.ini @@ -0,0 +1,8 @@ +[Window][Debug##Default] +Pos=60,60 +Size=400,400 + +[Window][Controls] +Pos=10,10 +Size=320,340 + From beac3283c9680c1ff153556fbb44791b6d3c6578 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 12 Mar 2025 22:31:19 +0700 Subject: [PATCH 070/296] Remove raygenGroupStride --- 71_RayTracingPipeline/main.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 5793ff8d3..e8cc6f947 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -28,7 +28,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, struct ShaderBindingTable { SBufferRange raygenGroupRange; - uint32_t raygenGroupStride; SBufferRange hitGroupsRange; uint32_t hitGroupsStride; SBufferRange missGroupsRange; @@ -742,7 +741,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, }else { cmdbuf->traceRays( - m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.raygenGroupStride, + m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, @@ -1326,7 +1325,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .offset = 0, .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment) }; - m_shaderBindingTable.raygenGroupStride = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment); missRange = { .offset = raygenRange.size, From 6517442ad2e8a592e3d5778eeee2ea607d6bb51f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 12 Mar 2025 22:56:38 +0700 Subject: [PATCH 071/296] Fix merge bug. --- 71_RayTracingPipeline/main.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index e8cc6f947..26618d2b2 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -482,9 +482,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { IGPUSampler::SParams params; params.AnisotropicFilter = 1u; - params.TextureWrapU = ISampler::ETC_REPEAT; - params.TextureWrapV = ISampler::ETC_REPEAT; - params.TextureWrapW = ISampler::ETC_REPEAT; + params.TextureWrapU = ETC_REPEAT; + params.TextureWrapV = ETC_REPEAT; + params.TextureWrapW = ETC_REPEAT; m_ui.samplers.gui = m_device->createSampler(params); m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler"); @@ -578,10 +578,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); - float32_t dOuterCutoff = degrees(acos(m_light.outerCutoff)); + float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff)); if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f)) { - m_light.outerCutoff = cos(radians(dOuterCutoff)); + m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff)); } } ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand); From 64653126f9bb5b13a8150376717662b977bdf5e4 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 12 Mar 2025 23:21:32 +0700 Subject: [PATCH 072/296] Fix demo to use the new traceRayIndirect --- 71_RayTracingPipeline/main.cpp | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 26618d2b2..4106a958f 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -730,10 +730,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, if (m_useIndirectCommand) { cmdbuf->traceRaysIndirect( - m_shaderBindingTable.raygenGroupRange, m_shaderBindingTable.raygenGroupStride, - m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, - m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, - m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, SBufferBinding{ .offset = 0, .buffer = m_indirectBuffer, @@ -1042,7 +1038,26 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue) { - const auto command = TraceRaysIndirectCommand_t{ WIN_W, WIN_H, 1 }; + const auto getBufferRangeAddress = [](const SBufferRange& range) + { + return range.buffer->getDeviceAddress() + range.offset; + }; + const auto command = TraceRaysIndirectCommand_t{ + .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange), + .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size, + .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange), + .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size, + .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride, + .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange), + .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size, + .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride, + .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange), + .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size, + .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride, + .width = WIN_W, + .height = WIN_H, + .depth = 1, + }; IGPUBuffer::SCreationParams params; params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; params.size = sizeof(TraceRaysIndirectCommand_t); From 6abb635b5d47e689e8f8ad3eb1ef35887ed53df6 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 13 Mar 2025 15:56:08 +0700 Subject: [PATCH 073/296] fixed nan and accumulation going black problem --- .../app_resources/hlsl/common.hlsl | 10 ++++++- .../hlsl/next_event_estimator.hlsl | 6 ++--- .../app_resources/hlsl/pathtracer.hlsl | 22 +++++++--------- .../app_resources/hlsl/render.comp.hlsl | 26 +++---------------- .../app_resources/hlsl/render_common.hlsl | 23 ++++++++++++++++ 31_HLSLPathTracer/main.cpp | 19 +++++++++----- 6 files changed, 61 insertions(+), 45 deletions(-) create mode 100644 31_HLSLPathTracer/app_resources/hlsl/render_common.hlsl diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index ac1e0f09a..9e2249732 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -18,7 +18,7 @@ namespace hlsl namespace ext { -template +template // TODO make type T Spectrum struct Payload { using this_t = Payload; @@ -85,6 +85,14 @@ struct Light NBL_CONSTEXPR_STATIC_INLINE uint32_t INVALID_ID = 0xffffu; + static Light create(NBL_CONST_REF_ARG(spectral_type) radiance, uint32_t objId, uint32_t mode, ProceduralShapeType shapeType) + { + Light retval; + retval.radiance = radiance; + retval.objectID = ObjectID::create(objId, mode, shapeType); + return retval; + } + static Light create(NBL_CONST_REF_ARG(spectral_type) radiance, NBL_CONST_REF_ARG(ObjectID) objectID) { Light retval; diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 65646b3c1..c1528216d 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -89,7 +89,7 @@ struct Estimator static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event) { - const uint32_t mode = event.mode; + const IntersectMode mode = (IntersectMode)event.mode; switch (mode) { case IM_RAY_QUERY: @@ -192,7 +192,7 @@ struct Estimator } newRayMaxT *= Tolerance::getEnd(depth); - pdf *= 1.0 / lightCount; + pdf *= 1.0 / scalar_type(lightCount); spectral_type quo = light.radiance / pdf; quotient_pdf = quotient_pdf_type::create(quo, pdf); @@ -201,7 +201,7 @@ struct Estimator static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event) { - const uint32_t mode = event.mode; + const IntersectMode mode = (IntersectMode)event.mode; sample_type L; switch (mode) { diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 6ed89de7a..5c01db852 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -79,13 +79,12 @@ struct Unidirectional // NextEventEstimator nee) // {} - static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams) params, Buffer sampleSequence) + static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams) params) { this_t retval; retval.randGen = randgen_type::create(params.rngState); retval.rayGen = raygen_type::create(params.pixOffsetParam, params.camPos, params.NDC, params.invMVP); retval.materialSystem = material_system_type::create(params.diffuseParams, params.conductorParams, params.dielectricParams); - retval.sampleSequence = sampleSequence; return retval; } @@ -170,14 +169,14 @@ struct Unidirectional scalar_type rcpChoiceProb; if (!math::partitionRandVariable(neeProbability, eps0.z, rcpChoiceProb) && depth < 2u) { + uint32_t randLightID = uint32_t(float32_t(randGen().x) / numeric_limits::max) * scene.lightCount; quotient_pdf_type neeContrib_pdf; scalar_type t; sample_type nee_sample = nee.generate_and_quotient_and_pdf( neeContrib_pdf, t, - scene.lights[lightID], intersection, interaction, - isBSDF, eps0, depth, scene.toNextEvent(lightID) + scene.lights[randLightID], intersection, interaction, + isBSDF, eps0, depth, scene.toNextEvent(randLightID) ); - //printf("%f %f %f\n", nee_sample.L.direction.x, nee_sample.L.direction.y, nee_sample.L.direction.z); // We don't allow non watertight transmitters in this renderer bool validPath = nee_sample.NdotL > numeric_limits::min; @@ -233,8 +232,7 @@ struct Unidirectional // } quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params); - bsdf_quotient_pdf.quotient *= bxdf.albedo * throughput; - neeContrib_pdf.quotient *= bsdf_quotient_pdf.quotient; + neeContrib_pdf.quotient *= bxdf.albedo * throughput * bsdf_quotient_pdf.quotient; const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb; const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf; neeContrib_pdf.quotient *= otherGenOverChoice / (1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice); // balance heuristic @@ -252,7 +250,7 @@ struct Unidirectional } } - //return false; // NEE only + // return false; // NEE only // sample BSDF scalar_type bxdfPdf; @@ -312,8 +310,8 @@ struct Unidirectional if (bxdfPdf > bxdfPdfThreshold && getLuma(throughput) > lumaThroughputThreshold) { ray.payload.throughput = throughput; - ray.payload.otherTechniqueHeuristic = neeProbability / bxdfPdf; // numerically stable, don't touch - ray.payload.otherTechniqueHeuristic *= ray.payload.otherTechniqueHeuristic; + scalar_type otherTechniqueHeuristic = neeProbability / bxdfPdf; // numerically stable, don't touch + ray.payload.otherTechniqueHeuristic = otherTechniqueHeuristic * otherTechniqueHeuristic; // trace new ray ray.origin = intersection + bxdfSample * (1.0/*kSceneSize*/) * Tolerance::getStart(depth); @@ -354,7 +352,7 @@ struct Unidirectional // bounces bool hit = true; bool rayAlive = true; - for (int d = 1; d <= depth && hit && rayAlive; d += 2) + for (int d = 1; (d <= depth) && hit && rayAlive; d += 2) { ray.intersectionT = numeric_limits::max; ray.objectID = intersector_type::traceRay(ray, scene); @@ -385,8 +383,6 @@ struct Unidirectional raygen_type rayGen; material_system_type materialSystem; nee_type nee; - - Buffer sampleSequence; }; } diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index e25961b56..d19007dd4 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -6,6 +6,7 @@ #include "nbl/builtin/hlsl/bxdf/reflection.hlsl" #include "nbl/builtin/hlsl/bxdf/transmission.hlsl" +#include "render_common.hlsl" #include "pathtracer.hlsl" // add these defines (one at a time) using -D argument to dxc @@ -26,25 +27,6 @@ NBL_CONSTEXPR uint32_t WorkgroupSize = 32; NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4; NBL_CONSTEXPR uint32_t MAX_SAMPLES_LOG2 = 10; -struct SPushConstants -{ - float32_t4x4 invMVP; - int sampleCount; - int depth; -}; - -[[vk::push_constant]] SPushConstants pc; - -[[vk::combinedImageSampler]][[vk::binding(0, 2)]] Texture2D envMap; // unused -[[vk::combinedImageSampler]][[vk::binding(0, 2)]] SamplerState envSampler; - -[[vk::binding(1, 2)]] Buffer sampleSequence; - -[[vk::combinedImageSampler]][[vk::binding(2, 2)]] Texture2D scramblebuf; // unused -[[vk::combinedImageSampler]][[vk::binding(2, 2)]] SamplerState scrambleSampler; - -[[vk::image_format("rgba16f")]][[vk::binding(0, 0)]] RWTexture2D outImage; - int32_t2 getCoordinates() { return int32_t2(glsl::gl_GlobalInvocationID().xy); @@ -115,7 +97,7 @@ static const ext::Shape rectangles[RECTANGLE_COUNT] = { #define LIGHT_COUNT 1 static const light_type lights[LIGHT_COUNT] = { - light_type::create(spectral_t(30.0,25.0,15.0), ext::ObjectID::create(8u, ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE)) + light_type::create(spectral_t(30.0,25.0,15.0), 8u, ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE) }; #define BXDF_COUNT 7 @@ -154,7 +136,7 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) // set up path tracer ext::PathTracer::PathTracerCreationParams ptCreateParams; - ptCreateParams.rngState = pcg(); + ptCreateParams.rngState = scramblebuf[coords].rg; uint2 scrambleDim; scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y); @@ -174,7 +156,7 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) ptCreateParams.conductorParams = bxdfs[3].params; ptCreateParams.dielectricParams = bxdfs[6].params; - pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams, sampleSequence); + pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams); // set up scene (can do as global var?) ext::Scene scene; diff --git a/31_HLSLPathTracer/app_resources/hlsl/render_common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render_common.hlsl new file mode 100644 index 000000000..5e5cf89da --- /dev/null +++ b/31_HLSLPathTracer/app_resources/hlsl/render_common.hlsl @@ -0,0 +1,23 @@ +#ifndef _NBL_HLSL_PATHTRACER_RENDER_COMMON_INCLUDED_ +#define _NBL_HLSL_PATHTRACER_RENDER_COMMON_INCLUDED_ + +struct SPushConstants +{ + float32_t4x4 invMVP; + int sampleCount; + int depth; +}; + +[[vk::push_constant]] SPushConstants pc; + +[[vk::combinedImageSampler]][[vk::binding(0, 2)]] Texture2D envMap; // unused +[[vk::combinedImageSampler]][[vk::binding(0, 2)]] SamplerState envSampler; + +[[vk::binding(1, 2)]] Buffer sampleSequence; + +[[vk::combinedImageSampler]][[vk::binding(2, 2)]] Texture2D scramblebuf; // unused +[[vk::combinedImageSampler]][[vk::binding(2, 2)]] SamplerState scrambleSampler; + +[[vk::image_format("rgba16f")]][[vk::binding(0, 0)]] RWTexture2D outImage; + +#endif diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 8da32083e..30a0fad8d 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -74,6 +74,13 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, inline bool isComputeOnly() const override { return false; } + //inline video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override + //{ + // auto retval = device_base_t::getAPIFeaturesToEnable(); + // retval.synchronizationValidation = true; + // return retval; + //} + inline core::vector getSurfaces() const override { if (!m_surface) @@ -359,11 +366,11 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; // should be compute options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; options.spirvOptimizer = nullptr; -#ifndef _NBL_DEBUG - ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; - auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); - options.spirvOptimizer = opt.get(); -#endif +//#ifndef _NBL_DEBUG +// ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; +// auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); +// options.spirvOptimizer = opt.get(); +//#endif options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT; options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); options.preprocessorOptions.logger = m_logger.get(); @@ -1343,7 +1350,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE; int renderMode = E_RENDER_MODE::ERM_HLSL; int spp = 32; - int depth = 3; + int depth = 1; bool m_firstFrame = true; IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; From 1eee3ca8ade05d2afdd1ae3eeb1033edee372a66 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 13 Mar 2025 16:06:04 +0700 Subject: [PATCH 074/296] fixed triangle light, rectangle needs checking --- 31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index d19007dd4..065d93b7b 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -97,7 +97,13 @@ static const ext::Shape rectangles[RECTANGLE_COUNT] = { #define LIGHT_COUNT 1 static const light_type lights[LIGHT_COUNT] = { - light_type::create(spectral_t(30.0,25.0,15.0), 8u, ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE) + light_type::create(spectral_t(30.0,25.0,15.0), +#ifdef SPHERE_LIGHT + 8u, +#else + 0u, +#endif + ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE) }; #define BXDF_COUNT 7 From 011fbfb376ef9d926b74960a05a5bfcfaf851fbf Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 13 Mar 2025 16:24:12 +0700 Subject: [PATCH 075/296] simplified material data --- .../app_resources/hlsl/material_system.hlsl | 61 +++++++++++-------- .../app_resources/hlsl/pathtracer.hlsl | 20 +++--- .../app_resources/hlsl/render.comp.hlsl | 14 ++--- 31_HLSLPathTracer/main.cpp | 2 +- 4 files changed, 49 insertions(+), 48 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index 0d739d9ec..feffee9ef 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -14,20 +14,27 @@ namespace ext namespace MaterialSystem { -struct Material -{ - enum Type : uint32_t // enum class? - { - DIFFUSE, - CONDUCTOR, - DIELECTRIC - }; +// struct Material +// { +// enum Type : uint32_t // enum class? +// { +// DIFFUSE, +// CONDUCTOR, +// DIELECTRIC +// }; - NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 1; +// NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 1; - uint32_t type : 2; - uint32_t unused : 30; // possible space for flags - uint32_t data[DataSize]; +// uint32_t type : 2; +// uint32_t unused : 30; // possible space for flags +// uint32_t data[DataSize]; +// }; + +enum MaterialType : uint32_t // enum class? +{ + DIFFUSE, + CONDUCTOR, + DIELECTRIC }; template // NOTE: these bxdfs should match the ones in Scene BxDFNode @@ -59,23 +66,23 @@ struct System return retval; } - measure_type eval(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) + measure_type eval(uint32_t material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) { - switch(material.type) + switch(material) { - case Material::Type::DIFFUSE: + case MaterialType::DIFFUSE: { diffuseBxDF.init(cparams); return (measure_type)diffuseBxDF.eval(params); } break; - case Material::Type::CONDUCTOR: + case MaterialType::CONDUCTOR: { conductorBxDF.init(cparams); return conductorBxDF.eval(params); } break; - case Material::Type::DIELECTRIC: + case MaterialType::DIELECTRIC: { dielectricBxDF.init(cparams); return dielectricBxDF.eval(params); @@ -86,23 +93,23 @@ struct System } } - sample_type generate(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(anisotropic_type) interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) _cache) + sample_type generate(uint32_t material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(anisotropic_type) interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) _cache) { - switch(material.type) + switch(material) { - case Material::Type::DIFFUSE: + case MaterialType::DIFFUSE: { diffuseBxDF.init(cparams); return diffuseBxDF.generate(interaction, u.xy); } break; - case Material::Type::CONDUCTOR: + case MaterialType::CONDUCTOR: { conductorBxDF.init(cparams); return conductorBxDF.generate(interaction, u.xy, _cache); } break; - case Material::Type::DIELECTRIC: + case MaterialType::DIELECTRIC: { dielectricBxDF.init(cparams); return dielectricBxDF.generate(interaction, u, _cache); @@ -121,26 +128,26 @@ struct System return sample_type::create(L, 0, (vector3_type)0); } - quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(Material) material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) + quotient_pdf_type quotient_and_pdf(uint32_t material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) { const float minimumProjVectorLen = 0.00000001; if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen) { - switch(material.type) + switch(material) { - case Material::Type::DIFFUSE: + case MaterialType::DIFFUSE: { diffuseBxDF.init(cparams); return diffuseBxDF.quotient_and_pdf(params); } break; - case Material::Type::CONDUCTOR: + case MaterialType::CONDUCTOR: { conductorBxDF.init(cparams); return conductorBxDF.quotient_and_pdf(params); } break; - case Material::Type::DIELECTRIC: + case MaterialType::DIELECTRIC: { dielectricBxDF.init(cparams); return dielectricBxDF.quotient_and_pdf(params); diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 5c01db852..553094e21 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -150,8 +150,8 @@ struct Unidirectional // TODO: ifdef kill diffuse specular paths - const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::Material::Type::DIFFUSE) ? bxdf_traits::type == BT_BSDF : - (bxdf.materialType == ext::MaterialSystem::Material::Type::CONDUCTOR) ? bxdf_traits::type == BT_BSDF : + const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIFFUSE) ? bxdf_traits::type == BT_BSDF : + (bxdf.materialType == ext::MaterialSystem::MaterialType::CONDUCTOR) ? bxdf_traits::type == BT_BSDF : bxdf_traits::type == BT_BSDF; vector3_type eps0 = rand3d(depth, _sample, 0u); @@ -193,11 +193,8 @@ struct Unidirectional ray.payload.accumulation += vector3_type(0.f, 1000.f, 0.f); else if (validPath) { - ext::MaterialSystem::Material material; - material.type = bxdf.materialType; - bxdf::BxDFClampMode _clamp; - _clamp = (bxdf.materialType == ext::MaterialSystem::Material::Type::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX; + _clamp = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX; // example only uses isotropic bxdfs params_type params = params_type::template create(nee_sample, interaction.isotropic, _cache.iso_cache, _clamp); @@ -231,7 +228,7 @@ struct Unidirectional // } // } - quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params); + quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(bxdf.materialType, bxdf.params, params); neeContrib_pdf.quotient *= bxdf.albedo * throughput * bsdf_quotient_pdf.quotient; const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb; const scalar_type otherGenOverLightAndChoice = otherGenOverChoice / bsdf_quotient_pdf.pdf; @@ -256,14 +253,11 @@ struct Unidirectional scalar_type bxdfPdf; vector3_type bxdfSample; { - ext::MaterialSystem::Material material; - material.type = bxdf.materialType; - anisocache_type _cache; - sample_type bsdf_sample = materialSystem.generate(material, bxdf.params, interaction, eps1, _cache); + sample_type bsdf_sample = materialSystem.generate(bxdf.materialType, bxdf.params, interaction, eps1, _cache); bxdf::BxDFClampMode _clamp; - _clamp = (bxdf.materialType == ext::MaterialSystem::Material::Type::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX; + _clamp = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX; // example only uses isotropic bxdfs params_type params = params_type::template create(bsdf_sample, interaction.isotropic, _cache.iso_cache, _clamp); @@ -299,7 +293,7 @@ struct Unidirectional // } // the value of the bsdf divided by the probability of the sample being generated - quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(material, bxdf.params, params); + quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(bxdf.materialType, bxdf.params, params); throughput *= bxdf.albedo * bsdf_quotient_pdf.quotient; bxdfPdf = bsdf_quotient_pdf.pdf; bxdfSample = bsdf_sample.L.direction; diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index 065d93b7b..5dea2d1bf 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -108,13 +108,13 @@ static const light_type lights[LIGHT_COUNT] = { #define BXDF_COUNT 7 static const bxdfnode_type bxdfs[BXDF_COUNT] = { - bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.8,0.8)), - bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.4,0.4)), - bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIFFUSE, false, float2(0,0), spectral_t(0.4,0.8,0.4)), - bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77)), - bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)), - bxdfnode_type::create(ext::MaterialSystem::Material::Type::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)), - bxdfnode_type::create(ext::MaterialSystem::Material::Type::DIELECTRIC, false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67)) + bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.8,0.8)), + bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.4,0.4)), + bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.4,0.8,0.4)), + bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77)), + bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)), + bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)), + bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIELECTRIC, false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67)) }; [numthreads(WorkgroupSize, WorkgroupSize, 1)] diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 30a0fad8d..46597d738 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -1350,7 +1350,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE; int renderMode = E_RENDER_MODE::ERM_HLSL; int spp = 32; - int depth = 1; + int depth = 3; bool m_firstFrame = true; IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; From 4d1dca47c8e95081341cbf5fb31ab64b27fa8e1b Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 13 Mar 2025 23:54:12 +0700 Subject: [PATCH 076/296] Small fixes on ray trace pipeline demo. --- 71_RayTracingPipeline/app_resources/common.hlsl | 2 +- 71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl | 2 +- 71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index 5b69c4a76..6c052dff1 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -64,7 +64,7 @@ struct MaterialPacked bool isTransparent() NBL_CONST_MEMBER_FUNC { return alpha != MAX_UNORM_10; -} + } }; inline MaterialPacked packMaterial(Material material) diff --git a/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl index fcb130104..f298e4643 100644 --- a/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl +++ b/71_RayTracingPipeline/app_resources/light_spot.rcall.hlsl @@ -10,7 +10,7 @@ void main(inout RayLight cLight) cLight.outIntensity = LightIntensity / (cLight.outLightDistance * cLight.outLightDistance); cLight.outLightDir = normalize(lDir); float theta = dot(cLight.outLightDir, normalize(-pc.light.direction)); - float epsilon = 1 - pc.light.outerCutoff; + float epsilon = 1.f - pc.light.outerCutoff; float spotIntensity = clamp((theta - pc.light.outerCutoff) / epsilon, 0.0, 1.0); cLight.outIntensity *= spotIntensity; } diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index df6a5215d..f15b424ea 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -91,7 +91,7 @@ void main() uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_NON_OPAQUE | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER; OcclusionPayload occlusionPayload; - occlusionPayload.attenuation = 1; // negative attenuation indicate occlusion happening. will be multiplied by -1 in miss shader. + occlusionPayload.attenuation = 1; TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, occlusionPayload); attenuation = occlusionPayload.attenuation; From cc84091d68ceedeb954c1867b3420e0e5119789b Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 13 Mar 2025 23:55:45 +0700 Subject: [PATCH 077/296] Optimize ray tracing demo occlusion tracing --- .../app_resources/raytrace.rgen.hlsl | 2 +- .../app_resources/raytrace_shadow.rahit.hlsl | 13 ++----------- .../raytrace_shadow_triangle.rchit.hlsl | 7 +++++++ 71_RayTracingPipeline/main.cpp | 8 +++++++- 4 files changed, 17 insertions(+), 13 deletions(-) create mode 100644 71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index f15b424ea..c74774880 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -89,7 +89,7 @@ void main() rayDesc.TMin = 0.01; rayDesc.TMax = cLight.outLightDistance; - uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_NON_OPAQUE | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER; + uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH; OcclusionPayload occlusionPayload; occlusionPayload.attenuation = 1; TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, occlusionPayload); diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl index c59f7367e..e76f1da55 100644 --- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl @@ -9,15 +9,6 @@ void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttribut const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); const Material material = unpackMaterial(geom.material); - if (material.isTransparent()) - { - payload.attenuation = material.alpha * payload.attenuation; - IgnoreHit(); - } - else - { - payload.attenuation = 0; - AcceptHitAndEndSearch(); - } - + payload.attenuation = material.alpha * payload.attenuation; + IgnoreHit(); } diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl new file mode 100644 index 000000000..c85c7c32d --- /dev/null +++ b/71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl @@ -0,0 +1,7 @@ +#include "common.hlsl" + +[shader("closesthit")] +void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttributes attribs) +{ + payload.attenuation = 0; +} diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 4106a958f..cb7ef1d8e 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -164,6 +164,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl"); const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); const auto shadowMissShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl"); + const auto shadowClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_shadow_triangle.rchit.hlsl"); const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl"); @@ -324,6 +325,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, RTDS_RAYGEN, RTDS_MISS, RTDS_SHADOW_MISS, + RTDS_CLOSEST_HIT_SHADOW, RTDS_CLOSEST_HIT, RTDS_SPHERE_CLOSEST_HIT, RTDS_ANYHIT_PRIMARY, @@ -339,6 +341,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()}; shaders[RTDS_MISS] = {.shader = missShader.get()}; shaders[RTDS_SHADOW_MISS] = {.shader = shadowMissShader.get()}; + shaders[RTDS_CLOSEST_HIT_SHADOW] = { .shader = shadowClosestHitShader.get() }; shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()}; shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()}; shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()}; @@ -350,6 +353,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, params.layout = pipelineLayout.get(); params.shaders = std::span(shaders); + params.flags = IGPURayTracingPipeline::SCreationParams::FLAGS::NO_NULL_INTERSECTION_SHADERS; auto& shaderGroups = params.shaderGroups; @@ -357,7 +361,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT]; missGroups[EMT_PRIMARY] = { .index = RTDS_MISS }; - missGroups[EMT_OCCLUSION] = { .index = RTDS_SHADOW_MISS }; + missGroups[EMT_OCCLUSION] = { .index = IGPURayTracingPipeline::SGeneralShaderGroup::Unused }; shaderGroups.misses = missGroups; auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) @@ -370,6 +374,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .anyHit = RTDS_ANYHIT_PRIMARY, }; hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { + .closestHit = RTDS_CLOSEST_HIT_SHADOW, .anyHit = RTDS_ANYHIT_SHADOW, }; hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { @@ -378,6 +383,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .intersectionShader = RTDS_INTERSECTION, }; hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { + .closestHit = RTDS_CLOSEST_HIT_SHADOW, .anyHit = RTDS_ANYHIT_SHADOW, .intersectionShader = RTDS_INTERSECTION, }; From 63b64e3182dc395d83ada3fe95f46b5febc41d29 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 14 Mar 2025 11:14:01 +0700 Subject: [PATCH 078/296] made scene a static global var --- .../app_resources/hlsl/render.comp.hlsl | 70 +++++++++---------- .../app_resources/hlsl/scene.hlsl | 67 +++++++++++++++--- 2 files changed, 91 insertions(+), 46 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index 5dea2d1bf..f8cf2ae22 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -6,9 +6,6 @@ #include "nbl/builtin/hlsl/bxdf/reflection.hlsl" #include "nbl/builtin/hlsl/bxdf/transmission.hlsl" -#include "render_common.hlsl" -#include "pathtracer.hlsl" - // add these defines (one at a time) using -D argument to dxc // #define SPHERE_LIGHT // #define TRIANGLE_LIGHT @@ -17,10 +14,33 @@ #ifdef SPHERE_LIGHT #define SPHERE_COUNT 9 #define LIGHT_TYPE ext::PST_SPHERE -#else + +#define TRIANGLE_COUNT 0 +#define RECTANGLE_COUNT 0 +#endif + +#ifdef TRIANGLE_LIGHT +#define TRIANGLE_COUNT 1 +#define LIGHT_TYPE ext::PST_TRIANGLE + +#define SPHERE_COUNT 8 +#define RECTANGLE_COUNT 0 +#endif + +#ifdef RECTANGLE_LIGHT +#define RECTANGLE_COUNT 1 +#define LIGHT_TYPE ext::PST_RECTANGLE + #define SPHERE_COUNT 8 +#define TRIANGLE_COUNT 0 #endif +#define LIGHT_COUNT 1 +#define BXDF_COUNT 7 + +#include "render_common.hlsl" +#include "pathtracer.hlsl" + using namespace nbl::hlsl; NBL_CONSTEXPR uint32_t WorkgroupSize = 32; @@ -80,22 +100,21 @@ static const ext::Shape spheres[SPHERE_COUNT] = { }; #ifdef TRIANGLE_LIGHT -#define LIGHT_TYPE ext::PST_TRIANGLE -#define TRIANGLE_COUNT 1 static const ext::Shape triangles[TRIANGLE_COUNT] = { ext::Shape::create(float3(-1.8,0.35,0.3) * 10.0, float3(-1.2,0.35,0.0) * 10.0, float3(-1.5,0.8,-0.3) * 10.0, bxdfnode_type::INVALID_ID, 0u) }; +#else +static const ext::Shape triangles[1]; #endif #ifdef RECTANGLE_LIGHT -#define LIGHT_TYPE ext::PST_RECTANGLE -#define RECTANGLE_COUNT 1 static const ext::Shape rectangles[RECTANGLE_COUNT] = { ext::Shape::create(float3(-3.8,0.35,1.3), normalize(float3(2,0,-1))*7.0, normalize(float3(2,-5,4))*0.1, bxdfnode_type::INVALID_ID, 0u) }; +#else +static const ext::Shape rectangles[1]; #endif -#define LIGHT_COUNT 1 static const light_type lights[LIGHT_COUNT] = { light_type::create(spectral_t(30.0,25.0,15.0), #ifdef SPHERE_LIGHT @@ -106,7 +125,6 @@ static const light_type lights[LIGHT_COUNT] = { ext::IntersectMode::IM_PROCEDURAL, LIGHT_TYPE) }; -#define BXDF_COUNT 7 static const bxdfnode_type bxdfs[BXDF_COUNT] = { bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.8,0.8)), bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.4,0.4)), @@ -117,6 +135,12 @@ static const bxdfnode_type bxdfs[BXDF_COUNT] = { bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIELECTRIC, false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67)) }; +static const ext::Scene scene = ext::Scene::create( + spheres, triangles, rectangles, + SPHERE_COUNT, TRIANGLE_COUNT, RECTANGLE_COUNT, + lights, LIGHT_COUNT, bxdfs, BXDF_COUNT +); + [numthreads(WorkgroupSize, WorkgroupSize, 1)] void main(uint32_t3 threadID : SV_DispatchThreadID) { @@ -164,32 +188,6 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams); - // set up scene (can do as global var?) - ext::Scene scene; - scene.sphereCount = SPHERE_COUNT; - for (uint32_t i = 0; i < SPHERE_COUNT; i++) - scene.spheres[i] = spheres[i]; -#ifdef TRIANGLE_LIGHT - scene.triangleCount = TRIANGLE_COUNT; - for (uint32_t i = 0; i < TRIANGLE_COUNT; i++) - scene.triangles[i] = triangles[i]; -#else - scene.triangleCount = 0; -#endif -#ifdef RECTANGLE_LIGHT - scene.rectangleCount = RECTANGLE_COUNT; - for (uint32_t i = 0; i < RECTANGLE_COUNT; i++) - scene.rectangles[i] = rectangles[i]; -#else - scene.rectangleCount = 0; -#endif - scene.lightCount = LIGHT_COUNT; - for (uint32_t i = 0; i < LIGHT_COUNT; i++) - scene.lights[i] = lights[i]; - scene.bxdfCount = BXDF_COUNT; - for (uint32_t i = 0; i < BXDF_COUNT; i++) - scene.bxdfs[i] = bxdfs[i]; - float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene); float32_t4 pixCol = float32_t4(color, 1.0); outImage[coords] = pixCol; diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl index 5b4178ec4..887d20c48 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl @@ -15,31 +15,78 @@ struct Scene { using light_type = Light; using bxdfnode_type = BxdfNode; + using this_t = Scene; - NBL_CONSTEXPR_STATIC_INLINE uint32_t maxSphereCount = 25; - NBL_CONSTEXPR_STATIC_INLINE uint32_t maxTriangleCount = 12; - NBL_CONSTEXPR_STATIC_INLINE uint32_t maxRectangleCount = 12; + // NBL_CONSTEXPR_STATIC_INLINE uint32_t maxSphereCount = 25; + // NBL_CONSTEXPR_STATIC_INLINE uint32_t maxTriangleCount = 12; + // NBL_CONSTEXPR_STATIC_INLINE uint32_t maxRectangleCount = 12; - Shape spheres[maxSphereCount]; - Shape triangles[maxTriangleCount]; - Shape rectangles[maxRectangleCount]; +#if SPHERE_COUNT < 1 +#define SCENE_SPHERE_COUNT 1 +#else +#define SCENE_SPHERE_COUNT SPHERE_COUNT +#endif + +#if TRIANGLE_COUNT < 1 +#define SCENE_TRIANGLE_COUNT 1 +#else +#define SCENE_TRIANGLE_COUNT TRIANGLE_COUNT +#endif + +#if RECTANGLE_COUNT < 1 +#define SCENE_RECTANGLE_COUNT 1 +#else +#define SCENE_RECTANGLE_COUNT RECTANGLE_COUNT +#endif + + Shape spheres[SCENE_SPHERE_COUNT]; + Shape triangles[SCENE_TRIANGLE_COUNT]; + Shape rectangles[SCENE_RECTANGLE_COUNT]; uint32_t sphereCount; uint32_t triangleCount; uint32_t rectangleCount; - NBL_CONSTEXPR_STATIC_INLINE uint32_t maxLightCount = 4; + // NBL_CONSTEXPR_STATIC_INLINE uint32_t maxLightCount = 4; - light_type lights[maxLightCount]; + light_type lights[LIGHT_COUNT]; uint32_t lightCount; - NBL_CONSTEXPR_STATIC_INLINE uint32_t maxBxdfCount = 16; // TODO: limit change? + // NBL_CONSTEXPR_STATIC_INLINE uint32_t maxBxdfCount = 16; - bxdfnode_type bxdfs[maxBxdfCount]; + bxdfnode_type bxdfs[BXDF_COUNT]; uint32_t bxdfCount; // AS ases; + static this_t create( + NBL_CONST_REF_ARG(Shape) spheres[SCENE_SPHERE_COUNT], + NBL_CONST_REF_ARG(Shape) triangles[SCENE_TRIANGLE_COUNT], + NBL_CONST_REF_ARG(Shape) rectangles[SCENE_RECTANGLE_COUNT], + uint32_t sphereCount, uint32_t triangleCount, uint32_t rectangleCount, + NBL_CONST_REF_ARG(light_type) lights[LIGHT_COUNT], uint32_t lightCount, + NBL_CONST_REF_ARG(bxdfnode_type) bxdfs[BXDF_COUNT], uint32_t bxdfCount) + { + this_t retval; + retval.spheres = spheres; + retval.triangles = triangles; + retval.rectangles = rectangles; + retval.sphereCount = sphereCount; + retval.triangleCount = triangleCount; + retval.rectangleCount = rectangleCount; + + retval.lights = lights; + retval.lightCount = lightCount; + + retval.bxdfs = bxdfs; + retval.bxdfCount = bxdfCount; + return retval; + } + +#undef SCENE_SPHERE_COUNT +#undef SCENE_TRIANGLE_COUNT +#undef SCENE_RECTANGLE_COUNT + // obsolete? // Intersector::IntersectData toIntersectData(uint32_t mode, ProceduralShapeType type) // { From 7bd69e96d5512998df8efd85e5cdac33e1bde18d Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 14 Mar 2025 16:56:18 +0700 Subject: [PATCH 079/296] fixed most of rectangle light issues, still red pixels --- .../app_resources/hlsl/common.hlsl | 23 +++-- .../hlsl/next_event_estimator.hlsl | 86 +++++++++---------- 31_HLSLPathTracer/main.cpp | 4 +- 3 files changed, 56 insertions(+), 57 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 9e2249732..28261a634 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -388,7 +388,7 @@ struct Shape sampling::ProjectedSphericalTriangle pst = sampling::ProjectedSphericalTriangle::create(st); const float pdf = pst.pdf(ray.normalAtOrigin, ray.wasBSDFAtOrigin, L); // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small - return pdf < numeric_limits::max ? pdf : 0.0; + return pdf < numeric_limits::max ? pdf : numeric_limits::max; } break; default: @@ -427,7 +427,7 @@ struct Shape const float32_t3 L = sst.generate(rcpPdf, xi.xy); - pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : 0.0; + pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : numeric_limits::max; const float32_t3 N = getNormalTimesArea(); newRayMaxT = hlsl::dot(N, vertex0 - origin) / hlsl::dot(N, L); @@ -443,7 +443,7 @@ struct Shape const float32_t3 L = sst.generate(rcpPdf, interaction.isotropic.N, isBSDF, xi.xy); - pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : 0.0; + pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : numeric_limits::max; const float32_t3 N = getNormalTimesArea(); newRayMaxT = hlsl::dot(N, vertex0 - origin) / hlsl::dot(N, L); @@ -513,8 +513,6 @@ struct Shape basis[0] = edge0 / extents[0]; basis[1] = edge1 / extents[1]; basis[2] = normalize(cross(basis[0],basis[1])); - - basis = nbl::hlsl::transpose(basis); // TODO: double check transpose } template @@ -541,17 +539,18 @@ struct Shape if (solidAngle > numeric_limits::min) pdf = 1.f / solidAngle; else - pdf = numeric_limits::infinity; + pdf = bit_cast(numeric_limits::infinity); return pdf; } break; case PPM_APPROX_PROJECTED_SOLID_ANGLE: { - return numeric_limits::infinity; + // currently broken + return bit_cast(numeric_limits::infinity); } break; default: - return numeric_limits::infinity; + return bit_cast(numeric_limits::infinity); } } @@ -577,7 +576,6 @@ struct Shape // #ifdef TRIANGLE_REFERENCE ? case PPM_SOLID_ANGLE: { - float pdf; float32_t3x3 rectNormalBasis; float32_t2 rectExtents; getNormalBasis(rectNormalBasis, rectExtents); @@ -594,7 +592,7 @@ struct Shape pdf = 1.f / solidAngle; } else - pdf = numeric_limits::infinity; + pdf = bit_cast(numeric_limits::infinity); newRayMaxT = hlsl::dot(N, origin2origin) / hlsl::dot(N, L); return L; @@ -602,12 +600,13 @@ struct Shape break; case PPM_APPROX_PROJECTED_SOLID_ANGLE: { - pdf = numeric_limits::infinity; + // currently broken + pdf = bit_cast(numeric_limits::infinity); return (float32_t3)0.0; } break; default: - pdf = numeric_limits::infinity; + pdf = bit_cast(numeric_limits::infinity); return (float32_t3)0.0; } } diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index c1528216d..9c41f6627 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -89,28 +89,28 @@ struct Estimator static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event) { - const IntersectMode mode = (IntersectMode)event.mode; - switch (mode) - { - case IM_RAY_QUERY: - { - // TODO: do ray query stuff - } - break; - case IM_RAY_TRACING: - { - // TODO: do ray tracing stuff - } - break; - case IM_PROCEDURAL: - { + // const IntersectMode mode = (IntersectMode)event.mode; + // switch (mode) + // { + // case IM_RAY_QUERY: + // { + // // TODO: do ray query stuff + // } + // break; + // case IM_RAY_TRACING: + // { + // // TODO: do ray tracing stuff + // } + // break; + // case IM_PROCEDURAL: + // { return proceduralDeferredEvalAndPdf(pdf, light, ray, event); - } - break; - default: - return (spectral_type)0.0; - } - return (spectral_type)0.0; + // } + // break; + // default: + // return (spectral_type)0.0; + // } + // return (spectral_type)0.0; } static sample_type procedural_generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event) @@ -203,29 +203,29 @@ struct Estimator { const IntersectMode mode = (IntersectMode)event.mode; sample_type L; - switch (mode) - { - case IM_RAY_QUERY: - { - // TODO: do ray query stuff - } - break; - case IM_RAY_TRACING: - { - // TODO: do ray tracing stuff - } - break; - case IM_PROCEDURAL: - { + // switch (mode) + // { + // case IM_RAY_QUERY: + // { + // // TODO: do ray query stuff + // } + // break; + // case IM_RAY_TRACING: + // { + // // TODO: do ray tracing stuff + // } + // break; + // case IM_PROCEDURAL: + // { return procedural_generate_and_quotient_and_pdf(quotient_pdf, newRayMaxT, light, origin, interaction, isBSDF, xi, depth, event); - } - break; - default: - { - return L; - } - } - return L; + // } + // break; + // default: + // { + // return L; + // } + // } + // return L; } }; diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 46597d738..b8e3ea044 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -1347,10 +1347,10 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, float viewWidth = 10.f; float camYAngle = 165.f / 180.f * 3.14159f; float camXAngle = 32.f / 180.f * 3.14159f; - int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE; + int PTPipline = E_LIGHT_GEOMETRY::ELG_RECTANGLE; int renderMode = E_RENDER_MODE::ERM_HLSL; int spp = 32; - int depth = 3; + int depth = 1; bool m_firstFrame = true; IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; From 38d8285dc7101dece86fa5d3733b6056ff0a6266 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 14 Mar 2025 21:47:50 +0700 Subject: [PATCH 080/296] Use unused shader instead of stub shader for occlusion ray miss shader. --- .../app_resources/raytrace_shadow.rmiss.hlsl | 6 ------ 71_RayTracingPipeline/main.cpp | 3 --- 2 files changed, 9 deletions(-) delete mode 100644 71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl deleted file mode 100644 index baad9a3e9..000000000 --- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl +++ /dev/null @@ -1,6 +0,0 @@ -#include "common.hlsl" - -[shader("miss")] -void main(inout OcclusionPayload payload) -{ -} diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index cb7ef1d8e..9a85ea423 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -163,7 +163,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl"); const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl"); const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); - const auto shadowMissShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl"); const auto shadowClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_shadow_triangle.rchit.hlsl"); const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); @@ -324,7 +323,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { RTDS_RAYGEN, RTDS_MISS, - RTDS_SHADOW_MISS, RTDS_CLOSEST_HIT_SHADOW, RTDS_CLOSEST_HIT, RTDS_SPHERE_CLOSEST_HIT, @@ -340,7 +338,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, IGPUShader::SSpecInfo shaders[RTDS_COUNT]; shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()}; shaders[RTDS_MISS] = {.shader = missShader.get()}; - shaders[RTDS_SHADOW_MISS] = {.shader = shadowMissShader.get()}; shaders[RTDS_CLOSEST_HIT_SHADOW] = { .shader = shadowClosestHitShader.get() }; shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()}; shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()}; From ab0aa1231e1fd0eb24ade01a918f139a7c6f758a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 17 Mar 2025 13:56:17 +0700 Subject: [PATCH 081/296] fix for nan samples --- 31_HLSLPathTracer/app_resources/hlsl/common.hlsl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 28261a634..d5cbbea81 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -588,7 +588,8 @@ struct Shape if (solidAngle > numeric_limits::min) { float32_t3 sph_sample = sphUv[0] * edge0 + sphUv[1] * edge1 + offset; - L = nbl::hlsl::normalize(sph_sample - origin); + L = sph_sample - origin; + L = hlsl::mix(nbl::hlsl::normalize(L), (float32_t3)0.0, hlsl::abs(L) > (float32_t3)numeric_limits::min); // TODO? sometimes L is vec3(0), find cause pdf = 1.f / solidAngle; } else From 96c7497430a12d064c932e53644b85a7b1984e1d Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 17 Mar 2025 14:07:21 +0700 Subject: [PATCH 082/296] revert to intial scene settings --- 31_HLSLPathTracer/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index b8e3ea044..46597d738 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -1347,10 +1347,10 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, float viewWidth = 10.f; float camYAngle = 165.f / 180.f * 3.14159f; float camXAngle = 32.f / 180.f * 3.14159f; - int PTPipline = E_LIGHT_GEOMETRY::ELG_RECTANGLE; + int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE; int renderMode = E_RENDER_MODE::ERM_HLSL; int spp = 32; - int depth = 1; + int depth = 3; bool m_firstFrame = true; IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; From f96dfcc01a02cb3d3b80368386155e5c287f8f5c Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 17 Mar 2025 22:23:21 +0700 Subject: [PATCH 083/296] Use nbl::hlsl::_static_cast for converting MaterialPacked to Material and vice versa --- .../app_resources/common.hlsl | 64 ++++++++++++------- .../app_resources/raytrace.rahit.hlsl | 2 +- .../app_resources/raytrace.rgen.hlsl | 2 +- .../app_resources/raytrace_shadow.rahit.hlsl | 2 +- 71_RayTracingPipeline/main.cpp | 4 +- 5 files changed, 47 insertions(+), 27 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index 6c052dff1..0b5f4b170 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -2,6 +2,7 @@ #define RQG_COMMON_HLSL #include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/cpp_compat/basic.h" NBL_CONSTEXPR uint32_t WorkgroupSize = 16; NBL_CONSTEXPR uint32_t MAX_UNORM_10 = 1023; @@ -67,28 +68,6 @@ struct MaterialPacked } }; -inline MaterialPacked packMaterial(Material material) -{ - MaterialPacked packed; - packed.ambient = packUnorm3x10(material.ambient); - packed.diffuse = packUnorm3x10(material.diffuse); - packed.specular = packUnorm3x10(material.specular); - packed.shininess = packUnorm22(material.shininess); - packed.alpha = packUnorm10(material.alpha); - return packed; -} - -inline Material unpackMaterial(MaterialPacked packed) -{ - Material material; - material.ambient = unpackUnorm3x10(packed.ambient); - material.diffuse = unpackUnorm3x10(packed.diffuse); - material.specular = unpackUnorm3x10(packed.specular); - material.shininess = unpackUnorm22(packed.shininess); - material.alpha = unpackUnorm10(packed.alpha); - return material; -} - struct SProceduralGeomInfo { MaterialPacked material; @@ -236,4 +215,45 @@ float32_t3 computeSpecular(Material mat, float32_t3 view_dir, } #endif +namespace nbl +{ +namespace hlsl +{ +namespace impl +{ + +template<> +struct static_cast_helper +{ + static inline Material cast(MaterialPacked packed) + { + Material material; + material.ambient = unpackUnorm3x10(packed.ambient); + material.diffuse = unpackUnorm3x10(packed.diffuse); + material.specular = unpackUnorm3x10(packed.specular); + material.shininess = unpackUnorm22(packed.shininess); + material.alpha = unpackUnorm10(packed.alpha); + return material; + } +}; + +template<> +struct static_cast_helper +{ + static inline MaterialPacked cast(Material material) + { + MaterialPacked packed; + packed.ambient = packUnorm3x10(material.ambient); + packed.diffuse = packUnorm3x10(material.diffuse); + packed.specular = packUnorm3x10(material.specular); + packed.shininess = packUnorm22(material.shininess); + packed.alpha = packUnorm10(material.alpha); + return packed; + } +}; + +} +} +} + #endif // RQG_COMMON_HLSL diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl index 2923e95d9..c499e0506 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -7,7 +7,7 @@ void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes { const int instID = InstanceID(); const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); - const Material material = unpackMaterial(geom.material); + const Material material = nbl::hlsl::_static_cast(geom.material); if (material.alpha > payload.alphaThreshold) { diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index c74774880..bd8f6dcba 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -72,7 +72,7 @@ void main() const float32_t3 worldPosition = pc.camPos + (camDirection * payload.rayDistance); const float32_t3 worldNormal = payload.worldNormal; - const Material material = unpackMaterial(payload.material); + const Material material = nbl::hlsl::_static_cast(payload.material); RayLight cLight; cLight.inHitPosition = worldPosition; CallShader(pc.light.type, cLight); diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl index e76f1da55..88a9b79db 100644 --- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl @@ -7,7 +7,7 @@ void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttribut { const int instID = InstanceID(); const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); - const Material material = unpackMaterial(geom.material); + const Material material = nbl::hlsl::_static_cast(geom.material); payload.attenuation = material.alpha * payload.attenuation; IgnoreHit(); diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 9a85ea423..363d3b59f 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1253,7 +1253,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .vertexStride = cpuObject.data.inputParams.bindings[0].stride, .indexType = cpuObject.data.indexType, .indexCount = cpuObject.data.indexCount, - .material = packMaterial(cpuObject.material), + .material = hlsl::_static_cast(cpuObject.material), .transform = cpuObject.transform, }); } @@ -1292,7 +1292,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { const auto middle_i = NumberOfProceduralGeometries / 2.0; SProceduralGeomInfo sphere = { - .material = packMaterial({ + .material = hlsl::_static_cast(Material{ .ambient = {0.1, 0.05 * i, 0.1}, .diffuse = {0.3, 0.2 * i, 0.3}, .specular = {0.8, 0.8, 0.8}, From 88b3275c0e40b7b78a4e842a7fb493be572155af Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 17 Mar 2025 22:23:52 +0700 Subject: [PATCH 084/296] Fix create shader binding table to use the new span api --- 71_RayTracingPipeline/main.cpp | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 363d3b59f..036acd510 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -3,6 +3,7 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "common.hpp" +#include "nbl/builtin/builtinResources.h" #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" #include "nbl/builtin/hlsl/indirect_commands.hlsl" @@ -377,12 +378,12 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { .closestHit = RTDS_SPHERE_CLOSEST_HIT, .anyHit = RTDS_ANYHIT_PRIMARY, - .intersectionShader = RTDS_INTERSECTION, + .intersection = RTDS_INTERSECTION, }; hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { .closestHit = RTDS_CLOSEST_HIT_SHADOW, .anyHit = RTDS_ANYHIT_SHADOW, - .intersectionShader = RTDS_INTERSECTION, + .intersection = RTDS_INTERSECTION, }; shaderGroups.hits = hitGroups; @@ -1335,9 +1336,15 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment); auto& raygenRange = m_shaderBindingTable.raygenGroupRange; + auto& hitRange = m_shaderBindingTable.hitGroupsRange; + const auto hitHandles = pipeline->getHitHandles(); + auto& missRange = m_shaderBindingTable.missGroupsRange; + const auto missHandles = pipeline->getMissHandles(); + auto& callableRange = m_shaderBindingTable.callableGroupsRange; + const auto callableHandles = pipeline->getCallableHandles(); raygenRange = { .offset = 0, @@ -1346,19 +1353,19 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, missRange = { .offset = raygenRange.size, - .size = core::alignUp(pipeline->getMissGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment), + .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), }; m_shaderBindingTable.missGroupsStride = handleSizeAligned; hitRange = { .offset = missRange.offset + missRange.size, - .size = core::alignUp(pipeline->getHitGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment), + .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), }; m_shaderBindingTable.hitGroupsStride = handleSizeAligned; callableRange = { .offset = hitRange.offset + hitRange.size, - .size = core::alignUp(pipeline->getCallableGroupCount() * handleSizeAligned, limits.shaderGroupBaseAlignment), + .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), }; m_shaderBindingTable.callableGroupsStride = handleSizeAligned; @@ -1374,25 +1381,25 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, // copy miss region uint8_t* pMissData = pData + missRange.offset; - for (int32_t missIx = 0; missIx < pipeline->getMissGroupCount(); missIx++) + for (const auto& handle : missHandles) { - memcpy(pMissData, &pipeline->getMiss(missIx), handleSize); + memcpy(pMissData, &handle, handleSize); pMissData += m_shaderBindingTable.missGroupsStride; } // copy hit region uint8_t* pHitData = pData + hitRange.offset; - for (int32_t hitIx = 0; hitIx < pipeline->getHitGroupCount(); hitIx++) + for (const auto& handle : hitHandles) { - memcpy(pHitData, &pipeline->getHit(hitIx), handleSize); + memcpy(pHitData, &handle, handleSize); pHitData += m_shaderBindingTable.hitGroupsStride; } // copy callable region uint8_t* pCallableData = pData + callableRange.offset; - for (int32_t callableIx = 0; callableIx < pipeline->getCallableGroupCount(); callableIx++) + for (const auto& handle : callableHandles) { - memcpy(pCallableData, &pipeline->getCallable(callableIx), handleSize); + memcpy(pCallableData, &handle, handleSize); pCallableData += m_shaderBindingTable.callableGroupsStride; } From cca8f7248bc63c4ba2fe786e9c77a594d02c99e9 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 18 Mar 2025 21:10:05 +0700 Subject: [PATCH 085/296] Some optimization on ray tracing demo --- .../app_resources/raytrace.rgen.hlsl | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index bd8f6dcba..fc6383dcf 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -2,6 +2,7 @@ #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" #include "nbl/builtin/hlsl/random/xoroshiro.hlsl" +#include "nbl/builtin/hlsl/random/pcg.hlsl" #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl" @@ -15,13 +16,6 @@ static const float32_t3 s_clearColor = float32_t3(0.3, 0.3, 0.8); [[vk::binding(1, 0)]] RWTexture2D colorImage; -uint32_t pcgHash(uint32_t v) -{ - const uint32_t state = v * 747796405u + 2891336453u; - const uint32_t word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u; - return (word >> 22u) ^ word; -} - float32_t nextRandomUnorm(inout nbl::hlsl::Xoroshiro64StarStar rnd) { return float32_t(rnd()) / float32_t(0xFFFFFFFF); @@ -34,8 +28,8 @@ void main() const uint32_t3 launchSize = DispatchRaysDimensions(); const uint32_t2 coords = launchID.xy; - const uint32_t seed1 = pcgHash(pc.frameCounter); - const uint32_t seed2 = pcgHash(launchID.y * launchSize.x + launchID.x); + const uint32_t seed1 = nbl::hlsl::Pcg::construct(pc.frameCounter)(); + const uint32_t seed2 = nbl::hlsl::Pcg::construct(launchID.y * launchSize.x + launchID.x)(); nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(seed1, seed2)); float32_t3 hitValues = float32_t3(0, 0, 0); @@ -77,8 +71,6 @@ void main() cLight.inHitPosition = worldPosition; CallShader(pc.light.type, cLight); - const float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal); - float32_t3 specular = float32_t3(0, 0, 0); float32_t attenuation = 1; if (dot(worldNormal, cLight.outLightDir) > 0) @@ -95,12 +87,14 @@ void main() TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, occlusionPayload); attenuation = occlusionPayload.attenuation; - if (occlusionPayload.attenuation > 0) + if (occlusionPayload.attenuation > 0.0001) { - specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal); + const float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal); + const float32_t3 specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal); + hitValues += (cLight.outIntensity * attenuation * (diffuse + specular)); } } - hitValues += ((cLight.outIntensity * attenuation * (diffuse + specular)) + material.ambient); + hitValues += material.ambient; } const float32_t3 hitValue = hitValues / s_sampleCount; From b483aa6b7474526bb6c89e05c1965ff43880cfde Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 19 Mar 2025 10:42:37 +0700 Subject: [PATCH 086/296] better hlsl dispatch --- .../app_resources/hlsl/render.comp.hlsl | 8 ++++--- 31_HLSLPathTracer/main.cpp | 21 +++++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index f8cf2ae22..b54f5721d 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -43,13 +43,15 @@ using namespace nbl::hlsl; -NBL_CONSTEXPR uint32_t WorkgroupSize = 32; +NBL_CONSTEXPR uint32_t WorkgroupSize = 256; NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4; NBL_CONSTEXPR uint32_t MAX_SAMPLES_LOG2 = 10; int32_t2 getCoordinates() { - return int32_t2(glsl::gl_GlobalInvocationID().xy); + uint32_t width, height; + outImage.GetDimensions(width, height); + return int32_t2(glsl::gl_GlobalInvocationID().x % width, glsl::gl_GlobalInvocationID().x / width); } float32_t2 getTexCoords() @@ -141,7 +143,7 @@ static const ext::Scene scene = ext::ScenegetPhysicalDevice()->getLimits().spirvVersion; options.spirvOptimizer = nullptr; -//#ifndef _NBL_DEBUG -// ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; -// auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); -// options.spirvOptimizer = opt.get(); -//#endif - options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT; +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + options.spirvOptimizer = opt.get(); +#endif + options.debugInfoFlags = IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_NONE; options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); options.preprocessorOptions.logger = m_logger.get(); options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); @@ -418,8 +418,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, params.shader.shader = ptShader.get(); params.shader.entryPoint = "main"; params.shader.entries = nullptr; - params.shader.requireFullSubgroups = true; - params.shader.requiredSubgroupSize = static_cast(5); + params.shader.requireFullSubgroups = true; + params.shader.requiredSubgroupSize = static_cast(5); if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTGLSLPipelines.data() + index)) return logFail("Failed to create GLSL compute pipeline!\n"); } @@ -1068,7 +1068,10 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get()); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get()); cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc); - cmdbuf->dispatch(1 + (WindowDimensions.x - 1) / DefaultWorkGroupSize, 1 + (WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u); + if (renderMode == E_RENDER_MODE::ERM_HLSL) + cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / 256u, 1u, 1u); + else + cmdbuf->dispatch(1 + (WindowDimensions.x - 1) / DefaultWorkGroupSize, 1 + (WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u); } // TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image) From 773733d3bed7f17073ff02af29d16700767988a9 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 19 Mar 2025 15:23:06 +0700 Subject: [PATCH 087/296] refactor NEE to use templated light types and sampling --- .../app_resources/hlsl/common.hlsl | 272 +------- .../hlsl/next_event_estimator.hlsl | 579 ++++++++++++------ .../app_resources/hlsl/pathtracer.hlsl | 75 +-- .../app_resources/hlsl/render.comp.hlsl | 22 +- .../app_resources/hlsl/scene.hlsl | 139 ----- 31_HLSLPathTracer/main.cpp | 9 +- 6 files changed, 413 insertions(+), 683 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index d5cbbea81..dea682c8b 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -35,6 +35,7 @@ struct Payload enum ProceduralShapeType : uint16_t { + PST_NONE = 0, PST_SPHERE, PST_TRIANGLE, PST_RECTANGLE @@ -173,33 +174,6 @@ enum PTPolygonMethod : uint16_t PPM_APPROX_PROJECTED_SOLID_ANGLE }; -// namespace Intersector -// { -// // ray query method -// // ray query struct holds AS info -// // pass in address to vertex/index buffers? - -// // ray tracing pipeline method - -// // procedural data store: [obj count] [intersect type] [obj1] [obj2] [...] - -// struct IntersectData -// { -// enum Mode : uint32_t // enum class? -// { -// RAY_QUERY, -// RAY_TRACING, -// PROCEDURAL -// }; - -// NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 128; - -// uint32_t mode : 2; -// uint32_t unused : 30; // possible space for flags -// uint32_t data[DataSize]; -// }; -// } - enum IntersectMode : uint32_t { IM_RAY_QUERY, @@ -207,20 +181,6 @@ enum IntersectMode : uint32_t IM_PROCEDURAL }; -namespace NextEventEstimator -{ -// procedural data store: [light count] [event type] [obj] - -struct Event -{ - NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 16; - - uint32_t mode : 2; - uint32_t unused : 30; // possible space for flags - uint32_t data[DataSize]; -}; -} - template struct Shape; @@ -269,45 +229,6 @@ struct Shape return 2.0 * numbers::pi * (1.0 - cosThetaMax); } - template - float deferredPdf(NBL_CONST_REF_ARG(Ray) ray) - { - return 1.0 / getSolidAngle(ray.origin); - } - - template - float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi) - { - float32_t3 Z = position - origin; - const float distanceSQ = hlsl::dot(Z,Z); - const float cosThetaMax2 = 1.0 - radius2 / distanceSQ; - if (cosThetaMax2 > 0.0) - { - const float rcpDistance = 1.0 / hlsl::sqrt(distanceSQ); - Z *= rcpDistance; - - const float cosThetaMax = hlsl::sqrt(cosThetaMax2); - const float cosTheta = hlsl::mix(1.0, cosThetaMax, xi.x); - - float32_t3 L = Z * cosTheta; - - const float cosTheta2 = cosTheta * cosTheta; - const float sinTheta = hlsl::sqrt(1.0 - cosTheta2); - float sinPhi, cosPhi; - math::sincos(2.0 * numbers::pi * xi.y - numbers::pi, sinPhi, cosPhi); - float32_t3 X, Y; - math::frisvad(Z, X, Y); - - L += (X * cosPhi + Y * sinPhi) * sinTheta; - - newRayMaxT = (cosTheta - hlsl::sqrt(cosTheta2 - cosThetaMax2)) / rcpDistance; - pdf = 1.0 / (2.0 * numbers::pi * (1.0 - cosThetaMax)); - return L; - } - pdf = 0.0; - return float32_t3(0.0,0.0,0.0); - } - NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 5; float32_t3 position; @@ -361,100 +282,6 @@ struct Shape return hlsl::cross(edges[0], edges[1]) * 0.5f; } - template - float deferredPdf(NBL_CONST_REF_ARG(Ray) ray) - { - const float32_t3 L = ray.direction; - switch (polygonMethod) - { - case PPM_AREA: - { - const float dist = ray.intersectionT; - const float32_t3 L = ray.direction; - return dist * dist / hlsl::abs(hlsl::dot(getNormalTimesArea(), L)); - } - break; - case PPM_SOLID_ANGLE: - { - shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, ray.origin); - const float rcpProb = st.solidAngleOfTriangle(); - // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 - return rcpProb > numeric_limits::min ? (1.0 / rcpProb) : numeric_limits::max; - } - break; - case PPM_APPROX_PROJECTED_SOLID_ANGLE: - { - shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, ray.origin); - sampling::ProjectedSphericalTriangle pst = sampling::ProjectedSphericalTriangle::create(st); - const float pdf = pst.pdf(ray.normalAtOrigin, ray.wasBSDFAtOrigin, L); - // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small - return pdf < numeric_limits::max ? pdf : numeric_limits::max; - } - break; - default: - return 0.0; - } - } - - template - float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi) - { - switch(polygonMethod) - { - case PPM_AREA: - { - const float32_t3 edge0 = vertex1 - vertex0; - const float32_t3 edge1 = vertex2 - vertex0; - const float sqrtU = hlsl::sqrt(xi.x); - float32_t3 pnt = vertex0 + edge0 * (1.0 - sqrtU) + edge1 * sqrtU * xi.y; - float32_t3 L = pnt - origin; - - const float distanceSq = hlsl::dot(L,L); - const float rcpDistance = 1.0 / hlsl::sqrt(distanceSq); - L *= rcpDistance; - - pdf = distanceSq / hlsl::abs(hlsl::dot(hlsl::cross(edge0, edge1) * 0.5f, L)); - newRayMaxT = 1.0 / rcpDistance; - return L; - } - break; - case PPM_SOLID_ANGLE: - { - float rcpPdf; - - shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, origin); - sampling::SphericalTriangle sst = sampling::SphericalTriangle::create(st); - - const float32_t3 L = sst.generate(rcpPdf, xi.xy); - - pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : numeric_limits::max; - - const float32_t3 N = getNormalTimesArea(); - newRayMaxT = hlsl::dot(N, vertex0 - origin) / hlsl::dot(N, L); - return L; - } - break; - case PPM_APPROX_PROJECTED_SOLID_ANGLE: - { - float rcpPdf; - - shapes::SphericalTriangle st = shapes::SphericalTriangle::create(vertex0, vertex1, vertex2, origin); - sampling::ProjectedSphericalTriangle sst = sampling::ProjectedSphericalTriangle::create(st); - - const float32_t3 L = sst.generate(rcpPdf, interaction.isotropic.N, isBSDF, xi.xy); - - pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : numeric_limits::max; - - const float32_t3 N = getNormalTimesArea(); - newRayMaxT = hlsl::dot(N, vertex0 - origin) / hlsl::dot(N, L); - return L; - } - break; - default: - return (float32_t3)0.0; - } - } - NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 10; float32_t3 vertex0; @@ -515,103 +342,6 @@ struct Shape basis[2] = normalize(cross(basis[0],basis[1])); } - template - float deferredPdf(NBL_CONST_REF_ARG(Ray) ray) - { - switch (polygonMethod) - { - case PPM_AREA: - { - const float dist = ray.intersectionT; - const float32_t3 L = ray.direction; - return dist * dist / hlsl::abs(hlsl::dot(getNormalTimesArea(), L)); - } - break; - // #ifdef TRIANGLE_REFERENCE ? - case PPM_SOLID_ANGLE: - { - float pdf; - float32_t3x3 rectNormalBasis; - float32_t2 rectExtents; - getNormalBasis(rectNormalBasis, rectExtents); - shapes::SphericalRectangle sphR0 = shapes::SphericalRectangle::create(ray.origin, offset, rectNormalBasis); - float solidAngle = sphR0.solidAngleOfRectangle(rectExtents); - if (solidAngle > numeric_limits::min) - pdf = 1.f / solidAngle; - else - pdf = bit_cast(numeric_limits::infinity); - return pdf; - } - break; - case PPM_APPROX_PROJECTED_SOLID_ANGLE: - { - // currently broken - return bit_cast(numeric_limits::infinity); - } - break; - default: - return bit_cast(numeric_limits::infinity); - } - } - - template - float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, float32_t3 xi) - { - const float32_t3 N = getNormalTimesArea(); - const float32_t3 origin2origin = offset - origin; - - switch (polygonMethod) - { - case PPM_AREA: - { - float32_t3 L = origin2origin + edge0 * xi.x + edge1 * xi.y; - const float distSq = hlsl::dot(L, L); - const float rcpDist = 1.0 / hlsl::sqrt(distSq); - L *= rcpDist; - pdf = distSq / hlsl::abs(hlsl::dot(N, L)); - newRayMaxT = 1.0 / rcpDist; - return L; - } - break; - // #ifdef TRIANGLE_REFERENCE ? - case PPM_SOLID_ANGLE: - { - float32_t3x3 rectNormalBasis; - float32_t2 rectExtents; - getNormalBasis(rectNormalBasis, rectExtents); - shapes::SphericalRectangle sphR0 = shapes::SphericalRectangle::create(origin, offset, rectNormalBasis); - float32_t3 L = (float32_t3)0.0; - float solidAngle = sphR0.solidAngleOfRectangle(rectExtents); - - sampling::SphericalRectangle ssph = sampling::SphericalRectangle::create(sphR0); - float32_t2 sphUv = ssph.generate(rectExtents, xi.xy, solidAngle); - if (solidAngle > numeric_limits::min) - { - float32_t3 sph_sample = sphUv[0] * edge0 + sphUv[1] * edge1 + offset; - L = sph_sample - origin; - L = hlsl::mix(nbl::hlsl::normalize(L), (float32_t3)0.0, hlsl::abs(L) > (float32_t3)numeric_limits::min); // TODO? sometimes L is vec3(0), find cause - pdf = 1.f / solidAngle; - } - else - pdf = bit_cast(numeric_limits::infinity); - - newRayMaxT = hlsl::dot(N, origin2origin) / hlsl::dot(N, L); - return L; - } - break; - case PPM_APPROX_PROJECTED_SOLID_ANGLE: - { - // currently broken - pdf = bit_cast(numeric_limits::infinity); - return (float32_t3)0.0; - } - break; - default: - pdf = bit_cast(numeric_limits::infinity); - return (float32_t3)0.0; - } - } - NBL_CONSTEXPR_STATIC_INLINE uint32_t ObjSize = 10; float32_t3 offset; diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 9c41f6627..7c157aadf 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -12,220 +12,425 @@ namespace ext namespace NextEventEstimator { -template -struct Estimator +template +struct ShapeSampling; + +template +struct ShapeSampling +{ + static ShapeSampling create(NBL_CONST_REF_ARG(Shape) sphere) + { + ShapeSampling retval; + retval.sphere = sphere; + return retval; + } + + template + float deferredPdf(NBL_CONST_REF_ARG(Ray) ray) + { + return 1.0 / sphere.getSolidAngle(ray.origin); + } + + template + float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi) + { + float32_t3 Z = sphere.position - origin; + const float distanceSQ = hlsl::dot(Z,Z); + const float cosThetaMax2 = 1.0 - sphere.radius2 / distanceSQ; + if (cosThetaMax2 > 0.0) + { + const float rcpDistance = 1.0 / hlsl::sqrt(distanceSQ); + Z *= rcpDistance; + + const float cosThetaMax = hlsl::sqrt(cosThetaMax2); + const float cosTheta = hlsl::mix(1.0, cosThetaMax, xi.x); + + float32_t3 L = Z * cosTheta; + + const float cosTheta2 = cosTheta * cosTheta; + const float sinTheta = hlsl::sqrt(1.0 - cosTheta2); + float sinPhi, cosPhi; + math::sincos(2.0 * numbers::pi * xi.y - numbers::pi, sinPhi, cosPhi); + float32_t3 X, Y; + math::frisvad(Z, X, Y); + + L += (X * cosPhi + Y * sinPhi) * sinTheta; + + newRayMaxT = (cosTheta - hlsl::sqrt(cosTheta2 - cosThetaMax2)) / rcpDistance; + pdf = 1.0 / (2.0 * numbers::pi * (1.0 - cosThetaMax)); + return L; + } + pdf = 0.0; + return float32_t3(0.0,0.0,0.0); + } + + Shape sphere; +}; + +template<> +struct ShapeSampling +{ + static ShapeSampling create(NBL_CONST_REF_ARG(Shape) tri) + { + ShapeSampling retval; + retval.tri = tri; + return retval; + } + + template + float deferredPdf(NBL_CONST_REF_ARG(Ray) ray) + { + const float dist = ray.intersectionT; + const float32_t3 L = ray.direction; + return dist * dist / hlsl::abs(hlsl::dot(tri.getNormalTimesArea(), L)); + } + + template + float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi) + { + const float32_t3 edge0 = tri.vertex1 - tri.vertex0; + const float32_t3 edge1 = tri.vertex2 - tri.vertex0; + const float sqrtU = hlsl::sqrt(xi.x); + float32_t3 pnt = tri.vertex0 + edge0 * (1.0 - sqrtU) + edge1 * sqrtU * xi.y; + float32_t3 L = pnt - origin; + + const float distanceSq = hlsl::dot(L,L); + const float rcpDistance = 1.0 / hlsl::sqrt(distanceSq); + L *= rcpDistance; + + pdf = distanceSq / hlsl::abs(hlsl::dot(hlsl::cross(edge0, edge1) * 0.5f, L)); + newRayMaxT = 1.0 / rcpDistance; + return L; + } + + Shape tri; +}; + +template<> +struct ShapeSampling +{ + static ShapeSampling create(NBL_CONST_REF_ARG(Shape) tri) + { + ShapeSampling retval; + retval.tri = tri; + return retval; + } + + template + float deferredPdf(NBL_CONST_REF_ARG(Ray) ray) + { + shapes::SphericalTriangle st = shapes::SphericalTriangle::create(tri.vertex0, tri.vertex1, tri.vertex2, ray.origin); + const float rcpProb = st.solidAngleOfTriangle(); + // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 + return rcpProb > numeric_limits::min ? (1.0 / rcpProb) : numeric_limits::max; + } + + template + float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi) + { + float rcpPdf; + shapes::SphericalTriangle st = shapes::SphericalTriangle::create(tri.vertex0, tri.vertex1, tri.vertex2, origin); + sampling::SphericalTriangle sst = sampling::SphericalTriangle::create(st); + + const float32_t3 L = sst.generate(rcpPdf, xi.xy); + + pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : numeric_limits::max; + + const float32_t3 N = tri.getNormalTimesArea(); + newRayMaxT = hlsl::dot(N, tri.vertex0 - origin) / hlsl::dot(N, L); + return L; + } + + Shape tri; +}; + +template<> +struct ShapeSampling +{ + static ShapeSampling create(NBL_CONST_REF_ARG(Shape) tri) + { + ShapeSampling retval; + retval.tri = tri; + return retval; + } + + template + float deferredPdf(NBL_CONST_REF_ARG(Ray) ray) + { + const float32_t3 L = ray.direction; + shapes::SphericalTriangle st = shapes::SphericalTriangle::create(tri.vertex0, tri.vertex1, tri.vertex2, ray.origin); + sampling::ProjectedSphericalTriangle pst = sampling::ProjectedSphericalTriangle::create(st); + const float pdf = pst.pdf(ray.normalAtOrigin, ray.wasBSDFAtOrigin, L); + // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small + return pdf < numeric_limits::max ? pdf : numeric_limits::max; + } + + template + float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi) + { + float rcpPdf; + shapes::SphericalTriangle st = shapes::SphericalTriangle::create(tri.vertex0, tri.vertex1, tri.vertex2, origin); + sampling::ProjectedSphericalTriangle sst = sampling::ProjectedSphericalTriangle::create(st); + + const float32_t3 L = sst.generate(rcpPdf, interaction.isotropic.N, isBSDF, xi.xy); + + pdf = rcpPdf > numeric_limits::min ? (1.0 / rcpPdf) : numeric_limits::max; + + const float32_t3 N = tri.getNormalTimesArea(); + newRayMaxT = hlsl::dot(N, tri.vertex0 - origin) / hlsl::dot(N, L); + return L; + } + + Shape tri; +}; + +template<> +struct ShapeSampling +{ + static ShapeSampling create(NBL_CONST_REF_ARG(Shape) rect) + { + ShapeSampling retval; + retval.rect = rect; + return retval; + } + + template + float deferredPdf(NBL_CONST_REF_ARG(Ray) ray) + { + const float dist = ray.intersectionT; + const float32_t3 L = ray.direction; + return dist * dist / hlsl::abs(hlsl::dot(rect.getNormalTimesArea(), L)); + } + + template + float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi) + { + const float32_t3 N = rect.getNormalTimesArea(); + const float32_t3 origin2origin = rect.offset - origin; + + float32_t3 L = origin2origin + rect.edge0 * xi.x + rect.edge1 * xi.y; + const float distSq = hlsl::dot(L, L); + const float rcpDist = 1.0 / hlsl::sqrt(distSq); + L *= rcpDist; + pdf = distSq / hlsl::abs(hlsl::dot(N, L)); + newRayMaxT = 1.0 / rcpDist; + return L; + } + + Shape rect; +}; + +template<> +struct ShapeSampling +{ + static ShapeSampling create(NBL_CONST_REF_ARG(Shape) rect) + { + ShapeSampling retval; + retval.rect = rect; + return retval; + } + + template + float deferredPdf(NBL_CONST_REF_ARG(Ray) ray) + { + float pdf; + float32_t3x3 rectNormalBasis; + float32_t2 rectExtents; + rect.getNormalBasis(rectNormalBasis, rectExtents); + shapes::SphericalRectangle sphR0 = shapes::SphericalRectangle::create(ray.origin, rect.offset, rectNormalBasis); + float solidAngle = sphR0.solidAngleOfRectangle(rectExtents); + if (solidAngle > numeric_limits::min) + pdf = 1.f / solidAngle; + else + pdf = bit_cast(numeric_limits::infinity); + return pdf; + } + + template + float32_t3 generate_and_pdf(NBL_REF_ARG(float32_t) pdf, NBL_REF_ARG(float32_t) newRayMaxT, NBL_CONST_REF_ARG(float32_t3) origin, NBL_CONST_REF_ARG(Aniso) interaction, bool isBSDF, NBL_CONST_REF_ARG(float32_t3) xi) + { + const float32_t3 N = rect.getNormalTimesArea(); + const float32_t3 origin2origin = rect.offset - origin; + + float32_t3x3 rectNormalBasis; + float32_t2 rectExtents; + rect.getNormalBasis(rectNormalBasis, rectExtents); + shapes::SphericalRectangle sphR0 = shapes::SphericalRectangle::create(origin, rect.offset, rectNormalBasis); + float32_t3 L = (float32_t3)0.0; + float solidAngle = sphR0.solidAngleOfRectangle(rectExtents); + + sampling::SphericalRectangle ssph = sampling::SphericalRectangle::create(sphR0); + float32_t2 sphUv = ssph.generate(rectExtents, xi.xy, solidAngle); + if (solidAngle > numeric_limits::min) + { + float32_t3 sph_sample = sphUv[0] * rect.edge0 + sphUv[1] * rect.edge1 + rect.offset; + L = sph_sample - origin; + L = hlsl::mix(nbl::hlsl::normalize(L), (float32_t3)0.0, hlsl::abs(L) > (float32_t3)numeric_limits::min); // TODO? sometimes L is vec3(0), find cause + pdf = 1.f / solidAngle; + } + else + pdf = bit_cast(numeric_limits::infinity); + + newRayMaxT = hlsl::dot(N, origin2origin) / hlsl::dot(N, L); + return L; + } + + Shape rect; +}; + +// PPM_APPROX_PROJECTED_SOLID_ANGLE not available for PST_TRIANGLE + + +template +struct Estimator; + +template +struct Estimator { using scalar_type = typename Ray::scalar_type; using vector3_type = vector; using ray_type = Ray; - using light_type = Light; - using spectral_type = typename Light::spectral_type; + using scene_type = Scene; + using light_type = typename Scene::light_type; + using spectral_type = typename light_type::spectral_type; using interaction_type = Aniso; using quotient_pdf_type = bxdf::quotient_and_pdf; using sample_type = LightSample; using ray_dir_info_type = typename sample_type::ray_dir_info_type; - static spectral_type proceduralDeferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event) + static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(ray_type) ray) { - const uint32_t lightCount = event.data[0]; - const ProceduralShapeType type = (ProceduralShapeType)event.data[1]; - - pdf = 1.0 / lightCount; - switch (type) - { - case PST_SPHERE: - { - const vector3_type position = vector3_type( - bit_cast(event.data[2]), - bit_cast(event.data[3]), - bit_cast(event.data[4])); - Shape sphere = Shape::create(position, bit_cast(event.data[5]), event.data[6]); - pdf *= sphere.template deferredPdf(ray); - } - break; - case PST_TRIANGLE: - { - const vector3_type vertex0 = vector3_type( - bit_cast(event.data[2]), - bit_cast(event.data[3]), - bit_cast(event.data[4])); - const vector3_type vertex1 = vector3_type( - bit_cast(event.data[5]), - bit_cast(event.data[6]), - bit_cast(event.data[7])); - const vector3_type vertex2 = vector3_type( - bit_cast(event.data[8]), - bit_cast(event.data[9]), - bit_cast(event.data[10])); - Shape tri = Shape::create(vertex0, vertex1, vertex2, event.data[11]); - pdf *= tri.template deferredPdf(ray); - } - break; - case PST_RECTANGLE: - { - const vector3_type offset = vector3_type( - bit_cast(event.data[2]), - bit_cast(event.data[3]), - bit_cast(event.data[4])); - const vector3_type edge0 = vector3_type( - bit_cast(event.data[5]), - bit_cast(event.data[6]), - bit_cast(event.data[7])); - const vector3_type edge1 = vector3_type( - bit_cast(event.data[8]), - bit_cast(event.data[9]), - bit_cast(event.data[10])); - Shape rect = Shape::create(offset, edge0, edge1, event.data[11]); - pdf *= rect.template deferredPdf(ray); - } - break; - default: - pdf = bit_cast(numeric_limits::infinity); - break; - } + pdf = 1.0 / scene.lightCount; + const light_type light = scene.lights[lightID]; + const Shape sphere = scene.spheres[light.objectID.id]; + const ShapeSampling sampling = ShapeSampling::create(sphere); + pdf *= sampling.template deferredPdf(ray); return light.radiance; } - static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(Event) event) - { - // const IntersectMode mode = (IntersectMode)event.mode; - // switch (mode) - // { - // case IM_RAY_QUERY: - // { - // // TODO: do ray query stuff - // } - // break; - // case IM_RAY_TRACING: - // { - // // TODO: do ray tracing stuff - // } - // break; - // case IM_PROCEDURAL: - // { - return proceduralDeferredEvalAndPdf(pdf, light, ray, event); - // } - // break; - // default: - // return (spectral_type)0.0; - // } - // return (spectral_type)0.0; - } - - static sample_type procedural_generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event) - { - const uint32_t lightCount = event.data[0]; - const ProceduralShapeType type = (ProceduralShapeType)event.data[1]; + static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth) + { + sample_type L; + scalar_type pdf; + + const light_type light = scene.lights[lightID]; + const Shape sphere = scene.spheres[light.objectID.id]; + const ShapeSampling sampling = ShapeSampling::create(sphere); + + const vector3_type sampleL = sampling.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); + const vector3_type V = interaction.isotropic.V.getDirection(); + const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); + ray_dir_info_type rayL; + rayL.direction = sampleL; + L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); + + newRayMaxT *= Tolerance::getEnd(depth); + pdf *= 1.0 / scalar_type(scene.lightCount); + spectral_type quo = light.radiance / pdf; + quotient_pdf = quotient_pdf_type::create(quo, pdf); + + return L; + } +}; + +template +struct Estimator +{ + using scalar_type = typename Ray::scalar_type; + using vector3_type = vector; + using ray_type = Ray; + using scene_type = Scene; + using light_type = typename Scene::light_type; + using spectral_type = typename light_type::spectral_type; + using interaction_type = Aniso; + using quotient_pdf_type = bxdf::quotient_and_pdf; + using sample_type = LightSample; + using ray_dir_info_type = typename sample_type::ray_dir_info_type; + + static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(ray_type) ray) + { + pdf = 1.0 / scene.lightCount; + const light_type light = scene.lights[lightID]; + const Shape tri = scene.triangles[light.objectID.id]; + const ShapeSampling sampling = ShapeSampling::create(tri); + pdf *= sampling.template deferredPdf(ray); + + return light.radiance; + } + static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth) + { sample_type L; scalar_type pdf; - switch (type) - { - case PST_SPHERE: - { - const vector3_type position = vector3_type( - bit_cast(event.data[2]), - bit_cast(event.data[3]), - bit_cast(event.data[4])); - Shape sphere = Shape::create(position, bit_cast(event.data[5]), event.data[6]); - - const vector3_type sampleL = sphere.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); - const vector3_type V = interaction.isotropic.V.getDirection(); - const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); - ray_dir_info_type rayL; - rayL.direction = sampleL; - L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); - } - break; - case PST_TRIANGLE: - { - const vector3_type vertex0 = vector3_type( - bit_cast(event.data[2]), - bit_cast(event.data[3]), - bit_cast(event.data[4])); - const vector3_type vertex1 = vector3_type( - bit_cast(event.data[5]), - bit_cast(event.data[6]), - bit_cast(event.data[7])); - const vector3_type vertex2 = vector3_type( - bit_cast(event.data[8]), - bit_cast(event.data[9]), - bit_cast(event.data[10])); - Shape tri = Shape::create(vertex0, vertex1, vertex2, event.data[11]); - - const vector3_type sampleL = tri.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); - const vector3_type V = interaction.isotropic.V.getDirection(); - const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); - ray_dir_info_type rayL; - rayL.direction = sampleL; - L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); - } - break; - case PST_RECTANGLE: - { - const vector3_type offset = vector3_type( - bit_cast(event.data[2]), - bit_cast(event.data[3]), - bit_cast(event.data[4])); - const vector3_type edge0 = vector3_type( - bit_cast(event.data[5]), - bit_cast(event.data[6]), - bit_cast(event.data[7])); - const vector3_type edge1 = vector3_type( - bit_cast(event.data[8]), - bit_cast(event.data[9]), - bit_cast(event.data[10])); - Shape rect = Shape::create(offset, edge0, edge1, event.data[11]); - - const vector3_type sampleL = rect.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); - const vector3_type V = interaction.isotropic.V.getDirection(); - const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); - ray_dir_info_type rayL; - rayL.direction = sampleL; - L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); - } - break; - default: - pdf = bit_cast(numeric_limits::infinity); - break; - } + + const light_type light = scene.lights[lightID]; + const Shape tri = scene.triangles[light.objectID.id]; + const ShapeSampling sampling = ShapeSampling::create(tri); + + const vector3_type sampleL = sampling.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); + const vector3_type V = interaction.isotropic.V.getDirection(); + const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); + ray_dir_info_type rayL; + rayL.direction = sampleL; + L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); newRayMaxT *= Tolerance::getEnd(depth); - pdf *= 1.0 / scalar_type(lightCount); + pdf *= 1.0 / scalar_type(scene.lightCount); spectral_type quo = light.radiance / pdf; quotient_pdf = quotient_pdf_type::create(quo, pdf); return L; } +}; + +template +struct Estimator +{ + using scalar_type = typename Ray::scalar_type; + using vector3_type = vector; + using ray_type = Ray; + using scene_type = Scene; + using light_type = typename Scene::light_type; + using spectral_type = typename light_type::spectral_type; + using interaction_type = Aniso; + using quotient_pdf_type = bxdf::quotient_and_pdf; + using sample_type = LightSample; + using ray_dir_info_type = typename sample_type::ray_dir_info_type; + + static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(ray_type) ray) + { + pdf = 1.0 / scene.lightCount; + const light_type light = scene.lights[lightID]; + const Shape rect = scene.rectangles[light.objectID.id]; + const ShapeSampling sampling = ShapeSampling::create(rect); + pdf *= sampling.template deferredPdf(ray); + + return light.radiance; + } - static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(light_type) light, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth, NBL_CONST_REF_ARG(Event) event) + static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth) { - const IntersectMode mode = (IntersectMode)event.mode; sample_type L; - // switch (mode) - // { - // case IM_RAY_QUERY: - // { - // // TODO: do ray query stuff - // } - // break; - // case IM_RAY_TRACING: - // { - // // TODO: do ray tracing stuff - // } - // break; - // case IM_PROCEDURAL: - // { - return procedural_generate_and_quotient_and_pdf(quotient_pdf, newRayMaxT, light, origin, interaction, isBSDF, xi, depth, event); - // } - // break; - // default: - // { - // return L; - // } - // } - // return L; + scalar_type pdf; + + const light_type light = scene.lights[lightID]; + const Shape rect = scene.rectangles[light.objectID.id]; + const ShapeSampling sampling = ShapeSampling::create(rect); + + const vector3_type sampleL = sampling.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); + const vector3_type V = interaction.isotropic.V.getDirection(); + const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); + ray_dir_info_type rayL; + rayL.direction = sampleL; + L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); + + newRayMaxT *= Tolerance::getEnd(depth); + pdf *= 1.0 / scalar_type(scene.lightCount); + spectral_type quo = light.radiance / pdf; + quotient_pdf = quotient_pdf_type::create(quo, pdf); + + return L; } }; diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 553094e21..3082e599e 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -71,14 +71,6 @@ struct Unidirectional using conductor_op_type = typename MaterialSystem::conductor_op_type; using dielectric_op_type = typename MaterialSystem::dielectric_op_type; - // static this_t create(RandGen randGen, - // RayGen rayGen, - // Intersector intersector, - // MaterialSystem materialSystem, - // /* PathGuider pathGuider, */ - // NextEventEstimator nee) - // {} - static this_t create(NBL_CONST_REF_ARG(PathTracerCreationParams) params) { this_t retval; @@ -139,7 +131,7 @@ struct Unidirectional if (lightID != light_type::INVALID_ID) { float _pdf; - ray.payload.accumulation += nee.deferredEvalAndPdf(_pdf, scene.lights[lightID], ray, scene.toNextEvent(lightID)) * throughput / (1.0 + _pdf * _pdf * ray.payload.otherTechniqueHeuristic); + ray.payload.accumulation += nee.deferredEvalAndPdf(_pdf, scene, lightID, ray) * throughput / (1.0 + _pdf * _pdf * ray.payload.otherTechniqueHeuristic); } const uint32_t bsdfID = glsl::bitfieldExtract(bsdfLightIDs, 0, 16); @@ -174,8 +166,8 @@ struct Unidirectional scalar_type t; sample_type nee_sample = nee.generate_and_quotient_and_pdf( neeContrib_pdf, t, - scene.lights[randLightID], intersection, interaction, - isBSDF, eps0, depth, scene.toNextEvent(randLightID) + scene, randLightID, intersection, interaction, + isBSDF, eps0, depth ); // We don't allow non watertight transmitters in this renderer @@ -198,36 +190,6 @@ struct Unidirectional // example only uses isotropic bxdfs params_type params = params_type::template create(nee_sample, interaction.isotropic, _cache.iso_cache, _clamp); - // TODO: does not yet account for smooth dielectric - // if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) - // { - // params = params_type::template create(nee_sample, interaction.isotropic, bxdf::BCM_MAX); - // } - // else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) - // { - // if (bxdf.params.is_aniso) - // params = params_type::template create(nee_sample, interaction, _cache, bxdf::BCM_MAX); - // else - // { - // isocache_type isocache = _cache.iso_cache; - // params = params_type::template create(nee_sample, interaction.isotropic, _cache.iso_cache, bxdf::BCM_MAX); - // } - // } - // else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) - // { - // params = params_type::template create(nee_sample, interaction.isotropic, bxdf::BCM_ABS); - // } - // else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) - // { - // if (bxdf.params.is_aniso) - // params = params_type::template create(nee_sample, interaction, _cache, bxdf::BCM_ABS); - // else - // { - // isocache_type isocache = _cache.iso_cache; - // params = params_type::template create(nee_sample, interaction.isotropic, _cache.iso_cache, bxdf::BCM_ABS); - // } - // } - quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(bxdf.materialType, bxdf.params, params); neeContrib_pdf.quotient *= bxdf.albedo * throughput * bsdf_quotient_pdf.quotient; const scalar_type otherGenOverChoice = bsdf_quotient_pdf.pdf * rcpChoiceProb; @@ -261,37 +223,6 @@ struct Unidirectional // example only uses isotropic bxdfs params_type params = params_type::template create(bsdf_sample, interaction.isotropic, _cache.iso_cache, _clamp); - // TODO: does not yet account for smooth dielectric - // params_type params; - // if (!isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) - // { - // params = params_type::template create(bsdf_sample, iso_interaction, bxdf::BCM_MAX); - // } - // else if (!isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) - // { - // if (bxdf.params.is_aniso) - // params = params_type::template create(bsdf_sample, interaction, _cache, bxdf::BCM_MAX); - // else - // { - // isocache_type isocache = _cache.iso_cache; - // params = params_type::template create(bsdf_sample, iso_interaction, isocache, bxdf::BCM_MAX); - // } - // } - // else if (isBSDF && bxdf.materialType == ext::MaterialSystem::Material::DIFFUSE) - // { - // params = params_type::template create(bsdf_sample, iso_interaction, bxdf::BCM_ABS); - // } - // else if (isBSDF && bxdf.materialType != ext::MaterialSystem::Material::DIFFUSE) - // { - // if (bxdf.params.is_aniso) - // params = params_type::template create(bsdf_sample, interaction, _cache, bxdf::BCM_ABS); - // else - // { - // isocache_type isocache = _cache.iso_cache; - // params = params_type::template create(bsdf_sample, iso_interaction, isocache, bxdf::BCM_ABS); - // } - // } - // the value of the bsdf divided by the probability of the sample being generated quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(bxdf.materialType, bxdf.params, params); throughput *= bxdf.albedo * bsdf_quotient_pdf.quotient; diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index b54f5721d..5e8102f6f 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -13,24 +13,18 @@ #ifdef SPHERE_LIGHT #define SPHERE_COUNT 9 -#define LIGHT_TYPE ext::PST_SPHERE - #define TRIANGLE_COUNT 0 #define RECTANGLE_COUNT 0 #endif #ifdef TRIANGLE_LIGHT #define TRIANGLE_COUNT 1 -#define LIGHT_TYPE ext::PST_TRIANGLE - #define SPHERE_COUNT 8 #define RECTANGLE_COUNT 0 #endif #ifdef RECTANGLE_LIGHT #define RECTANGLE_COUNT 1 -#define LIGHT_TYPE ext::PST_RECTANGLE - #define SPHERE_COUNT 8 #define TRIANGLE_COUNT 0 #endif @@ -47,6 +41,18 @@ NBL_CONSTEXPR uint32_t WorkgroupSize = 256; NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4; NBL_CONSTEXPR uint32_t MAX_SAMPLES_LOG2 = 10; +#ifdef SPHERE_LIGHT +NBL_CONSTEXPR ext::ProceduralShapeType LIGHT_TYPE = ext::PST_SPHERE; +#endif +#ifdef TRIANGLE_LIGHT +NBL_CONSTEXPR ext::ProceduralShapeType LIGHT_TYPE = ext::PST_TRIANGLE; +#endif +#ifdef RECTANGLE_LIGHT +NBL_CONSTEXPR ext::ProceduralShapeType LIGHT_TYPE = ext::PST_RECTANGLE; +#endif + +NBL_CONSTEXPR ext::PTPolygonMethod POLYGON_METHOD = ext::PPM_SOLID_ANGLE; + int32_t2 getCoordinates() { uint32_t width, height; @@ -80,11 +86,12 @@ using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF; using light_type = ext::Light; using bxdfnode_type = ext::BxDFNode; +using scene_type = ext::Scene; using randgen_type = ext::RandGen::Uniform3D; using raygen_type = ext::RayGen::Basic; using intersector_type = ext::Intersector::Comprehensive; using material_system_type = ext::MaterialSystem::System; -using nee_type = ext::NextEventEstimator::Estimator; +using nee_type = ext::NextEventEstimator::Estimator; using pathtracer_type = ext::PathTracer::Unidirectional; static const ext::Shape spheres[SPHERE_COUNT] = { @@ -164,7 +171,6 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) } int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x; - PCG32x2 pcg = PCG32x2::construct(flatIdx); // replaces scramblebuf? // set up path tracer ext::PathTracer::PathTracerCreationParams ptCreateParams; diff --git a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl index 887d20c48..40fb01057 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/scene.hlsl @@ -87,145 +87,6 @@ struct Scene #undef SCENE_TRIANGLE_COUNT #undef SCENE_RECTANGLE_COUNT - // obsolete? - // Intersector::IntersectData toIntersectData(uint32_t mode, ProceduralShapeType type) - // { - // Intersector::IntersectData retval; - // retval.mode = mode; - - // uint32_t objCount = (type == PST_SPHERE) ? sphereCount : - // (type == PST_TRIANGLE) ? triangleCount : - // (type == PST_RECTANGLE) ? rectangleCount : - // -1; - // retval.data[0] = objCount; - // retval.data[1] = type; - - // switch (type) - // { - // case PST_SPHERE: - // { - // for (int i = 0; i < objCount; i++) - // { - // Shape sphere = spheres[i]; - // uint32_t3 uintPos = bit_cast(sphere.position); - // retval.data[2 + i * Shape::ObjSize] = uintPos.x; - // retval.data[2 + i * Shape::ObjSize + 1] = uintPos.y; - // retval.data[2 + i * Shape::ObjSize + 2] = uintPos.z; - // retval.data[2 + i * Shape::ObjSize + 3] = bit_cast(sphere.radius2); - // retval.data[2 + i * Shape::ObjSize + 4] = sphere.bsdfLightIDs; - // } - // } - // break; - // case PST_TRIANGLE: - // { - // for (int i = 0; i < objCount; i++) - // { - // Shape tri = triangles[i]; - // retval.data[2 + i * Shape::ObjSize] = asuint(tri.vertex0.x); - // retval.data[2 + i * Shape::ObjSize + 1] = asuint(tri.vertex0.y); - // retval.data[2 + i * Shape::ObjSize + 2] = asuint(tri.vertex0.z); - // retval.data[2 + i * Shape::ObjSize + 3] = asuint(tri.vertex1.x); - // retval.data[2 + i * Shape::ObjSize + 4] = asuint(tri.vertex1.y); - // retval.data[2 + i * Shape::ObjSize + 5] = asuint(tri.vertex1.z); - // retval.data[2 + i * Shape::ObjSize + 6] = asuint(tri.vertex2.x); - // retval.data[2 + i * Shape::ObjSize + 7] = asuint(tri.vertex2.y); - // retval.data[2 + i * Shape::ObjSize + 8] = asuint(tri.vertex2.z); - // retval.data[2 + i * Shape::ObjSize + 9] = tri.bsdfLightIDs; - // } - // } - // break; - // case PST_RECTANGLE: - // { - // for (int i = 0; i < objCount; i++) - // { - // Shape rect = rectangles[i]; - // retval.data[2 + i * Shape::ObjSize] = asuint(rect.offset.x); - // retval.data[2 + i * Shape::ObjSize + 1] = asuint(rect.offset.y); - // retval.data[2 + i * Shape::ObjSize + 2] = asuint(rect.offset.z); - // retval.data[2 + i * Shape::ObjSize + 3] = asuint(rect.edge0.x); - // retval.data[2 + i * Shape::ObjSize + 4] = asuint(rect.edge0.y); - // retval.data[2 + i * Shape::ObjSize + 5] = asuint(rect.edge0.z); - // retval.data[2 + i * Shape::ObjSize + 6] = asuint(rect.edge1.x); - // retval.data[2 + i * Shape::ObjSize + 7] = asuint(rect.edge1.y); - // retval.data[2 + i * Shape::ObjSize + 8] = asuint(rect.edge1.z); - // retval.data[2 + i * Shape::ObjSize + 9] = rect.bsdfLightIDs; - // } - // } - // break; - // default: - // // for ASes - // break; - // } - // return retval; - // } - - NextEventEstimator::Event toNextEvent(uint32_t lightID) - { - NextEventEstimator::Event retval; - - ObjectID objectID = lights[lightID].objectID; - retval.mode = objectID.mode; - - retval.data[0] = lightCount; - retval.data[1] = objectID.shapeType; - - uint32_t id = objectID.id; - switch (objectID.shapeType) - { - case PST_SPHERE: - { - Shape sphere = spheres[id]; - uint32_t3 position = bit_cast(sphere.position); - retval.data[2] = position.x; - retval.data[3] = position.y; - retval.data[4] = position.z; - retval.data[5] = bit_cast(sphere.radius2); - retval.data[6] = sphere.bsdfLightIDs; - } - break; - case PST_TRIANGLE: - { - Shape tri = triangles[id]; - uint32_t3 vertex = bit_cast(tri.vertex0); - retval.data[2] = vertex.x; - retval.data[3] = vertex.y; - retval.data[4] = vertex.z; - vertex = bit_cast(tri.vertex1); - retval.data[5] = vertex.x; - retval.data[6] = vertex.y; - retval.data[7] = vertex.z; - vertex = bit_cast(tri.vertex2); - retval.data[8] = vertex.x; - retval.data[9] = vertex.y; - retval.data[10] = vertex.z; - retval.data[11] = tri.bsdfLightIDs; - } - break; - case PST_RECTANGLE: - { - Shape rect = rectangles[id]; - uint32_t3 tmp = bit_cast(rect.offset); - retval.data[2] = tmp.x; - retval.data[3] = tmp.y; - retval.data[4] = tmp.z; - tmp = bit_cast(rect.edge0); - retval.data[5] = tmp.x; - retval.data[6] = tmp.y; - retval.data[7] = tmp.z; - tmp = bit_cast(rect.edge1); - retval.data[8] = tmp.x; - retval.data[9] = tmp.y; - retval.data[10] = tmp.z; - retval.data[11] = rect.bsdfLightIDs; - } - break; - default: - // for ASes - break; - } - return retval; - } - // TODO: get these to work with AS types as well uint32_t getBsdfLightIDs(NBL_CONST_REF_ARG(ObjectID) objectID) { diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 10889f37f..ae9f162a4 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -48,7 +48,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 }; constexpr static inline uint32_t MaxFramesInFlight = 5; constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); - constexpr static inline uint32_t DefaultWorkGroupSize = 16u; + constexpr static inline uint32_t DefaultWorkGroupSize = 256u; constexpr static inline uint32_t MaxDescriptorCount = 256u; constexpr static inline uint32_t MaxDepthLog2 = 4u; // 5 constexpr static inline uint32_t MaxSamplesLog2 = 10u; // 18 @@ -1068,10 +1068,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get()); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get()); cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc); - if (renderMode == E_RENDER_MODE::ERM_HLSL) - cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / 256u, 1u, 1u); - else - cmdbuf->dispatch(1 + (WindowDimensions.x - 1) / DefaultWorkGroupSize, 1 + (WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u); + cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u); } // TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image) @@ -1351,7 +1348,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, float camYAngle = 165.f / 180.f * 3.14159f; float camXAngle = 32.f / 180.f * 3.14159f; int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE; - int renderMode = E_RENDER_MODE::ERM_HLSL; + int renderMode = E_RENDER_MODE::ERM_GLSL; int spp = 32; int depth = 3; From b889b60e4db77dbc435ea6c5baefbcba0089e01c Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 19 Mar 2025 15:24:14 +0700 Subject: [PATCH 088/296] use 1D workgroup dispatch --- 31_HLSLPathTracer/app_resources/glsl/common.glsl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl index 2463f82cf..b09c90824 100644 --- a/31_HLSLPathTracer/app_resources/glsl/common.glsl +++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl @@ -16,12 +16,13 @@ layout(set = 2, binding = 2) uniform usampler2D scramblebuf; layout(set=0, binding=0, rgba16f) uniform image2D outImage; #ifndef _NBL_GLSL_WORKGROUP_SIZE_ -#define _NBL_GLSL_WORKGROUP_SIZE_ 32 -layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=_NBL_GLSL_WORKGROUP_SIZE_, local_size_z=1) in; +#define _NBL_GLSL_WORKGROUP_SIZE_ 256 +layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in; #endif ivec2 getCoordinates() { - return ivec2(gl_GlobalInvocationID.xy); + ivec2 imageSize = imageSize(outImage); + return ivec2(gl_GlobalInvocationID.x % imageSize.x, gl_GlobalInvocationID.x / imageSize.x); } vec2 getTexCoords() { From 79ee9da780900a7977d630ef156128f7287d2222 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 19 Mar 2025 15:24:57 +0700 Subject: [PATCH 089/296] removed obsolete commented sections --- .../app_resources/hlsl/intersector.hlsl | 118 ------------------ 1 file changed, 118 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl index 03a45f866..e59fdc2c3 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/intersector.hlsl @@ -78,124 +78,6 @@ struct Comprehensive return objectID; } - - // note for future consideration: still need to encode to IntersectData? - // obsolete? - // static ObjectID traceProcedural(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect) - // { - // const bool anyHit = ray.intersectionT != numeric_limits::max; - // const uint32_t objCount = intersect.data[0]; - // const ProceduralShapeType type = (ProceduralShapeType)intersect.data[1]; - - // ObjectID objectID = ray.objectID; - // objectID.mode = IM_PROCEDURAL; - // objectID.shapeType = type; - // for (int i = 0; i < objCount; i++) - // { - // float t; - // switch (type) - // { - // case PST_SPHERE: - // { - // vector3_type position = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); - // Shape sphere = Shape::create(position, asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), intersect.data[2 + i * Shape::ObjSize + 4]); - // t = sphere.intersect(ray.origin, ray.direction); - // } - // break; - // case PST_TRIANGLE: - // { - // vector3_type vertex0 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); - // vector3_type vertex1 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape::ObjSize + 5])); - // vector3_type vertex2 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape::ObjSize + 8])); - // Shape tri = Shape::create(vertex0, vertex1, vertex2, intersect.data[2 + i * Shape::ObjSize + 9]); - // t = tri.intersect(ray.origin, ray.direction); - // } - // break; - // case PST_RECTANGLE: - // { - // vector3_type offset = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize]), asfloat(intersect.data[2 + i * Shape::ObjSize + 1]), asfloat(intersect.data[2 + i * Shape::ObjSize + 2])); - // vector3_type edge0 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 3]), asfloat(intersect.data[2 + i * Shape::ObjSize + 4]), asfloat(intersect.data[2 + i * Shape::ObjSize + 5])); - // vector3_type edge1 = vector3_type(asfloat(intersect.data[2 + i * Shape::ObjSize + 6]), asfloat(intersect.data[2 + i * Shape::ObjSize + 7]), asfloat(intersect.data[2 + i * Shape::ObjSize + 8])); - // Shape rect = Shape::create(offset, edge0, edge1, intersect.data[2 + i * Shape::ObjSize + 9]); - // t = rect.intersect(ray.origin, ray.direction); - // } - // break; - // default: - // t = numeric_limits::infinity; - // break; - // } - - // bool closerIntersection = t > 0.0 && t < ray.intersectionT; - - // ray.intersectionT = closerIntersection ? t : ray.intersectionT; - // objectID.id = closerIntersection ? i : objectID.id; - - // // allowing early out results in a performance regression, WTF!? - // //if (anyHit && closerIntersection) - // //break; - // } - // return objectID; - // } - - // obsolete? - // static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(IntersectData) intersect) - // { - // const uint32_t mode = intersect.mode; - // switch (mode) - // { - // case IM_RAY_QUERY: - // { - // // TODO: do ray query stuff - // } - // break; - // case IM_RAY_TRACING: - // { - // // TODO: do ray tracing stuff - // } - // break; - // case IM_PROCEDURAL: - // { - // return traceProcedural(ray, intersect); - // } - // break; - // default: - // { - // return ObjectID::create(-1, 0, PST_SPHERE); - // } - // } - // return ObjectID::create(-1, 0, PST_SPHERE); - // } - - // static ObjectID traceRay(NBL_REF_ARG(ray_type) ray, NBL_CONST_REF_ARG(scene_type) scene) - // { - // IntersectData data; - - // ObjectID objectID; - // objectID.id = -1; // start with no intersect - - // // prodedural shapes - // if (scene.sphereCount > 0) - // { - // data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_SPHERE); - // objectID = traceRay(ray, data); - // } - - // if (scene.triangleCount > 0) - // { - // data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_TRIANGLE); - // objectID = traceRay(ray, data); - // } - - // if (scene.rectangleCount > 0) - // { - // data = scene.toIntersectData(ext::Intersector::IntersectData::Mode::PROCEDURAL, PST_RECTANGLE); - // objectID = traceRay(ray, data); - // } - - // // TODO: trace AS - - // return objectID; - // } }; } From ca8f2ec8fa84a2bd1bfeb4348263f82d14026bca Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 19 Mar 2025 16:01:15 +0700 Subject: [PATCH 090/296] some minor corrections --- .../app_resources/hlsl/common.hlsl | 6 +--- .../hlsl/next_event_estimator.hlsl | 28 +++++++++++-------- .../app_resources/hlsl/pathtracer.hlsl | 9 +++--- 31_HLSLPathTracer/main.cpp | 2 +- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index dea682c8b..2e2561345 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -68,7 +68,7 @@ struct Ray vector3_type origin; vector3_type direction; - // TODO: polygon method == 2 stuff + // polygon method == PPM_APPROX_PROJECTED_SOLID_ANGLE vector3_type normalAtOrigin; bool wasBSDFAtOrigin; @@ -246,7 +246,6 @@ struct Shape retval.vertex1 = vertex1; retval.vertex2 = vertex2; retval.bsdfLightIDs = bsdfLightIDs; - retval.polygonMethod = PPM_SOLID_ANGLE; return retval; } @@ -288,7 +287,6 @@ struct Shape float32_t3 vertex1; float32_t3 vertex2; uint32_t bsdfLightIDs; - PTPolygonMethod polygonMethod; }; template<> @@ -301,7 +299,6 @@ struct Shape retval.edge0 = edge0; retval.edge1 = edge1; retval.bsdfLightIDs = bsdfLightIDs; - retval.polygonMethod = PPM_SOLID_ANGLE; return retval; } @@ -348,7 +345,6 @@ struct Shape float32_t3 edge0; float32_t3 edge1; uint32_t bsdfLightIDs; - PTPolygonMethod polygonMethod; }; } diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 7c157aadf..51c018ac5 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -298,6 +298,10 @@ struct Estimator using sample_type = LightSample; using ray_dir_info_type = typename sample_type::ray_dir_info_type; + // affected by https://github.com/microsoft/DirectXShaderCompiler/issues/7007 + // NBL_CONSTEXPR_STATIC_INLINE PTPolygonMethod PolygonMethod = PPM; + enum : uint16_t { PolygonMethod = PPM }; + static spectral_type deferredEvalAndPdf(NBL_REF_ARG(scalar_type) pdf, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(ray_type) ray) { pdf = 1.0 / scene.lightCount; @@ -311,19 +315,17 @@ struct Estimator static sample_type generate_and_quotient_and_pdf(NBL_REF_ARG(quotient_pdf_type) quotient_pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(scene_type) scene, uint32_t lightID, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(interaction_type) interaction, bool isBSDF, NBL_CONST_REF_ARG(vector3_type) xi, uint32_t depth) { - sample_type L; - scalar_type pdf; - const light_type light = scene.lights[lightID]; const Shape sphere = scene.spheres[light.objectID.id]; const ShapeSampling sampling = ShapeSampling::create(sphere); + scalar_type pdf; const vector3_type sampleL = sampling.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); const vector3_type V = interaction.isotropic.V.getDirection(); const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); ray_dir_info_type rayL; rayL.direction = sampleL; - L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); + sample_type L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); newRayMaxT *= Tolerance::getEnd(depth); pdf *= 1.0 / scalar_type(scene.lightCount); @@ -348,6 +350,9 @@ struct Estimator tri = scene.triangles[light.objectID.id]; const ShapeSampling sampling = ShapeSampling::create(tri); + scalar_type pdf; const vector3_type sampleL = sampling.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); const vector3_type V = interaction.isotropic.V.getDirection(); const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); ray_dir_info_type rayL; rayL.direction = sampleL; - L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); + sample_type L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); newRayMaxT *= Tolerance::getEnd(depth); pdf *= 1.0 / scalar_type(scene.lightCount); @@ -398,6 +401,9 @@ struct Estimator rect = scene.rectangles[light.objectID.id]; const ShapeSampling sampling = ShapeSampling::create(rect); + scalar_type pdf; const vector3_type sampleL = sampling.template generate_and_pdf(pdf, newRayMaxT, origin, interaction, isBSDF, xi); const vector3_type V = interaction.isotropic.V.getDirection(); const scalar_type VdotL = nbl::hlsl::dot(V, sampleL); ray_dir_info_type rayL; rayL.direction = sampleL; - L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); + sample_type L = sample_type::create(rayL,VdotL,interaction.T,interaction.B,interaction.isotropic.N); newRayMaxT *= Tolerance::getEnd(depth); pdf *= 1.0 / scalar_type(scene.lightCount); diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index 3082e599e..f5d5206dc 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -241,10 +241,11 @@ struct Unidirectional // trace new ray ray.origin = intersection + bxdfSample * (1.0/*kSceneSize*/) * Tolerance::getStart(depth); ray.direction = bxdfSample; - // #if POLYGON_METHOD==2 - // ray._immutable.normalAtOrigin = interaction.isotropic.N; - // ray._immutable.wasBSDFAtOrigin = isBSDF; - // #endif + if ((PTPolygonMethod)nee_type::PolygonMethod == PPM_APPROX_PROJECTED_SOLID_ANGLE) + { + ray.normalAtOrigin = interaction.isotropic.N; + ray.wasBSDFAtOrigin = isBSDF; + } return true; } diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index ae9f162a4..e3e0b7d7a 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -1348,7 +1348,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, float camYAngle = 165.f / 180.f * 3.14159f; float camXAngle = 32.f / 180.f * 3.14159f; int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE; - int renderMode = E_RENDER_MODE::ERM_GLSL; + int renderMode = E_RENDER_MODE::ERM_HLSL; int spp = 32; int depth = 3; From c9041343c0ca756c07aa94008f2da790d8ec5b3b Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 20 Mar 2025 20:34:14 +0700 Subject: [PATCH 091/296] Return materialId instead of materialPacked from rchit --- .../app_resources/common.hlsl | 140 +++++++++++++++++- .../app_resources/raytrace.rchit.hlsl | 94 +----------- .../app_resources/raytrace.rgen.hlsl | 20 ++- .../app_resources/raytrace.rint.hlsl | 4 +- .../raytrace_procedural.rchit.hlsl | 3 +- 5 files changed, 155 insertions(+), 106 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index 0b5f4b170..d64851b17 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -75,6 +75,7 @@ struct SProceduralGeomInfo float32_t radius; }; + struct STriangleGeomInfo { MaterialPacked material; @@ -154,13 +155,6 @@ struct RayLight float32_t outIntensity; }; -struct ProceduralHitAttribute -{ - MaterialPacked material; - float32_t3 center; -}; - - #ifdef __HLSL_VERSION struct [raypayload] OcclusionPayload @@ -168,12 +162,50 @@ struct [raypayload] OcclusionPayload float32_t attenuation : read(caller) : write(caller, anyhit); }; +struct MaterialId +{ + const static uint32_t PROCEDURAL_FLAG = (1 << 31); + const static uint32_t PROCEDURAL_MASK = ~PROCEDURAL_FLAG; + + uint32_t data; + + static MaterialId createProcedural(uint32_t index) + { + MaterialId id; + id.data = index | PROCEDURAL_FLAG; + return id; + } + + static MaterialId createTriangle(uint32_t index) + { + MaterialId id; + id.data = index; + return id; + } + + uint32_t getMaterialIndex() + { + return data & PROCEDURAL_MASK; + } + + bool isHitProceduralGeom() + { + return data & PROCEDURAL_FLAG; + } +}; + struct [raypayload] PrimaryPayload { - MaterialPacked material : read(caller) : write(closesthit); float32_t3 worldNormal : read(caller) : write(closesthit); float32_t rayDistance : read(caller) : write(closesthit, miss); float32_t alphaThreshold : read(closesthit, anyhit) : write(caller); + MaterialId materialId : read(caller) : write(closesthit); + +}; + +struct ProceduralHitAttribute +{ + float32_t3 center; }; enum ObjectType : uint32_t // matches c++ @@ -213,6 +245,98 @@ float32_t3 computeSpecular(Material mat, float32_t3 view_dir, return float32_t3(mat.specular * specular); } + +float3 unpackNormals3x10(uint32_t v) +{ + // host side changes float32_t3 to EF_A2B10G10R10_SNORM_PACK32 + // follows unpacking scheme from https://github.com/KhronosGroup/SPIRV-Cross/blob/main/reference/shaders-hlsl/frag/unorm-snorm-packing.frag + int signedValue = int(v); + int3 pn = int3(signedValue << 22, signedValue << 12, signedValue << 2) >> 22; + return clamp(float3(pn) / 511.0, -1.0, 1.0); +} + +float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, float2 bary) +{ + uint idxOffset = primID * 3; + + const uint indexType = geom.indexType; + const uint vertexStride = geom.vertexStride; + + const uint32_t objType = geom.objType; + const uint64_t indexBufferAddress = geom.indexBufferAddress; + + uint i0, i1, i2; + switch (indexType) + { + case 0: // EIT_16BIT + { + i0 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u)); + i1 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u)); + i2 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u)); + } + break; + case 1: // EIT_32BIT + { + i0 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t)); + i1 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t)); + i2 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t)); + } + break; + default: // EIT_NONE + { + i0 = idxOffset; + i1 = idxOffset + 1; + i2 = idxOffset + 2; + } + } + + const uint64_t normalVertexBufferAddress = geom.vertexBufferAddress + s_offsetsToNormalBytes[objType]; + float3 n0, n1, n2; + switch (objType) + { + case OT_CUBE: + { + uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride, 2u); + uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride, 2u); + uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride, 2u); + + n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz); + n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz); + n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz); + } + break; + case OT_SPHERE: + case OT_CYLINDER: + case OT_ARROW: + case OT_CONE: + { + uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride); + uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride); + uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride); + + n0 = normalize(unpackNormals3x10(v0)); + n1 = normalize(unpackNormals3x10(v1)); + n2 = normalize(unpackNormals3x10(v2)); + } + break; + case OT_RECTANGLE: + case OT_DISK: + case OT_ICOSPHERE: + default: + { + n0 = normalize(vk::RawBufferLoad < + float3 > (normalVertexBufferAddress + i0 * vertexStride)); + n1 = normalize(vk::RawBufferLoad < + float3 > (normalVertexBufferAddress + i1 * vertexStride)); + n2 = normalize(vk::RawBufferLoad < + float3 > (normalVertexBufferAddress + i2 * vertexStride)); + } + } + + float3 barycentrics = float3(0.0, bary); + barycentrics.x = 1.0 - barycentrics.y - barycentrics.z; + return normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2); +} #endif namespace nbl diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl index fdb252cda..cf68e52eb 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl @@ -2,97 +2,6 @@ [[vk::push_constant]] SPushConstants pc; -float3 unpackNormals3x10(uint32_t v) -{ - // host side changes float32_t3 to EF_A2B10G10R10_SNORM_PACK32 - // follows unpacking scheme from https://github.com/KhronosGroup/SPIRV-Cross/blob/main/reference/shaders-hlsl/frag/unorm-snorm-packing.frag - int signedValue = int(v); - int3 pn = int3(signedValue << 22, signedValue << 12, signedValue << 2) >> 22; - return clamp(float3(pn) / 511.0, -1.0, 1.0); -} - -float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, float2 bary) -{ - uint idxOffset = primID * 3; - - const uint indexType = geom.indexType; - const uint vertexStride = geom.vertexStride; - - const uint32_t objType = geom.objType; - const uint64_t indexBufferAddress = geom.indexBufferAddress; - - uint i0, i1, i2; - switch (indexType) - { - case 0: // EIT_16BIT - { - i0 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint16_t), 2u)); - i1 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint16_t), 2u)); - i2 = uint32_t(vk::RawBufferLoad < uint16_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint16_t), 2u)); - } - break; - case 1: // EIT_32BIT - { - i0 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 0) * sizeof(uint32_t)); - i1 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 1) * sizeof(uint32_t)); - i2 = vk::RawBufferLoad < uint32_t > (indexBufferAddress + (idxOffset + 2) * sizeof(uint32_t)); - } - break; - default: // EIT_NONE - { - i0 = idxOffset; - i1 = idxOffset + 1; - i2 = idxOffset + 2; - } - } - - const uint64_t normalVertexBufferAddress = geom.vertexBufferAddress + s_offsetsToNormalBytes[objType]; - float3 n0, n1, n2; - switch (objType) - { - case OT_CUBE: - { - uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride, 2u); - uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride, 2u); - uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride, 2u); - - n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz); - n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz); - n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz); - } - break; - case OT_SPHERE: - case OT_CYLINDER: - case OT_ARROW: - case OT_CONE: - { - uint32_t v0 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i0 * vertexStride); - uint32_t v1 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i1 * vertexStride); - uint32_t v2 = vk::RawBufferLoad < uint32_t > (normalVertexBufferAddress + i2 * vertexStride); - - n0 = normalize(unpackNormals3x10(v0)); - n1 = normalize(unpackNormals3x10(v1)); - n2 = normalize(unpackNormals3x10(v2)); - } - break; - case OT_RECTANGLE: - case OT_DISK: - case OT_ICOSPHERE: - default: - { - n0 = normalize(vk::RawBufferLoad < - float3 > (normalVertexBufferAddress + i0 * vertexStride)); - n1 = normalize(vk::RawBufferLoad < - float3 > (normalVertexBufferAddress + i1 * vertexStride)); - n2 = normalize(vk::RawBufferLoad < - float3 > (normalVertexBufferAddress + i2 * vertexStride)); - } - } - - float3 barycentrics = float3(0.0, bary); - barycentrics.x = 1.0 - barycentrics.y - barycentrics.z; - return normalize(barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2); -} [shader("closesthit")] void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs) @@ -103,7 +12,8 @@ void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes const float32_t3 vertexNormal = fetchVertexNormal(instID, primID, geom, attribs.barycentrics); const float32_t3 worldNormal = normalize(mul(vertexNormal, WorldToObject3x4()).xyz); - payload.material = geom.material; + payload.materialId = MaterialId::createTriangle(instID); + payload.worldNormal = worldNormal; payload.rayDistance = RayTCurrent(); diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index fc6383dcf..ef84ced3e 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -58,15 +58,29 @@ void main() payload.alphaThreshold = nextRandomUnorm(rnd); TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload); - if (payload.rayDistance < 0) + const float32_t rayDistance = payload.rayDistance; + if (rayDistance < 0) { hitValues += s_clearColor; continue; } - const float32_t3 worldPosition = pc.camPos + (camDirection * payload.rayDistance); + const float32_t3 worldPosition = pc.camPos + (camDirection * rayDistance); const float32_t3 worldNormal = payload.worldNormal; - const Material material = nbl::hlsl::_static_cast(payload.material); + + Material material; + MaterialId materialId = payload.materialId; + // we use negative index to indicate that this is a procedural geometry + if (materialId.isHitProceduralGeom()) + { + const MaterialPacked materialPacked = vk::RawBufferLoad(pc.proceduralGeomInfoBuffer + materialId.getMaterialIndex() * sizeof(SProceduralGeomInfo)); + material = nbl::hlsl::_static_cast(materialPacked); + } + else + { + const MaterialPacked materialPacked = vk::RawBufferLoad(pc.triangleGeomInfoBuffer + materialId.getMaterialIndex() * sizeof(STriangleGeomInfo)); + material = nbl::hlsl::_static_cast(materialPacked); + } RayLight cLight; cLight.inHitPosition = worldPosition; CallShader(pc.light.type, cLight); diff --git a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl index b9941fc59..ab623382d 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl @@ -38,16 +38,16 @@ void main() const int primID = PrimitiveIndex(); // Sphere data - SProceduralGeomInfo sphere = vk::RawBufferLoad < SProceduralGeomInfo > (pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo)); + SProceduralGeomInfo sphere = vk::RawBufferLoad(pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo)); const float32_t tHit = hitSphere(sphere, ray); ProceduralHitAttribute hitAttrib; + // Report hit point if (tHit > 0) { hitAttrib.center = sphere.center; - hitAttrib.material = sphere.material; ReportHit(tHit, 0, hitAttrib); } } \ No newline at end of file diff --git a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl index 0a58ccba8..df9ef9623 100644 --- a/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace_procedural.rchit.hlsl @@ -8,7 +8,8 @@ void main(inout PrimaryPayload payload, in ProceduralHitAttribute attrib) const float32_t3 worldPosition = WorldRayOrigin() + WorldRayDirection() * RayTCurrent(); const float32_t3 worldNormal = normalize(worldPosition - attrib.center); - payload.material = attrib.material; + payload.materialId = MaterialId::createProcedural(PrimitiveIndex()); // we use negative value to indicate that this is procedural + payload.worldNormal = worldNormal; payload.rayDistance = RayTCurrent(); From 6b3ae5402a0c2b7c85506d9c905a89ddd7257e14 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 20 Mar 2025 20:38:51 +0700 Subject: [PATCH 092/296] Add ray tracing no null flags --- 71_RayTracingPipeline/main.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 036acd510..5fe6f8847 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -351,7 +351,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, params.layout = pipelineLayout.get(); params.shaders = std::span(shaders); - params.flags = IGPURayTracingPipeline::SCreationParams::FLAGS::NO_NULL_INTERSECTION_SHADERS; + using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS; + params.flags = core::bitflag(RayTracingFlags::NO_NULL_INTERSECTION_SHADERS) | + RayTracingFlags::NO_NULL_ANY_HIT_SHADERS | + RayTracingFlags::NO_NULL_CLOSEST_HIT_SHADERS; auto& shaderGroups = params.shaderGroups; From b2abf0042c60d1524ef14a93ebe384172f3de3d1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 20 Mar 2025 20:58:48 +0700 Subject: [PATCH 093/296] Add setRayTracingStackSize in the demo --- 71_RayTracingPipeline/main.cpp | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 5fe6f8847..1e4619b46 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -397,11 +397,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, shaderGroups.callables = callableGroups; params.cached.maxRecursionDepth = 1; + params.cached.dynamicStackSize = true; if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) return logFail("Failed to create ray tracing pipeline"); - m_logger->log("Ray Tracing Pipeline Created!", system::ILogger::ELL_INFO); + calculateRayTracingStackSize(m_rayTracingPipeline); + if (!createShaderBindingTable(gQueue, m_rayTracingPipeline)) return logFail("Could not create shader binding table"); @@ -732,6 +734,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP)); cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); + cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize); cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get()); if (m_useIndirectCommand) @@ -1332,6 +1335,29 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, return true; } + void calculateRayTracingStackSize(const smart_refctd_ptr& pipeline) + { + const auto raygenStackSize = pipeline->getRaygenStackSize(); + auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t + { + auto maxValue = 0; + for (const auto& val : ranges) + { + maxValue = std::max(maxValue, std::invoke(valProj, val)); + } + return maxValue; + }; + + const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit); + const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit); + const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection); + const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{}); + const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{}); + auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax); + firstDepthStackSizeMax = std::max(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax); + m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax); + } + bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr& pipeline) { const auto& limits = m_device->getPhysicalDevice()->getLimits(); @@ -1823,6 +1849,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, smart_refctd_ptr m_rayTracingDsPool; smart_refctd_ptr m_rayTracingDs; smart_refctd_ptr m_rayTracingPipeline; + uint64_t m_rayTracingStackSize; ShaderBindingTable m_shaderBindingTable; smart_refctd_ptr m_presentDs; From e95f09d5d20181c4107064cec08bddc689a7f399 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 21 Mar 2025 16:48:41 +0700 Subject: [PATCH 094/296] changed workgroup size to 512 --- 31_HLSLPathTracer/app_resources/glsl/common.glsl | 2 +- .../app_resources/hlsl/render.comp.hlsl | 2 +- 31_HLSLPathTracer/imgui.ini | 8 -------- 31_HLSLPathTracer/main.cpp | 16 ++++++++-------- 4 files changed, 10 insertions(+), 18 deletions(-) delete mode 100644 31_HLSLPathTracer/imgui.ini diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl index b09c90824..9015f755d 100644 --- a/31_HLSLPathTracer/app_resources/glsl/common.glsl +++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl @@ -16,7 +16,7 @@ layout(set = 2, binding = 2) uniform usampler2D scramblebuf; layout(set=0, binding=0, rgba16f) uniform image2D outImage; #ifndef _NBL_GLSL_WORKGROUP_SIZE_ -#define _NBL_GLSL_WORKGROUP_SIZE_ 256 +#define _NBL_GLSL_WORKGROUP_SIZE_ 1024 layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in; #endif diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index 5e8102f6f..d0c969b8b 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -37,7 +37,7 @@ using namespace nbl::hlsl; -NBL_CONSTEXPR uint32_t WorkgroupSize = 256; +NBL_CONSTEXPR uint32_t WorkgroupSize = 1024; NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4; NBL_CONSTEXPR uint32_t MAX_SAMPLES_LOG2 = 10; diff --git a/31_HLSLPathTracer/imgui.ini b/31_HLSLPathTracer/imgui.ini deleted file mode 100644 index e60624929..000000000 --- a/31_HLSLPathTracer/imgui.ini +++ /dev/null @@ -1,8 +0,0 @@ -[Window][Debug##Default] -Pos=60,60 -Size=400,400 - -[Window][Controls] -Pos=10,10 -Size=320,340 - diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index e3e0b7d7a..add980078 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -48,7 +48,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 }; constexpr static inline uint32_t MaxFramesInFlight = 5; constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); - constexpr static inline uint32_t DefaultWorkGroupSize = 256u; + constexpr static inline uint32_t DefaultWorkGroupSize = 1024u; constexpr static inline uint32_t MaxDescriptorCount = 256u; constexpr static inline uint32_t MaxDepthLog2 = 4u; // 5 constexpr static inline uint32_t MaxSamplesLog2 = 10u; // 18 @@ -366,12 +366,12 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; // should be compute options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; options.spirvOptimizer = nullptr; -#ifndef _NBL_DEBUG - ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; - auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); - options.spirvOptimizer = opt.get(); -#endif - options.debugInfoFlags = IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_NONE; +//#ifndef _NBL_DEBUG +// ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; +// auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); +// options.spirvOptimizer = opt.get(); +//#endif + options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); options.preprocessorOptions.logger = m_logger.get(); options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); @@ -1348,7 +1348,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, float camYAngle = 165.f / 180.f * 3.14159f; float camXAngle = 32.f / 180.f * 3.14159f; int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE; - int renderMode = E_RENDER_MODE::ERM_HLSL; + int renderMode = E_RENDER_MODE::ERM_GLSL; int spp = 32; int depth = 3; From 56994a9d36ae0e21e54a07aa76e1e5bbe2e2d959 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 21 Mar 2025 16:51:39 +0700 Subject: [PATCH 095/296] workgroup size 512 for sure this time --- 31_HLSLPathTracer/app_resources/glsl/common.glsl | 2 +- .../app_resources/hlsl/render.comp.hlsl | 2 +- 31_HLSLPathTracer/main.cpp | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl index 9015f755d..1a1594e6a 100644 --- a/31_HLSLPathTracer/app_resources/glsl/common.glsl +++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl @@ -16,7 +16,7 @@ layout(set = 2, binding = 2) uniform usampler2D scramblebuf; layout(set=0, binding=0, rgba16f) uniform image2D outImage; #ifndef _NBL_GLSL_WORKGROUP_SIZE_ -#define _NBL_GLSL_WORKGROUP_SIZE_ 1024 +#define _NBL_GLSL_WORKGROUP_SIZE_ 512 layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in; #endif diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index d0c969b8b..b0d221a20 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -37,7 +37,7 @@ using namespace nbl::hlsl; -NBL_CONSTEXPR uint32_t WorkgroupSize = 1024; +NBL_CONSTEXPR uint32_t WorkgroupSize = 512; NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4; NBL_CONSTEXPR uint32_t MAX_SAMPLES_LOG2 = 10; diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index add980078..8394889db 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -48,7 +48,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 }; constexpr static inline uint32_t MaxFramesInFlight = 5; constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); - constexpr static inline uint32_t DefaultWorkGroupSize = 1024u; + constexpr static inline uint32_t DefaultWorkGroupSize = 512u; constexpr static inline uint32_t MaxDescriptorCount = 256u; constexpr static inline uint32_t MaxDepthLog2 = 4u; // 5 constexpr static inline uint32_t MaxSamplesLog2 = 10u; // 18 @@ -366,11 +366,11 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; // should be compute options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; options.spirvOptimizer = nullptr; -//#ifndef _NBL_DEBUG -// ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; -// auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); -// options.spirvOptimizer = opt.get(); -//#endif +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + options.spirvOptimizer = opt.get(); +#endif options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); options.preprocessorOptions.logger = m_logger.get(); @@ -1348,7 +1348,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, float camYAngle = 165.f / 180.f * 3.14159f; float camXAngle = 32.f / 180.f * 3.14159f; int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE; - int renderMode = E_RENDER_MODE::ERM_GLSL; + int renderMode = E_RENDER_MODE::ERM_HLSL; int spp = 32; int depth = 3; From 3cdfb4baf2df319643620a8189c277dec20cb163 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 24 Mar 2025 14:43:11 +0700 Subject: [PATCH 096/296] use morton and virtual indexing --- .../app_resources/glsl/common.glsl | 199 +++++++++--------- .../app_resources/hlsl/render.comp.hlsl | 93 ++++---- 31_HLSLPathTracer/main.cpp | 3 +- 3 files changed, 156 insertions(+), 139 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl index 1a1594e6a..c04ad2b11 100644 --- a/31_HLSLPathTracer/app_resources/glsl/common.glsl +++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl @@ -9,7 +9,7 @@ // debug //#define NEE_ONLY -layout(set = 2, binding = 0) uniform sampler2D envMap; +layout(set = 2, binding = 0) uniform sampler2D envMap; layout(set = 2, binding = 1) uniform usamplerBuffer sampleSequence; layout(set = 2, binding = 2) uniform usampler2D scramblebuf; @@ -35,6 +35,7 @@ vec2 getTexCoords() { #include #include #include +#include #include @@ -51,7 +52,7 @@ struct Sphere vec3 position; float radius2; uint bsdfLightIDs; -}; +}; Sphere Sphere_Sphere(in vec3 position, in float radius, in uint bsdfID, in uint lightID) { @@ -188,7 +189,7 @@ void Rectangle_getNormalBasis(in Rectangle rect, out mat3 basis, out vec2 extent basis[0] = rect.edge0/extents[0]; basis[1] = rect.edge1/extents[1]; basis[2] = normalize(cross(basis[0],basis[1])); -} +} // return intersection distance if found, nbl_glsl_FLT_NAN otherwise float Rectangle_intersect(in Rectangle rect, in vec3 origin, in vec3 direction) @@ -222,7 +223,7 @@ vec3 Rectangle_getNormalTimesArea(in Rectangle rect) #define OP_BITS_OFFSET 0 #define OP_BITS_SIZE 2 struct BSDFNode -{ +{ uvec4 data[2]; }; @@ -386,13 +387,13 @@ vec2 SampleSphericalMap(vec3 v) { vec2 uv = vec2(atan(v.z, v.x), asin(v.y)); uv *= nbl_glsl_RECIPROCAL_PI*0.5; - uv += 0.5; + uv += 0.5; return uv; } void missProgram(in ImmutableRay_t _immutable, inout Payload_t _payload) { - vec3 finalContribution = _payload.throughput; + vec3 finalContribution = _payload.throughput; // #define USE_ENVMAP #ifdef USE_ENVMAP vec2 uv = SampleSphericalMap(_immutable.direction); @@ -415,7 +416,7 @@ nbl_glsl_LightSample nbl_glsl_bsdf_cos_generate(in nbl_glsl_AnisotropicViewSurfa { const float a = BSDFNode_getRoughness(bsdf); const mat2x3 ior = BSDFNode_getEta(bsdf); - + // fresnel stuff for dielectrics float orientedEta, rcpOrientedEta; const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta); @@ -519,7 +520,7 @@ int traceRay(inout float intersectionT, in vec3 origin, in vec3 direction) intersectionT = closerIntersection ? t : intersectionT; objectID = closerIntersection ? i:objectID; - + // allowing early out results in a performance regression, WTF!? //if (anyHit && closerIntersection) //break; @@ -543,7 +544,7 @@ nbl_glsl_LightSample nbl_glsl_light_generate_and_remainder_and_pdf(out vec3 rema { // normally we'd pick from set of lights, using `xi.z` const Light light = lights[0]; - + vec3 L = nbl_glsl_light_generate_and_pdf(pdf,newRayMaxT,origin,interaction,isBSDF,xi,Light_getObjectID(light)); newRayMaxT *= getEndTolerance(depth); @@ -663,7 +664,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb // bsdfSampleL = bsdf_sample.L; } - + // additional threshold const float lumaThroughputThreshold = lumaContributionThreshold; if (bsdfPdf>bsdfPdfThreshold && getLuma(throughput)>lumaThroughputThreshold) @@ -671,7 +672,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb ray._payload.throughput = throughput; ray._payload.otherTechniqueHeuristic = neeProbability/bsdfPdf; // numerically stable, don't touch ray._payload.otherTechniqueHeuristic *= ray._payload.otherTechniqueHeuristic; - + // trace new ray ray._immutable.origin = intersection+bsdfSampleL*(1.0/*kSceneSize*/)*getStartTolerance(depth); ray._immutable.direction = bsdfSampleL; @@ -688,109 +689,115 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb void main() { const ivec2 imageExtents = imageSize(outImage); - const ivec2 coords = getCoordinates(); - vec2 texCoord = vec2(coords) / vec2(imageExtents); - texCoord.y = 1.0 - texCoord.y; - - if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) { - return; - } - if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0) + uint virtualThreadIndex; + for (uint virtualThreadBase = gl_WorkGroupID.x * _NBL_GLSL_WORKGROUP_SIZE_; virtualThreadBase < 1920*1080; virtualThreadBase += gl_NumWorkGroups.x * _NBL_GLSL_WORKGROUP_SIZE_) // not sure why 1280*720 doesn't cover entire window { - vec4 pixelCol = vec4(1.0,0.0,0.0,1.0); - imageStore(outImage, coords, pixelCol); - return; - } - - nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg; - const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0)); + virtualThreadIndex = virtualThreadBase + gl_LocalInvocationIndex.x; + const ivec2 coords = ivec2(nbl_glsl_morton_decode2d32b(virtualThreadIndex)); // getCoordinates(); + vec2 texCoord = vec2(coords) / vec2(imageExtents); + texCoord.y = 1.0 - texCoord.y; + if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) { + continue; + } - const mat4 invMVP = PTPushConstant.invMVP; - - vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0); - vec3 camPos; - { - vec4 tmp = invMVP*NDC; - camPos = tmp.xyz/tmp.w; - NDC.z = 1.0; - } + if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0) + { + vec4 pixelCol = vec4(1.0,0.0,0.0,1.0); + imageStore(outImage, coords, pixelCol); + continue; + } - vec3 color = vec3(0.0); - float meanLumaSquared = 0.0; - // TODO: if we collapse the nested for loop, then all GPUs will get `PTPushConstant.depth` factor speedup, not just NV with separate PC - for (int i=0; i5.0) + color = vec3(1.0,0.0,0.0); #endif - } - #ifdef VISUALIZE_HIGH_VARIANCE - float variance = getLuma(color); - variance *= variance; - variance = meanLumaSquared-variance; - if (variance>5.0) - color = vec3(1.0,0.0,0.0); - #endif - - vec4 pixelCol = vec4(color, 1.0); - imageStore(outImage, coords, pixelCol); + vec4 pixelCol = vec4(color, 1.0); + imageStore(outImage, coords, pixelCol); + } } /** TODO: Improving Rendering diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index b0d221a20..ed7e4a85e 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -2,6 +2,7 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/random/pcg.hlsl" #include "nbl/builtin/hlsl/random/xoroshiro.hlsl" +#include "nbl/builtin/hlsl/math/morton.hlsl" #include "nbl/builtin/hlsl/bxdf/reflection.hlsl" #include "nbl/builtin/hlsl/bxdf/transmission.hlsl" @@ -35,7 +36,8 @@ #include "render_common.hlsl" #include "pathtracer.hlsl" -using namespace nbl::hlsl; +using namespace nbl; +using namespace hlsl; NBL_CONSTEXPR uint32_t WorkgroupSize = 512; NBL_CONSTEXPR uint32_t MAX_DEPTH_LOG2 = 4; @@ -155,48 +157,55 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) { uint32_t width, height; outImage.GetDimensions(width, height); - const int32_t2 coords = getCoordinates(); - float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height); - texCoord.y = 1.0 - texCoord.y; - if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) { - return; - } - - if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0) - { - float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0); - outImage[coords] = pixelCol; - return; - } - - int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x; - - // set up path tracer - ext::PathTracer::PathTracerCreationParams ptCreateParams; - ptCreateParams.rngState = scramblebuf[coords].rg; - - uint2 scrambleDim; - scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y); - ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim); - - float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0); + uint32_t virtualThreadIndex; + [loop] + for (uint32_t virtualThreadBase = glsl::gl_WorkGroupID().x * WorkgroupSize; virtualThreadBase < 1920*1080; virtualThreadBase += glsl::gl_NumWorkGroups().x * WorkgroupSize) // not sure why 1280*720 doesn't cover entire window { - float4 tmp = mul(pc.invMVP, NDC); - ptCreateParams.camPos = tmp.xyz / tmp.w; - NDC.z = 1.0; + virtualThreadIndex = virtualThreadBase + glsl::gl_LocalInvocationIndex().x; + const int32_t2 coords = (int32_t2)math::Morton::decode2d(virtualThreadIndex); // getCoordinates(); + float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height); + texCoord.y = 1.0 - texCoord.y; + + if (false == (hlsl::all((int32_t2)0 < coords)) && hlsl::all(int32_t2(width, height) < coords)) { + continue; + } + + if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0) + { + float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0); + outImage[coords] = pixelCol; + continue; + } + + int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x; + + // set up path tracer + ext::PathTracer::PathTracerCreationParams ptCreateParams; + ptCreateParams.rngState = scramblebuf[coords].rg; + + uint2 scrambleDim; + scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y); + ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim); + + float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0); + { + float4 tmp = hlsl::mul(pc.invMVP, NDC); + ptCreateParams.camPos = tmp.xyz / tmp.w; + NDC.z = 1.0; + } + + ptCreateParams.NDC = NDC; + ptCreateParams.invMVP = pc.invMVP; + + ptCreateParams.diffuseParams = bxdfs[0].params; + ptCreateParams.conductorParams = bxdfs[3].params; + ptCreateParams.dielectricParams = bxdfs[6].params; + + pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams); + + float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene); + float32_t4 pixCol = float32_t4(color, 1.0); + outImage[coords] = pixCol; } - - ptCreateParams.NDC = NDC; - ptCreateParams.invMVP = pc.invMVP; - - ptCreateParams.diffuseParams = bxdfs[0].params; - ptCreateParams.conductorParams = bxdfs[3].params; - ptCreateParams.dielectricParams = bxdfs[6].params; - - pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams); - - float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene); - float32_t4 pixCol = float32_t4(color, 1.0); - outImage[coords] = pixCol; } diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 8394889db..db1e198c5 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -1068,7 +1068,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get()); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get()); cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc); - cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u); + uint32_t dispatchSize = m_physicalDevice->getLimits().computeOptimalPersistentWorkgroupDispatchSize(WindowDimensions.x * WindowDimensions.y, DefaultWorkGroupSize); + cmdbuf->dispatch(dispatchSize, 1u, 1u); } // TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image) From 5f93cec878eafcd03a0af1b3d1e4a136deb9bade Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 24 Mar 2025 15:32:09 +0700 Subject: [PATCH 097/296] reverted virtual index, fix hlsl colors --- .../app_resources/glsl/common.glsl | 173 +++++++++--------- .../app_resources/hlsl/render.comp.hlsl | 96 +++++----- 31_HLSLPathTracer/main.cpp | 3 +- 3 files changed, 128 insertions(+), 144 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl index c04ad2b11..6c2b5f42f 100644 --- a/31_HLSLPathTracer/app_resources/glsl/common.glsl +++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl @@ -35,7 +35,6 @@ vec2 getTexCoords() { #include #include #include -#include #include @@ -689,115 +688,109 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb void main() { const ivec2 imageExtents = imageSize(outImage); + const ivec2 coords = getCoordinates(); + vec2 texCoord = vec2(coords) / vec2(imageExtents); + texCoord.y = 1.0 - texCoord.y; - uint virtualThreadIndex; - for (uint virtualThreadBase = gl_WorkGroupID.x * _NBL_GLSL_WORKGROUP_SIZE_; virtualThreadBase < 1920*1080; virtualThreadBase += gl_NumWorkGroups.x * _NBL_GLSL_WORKGROUP_SIZE_) // not sure why 1280*720 doesn't cover entire window + if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) { + return; + } + + if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0) { - virtualThreadIndex = virtualThreadBase + gl_LocalInvocationIndex.x; - const ivec2 coords = ivec2(nbl_glsl_morton_decode2d32b(virtualThreadIndex)); // getCoordinates(); - vec2 texCoord = vec2(coords) / vec2(imageExtents); - texCoord.y = 1.0 - texCoord.y; + vec4 pixelCol = vec4(1.0,0.0,0.0,1.0); + imageStore(outImage, coords, pixelCol); + return; + } - if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) { - continue; - } + nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg; + const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0)); - if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0) - { - vec4 pixelCol = vec4(1.0,0.0,0.0,1.0); - imageStore(outImage, coords, pixelCol); - continue; - } - nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg; - const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0)); + const mat4 invMVP = PTPushConstant.invMVP; + vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0); + vec3 camPos; + { + vec4 tmp = invMVP*NDC; + camPos = tmp.xyz/tmp.w; + NDC.z = 1.0; + } - const mat4 invMVP = PTPushConstant.invMVP; + vec3 color = vec3(0.0); + float meanLumaSquared = 0.0; + // TODO: if we collapse the nested for loop, then all GPUs will get `PTPushConstant.depth` factor speedup, not just NV with separate PC + for (int i=0; i5.0) - color = vec3(1.0,0.0,0.0); + float luma = getLuma(accumulation); + meanLumaSquared += (luma*luma-meanLumaSquared)*rcpSampleSize; #endif - - vec4 pixelCol = vec4(color, 1.0); - imageStore(outImage, coords, pixelCol); } + + #ifdef VISUALIZE_HIGH_VARIANCE + float variance = getLuma(color); + variance *= variance; + variance = meanLumaSquared-variance; + if (variance>5.0) + color = vec3(1.0,0.0,0.0); + #endif + + vec4 pixelCol = vec4(color, 1.0); + imageStore(outImage, coords, pixelCol); } /** TODO: Improving Rendering diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index ed7e4a85e..b187a1b33 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -2,7 +2,6 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/random/pcg.hlsl" #include "nbl/builtin/hlsl/random/xoroshiro.hlsl" -#include "nbl/builtin/hlsl/math/morton.hlsl" #include "nbl/builtin/hlsl/bxdf/reflection.hlsl" #include "nbl/builtin/hlsl/bxdf/transmission.hlsl" @@ -140,9 +139,9 @@ static const bxdfnode_type bxdfs[BXDF_COUNT] = { bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.8,0.8)), bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.4,0.4)), bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.4,0.8,0.4)), - bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77)), - bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)), - bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)), + bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1.02,1.02,1.3), spectral_t(1.0,1.0,2.0)), + bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1.02,1.3,1.02), spectral_t(1.0,2.0,1.0)), + bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1.02,1.3,1.02), spectral_t(1.0,2.0,1.0)), bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIELECTRIC, false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67)) }; @@ -157,55 +156,48 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) { uint32_t width, height; outImage.GetDimensions(width, height); + const int32_t2 coords = getCoordinates(); + float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height); + texCoord.y = 1.0 - texCoord.y; - uint32_t virtualThreadIndex; - [loop] - for (uint32_t virtualThreadBase = glsl::gl_WorkGroupID().x * WorkgroupSize; virtualThreadBase < 1920*1080; virtualThreadBase += glsl::gl_NumWorkGroups().x * WorkgroupSize) // not sure why 1280*720 doesn't cover entire window + if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) { + return; + } + + if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0) + { + float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0); + outImage[coords] = pixelCol; + return; + } + + int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x; + + // set up path tracer + ext::PathTracer::PathTracerCreationParams ptCreateParams; + ptCreateParams.rngState = scramblebuf[coords].rg; + + uint2 scrambleDim; + scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y); + ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim); + + float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0); { - virtualThreadIndex = virtualThreadBase + glsl::gl_LocalInvocationIndex().x; - const int32_t2 coords = (int32_t2)math::Morton::decode2d(virtualThreadIndex); // getCoordinates(); - float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height); - texCoord.y = 1.0 - texCoord.y; - - if (false == (hlsl::all((int32_t2)0 < coords)) && hlsl::all(int32_t2(width, height) < coords)) { - continue; - } - - if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0) - { - float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0); - outImage[coords] = pixelCol; - continue; - } - - int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x; - - // set up path tracer - ext::PathTracer::PathTracerCreationParams ptCreateParams; - ptCreateParams.rngState = scramblebuf[coords].rg; - - uint2 scrambleDim; - scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y); - ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim); - - float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0); - { - float4 tmp = hlsl::mul(pc.invMVP, NDC); - ptCreateParams.camPos = tmp.xyz / tmp.w; - NDC.z = 1.0; - } - - ptCreateParams.NDC = NDC; - ptCreateParams.invMVP = pc.invMVP; - - ptCreateParams.diffuseParams = bxdfs[0].params; - ptCreateParams.conductorParams = bxdfs[3].params; - ptCreateParams.dielectricParams = bxdfs[6].params; - - pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams); - - float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene); - float32_t4 pixCol = float32_t4(color, 1.0); - outImage[coords] = pixCol; + float4 tmp = mul(pc.invMVP, NDC); + ptCreateParams.camPos = tmp.xyz / tmp.w; + NDC.z = 1.0; } + + ptCreateParams.NDC = NDC; + ptCreateParams.invMVP = pc.invMVP; + + ptCreateParams.diffuseParams = bxdfs[0].params; + ptCreateParams.conductorParams = bxdfs[3].params; + ptCreateParams.dielectricParams = bxdfs[6].params; + + pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams); + + float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene); + float32_t4 pixCol = float32_t4(color, 1.0); + outImage[coords] = pixCol; } diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index db1e198c5..8394889db 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -1068,8 +1068,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get()); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get()); cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc); - uint32_t dispatchSize = m_physicalDevice->getLimits().computeOptimalPersistentWorkgroupDispatchSize(WindowDimensions.x * WindowDimensions.y, DefaultWorkGroupSize); - cmdbuf->dispatch(dispatchSize, 1u, 1u); + cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u); } // TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image) From 78de4f546a100b78ce6998f4cd49099b604176fa Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 25 Mar 2025 15:45:07 +0700 Subject: [PATCH 098/296] fixed some bugs for cpp compat --- 31_HLSLPathTracer/app_resources/hlsl/common.hlsl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl index 2e2561345..31bcca26a 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/common.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/common.hlsl @@ -10,6 +10,8 @@ #include #include #include +#include +#include namespace nbl { @@ -121,7 +123,7 @@ struct BxDFNode retval.albedo = albedo; retval.materialType = materialType; retval.params.is_aniso = isAniso; - retval.params.A = hlsl::max(A, 1e-4); + retval.params.A = hlsl::max(A, (float32_t2)1e-4); retval.params.ior0 = (spectral_type)1.0; retval.params.ior1 = (spectral_type)1.0; return retval; @@ -134,7 +136,7 @@ struct BxDFNode retval.albedo = (spectral_type)1.0; retval.materialType = materialType; retval.params.is_aniso = isAniso; - retval.params.A = hlsl::max(A, 1e-4); + retval.params.A = hlsl::max(A, (float32_t2)1e-4); retval.params.ior0 = ior0; retval.params.ior1 = ior1; return retval; @@ -218,7 +220,7 @@ struct Shape float32_t3 getNormal(NBL_CONST_REF_ARG(float32_t3) hitPosition) { - const float radiusRcp = spirv::inverseSqrt(radius2); + const float radiusRcp = hlsl::rsqrt(radius2); return (hitPosition - position) * radiusRcp; } From 8090a2d5afc1b33eb6259ef5d20e0402fce682c5 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 27 Mar 2025 15:26:02 +0700 Subject: [PATCH 099/296] initial benchmark example copy --- 71_ArithmeticBench/CMakeLists.txt | 25 + 71_ArithmeticBench/app_resources/common.hlsl | 96 ++++ .../app_resources/shaderCommon.hlsl | 55 +++ .../app_resources/testSubgroup.comp.hlsl | 18 + .../app_resources/testWorkgroup.comp.hlsl | 107 ++++ 71_ArithmeticBench/config.json.template | 28 ++ 71_ArithmeticBench/main.cpp | 462 ++++++++++++++++++ 71_ArithmeticBench/pipeline.groovy | 50 ++ CMakeLists.txt | 4 +- 9 files changed, 844 insertions(+), 1 deletion(-) create mode 100644 71_ArithmeticBench/CMakeLists.txt create mode 100644 71_ArithmeticBench/app_resources/common.hlsl create mode 100644 71_ArithmeticBench/app_resources/shaderCommon.hlsl create mode 100644 71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl create mode 100644 71_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl create mode 100644 71_ArithmeticBench/config.json.template create mode 100644 71_ArithmeticBench/main.cpp create mode 100644 71_ArithmeticBench/pipeline.groovy diff --git a/71_ArithmeticBench/CMakeLists.txt b/71_ArithmeticBench/CMakeLists.txt new file mode 100644 index 000000000..0724366c9 --- /dev/null +++ b/71_ArithmeticBench/CMakeLists.txt @@ -0,0 +1,25 @@ + +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/71_ArithmeticBench/app_resources/common.hlsl b/71_ArithmeticBench/app_resources/common.hlsl new file mode 100644 index 000000000..10892a2b9 --- /dev/null +++ b/71_ArithmeticBench/app_resources/common.hlsl @@ -0,0 +1,96 @@ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/functional.hlsl" + +template +struct Output +{ + NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount; + + uint32_t subgroupSize; + uint32_t data[ScanElementCount]; +}; + +// Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code +template +struct bit_and : nbl::hlsl::bit_and +{ + using base_t = nbl::hlsl::bit_and; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bit_and"; +#endif +}; +template +struct bit_or : nbl::hlsl::bit_or +{ + using base_t = nbl::hlsl::bit_or; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bit_xor"; +#endif +}; +template +struct bit_xor : nbl::hlsl::bit_xor +{ + using base_t = nbl::hlsl::bit_xor; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bit_or"; +#endif +}; +template +struct plus : nbl::hlsl::plus +{ + using base_t = nbl::hlsl::plus; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "plus"; +#endif +}; +template +struct multiplies : nbl::hlsl::multiplies +{ + using base_t = nbl::hlsl::multiplies; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "multiplies"; +#endif +}; +template +struct minimum : nbl::hlsl::minimum +{ + using base_t = nbl::hlsl::minimum; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "minimum"; +#endif +}; +template +struct maximum : nbl::hlsl::maximum +{ + using base_t = nbl::hlsl::maximum; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "maximum"; +#endif +}; + +template +struct ballot : nbl::hlsl::plus +{ + using base_t = nbl::hlsl::plus; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bitcount"; +#endif +}; + +#include "nbl/builtin/hlsl/subgroup/basic.hlsl" \ No newline at end of file diff --git a/71_ArithmeticBench/app_resources/shaderCommon.hlsl b/71_ArithmeticBench/app_resources/shaderCommon.hlsl new file mode 100644 index 000000000..13ee8d21e --- /dev/null +++ b/71_ArithmeticBench/app_resources/shaderCommon.hlsl @@ -0,0 +1,55 @@ +#include "common.hlsl" + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/subgroup/basic.hlsl" +#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" + +#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" + +// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 +uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} + +// unfortunately DXC chokes on descriptors as static members +// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 +[[vk::binding(0, 0)]] StructuredBuffer inputValue; +[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; + +// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way +uint32_t globalIndex(); +// since we test ITEMS_PER_WG class binop> +static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + if (globalIndex()==0u) + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + operation_t::base_t,nbl::hlsl::jit::device_capabilities> func; + if (canStore()) + output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); +} + + +type_t test() +{ + const type_t sourceVal = inputValue[globalIndex()]; + + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + return sourceVal; +} + +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" \ No newline at end of file diff --git a/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl new file mode 100644 index 000000000..479265d73 --- /dev/null +++ b/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl @@ -0,0 +1,18 @@ +#pragma shader_stage(compute) + +#define operation_t nbl::hlsl::OPERATION + +#include "shaderCommon.hlsl" + +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} + +bool canStore() {return true;} + +[numthreads(WORKGROUP_SIZE,1,1)] +void main() +{ + test(); +} \ No newline at end of file diff --git a/71_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl b/71_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl new file mode 100644 index 000000000..9bafae47f --- /dev/null +++ b/71_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl @@ -0,0 +1,107 @@ +#pragma shader_stage(compute) + + +#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" + +static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic::value; +static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot::value; +static const uint32_t ScratchSz = ArithmeticSz+BallotSz; + +// TODO: Can we make it a static variable in the ScratchProxy struct? +groupshared uint32_t scratch[ScratchSz]; + + +#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" + + +template +struct ScratchProxy +{ + void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) + { + value = scratch[ix+offset]; + } + void set(const uint32_t ix, const uint32_t value) + { + scratch[ix+offset] = value; + } + + uint32_t atomicOr(const uint32_t ix, const uint32_t value) + { + return nbl::hlsl::glsl::atomicOr(scratch[ix],value); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } +}; + +static ScratchProxy<0> arithmeticAccessor; + + +#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" + + +template +struct operation_t +{ + using type_t = typename Binop::type_t; + + type_t operator()(type_t value) + { + type_t retval = nbl::hlsl::OPERATION::template __call >(value,arithmeticAccessor); + // we barrier before because we alias the accessors for Binop + arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); + return retval; + } +}; + + +#include "shaderCommon.hlsl" + +static ScratchProxy ballotAccessor; + + +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} + +bool canStore() +{ + return nbl::hlsl::workgroup::SubgroupContiguousIndex()::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + // we can only ballot booleans, so low bit + nbl::hlsl::workgroup::ballot >(bool(sourceVal & 0x1u), ballotAccessor); + // need to barrier between ballot and usages of a ballot by myself + ballotAccessor.workgroupExecutionAndMemoryBarrier(); + + uint32_t destVal = 0xdeadbeefu; +#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same,0x45>,nbl::hlsl::workgroup::IS_OP,0x45> >::value +#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities + if (CONSTEXPR_OP_TYPE_TEST(reduction)) + destVal = nbl::hlsl::workgroup::ballotBitCount(ballotAccessor,arithmeticAccessor); + else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan)) + destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount(ballotAccessor,arithmeticAccessor); + else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan)) + destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount(ballotAccessor,arithmeticAccessor); + else + { + assert(false); + } +#undef BALLOT_TEMPLATE_ARGS +#undef CONSTEXPR_OP_TYPE_TEST + + if (canStore()) + output[ballot::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal); +} \ No newline at end of file diff --git a/71_ArithmeticBench/config.json.template b/71_ArithmeticBench/config.json.template new file mode 100644 index 000000000..f961745c1 --- /dev/null +++ b/71_ArithmeticBench/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/71_ArithmeticBench/main.cpp b/71_ArithmeticBench/main.cpp new file mode 100644 index 000000000..0952d2b57 --- /dev/null +++ b/71_ArithmeticBench/main.cpp @@ -0,0 +1,462 @@ +#include "nbl/application_templates/BasicMultiQueueApplication.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "app_resources/common.hlsl" + +using namespace nbl; +using namespace core; +using namespace asset; +using namespace system; +using namespace video; + +// method emulations on the CPU, to verify the results of the GPU methods +template +struct emulatedReduction +{ + using type_t = typename Binop::type_t; + + static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) + { + const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop()); + std::fill(out,out+itemCount,red); + } + + static inline constexpr const char* name = "reduction"; +}; +template +struct emulatedScanInclusive +{ + using type_t = typename Binop::type_t; + + static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) + { + std::inclusive_scan(in,in+itemCount,out,Binop()); + } + static inline constexpr const char* name = "inclusive_scan"; +}; +template +struct emulatedScanExclusive +{ + using type_t = typename Binop::type_t; + + static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) + { + std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop()); + } + static inline constexpr const char* name = "exclusive_scan"; +}; + +class ArithmeticBenchApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = application_templates::BasicMultiQueueApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + +public: + ArithmeticBenchApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + transferDownQueue = getTransferDownQueue(); + computeQueue = getComputeQueue(); + + // TODO: get the element count from argv + const uint32_t elementCount = Output<>::ScanElementCount; + // populate our random data buffer on the CPU and create a GPU copy + inputData = new uint32_t[elementCount]; + smart_refctd_ptr gpuinputDataBuffer; + { + std::mt19937 randGenerator(0xdeadbeefu); + for (uint32_t i = 0u; i < elementCount; i++) + inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all + + IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; + inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount; + inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + m_utils->createFilledDeviceLocalBufferOnDedMem( + SIntendedSubmitInfo{.queue=getTransferUpQueue()}, + std::move(inputDataBufferCreationParams), + inputData + ).move_into(gpuinputDataBuffer); + } + + // create 8 buffers for 8 operations + for (auto i=0u; igetSize(); + params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT; + + outputBuffers[i] = m_device->createBuffer(std::move(params)); + auto mreq = outputBuffers[i]->getMemoryReqs(); + mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + assert(mreq.memoryTypeBits); + + auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get()); + assert(bufferMem.isValid()); + } + + // create Descriptor Set and Pipeline Layout + { + // create Descriptor Set Layout + smart_refctd_ptr dsLayout; + { + IGPUDescriptorSetLayout::SBinding binding[2]; + for (uint32_t i = 0u; i < 2; i++) + binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; + binding[1].count = OutputBufferCount; + dsLayout = m_device->createDescriptorSetLayout(binding); + } + + // set and transient pool + auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1}); + descriptorSet = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); + { + IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount]; + infos[0].desc = gpuinputDataBuffer; + infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() }; + for (uint32_t i = 1u; i <= OutputBufferCount; i++) + { + auto buff = outputBuffers[i - 1]; + infos[i].info.buffer = { 0u,buff->getSize() }; + infos[i].desc = std::move(buff); // save an atomic in the refcount + + } + + IGPUDescriptorSet::SWriteDescriptorSet writes[2]; + for (uint32_t i=0u; i<2; i++) + writes[i] = {descriptorSet.get(),i,0u,1u,infos+i}; + writes[1].count = OutputBufferCount; + + m_device->updateDescriptorSets(2, writes, 0u, nullptr); + } + + pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout)); + } + + const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin"; + // enclose to make sure file goes out of scope and we can reopen it + { + smart_refctd_ptr spirv_isa_cache_input; + // try to load SPIR-V to ISA cache + { + ISystem::future_t> fileCreate; + m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT); + if (auto lock=fileCreate.acquire()) + spirv_isa_cache_input = *lock; + } + // create the cache + { + std::span spirv_isa_cache_data = {}; + if (spirv_isa_cache_input) + spirv_isa_cache_data = {reinterpret_cast(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()}; + else + m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE); + // Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead + m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data); + } + } + { + // TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ? + m_system->deleteDirectory(spirv_isa_cache_path); + ISystem::future_t> fileCreate; + m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE); + // I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though. + m_spirv_isa_cache_output=*fileCreate.acquire(); + if (!m_spirv_isa_cache_output) + logFail("Failed to Create SPIR-V to ISA cache file."); + } + + // load shader source from file + auto getShaderSource = [&](const char* filePath) -> auto + { + IAssetLoader::SAssetLoadParams lparams = {}; + lparams.logger = m_logger.get(); + lparams.workingDirectory = ""; + auto bundle = m_assetMgr->getAsset(filePath, lparams); + if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER) + { + m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath); + exit(-1); + } + auto firstAssetInBundle = bundle.getContents()[0]; + return smart_refctd_ptr_static_cast(firstAssetInBundle); + }; + + auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); + auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl"); + // now create or retrieve final resources to run our tests + sema = m_device->createSemaphore(timelineValue); + resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() }); + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1})) + { + logFail("Failed to create Command Buffers!\n"); + return false; + } + } + + const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; + const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; + const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; + for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) + { + const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); + for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize) + { + // make sure renderdoc captures everything for debugging + m_api->startCapture(); + m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); + + bool passed = true; + // TODO async the testing + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; + logTestOutcome(passed, workgroupSize); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; + logTestOutcome(passed, workgroupSize); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; + logTestOutcome(passed, workgroupSize); + for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--) + { + m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + logTestOutcome(passed, itemsPerWG); + } + m_api->endCapture(); + + // save cache every now and then + { + auto cpu = m_spirv_isa_cache->convertToCPUCache(); + // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata + auto bin = cpu->getEntries().begin()->second.bin; + IFile::success_t success; + m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size()); + if (!success) + logFail("Could not write Create SPIR-V to ISA cache to disk!"); + } + } + } + + return true; + } + + virtual bool onAppTerminated() override + { + m_logger->log("==========Result==========", ILogger::ELL_INFO); + m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount); + delete[] inputData; + return true; + } + + // the unit test is carried out on init + void workLoopBody() override {} + + // + bool keepRunning() override { return false; } + +private: + void logTestOutcome(bool passed, uint32_t workgroupSize) + { + if (passed) + m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); + else + { + totalFailCount++; + m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); + } + } + + // create pipeline (specialized every test) [TODO: turn into a future/async] + smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2) + { + auto shader = m_device->createShader(overridenUnspecialized); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = pipelineLayout.get(); + params.shader = { + .entryPoint = "main", + .shader = shader.get(), + .entries = nullptr, + .requiredSubgroupSize = static_cast(subgroupSizeLog2), + .requireFullSubgroups = true + }; + core::smart_refctd_ptr pipeline; + if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{¶ms,1},&pipeline)) + return nullptr; + return pipeline; + } + + /*template class Arithmetic, bool WorkgroupTest> + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) + { + return true; + }*/ + + template class Arithmetic, bool WorkgroupTest> + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) + { + std::string arith_name = Arithmetic>::name; + + smart_refctd_ptr overridenUnspecialized; + if constexpr (WorkgroupTest) + { + overridenUnspecialized = CHLSLCompiler::createOverridenCopy( + source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n", + (("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG + ); + } + else + { + itemsPerWG = workgroupSize; + overridenUnspecialized = CHLSLCompiler::createOverridenCopy( + source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n", + (("subgroup::") + arith_name).c_str(), workgroupSize + ); + } + auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2); + + // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) + const uint32_t workgroupCount = elementCount / itemsPerWG; + cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); + cmdbuf->bindComputePipeline(pipeline.get()); + cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); + cmdbuf->dispatch(workgroupCount, 1, 1); + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; + for (auto i=0u; igetSize(),outputBuffers[i]} + }; + } + IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {.memBarriers={},.bufBarriers=memoryBarrier}; + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); + } + cmdbuf->end(); + + const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf.get()}}; + const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}}; + computeQueue->submit(submits); + const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}}; + m_device->blockForSemaphores(wait); + + // check results + bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount); + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + if constexpr (WorkgroupTest) + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + + return passed; + } + + //returns true if result matches + template class Arithmetic, class Binop, bool WorkgroupTest> + bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount) + { + bool success = true; + + // download data + const SBufferRange bufferRange = {0u, resultsBuffer->getSize(), outputBuffers[Binop::BindingIndex]}; + m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer()); + + using type_t = typename Binop::type_t; + const auto dataFromBuffer = reinterpret_cast(resultsBuffer->getPointer()); + const auto subgroupSize = dataFromBuffer[0]; + if (subgroupSizenbl::hlsl::subgroup::MaxSubgroupSize) + { + m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize); + return false; + } + + const auto testData = reinterpret_cast(dataFromBuffer + 1); + // TODO: parallel for (the temporary values need to be threadlocal or what?) + // now check if the data obtained has valid values + type_t* tmp = new type_t[itemsPerWG]; + type_t* ballotInput = new type_t[itemsPerWG]; + for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) + { + const auto workgroupOffset = workgroupID * itemsPerWG; + + if constexpr (WorkgroupTest) + { + if constexpr (std::is_same_v, Binop>) + { + for (auto i = 0u; i < itemsPerWG; i++) + ballotInput[i] = inputData[i + workgroupOffset] & 0x1u; + Arithmetic::impl(tmp, ballotInput, itemsPerWG); + } + else + Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); + } + else + { + for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) + Arithmetic::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize); + } + + for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) + { + const auto globalInvocationIndex = workgroupOffset + localInvocationIndex; + const auto cpuVal = tmp[localInvocationIndex]; + const auto gpuVal = testData[globalInvocationIndex]; + if (cpuVal != gpuVal) + { + m_logger->log( + "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d", + ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, + cpuVal, gpuVal, workgroupID, localInvocationIndex + ); + success = false; + break; + } + } + } + delete[] ballotInput; + delete[] tmp; + + return success; + } + + IQueue* transferDownQueue; + IQueue* computeQueue; + smart_refctd_ptr m_spirv_isa_cache; + smart_refctd_ptr m_spirv_isa_cache_output; + + uint32_t* inputData = nullptr; + constexpr static inline uint32_t OutputBufferCount = 8u; + smart_refctd_ptr outputBuffers[OutputBufferCount]; + smart_refctd_ptr descriptorSet; + smart_refctd_ptr pipelineLayout; + + smart_refctd_ptr sema; + uint64_t timelineValue = 0; + smart_refctd_ptr cmdbuf; + smart_refctd_ptr resultsBuffer; + + uint32_t totalFailCount = 0; +}; + +NBL_MAIN_FUNC(ArithmeticBenchApp) \ No newline at end of file diff --git a/71_ArithmeticBench/pipeline.groovy b/71_ArithmeticBench/pipeline.groovy new file mode 100644 index 000000000..7ea9947e0 --- /dev/null +++ b/71_ArithmeticBench/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CArithemticUnitTestBuilder extends IBuilder +{ + public CArithemticUnitTestBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CArithemticUnitTestBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index fb03f95a4..4434eacc1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,7 +95,9 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL) add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL) - add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) + add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) + + add_subdirectory(71_ArithmeticBench EXCLUDE_FROM_ALL) NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") endif() From f8237715997821ec0bf5f7fa2fed92dbabe56e52 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 28 Mar 2025 10:46:10 +0700 Subject: [PATCH 100/296] use dropdown, more options --- 31_HLSLPathTracer/main.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 8394889db..706a0f713 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -41,7 +41,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, { ERM_GLSL, ERM_HLSL, - ERM_CHECKERED, + // ERM_CHECKERED, ERM_COUNT }; @@ -68,6 +68,11 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, "ELG_RECTANGLE" }; + const char* shaderTypes[E_RENDER_MODE::ERM_COUNT] = { + "ERM_GLSL", + "ERM_HLSL" + }; + public: inline HLSLComputePathtracer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} @@ -935,7 +940,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, ImGui::SliderFloat("Fov", &fov, 20.f, 150.f); ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); - ImGui::ListBox("Shader", &PTPipline, shaderNames, E_LIGHT_GEOMETRY::ELG_COUNT); + ImGui::Combo("Shader", &PTPipeline, shaderNames, E_LIGHT_GEOMETRY::ELG_COUNT); + ImGui::Combo("Render Mode", &renderMode, shaderTypes, E_RENDER_MODE::ERM_COUNT); ImGui::SliderInt("SPP", &spp, 1, MaxBufferSamples); ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 3); @@ -1063,7 +1069,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, // cube envmap handle { - auto pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPipelines[PTPipline].get() : m_PTGLSLPipelines[PTPipline].get(); + auto pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPipelines[PTPipeline].get() : m_PTGLSLPipelines[PTPipeline].get(); cmdbuf->bindComputePipeline(pipeline); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get()); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get()); @@ -1347,7 +1353,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, float viewWidth = 10.f; float camYAngle = 165.f / 180.f * 3.14159f; float camXAngle = 32.f / 180.f * 3.14159f; - int PTPipline = E_LIGHT_GEOMETRY::ELG_SPHERE; + int PTPipeline = E_LIGHT_GEOMETRY::ELG_SPHERE; int renderMode = E_RENDER_MODE::ERM_HLSL; int spp = 32; int depth = 3; From 1535561525c1df59d227969692ae7405b507962b Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 28 Mar 2025 12:45:14 +0700 Subject: [PATCH 101/296] added persistent workgroup toggle --- .../app_resources/glsl/common.glsl | 25 ++++++ .../app_resources/hlsl/render.comp.hlsl | 24 ++++++ 31_HLSLPathTracer/main.cpp | 82 +++++++++++++++++-- 3 files changed, 123 insertions(+), 8 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl index 6c2b5f42f..6b6e96710 100644 --- a/31_HLSLPathTracer/app_resources/glsl/common.glsl +++ b/31_HLSLPathTracer/app_resources/glsl/common.glsl @@ -35,6 +35,9 @@ vec2 getTexCoords() { #include #include #include +#ifdef PERSISTENT_WORKGROUPS +#include +#endif #include @@ -688,19 +691,37 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb void main() { const ivec2 imageExtents = imageSize(outImage); + +#ifdef PERSISTENT_WORKGROUPS + uint virtualThreadIndex; + for (uint virtualThreadBase = gl_WorkGroupID.x * _NBL_GLSL_WORKGROUP_SIZE_; virtualThreadBase < 1920*1080; virtualThreadBase += gl_NumWorkGroups.x * _NBL_GLSL_WORKGROUP_SIZE_) // not sure why 1280*720 doesn't cover draw surface + { + virtualThreadIndex = virtualThreadBase + gl_LocalInvocationIndex.x; + const ivec2 coords = ivec2(nbl_glsl_morton_decode2d32b(virtualThreadIndex)); +#else const ivec2 coords = getCoordinates(); +#endif + vec2 texCoord = vec2(coords) / vec2(imageExtents); texCoord.y = 1.0 - texCoord.y; if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) { +#ifdef PERSISTENT_WORKGROUPS + continue; +#else return; +#endif } if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0) { vec4 pixelCol = vec4(1.0,0.0,0.0,1.0); imageStore(outImage, coords, pixelCol); +#ifdef PERSISTENT_WORKGROUPS + continue; +#else return; +#endif } nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg; @@ -791,6 +812,10 @@ void main() vec4 pixelCol = vec4(color, 1.0); imageStore(outImage, coords, pixelCol); + +#ifdef PERSISTENT_WORKGROUPS + } +#endif } /** TODO: Improving Rendering diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index b187a1b33..81736f508 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -2,6 +2,9 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/random/pcg.hlsl" #include "nbl/builtin/hlsl/random/xoroshiro.hlsl" +#ifdef PERSISTENT_WORKGROUPS +#include "nbl/builtin/hlsl/math/morton.hlsl" +#endif #include "nbl/builtin/hlsl/bxdf/reflection.hlsl" #include "nbl/builtin/hlsl/bxdf/transmission.hlsl" @@ -156,19 +159,36 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) { uint32_t width, height; outImage.GetDimensions(width, height); +#ifdef PERSISTENT_WORKGROUPS + uint32_t virtualThreadIndex; + [loop] + for (uint32_t virtualThreadBase = glsl::gl_WorkGroupID().x * WorkgroupSize; virtualThreadBase < 1920*1080; virtualThreadBase += glsl::gl_NumWorkGroups().x * WorkgroupSize) // not sure why 1280*720 doesn't cover draw surface + { + virtualThreadIndex = virtualThreadBase + glsl::gl_LocalInvocationIndex().x; + const int32_t2 coords = (int32_t2)math::Morton::decode2d(virtualThreadIndex); +#else const int32_t2 coords = getCoordinates(); +#endif float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height); texCoord.y = 1.0 - texCoord.y; if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) { +#ifdef PERSISTENT_WORKGROUPS + continue; +#else return; +#endif } if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0) { float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0); outImage[coords] = pixelCol; +#ifdef PERSISTENT_WORKGROUPS + continue; +#else return; +#endif } int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x; @@ -200,4 +220,8 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene); float32_t4 pixCol = float32_t4(color, 1.0); outImage[coords] = pixCol; + +#ifdef PERSISTENT_WORKGROUPS + } +#endif } diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 706a0f713..0dc5fc053 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -323,7 +323,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, m_presentDescriptorSet = presentDSPool->createDescriptorSet(gpuPresentDescriptorSetLayout); // Create Shaders - auto loadAndCompileGLSLShader = [&](const std::string& pathToShader) -> smart_refctd_ptr + auto loadAndCompileGLSLShader = [&](const std::string& pathToShader, bool persistentWorkGroups = false) -> smart_refctd_ptr { IAssetLoader::SAssetLoadParams lp = {}; lp.workingDirectory = localInputCWD; @@ -339,6 +339,27 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, // The down-cast should not fail! assert(source); + auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + CGLSLCompiler::SOptions options = {}; + options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; // should be compute + options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; + options.spirvOptimizer = nullptr; +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + options.spirvOptimizer = opt.get(); +#endif + options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; + options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); + options.preprocessorOptions.logger = m_logger.get(); + options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); + + const IShaderCompiler::SMacroDefinition persistentDefine = { "PERSISTENT_WORKGROUPS", "1" }; + if (persistentWorkGroups) + options.preprocessorOptions.extraDefines = { &persistentDefine, &persistentDefine + 1 }; + + source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + // this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple auto shader = m_device->createShader(source.get()); if (!shader) @@ -350,7 +371,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, return shader; }; - auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro) -> smart_refctd_ptr + auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro = "", bool persistentWorkGroups = false) -> smart_refctd_ptr { IAssetLoader::SAssetLoadParams lp = {}; lp.workingDirectory = localInputCWD; @@ -368,7 +389,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); CHLSLCompiler::SOptions options = {}; - options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; // should be compute + options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; options.spirvOptimizer = nullptr; #ifndef _NBL_DEBUG @@ -381,8 +402,11 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, options.preprocessorOptions.logger = m_logger.get(); options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); - const IShaderCompiler::SMacroDefinition variantDefine = { defineMacro, "" }; - options.preprocessorOptions.extraDefines = { &variantDefine, &variantDefine + 1 }; + const IShaderCompiler::SMacroDefinition defines[2] = { {defineMacro, ""}, { "PERSISTENT_WORKGROUPS", "1" } }; + if (!defineMacro.empty() && persistentWorkGroups) + options.preprocessorOptions.extraDefines = { defines, defines + 2 }; + else if (!defineMacro.empty() && !persistentWorkGroups) + options.preprocessorOptions.extraDefines = { defines, defines + 1 }; source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); @@ -441,6 +465,34 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTHLSLPipelines.data() + index)) return logFail("Failed to create HLSL compute pipeline!\n"); } + + // persistent wg pipelines + { + auto ptShader = loadAndCompileGLSLShader(PTGLSLShaderPaths[index], true); + + IGPUComputePipeline::SCreationParams params = {}; + params.layout = ptPipelineLayout.get(); + params.shader.shader = ptShader.get(); + params.shader.entryPoint = "main"; + params.shader.entries = nullptr; + params.shader.requireFullSubgroups = true; + params.shader.requiredSubgroupSize = static_cast(5); + if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTGLSLPersistentWGPipelines.data() + index)) + return logFail("Failed to create GLSL PersistentWG compute pipeline!\n"); + } + { + auto ptShader = loadAndCompileHLSLShader(PTHLSLShaderPath, PTHLSLShaderVariants[index], true); + + IGPUComputePipeline::SCreationParams params = {}; + params.layout = ptPipelineLayout.get(); + params.shader.shader = ptShader.get(); + params.shader.entryPoint = "main"; + params.shader.entries = nullptr; + params.shader.requireFullSubgroups = true; + params.shader.requiredSubgroupSize = static_cast(5); + if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTHLSLPersistentWGPipelines.data() + index)) + return logFail("Failed to create HLSL PersistentWG compute pipeline!\n"); + } } } @@ -452,7 +504,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); // Load Fragment Shader - auto fragmentShader = loadAndCompileGLSLShader(PresentShaderPath); + auto fragmentShader = loadAndCompileHLSLShader(PresentShaderPath); if (!fragmentShader) return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!"); @@ -944,6 +996,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, ImGui::Combo("Render Mode", &renderMode, shaderTypes, E_RENDER_MODE::ERM_COUNT); ImGui::SliderInt("SPP", &spp, 1, MaxBufferSamples); ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 3); + ImGui::Checkbox("Persistent WorkGroups", &usePersistentWorkGroups); ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); @@ -1069,12 +1122,22 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, // cube envmap handle { - auto pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPipelines[PTPipeline].get() : m_PTGLSLPipelines[PTPipeline].get(); + IGPUComputePipeline* pipeline; + if (usePersistentWorkGroups) + pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPersistentWGPipelines[PTPipeline].get() : m_PTGLSLPersistentWGPipelines[PTPipeline].get(); + else + pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPipelines[PTPipeline].get() : m_PTGLSLPipelines[PTPipeline].get(); cmdbuf->bindComputePipeline(pipeline); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get()); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get()); cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc); - cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u); + if (usePersistentWorkGroups) + { + uint32_t dispatchSize = m_physicalDevice->getLimits().computeOptimalPersistentWorkgroupDispatchSize(WindowDimensions.x * WindowDimensions.y, DefaultWorkGroupSize); + cmdbuf->dispatch(dispatchSize, 1u, 1u); + } + else + cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u); } // TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image) @@ -1306,6 +1369,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, smart_refctd_ptr m_cmdPool; std::array, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTGLSLPipelines; std::array, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTHLSLPipelines; + std::array, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTGLSLPersistentWGPipelines; + std::array, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTHLSLPersistentWGPipelines; smart_refctd_ptr m_presentPipeline; uint64_t m_realFrameIx = 0; std::array, MaxFramesInFlight> m_cmdBufs; @@ -1357,6 +1422,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, int renderMode = E_RENDER_MODE::ERM_HLSL; int spp = 32; int depth = 3; + bool usePersistentWorkGroups = false; bool m_firstFrame = true; IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; From 3a2ff1421f81089749694db3032ce52068731551 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 31 Mar 2025 14:55:00 +0700 Subject: [PATCH 102/296] test subgroup2 funcs correct --- 71_ArithmeticBench/app_resources/common.hlsl | 3 +- .../app_resources/shaderCommon.hlsl | 62 ++++++++++++++----- .../app_resources/testSubgroup.comp.hlsl | 2 +- 71_ArithmeticBench/main.cpp | 57 +++++++++-------- 4 files changed, 78 insertions(+), 46 deletions(-) diff --git a/71_ArithmeticBench/app_resources/common.hlsl b/71_ArithmeticBench/app_resources/common.hlsl index 10892a2b9..8921659db 100644 --- a/71_ArithmeticBench/app_resources/common.hlsl +++ b/71_ArithmeticBench/app_resources/common.hlsl @@ -10,7 +10,6 @@ struct Output uint32_t data[ScanElementCount]; }; -// Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code template struct bit_and : nbl::hlsl::bit_and { @@ -93,4 +92,4 @@ struct ballot : nbl::hlsl::plus #endif }; -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" \ No newline at end of file +#include "nbl/builtin/hlsl/subgroup/basic.hlsl" diff --git a/71_ArithmeticBench/app_resources/shaderCommon.hlsl b/71_ArithmeticBench/app_resources/shaderCommon.hlsl index 13ee8d21e..e7105da62 100644 --- a/71_ArithmeticBench/app_resources/shaderCommon.hlsl +++ b/71_ArithmeticBench/app_resources/shaderCommon.hlsl @@ -2,7 +2,7 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/subgroup/basic.hlsl" -#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" @@ -19,37 +19,67 @@ uint32_t globalIndex(); // since we test ITEMS_PER_WG type_t; #ifndef OPERATION #error "Define OPERATION!" #endif -template class binop> +// template class binop> +// static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) +// { +// if (globalIndex()==0u) +// output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + +// operation_t::base_t,nbl::hlsl::jit::device_capabilities> func; +// if (canStore()) +// output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); +// } + +#ifndef SUBGROUP_SIZE_LOG2 +#error "Define SUBGROUP_SIZE_LOG2!" +#endif +template class binop, typename T, uint32_t N> static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { + // TODO static assert vector == type_t + //using type_t = vector; + using config_t = nbl::hlsl::subgroup::Configuration; + using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + if (globalIndex()==0u) - output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - operation_t::base_t,nbl::hlsl::jit::device_capabilities> func; + operation_t func; if (canStore()) - output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); + output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); } type_t test() { - const type_t sourceVal = inputValue[globalIndex()]; - - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); + const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION; + type_t sourceVal; + [unroll] + for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) + { + sourceVal[i] = inputValue[idx + i]; + } + + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); return sourceVal; } -#include "nbl/builtin/hlsl/workgroup/basic.hlsl" \ No newline at end of file +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" diff --git a/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl index 479265d73..50173ce42 100644 --- a/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl +++ b/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl @@ -15,4 +15,4 @@ bool canStore() {return true;} void main() { test(); -} \ No newline at end of file +} diff --git a/71_ArithmeticBench/main.cpp b/71_ArithmeticBench/main.cpp index 0952d2b57..00cfbcf35 100644 --- a/71_ArithmeticBench/main.cpp +++ b/71_ArithmeticBench/main.cpp @@ -200,14 +200,17 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp return false; } } - - const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; + + // TODO variable items per invocation? + const uint32_t ItemsPerInvocation = 4u; + const std::array workgroupSizes = { 64, 128, 256, 512, 1024 }; + // const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize) + for (const auto& workgroupSize : workgroupSizes) { // make sure renderdoc captures everything for debugging m_api->startCapture(); @@ -221,16 +224,16 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp logTestOutcome(passed, workgroupSize); passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; logTestOutcome(passed, workgroupSize); - for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--) - { - m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - } + //for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--) + //{ + // m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); + // passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + // logTestOutcome(passed, itemsPerWG); + // passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + // logTestOutcome(passed, itemsPerWG); + // passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + // logTestOutcome(passed, itemsPerWG); + //} m_api->endCapture(); // save cache every now and then @@ -301,30 +304,30 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp }*/ template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u) { std::string arith_name = Arithmetic>::name; smart_refctd_ptr overridenUnspecialized; - if constexpr (WorkgroupTest) - { - overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n", - (("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG - ); - } - else - { + //if constexpr (WorkgroupTest) + //{ + // overridenUnspecialized = CHLSLCompiler::createOverridenCopy( + // source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n", + // (("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG + // ); + //} + //else + //{ itemsPerWG = workgroupSize; overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n", - (("subgroup::") + arith_name).c_str(), workgroupSize + source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n", + (("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2 ); - } + //} auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2); // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) - const uint32_t workgroupCount = elementCount / itemsPerWG; + const uint32_t workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc); cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); cmdbuf->bindComputePipeline(pipeline.get()); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); From dd021a05605fa48ac5962db22d0a591c0ff7691d Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 31 Mar 2025 16:39:41 +0700 Subject: [PATCH 103/296] fix test --- 71_ArithmeticBench/main.cpp | 95 +++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 45 deletions(-) diff --git a/71_ArithmeticBench/main.cpp b/71_ArithmeticBench/main.cpp index 00cfbcf35..c03700e2a 100644 --- a/71_ArithmeticBench/main.cpp +++ b/71_ArithmeticBench/main.cpp @@ -203,7 +203,7 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp // TODO variable items per invocation? const uint32_t ItemsPerInvocation = 4u; - const std::array workgroupSizes = { 64, 128, 256, 512, 1024 }; + const std::array workgroupSizes = { 256, 512, 1024 }; // const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; @@ -218,11 +218,11 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp bool passed = true; // TODO async the testing - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed; logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed; logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed; logTestOutcome(passed, workgroupSize); //for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--) //{ @@ -362,22 +362,22 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp m_device->blockForSemaphores(wait); // check results - bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount); - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - if constexpr (WorkgroupTest) - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc); + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + //if constexpr (WorkgroupTest) + // passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; return passed; } //returns true if result matches template class Arithmetic, class Binop, bool WorkgroupTest> - bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount) + bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, uint32_t itemsPerInvoc = 1u) { bool success = true; @@ -397,47 +397,52 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp const auto testData = reinterpret_cast(dataFromBuffer + 1); // TODO: parallel for (the temporary values need to be threadlocal or what?) // now check if the data obtained has valid values - type_t* tmp = new type_t[itemsPerWG]; - type_t* ballotInput = new type_t[itemsPerWG]; + type_t* tmp = new type_t[itemsPerWG * itemsPerInvoc]; + //type_t* ballotInput = new type_t[itemsPerWG]; for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) { - const auto workgroupOffset = workgroupID * itemsPerWG; - - if constexpr (WorkgroupTest) - { - if constexpr (std::is_same_v, Binop>) - { - for (auto i = 0u; i < itemsPerWG; i++) - ballotInput[i] = inputData[i + workgroupOffset] & 0x1u; - Arithmetic::impl(tmp, ballotInput, itemsPerWG); - } - else - Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); - } - else - { + const auto workgroupOffset = workgroupID * itemsPerWG * itemsPerInvoc; + + //if constexpr (WorkgroupTest) + //{ + // if constexpr (std::is_same_v, Binop>) + // { + // for (auto i = 0u; i < itemsPerWG; i++) + // ballotInput[i] = inputData[i + workgroupOffset] & 0x1u; + // Arithmetic::impl(tmp, ballotInput, itemsPerWG); + // } + // else + // Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); + //} + //else + //{ for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) - Arithmetic::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize); - } + Arithmetic::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc); + //} for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) { - const auto globalInvocationIndex = workgroupOffset + localInvocationIndex; - const auto cpuVal = tmp[localInvocationIndex]; - const auto gpuVal = testData[globalInvocationIndex]; - if (cpuVal != gpuVal) + const auto localOffset = localInvocationIndex * itemsPerInvoc; + const auto globalInvocationIndex = workgroupOffset + localOffset; + + for (uint32_t itemInvocationIndex = 0u; itemInvocationIndex < itemsPerInvoc; itemInvocationIndex++) { - m_logger->log( - "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d", - ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, - cpuVal, gpuVal, workgroupID, localInvocationIndex - ); - success = false; - break; + const auto cpuVal = tmp[localOffset + itemInvocationIndex]; + const auto gpuVal = testData[globalInvocationIndex + itemInvocationIndex]; + if (cpuVal != gpuVal) + { + m_logger->log( + "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d and iteminvoc %d", + ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, + cpuVal, gpuVal, workgroupID, localInvocationIndex, itemInvocationIndex + ); + success = false; + break; + } } } } - delete[] ballotInput; + //delete[] ballotInput; delete[] tmp; return success; From 6766420f6cc2c09d6eafcb8d519235846cb66fe5 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 31 Mar 2025 12:25:20 +0200 Subject: [PATCH 104/296] make RT pipeline example work when `NBL_EMED_RESOURCES=OFF` --- 71_RayTracingPipeline/main.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 1e4619b46..e4d53008e 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -3,7 +3,6 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "common.hpp" -#include "nbl/builtin/builtinResources.h" #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" #include "nbl/builtin/hlsl/indirect_commands.hlsl" From 09050536fc442a68b3da11d308432f2c4f2d375b Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 31 Mar 2025 13:13:16 +0200 Subject: [PATCH 105/296] Correct Alpha handling in RT Pipeline example, also work around https://github.com/microsoft/DirectXShaderCompiler/issues/6464 --- .../app_resources/common.hlsl | 19 +++++++++++++++---- .../app_resources/raytrace.rahit.hlsl | 10 +++++----- .../app_resources/raytrace.rgen.hlsl | 7 +++---- .../app_resources/raytrace_shadow.rahit.hlsl | 5 ++++- 71_RayTracingPipeline/main.cpp | 2 +- 5 files changed, 28 insertions(+), 15 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index d64851b17..32e9de671 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -3,6 +3,7 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/cpp_compat/basic.h" +#include "nbl/builtin/hlsl/random/pcg.hlsl" NBL_CONSTEXPR uint32_t WorkgroupSize = 16; NBL_CONSTEXPR uint32_t MAX_UNORM_10 = 1023; @@ -196,10 +197,20 @@ struct MaterialId struct [raypayload] PrimaryPayload { - float32_t3 worldNormal : read(caller) : write(closesthit); - float32_t rayDistance : read(caller) : write(closesthit, miss); - float32_t alphaThreshold : read(closesthit, anyhit) : write(caller); - MaterialId materialId : read(caller) : write(closesthit); + using generator_t = nbl::hlsl::random::Pcg; +/* bugged out by https://github.com/microsoft/DirectXShaderCompiler/issues/6464 + bool nextDiscard(const float32_t alpha) + { + const uint32_t bitpattern = pcg(); + const float32_t xi = (float32_t(bitpattern)+0.5f)/float32_t(0xFFFFFFFF); + return xi > alpha; + } +*/ + + float32_t3 worldNormal : read(caller) : write(closesthit); + float32_t rayDistance : read(caller) : write(closesthit, miss); + generator_t pcg : read(anyhit) : write(caller,anyhit); + MaterialId materialId : read(caller) : write(closesthit); }; diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl index c499e0506..16f7551b1 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -7,10 +7,10 @@ void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes { const int instID = InstanceID(); const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); - const Material material = nbl::hlsl::_static_cast(geom.material); - - if (material.alpha > payload.alphaThreshold) - { + + // Should have been a method of the payload but https://github.com/microsoft/DirectXShaderCompiler/issues/6464 stops it + // alpha is quantized to 10 bits + const uint32_t bitpattern = payload.pcg()>>22; + if (bitpattern > geom.material.alpha) IgnoreHit(); - } } diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index ef84ced3e..c182d961e 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -2,7 +2,6 @@ #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" #include "nbl/builtin/hlsl/random/xoroshiro.hlsl" -#include "nbl/builtin/hlsl/random/pcg.hlsl" #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl" @@ -28,8 +27,8 @@ void main() const uint32_t3 launchSize = DispatchRaysDimensions(); const uint32_t2 coords = launchID.xy; - const uint32_t seed1 = nbl::hlsl::Pcg::construct(pc.frameCounter)(); - const uint32_t seed2 = nbl::hlsl::Pcg::construct(launchID.y * launchSize.x + launchID.x)(); + const uint32_t seed1 = nbl::hlsl::random::Pcg::create(pc.frameCounter)(); + const uint32_t seed2 = nbl::hlsl::random::Pcg::create(launchID.y * launchSize.x + launchID.x)(); nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(seed1, seed2)); float32_t3 hitValues = float32_t3(0, 0, 0); @@ -55,7 +54,7 @@ void main() rayDesc.TMax = 10000.0; PrimaryPayload payload; - payload.alphaThreshold = nextRandomUnorm(rnd); + payload.pcg = PrimaryPayload::generator_t::create(rnd()); TraceRay(topLevelAS, RAY_FLAG_NONE, 0xff, ERT_PRIMARY, 0, EMT_PRIMARY, rayDesc, payload); const float32_t rayDistance = payload.rayDistance; diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl index 88a9b79db..2357bb830 100644 --- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl @@ -9,6 +9,9 @@ void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttribut const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); const Material material = nbl::hlsl::_static_cast(geom.material); - payload.attenuation = material.alpha * payload.attenuation; + payload.attenuation = (1.f-material.alpha) * payload.attenuation; + // arbitrary constant +// if (payload.attenuation < 1.f/1024.f) +// TerminateRay(); IgnoreHit(); } diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index e4d53008e..73225d083 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1130,7 +1130,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .diffuse = {0.2, 0.8, 0.2}, .specular = {0.8, 0.8, 0.8}, .shininess = 1.0f, - .alpha = 0.8, + .alpha = 0.2, }, .transform = getTranslationMatrix(5.0f, 1.0f, 0), }, From 699f2632e96ce772f40e6715c1216c7cde96026d Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 31 Mar 2025 13:33:28 +0200 Subject: [PATCH 106/296] clean up the code a bit more, address comments in https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/pull/173 --- .../app_resources/common.hlsl | 27 +++++++++---------- .../app_resources/raytrace.rahit.hlsl | 6 ++--- .../app_resources/raytrace.rgen.hlsl | 9 ++++--- .../app_resources/raytrace.rint.hlsl | 10 ++----- 4 files changed, 23 insertions(+), 29 deletions(-) diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index 32e9de671..a5916812d 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -53,6 +53,11 @@ struct Material { return alpha < 1.0; } + + bool alphaTest(const float32_t xi) NBL_CONST_MEMBER_FUNC + { + return xi > alpha; + } }; struct MaterialPacked @@ -67,6 +72,11 @@ struct MaterialPacked { return alpha != MAX_UNORM_10; } + + bool alphaTest(const uint32_t xi) NBL_CONST_MEMBER_FUNC + { + return (xi>>22) > alpha; + } }; struct SProceduralGeomInfo @@ -198,14 +208,6 @@ struct MaterialId struct [raypayload] PrimaryPayload { using generator_t = nbl::hlsl::random::Pcg; -/* bugged out by https://github.com/microsoft/DirectXShaderCompiler/issues/6464 - bool nextDiscard(const float32_t alpha) - { - const uint32_t bitpattern = pcg(); - const float32_t xi = (float32_t(bitpattern)+0.5f)/float32_t(0xFFFFFFFF); - return xi > alpha; - } -*/ float32_t3 worldNormal : read(caller) : write(closesthit); float32_t rayDistance : read(caller) : write(closesthit, miss); @@ -335,12 +337,9 @@ float32_t3 fetchVertexNormal(int instID, int primID, STriangleGeomInfo geom, flo case OT_ICOSPHERE: default: { - n0 = normalize(vk::RawBufferLoad < - float3 > (normalVertexBufferAddress + i0 * vertexStride)); - n1 = normalize(vk::RawBufferLoad < - float3 > (normalVertexBufferAddress + i1 * vertexStride)); - n2 = normalize(vk::RawBufferLoad < - float3 > (normalVertexBufferAddress + i2 * vertexStride)); + n0 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i0 * vertexStride); + n1 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i1 * vertexStride); + n2 = vk::RawBufferLoad < float3 > (normalVertexBufferAddress + i2 * vertexStride); } } diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl index 16f7551b1..97713b3ec 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl @@ -8,9 +8,7 @@ void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes const int instID = InstanceID(); const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); - // Should have been a method of the payload but https://github.com/microsoft/DirectXShaderCompiler/issues/6464 stops it - // alpha is quantized to 10 bits - const uint32_t bitpattern = payload.pcg()>>22; - if (bitpattern > geom.material.alpha) + const uint32_t bitpattern = payload.pcg(); + if (geom.material.alphaTest(bitpattern)) IgnoreHit(); } diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index c182d961e..3e2c45bfe 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -65,6 +65,12 @@ void main() } const float32_t3 worldPosition = pc.camPos + (camDirection * rayDistance); + + // make sure to call with least live state + RayLight cLight; + cLight.inHitPosition = worldPosition; + CallShader(pc.light.type, cLight); + const float32_t3 worldNormal = payload.worldNormal; Material material; @@ -80,9 +86,6 @@ void main() const MaterialPacked materialPacked = vk::RawBufferLoad(pc.triangleGeomInfoBuffer + materialId.getMaterialIndex() * sizeof(STriangleGeomInfo)); material = nbl::hlsl::_static_cast(materialPacked); } - RayLight cLight; - cLight.inHitPosition = worldPosition; - CallShader(pc.light.type, cLight); float32_t attenuation = 1; diff --git a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl index ab623382d..d081c9248 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl @@ -18,14 +18,8 @@ float32_t hitSphere(SProceduralGeomInfo s, Ray r) float32_t c = dot(oc, oc) - s.radius * s.radius; float32_t discriminant = b * b - 4 * a * c; - if (discriminant < 0) - { - return -1.0; - } - else - { - return (-b - sqrt(discriminant)) / (2.0 * a); - } + // return whatever, if the discriminant is negative, it will produce a NaN, and NaN will compare false + return (-b - sqrt(discriminant)) / (2.0 * a); } [shader("intersection")] From bb2fd06626342be02dad8e043a4ea395fef7049d Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 31 Mar 2025 14:28:34 +0200 Subject: [PATCH 107/296] do shadows without any closest hit shaders, one miss shader instead --- .../app_resources/common.hlsl | 7 +++++-- .../app_resources/raytrace.rgen.hlsl | 8 +++++--- .../app_resources/raytrace_shadow.rahit.hlsl | 11 +++++++---- .../app_resources/raytrace_shadow.rmiss.hlsl | 8 ++++++++ .../raytrace_shadow_triangle.rchit.hlsl | 7 ------- 71_RayTracingPipeline/main.cpp | 18 +++++++++--------- 6 files changed, 34 insertions(+), 25 deletions(-) create mode 100644 71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl delete mode 100644 71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl index a5916812d..18b67085a 100644 --- a/71_RayTracingPipeline/app_resources/common.hlsl +++ b/71_RayTracingPipeline/app_resources/common.hlsl @@ -170,7 +170,10 @@ struct RayLight struct [raypayload] OcclusionPayload { - float32_t attenuation : read(caller) : write(caller, anyhit); + // TODO: will this break DXC? Tbh should come from push constant or some autoexposure feedback + // NBL_CONSTEXPR_STATIC_INLINE float32_t MinAttenuation = 1.f/1024.f; + + float32_t attenuation : read(caller,anyhit,miss) : write(caller,anyhit,miss); }; struct MaterialId @@ -210,7 +213,7 @@ struct [raypayload] PrimaryPayload using generator_t = nbl::hlsl::random::Pcg; float32_t3 worldNormal : read(caller) : write(closesthit); - float32_t rayDistance : read(caller) : write(closesthit, miss); + float32_t rayDistance : read(caller) : write(closesthit,miss); generator_t pcg : read(anyhit) : write(caller,anyhit); MaterialId materialId : read(caller) : write(closesthit); diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl index 3e2c45bfe..55b014d07 100644 --- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl @@ -97,13 +97,15 @@ void main() rayDesc.TMin = 0.01; rayDesc.TMax = cLight.outLightDistance; - uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH; OcclusionPayload occlusionPayload; - occlusionPayload.attenuation = 1; + // negative means its a hit, the miss shader will flip it back around to positive + occlusionPayload.attenuation = -1.f; + // abuse of miss shader to mean "not hit shader" solves us having to call closest hit shaders + uint32_t shadowRayFlags = RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER; TraceRay(topLevelAS, shadowRayFlags, 0xFF, ERT_OCCLUSION, 0, EMT_OCCLUSION, rayDesc, occlusionPayload); attenuation = occlusionPayload.attenuation; - if (occlusionPayload.attenuation > 0.0001) + if (occlusionPayload.attenuation > 1.f/1024.f) { const float32_t3 diffuse = computeDiffuse(material, cLight.outLightDir, worldNormal); const float32_t3 specular = computeSpecular(material, camDirection, cLight.outLightDir, worldNormal); diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl index 2357bb830..a3432b812 100644 --- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl +++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl @@ -1,4 +1,5 @@ #include "common.hlsl" +#include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl" [[vk::push_constant]] SPushConstants pc; @@ -9,9 +10,11 @@ void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttribut const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)); const Material material = nbl::hlsl::_static_cast(geom.material); - payload.attenuation = (1.f-material.alpha) * payload.attenuation; - // arbitrary constant -// if (payload.attenuation < 1.f/1024.f) -// TerminateRay(); + const float attenuation = (1.f-material.alpha) * payload.attenuation; + // DXC cogegens weird things in the presence of termination instructions + payload.attenuation = attenuation; + // arbitrary constant, whatever you want the smallest attenuation to be. Remember until miss, the attenuatio is negative + if (attenuation > -1.f/1024.f) + AcceptHitAndEndSearch(); IgnoreHit(); } diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl new file mode 100644 index 000000000..441a1b42a --- /dev/null +++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rmiss.hlsl @@ -0,0 +1,8 @@ +#include "common.hlsl" + +[shader("miss")] +void main(inout OcclusionPayload payload) +{ + // make positive + payload.attenuation = -payload.attenuation; +} diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl deleted file mode 100644 index c85c7c32d..000000000 --- a/71_RayTracingPipeline/app_resources/raytrace_shadow_triangle.rchit.hlsl +++ /dev/null @@ -1,7 +0,0 @@ -#include "common.hlsl" - -[shader("closesthit")] -void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttributes attribs) -{ - payload.attenuation = 0; -} diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 73225d083..35c750373 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -163,7 +163,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl"); const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl"); const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); - const auto shadowClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_shadow_triangle.rchit.hlsl"); + const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl"); const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl"); @@ -323,7 +323,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { RTDS_RAYGEN, RTDS_MISS, - RTDS_CLOSEST_HIT_SHADOW, + RTDS_MISS_SHADOW, RTDS_CLOSEST_HIT, RTDS_SPHERE_CLOSEST_HIT, RTDS_ANYHIT_PRIMARY, @@ -338,7 +338,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, IGPUShader::SSpecInfo shaders[RTDS_COUNT]; shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()}; shaders[RTDS_MISS] = {.shader = missShader.get()}; - shaders[RTDS_CLOSEST_HIT_SHADOW] = { .shader = shadowClosestHitShader.get() }; + shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() }; shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()}; shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()}; shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()}; @@ -351,9 +351,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, params.layout = pipelineLayout.get(); params.shaders = std::span(shaders); using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS; - params.flags = core::bitflag(RayTracingFlags::NO_NULL_INTERSECTION_SHADERS) | - RayTracingFlags::NO_NULL_ANY_HIT_SHADERS | - RayTracingFlags::NO_NULL_CLOSEST_HIT_SHADERS; + params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) | + RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | + RayTracingFlags::NO_NULL_ANY_HIT_SHADERS; auto& shaderGroups = params.shaderGroups; @@ -361,7 +361,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT]; missGroups[EMT_PRIMARY] = { .index = RTDS_MISS }; - missGroups[EMT_OCCLUSION] = { .index = IGPURayTracingPipeline::SGeneralShaderGroup::Unused }; + missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW }; shaderGroups.misses = missGroups; auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) @@ -374,7 +374,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .anyHit = RTDS_ANYHIT_PRIMARY, }; hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { - .closestHit = RTDS_CLOSEST_HIT_SHADOW, + .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, .anyHit = RTDS_ANYHIT_SHADOW, }; hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { @@ -383,7 +383,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .intersection = RTDS_INTERSECTION, }; hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { - .closestHit = RTDS_CLOSEST_HIT_SHADOW, + .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, .anyHit = RTDS_ANYHIT_SHADOW, .intersection = RTDS_INTERSECTION, }; From ca219416680386ae5cd8de42470960a7a7899c50 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 1 Apr 2025 16:24:35 +0700 Subject: [PATCH 108/296] benchmarking shader + pipeline working --- .../app_resources/benchmarkSubgroup.comp.hlsl | 73 ++++++++ 71_ArithmeticBench/app_resources/common.hlsl | 54 +++--- .../app_resources/shaderCommon.hlsl | 60 +++---- .../app_resources/testSubgroup.comp.hlsl | 4 +- 71_ArithmeticBench/main.cpp | 161 ++++++++++++++---- 5 files changed, 251 insertions(+), 101 deletions(-) create mode 100644 71_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl diff --git a/71_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/71_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl new file mode 100644 index 000000000..f3cc679ef --- /dev/null +++ b/71_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl @@ -0,0 +1,73 @@ +#pragma shader_stage(compute) + +#define operation_t nbl::hlsl::OPERATION + +#include "shaderCommon.hlsl" + +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} + +bool canStore() {return true;} + +#ifndef NUM_LOOPS +#error "Define NUM_LOOPS!" +#endif + +// template class binop, typename T, uint32_t N> +// static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) +// { +// using config_t = nbl::hlsl::subgroup::Configuration; +// using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + +// const uint32_t storeAddr = sizeof(uint32_t) + sizeof(type_t) * globalIndex(); + +// operation_t func; +// [unroll] +// for (uint32_t i = 0; i < NUM_LOOPS; i++) +// { +// const uint32_t arrIndex = i & 7u; // i % 8 +// output[arrIndex].template Store(storeAddr, func(sourceVal)); +// } +// } + +template class binop, typename T, uint32_t N> +static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + using config_t = nbl::hlsl::subgroup::Configuration; + using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + type_t value = sourceVal; + + operation_t func; + [unroll] + for (uint32_t i = 0; i < NUM_LOOPS; i++) + value = func(value); + + output[binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); +} + +void benchmark() +{ + const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION; + type_t sourceVal; + [unroll] + for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) + { + sourceVal[i] = inputValue[idx + i]; + } + + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); +} + +[numthreads(WORKGROUP_SIZE,1,1)] +void main() +{ + benchmark(); +} diff --git a/71_ArithmeticBench/app_resources/common.hlsl b/71_ArithmeticBench/app_resources/common.hlsl index 8921659db..67d3f16ca 100644 --- a/71_ArithmeticBench/app_resources/common.hlsl +++ b/71_ArithmeticBench/app_resources/common.hlsl @@ -4,91 +4,91 @@ template struct Output { - NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount; - uint32_t subgroupSize; - uint32_t data[ScanElementCount]; + uint32_t subgroupSize; + uint32_t data[ScanElementCount]; }; template struct bit_and : nbl::hlsl::bit_and { - using base_t = nbl::hlsl::bit_and; + using base_t = nbl::hlsl::bit_and; - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0; + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0; #ifndef __HLSL_VERSION - static inline constexpr const char* name = "bit_and"; + static inline constexpr const char* name = "bit_and"; #endif }; template struct bit_or : nbl::hlsl::bit_or { - using base_t = nbl::hlsl::bit_or; + using base_t = nbl::hlsl::bit_or; - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1; + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1; #ifndef __HLSL_VERSION - static inline constexpr const char* name = "bit_xor"; + static inline constexpr const char* name = "bit_xor"; #endif }; template struct bit_xor : nbl::hlsl::bit_xor { - using base_t = nbl::hlsl::bit_xor; + using base_t = nbl::hlsl::bit_xor; - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2; #ifndef __HLSL_VERSION - static inline constexpr const char* name = "bit_or"; + static inline constexpr const char* name = "bit_or"; #endif }; template struct plus : nbl::hlsl::plus { - using base_t = nbl::hlsl::plus; + using base_t = nbl::hlsl::plus; - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3; + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3; #ifndef __HLSL_VERSION - static inline constexpr const char* name = "plus"; + static inline constexpr const char* name = "plus"; #endif }; template struct multiplies : nbl::hlsl::multiplies { - using base_t = nbl::hlsl::multiplies; + using base_t = nbl::hlsl::multiplies; - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4; + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4; #ifndef __HLSL_VERSION - static inline constexpr const char* name = "multiplies"; + static inline constexpr const char* name = "multiplies"; #endif }; template struct minimum : nbl::hlsl::minimum { - using base_t = nbl::hlsl::minimum; + using base_t = nbl::hlsl::minimum; - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5; + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5; #ifndef __HLSL_VERSION - static inline constexpr const char* name = "minimum"; + static inline constexpr const char* name = "minimum"; #endif }; template struct maximum : nbl::hlsl::maximum { - using base_t = nbl::hlsl::maximum; + using base_t = nbl::hlsl::maximum; - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6; + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6; #ifndef __HLSL_VERSION - static inline constexpr const char* name = "maximum"; + static inline constexpr const char* name = "maximum"; #endif }; template struct ballot : nbl::hlsl::plus { - using base_t = nbl::hlsl::plus; + using base_t = nbl::hlsl::plus; - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7; + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7; #ifndef __HLSL_VERSION - static inline constexpr const char* name = "bitcount"; + static inline constexpr const char* name = "bitcount"; #endif }; diff --git a/71_ArithmeticBench/app_resources/shaderCommon.hlsl b/71_ArithmeticBench/app_resources/shaderCommon.hlsl index e7105da62..fa3713c44 100644 --- a/71_ArithmeticBench/app_resources/shaderCommon.hlsl +++ b/71_ArithmeticBench/app_resources/shaderCommon.hlsl @@ -31,16 +31,6 @@ typedef vector type_t; #ifndef OPERATION #error "Define OPERATION!" #endif -// template class binop> -// static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) -// { -// if (globalIndex()==0u) -// output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - -// operation_t::base_t,nbl::hlsl::jit::device_capabilities> func; -// if (canStore()) -// output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); -// } #ifndef SUBGROUP_SIZE_LOG2 #error "Define SUBGROUP_SIZE_LOG2!" @@ -48,38 +38,38 @@ typedef vector type_t; template class binop, typename T, uint32_t N> static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { - // TODO static assert vector == type_t - //using type_t = vector; - using config_t = nbl::hlsl::subgroup::Configuration; - using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + // TODO static assert vector == type_t + //using type_t = vector; + using config_t = nbl::hlsl::subgroup::Configuration; + using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; - if (globalIndex()==0u) - output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - - operation_t func; - if (canStore()) - output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); + if (globalIndex()==0u) + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + operation_t func; + if (canStore()) + output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); } type_t test() { - const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION; - type_t sourceVal; - [unroll] - for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) - { - sourceVal[i] = inputValue[idx + i]; - } + const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION; + type_t sourceVal; + [unroll] + for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) + { + sourceVal[i] = inputValue[idx + i]; + } - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - return sourceVal; + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + return sourceVal; } #include "nbl/builtin/hlsl/workgroup/basic.hlsl" diff --git a/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl index 50173ce42..2cc1ccb60 100644 --- a/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl +++ b/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl @@ -6,7 +6,7 @@ uint32_t globalIndex() { - return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); } bool canStore() {return true;} @@ -14,5 +14,5 @@ bool canStore() {return true;} [numthreads(WORKGROUP_SIZE,1,1)] void main() { - test(); + test(); } diff --git a/71_ArithmeticBench/main.cpp b/71_ArithmeticBench/main.cpp index c03700e2a..29f9ede8a 100644 --- a/71_ArithmeticBench/main.cpp +++ b/71_ArithmeticBench/main.cpp @@ -2,6 +2,8 @@ #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "app_resources/common.hlsl" +#include + using namespace nbl; using namespace core; using namespace asset; @@ -188,7 +190,7 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp }; auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); - auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl"); + //auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl"); // now create or retrieve final resources to run our tests sema = m_device->createSemaphore(timelineValue); resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() }); @@ -203,11 +205,75 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp // TODO variable items per invocation? const uint32_t ItemsPerInvocation = 4u; + const uint32_t NumLoops = 100000u; const std::array workgroupSizes = { 256, 512, 1024 }; // const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; - for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) + + if (b_runTests) + runTests(subgroupTestSource, elementCount, ItemsPerInvocation, MinSubgroupSize, MaxSubgroupSize, workgroupSizes); + + double time = runBenchmark(subgroupTestSource, elementCount, 5, 256, ItemsPerInvocation, NumLoops); + m_logger->log("Ran for %.3fms (disregard these numbers, profile in Nsight)", ILogger::ELL_INFO, time * 1000.0); + + //for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) + //{ + // const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); + // for (const auto& workgroupSize : workgroupSizes) + // { + // passed = runBenchmark(subgroupTestSource, queryPool, elementCount, subgroupSizeLog2, workgroupSize, ItemsPerInvocation, NumLoops) && passed; + // logTestOutcome(passed, workgroupSize); + // passed = runBenchmark(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ItemsPerInvocation, NumLoops) && passed; + // logTestOutcome(passed, workgroupSize); + // passed = runBenchmark(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ItemsPerInvocation, NumLoops) && passed; + // logTestOutcome(passed, workgroupSize); + + // // save cache every now and then + // { + // auto cpu = m_spirv_isa_cache->convertToCPUCache(); + // // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata + // auto bin = cpu->getEntries().begin()->second.bin; + // IFile::success_t success; + // m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size()); + // if (!success) + // logFail("Could not write Create SPIR-V to ISA cache to disk!"); + // } + // } + //} + + return true; + } + + virtual bool onAppTerminated() override + { + m_logger->log("==========Result==========", ILogger::ELL_INFO); + m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount); + delete[] inputData; + return true; + } + + // the unit test is carried out on init + void workLoopBody() override {} + + // + bool keepRunning() override { return true; } + +private: + void logTestOutcome(bool passed, uint32_t workgroupSize) + { + if (passed) + m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); + else + { + totalFailCount++; + m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); + } + } + + void runTests(smart_refctd_ptr subgroupTestSource, uint32_t elementCount, uint32_t ItemsPerInvocation, uint32_t MinSubgroupSize, uint32_t MaxSubgroupSize, const std::array& workgroupSizes) + { + for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); for (const auto& workgroupSize : workgroupSizes) @@ -242,40 +308,12 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata auto bin = cpu->getEntries().begin()->second.bin; IFile::success_t success; - m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size()); + m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size()); if (!success) logFail("Could not write Create SPIR-V to ISA cache to disk!"); } } } - - return true; - } - - virtual bool onAppTerminated() override - { - m_logger->log("==========Result==========", ILogger::ELL_INFO); - m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount); - delete[] inputData; - return true; - } - - // the unit test is carried out on init - void workLoopBody() override {} - - // - bool keepRunning() override { return false; } - -private: - void logTestOutcome(bool passed, uint32_t workgroupSize) - { - if (passed) - m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); - else - { - totalFailCount++; - m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); - } } // create pipeline (specialized every test) [TODO: turn into a future/async] @@ -297,12 +335,6 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp return pipeline; } - /*template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) - { - return true; - }*/ - template class Arithmetic, bool WorkgroupTest> bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u) { @@ -448,11 +480,66 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp return success; } + + template class Arithmetic> + double runBenchmark(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) + { + std::string arith_name = Arithmetic>::name; + + smart_refctd_ptr overridenUnspecialized = CHLSLCompiler::createOverridenCopy( + source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n", + (("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2 + ); + auto pipeline = createPipeline(overridenUnspecialized.get(), subgroupSizeLog2); + + const uint32_t workgroupCount = elementCount / (workgroupSize * itemsPerInvoc); + cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); + + cmdbuf->bindComputePipeline(pipeline.get()); + cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); + cmdbuf->dispatch(workgroupCount, 1, 1); + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; + for (auto i = 0u; i < OutputBufferCount; i++) + { + memoryBarrier[i] = { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + // in theory we don't need the HOST BITS cause we block on a semaphore but might as well add them + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT | PIPELINE_STAGE_FLAGS::HOST_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS | ACCESS_FLAGS::HOST_READ_BIT + } + }, + .range = {0ull,outputBuffers[i]->getSize(),outputBuffers[i]} + }; + } + IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .memBarriers = {},.bufBarriers = memoryBarrier }; + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, info); + } + cmdbuf->end(); + + auto startTime = std::chrono::high_resolution_clock::now(); + + const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = { {.semaphore = sema.get(),.value = ++timelineValue} }; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = { {.cmdbuf = cmdbuf.get()} }; + const IQueue::SSubmitInfo submits[1] = { {.commandBuffers = cmdbufs,.signalSemaphores = signal} }; + computeQueue->submit(submits); + const ISemaphore::SWaitInfo wait[1] = { {.semaphore = sema.get(),.value = timelineValue} }; + m_device->blockForSemaphores(wait); + + auto endTime = std::chrono::high_resolution_clock::now(); + + return std::chrono::duration(endTime - startTime).count(); + } + IQueue* transferDownQueue; IQueue* computeQueue; smart_refctd_ptr m_spirv_isa_cache; smart_refctd_ptr m_spirv_isa_cache_output; + bool b_runTests = false; uint32_t* inputData = nullptr; constexpr static inline uint32_t OutputBufferCount = 8u; smart_refctd_ptr outputBuffers[OutputBufferCount]; From 0bb41db1de63dcaa5e0c0efc95d3ac37a4210d6b Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 2 Apr 2025 10:47:36 +0700 Subject: [PATCH 109/296] begin adding fake frames for nsight profiler --- 71_ArithmeticBench/main.cpp | 48 ++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/71_ArithmeticBench/main.cpp b/71_ArithmeticBench/main.cpp index 29f9ede8a..beb243b97 100644 --- a/71_ArithmeticBench/main.cpp +++ b/71_ArithmeticBench/main.cpp @@ -1,13 +1,14 @@ -#include "nbl/application_templates/BasicMultiQueueApplication.hpp" +#include "SimpleWindowedApplication.hpp" +#include "CEventCallback.hpp" #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "app_resources/common.hlsl" -#include - using namespace nbl; using namespace core; -using namespace asset; +using namespace hlsl; using namespace system; +using namespace asset; +using namespace ui; using namespace video; // method emulations on the CPU, to verify the results of the GPU methods @@ -47,15 +48,46 @@ struct emulatedScanExclusive static inline constexpr const char* name = "exclusive_scan"; }; -class ArithmeticBenchApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = application_templates::BasicMultiQueueApplication; + using device_base_t = examples::SimpleWindowedApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 }; + constexpr static inline uint32_t MaxFramesInFlight = 5; + public: ArithmeticBenchApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + inline core::vector getSurfaces() const override + { + if (!m_surface) + { + { + auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); + IWindow::SCreationParams params = {}; + params.callback = core::make_smart_refctd_ptr(); + params.width = WindowDimensions.x; + params.height = WindowDimensions.y; + params.x = 32; + params.y = 32; + params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; + params.windowCaption = "ComputeShaderPathtracer"; + params.callback = windowCallback; + const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); + } + + auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); + const_cast&>(m_surface) = nbl::video::CSimpleResizeSurface::create(std::move(surface)); + } + + if (m_surface) + return { {m_surface->getSurface()/*,EQF_NONE*/} }; + + return {}; + } + bool onAppInitialized(smart_refctd_ptr&& system) override { if (!device_base_t::onAppInitialized(std::move(system))) @@ -539,6 +571,10 @@ class ArithmeticBenchApp final : public application_templates::BasicMultiQueueAp smart_refctd_ptr m_spirv_isa_cache; smart_refctd_ptr m_spirv_isa_cache_output; + smart_refctd_ptr m_window; + smart_refctd_ptr> m_surface; + smart_refctd_ptr m_inputSystem; + bool b_runTests = false; uint32_t* inputData = nullptr; constexpr static inline uint32_t OutputBufferCount = 8u; From 391c3aca99f39c41f5b63db43559b3be6482a727 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 3 Apr 2025 17:23:37 +0700 Subject: [PATCH 110/296] fix ray query geometry to use IShader --- 67_RayQueryGeometry/app_resources/render.comp.hlsl | 1 + 67_RayQueryGeometry/main.cpp | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl index e3d78f385..b9323ac74 100644 --- a/67_RayQueryGeometry/app_resources/render.comp.hlsl +++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl @@ -95,6 +95,7 @@ float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bar } [numthreads(WorkgroupSize, WorkgroupSize, 1)] +[shader("compute")] void main(uint32_t3 threadID : SV_DispatchThreadID) { uint2 coords = threadID.xy; diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index dab137cbd..c4c483263 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -164,9 +164,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu const auto assets = bundle.getContents(); assert(assets.size() == 1); - smart_refctd_ptr shaderSrc = IAsset::castDown(assets[0]); - shaderSrc->setShaderStage(IShader::E_SHADER_STAGE::ESS_COMPUTE); - auto shader = m_device->createShader(shaderSrc.get()); + const auto sourceRaw = smart_refctd_ptr_static_cast(assets[0]); + smart_refctd_ptr shader = m_device->compileShader({sourceRaw.get(), nullptr, nullptr, nullptr}); if (!shader) return logFail("Failed to create shader!"); @@ -203,6 +202,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu IGPUComputePipeline::SCreationParams params = {}; params.layout = pipelineLayout.get(); params.shader.shader = shader.get(); + params.shader.entryPoint = "main"; + params.shader.stage = ESS_COMPUTE; if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &renderPipeline)) return logFail("Failed to create compute pipeline"); } From 17dda8e2b8d5d3c2d3a7a853a3662b1c695bb145 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 7 Apr 2025 09:48:05 +0700 Subject: [PATCH 111/296] re-numbered example to avoid duplicate --- {71_ArithmeticBench => 73_ArithmeticBench}/CMakeLists.txt | 0 .../app_resources/benchmarkSubgroup.comp.hlsl | 0 .../app_resources/common.hlsl | 0 .../app_resources/shaderCommon.hlsl | 0 .../app_resources/testSubgroup.comp.hlsl | 0 .../app_resources/testWorkgroup.comp.hlsl | 0 {71_ArithmeticBench => 73_ArithmeticBench}/config.json.template | 0 {71_ArithmeticBench => 73_ArithmeticBench}/main.cpp | 0 {71_ArithmeticBench => 73_ArithmeticBench}/pipeline.groovy | 0 CMakeLists.txt | 2 +- 10 files changed, 1 insertion(+), 1 deletion(-) rename {71_ArithmeticBench => 73_ArithmeticBench}/CMakeLists.txt (100%) rename {71_ArithmeticBench => 73_ArithmeticBench}/app_resources/benchmarkSubgroup.comp.hlsl (100%) rename {71_ArithmeticBench => 73_ArithmeticBench}/app_resources/common.hlsl (100%) rename {71_ArithmeticBench => 73_ArithmeticBench}/app_resources/shaderCommon.hlsl (100%) rename {71_ArithmeticBench => 73_ArithmeticBench}/app_resources/testSubgroup.comp.hlsl (100%) rename {71_ArithmeticBench => 73_ArithmeticBench}/app_resources/testWorkgroup.comp.hlsl (100%) rename {71_ArithmeticBench => 73_ArithmeticBench}/config.json.template (100%) rename {71_ArithmeticBench => 73_ArithmeticBench}/main.cpp (100%) rename {71_ArithmeticBench => 73_ArithmeticBench}/pipeline.groovy (100%) diff --git a/71_ArithmeticBench/CMakeLists.txt b/73_ArithmeticBench/CMakeLists.txt similarity index 100% rename from 71_ArithmeticBench/CMakeLists.txt rename to 73_ArithmeticBench/CMakeLists.txt diff --git a/71_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl similarity index 100% rename from 71_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl rename to 73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl diff --git a/71_ArithmeticBench/app_resources/common.hlsl b/73_ArithmeticBench/app_resources/common.hlsl similarity index 100% rename from 71_ArithmeticBench/app_resources/common.hlsl rename to 73_ArithmeticBench/app_resources/common.hlsl diff --git a/71_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl similarity index 100% rename from 71_ArithmeticBench/app_resources/shaderCommon.hlsl rename to 73_ArithmeticBench/app_resources/shaderCommon.hlsl diff --git a/71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl similarity index 100% rename from 71_ArithmeticBench/app_resources/testSubgroup.comp.hlsl rename to 73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl diff --git a/71_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl b/73_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl similarity index 100% rename from 71_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl rename to 73_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl diff --git a/71_ArithmeticBench/config.json.template b/73_ArithmeticBench/config.json.template similarity index 100% rename from 71_ArithmeticBench/config.json.template rename to 73_ArithmeticBench/config.json.template diff --git a/71_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp similarity index 100% rename from 71_ArithmeticBench/main.cpp rename to 73_ArithmeticBench/main.cpp diff --git a/71_ArithmeticBench/pipeline.groovy b/73_ArithmeticBench/pipeline.groovy similarity index 100% rename from 71_ArithmeticBench/pipeline.groovy rename to 73_ArithmeticBench/pipeline.groovy diff --git a/CMakeLists.txt b/CMakeLists.txt index e073141c5..22033c682 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,7 +98,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) - add_subdirectory(71_ArithmeticBench EXCLUDE_FROM_ALL) + add_subdirectory(73_ArithmeticBench EXCLUDE_FROM_ALL) NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") endif() From 3d4e0f2372a799045422a6e71ef7f1bceeed0adc Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 8 Apr 2025 14:04:57 +0700 Subject: [PATCH 112/296] fake frames for nsight --- .../app_resources/benchmarkSubgroup.comp.hlsl | 2 + 73_ArithmeticBench/main.cpp | 452 +++++++++++++++--- 2 files changed, 398 insertions(+), 56 deletions(-) diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl index f3cc679ef..2815d1e38 100644 --- a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl @@ -4,6 +4,8 @@ #include "shaderCommon.hlsl" +[[vk::binding(2, 0)]] RWTexture2D outImage; // dummy + uint32_t globalIndex() { return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp index beb243b97..8e067e6cc 100644 --- a/73_ArithmeticBench/main.cpp +++ b/73_ArithmeticBench/main.cpp @@ -5,7 +5,6 @@ using namespace nbl; using namespace core; -using namespace hlsl; using namespace system; using namespace asset; using namespace ui; @@ -53,7 +52,8 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub using device_base_t = examples::SimpleWindowedApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; - constexpr static inline uint32_t2 WindowDimensions = { 1280, 720 }; + constexpr static inline uint32_t WIN_W = 1280; + constexpr static inline uint32_t WIN_H = 720; constexpr static inline uint32_t MaxFramesInFlight = 5; public: @@ -67,19 +67,19 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub { auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); IWindow::SCreationParams params = {}; - params.callback = core::make_smart_refctd_ptr(); - params.width = WindowDimensions.x; - params.height = WindowDimensions.y; + params.callback = core::make_smart_refctd_ptr(); + params.width = WIN_W; + params.height = WIN_H; params.x = 32; params.y = 32; params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; - params.windowCaption = "ComputeShaderPathtracer"; + params.windowCaption = "ArithmeticBenchApp"; params.callback = windowCallback; const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); } auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); - const_cast&>(m_surface) = nbl::video::CSimpleResizeSurface::create(std::move(surface)); + const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); } if (m_surface) @@ -90,11 +90,38 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub bool onAppInitialized(smart_refctd_ptr&& system) override { + m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + if (!device_base_t::onAppInitialized(std::move(system))) return false; if (!asset_base_t::onAppInitialized(std::move(system))) return false; + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); + + ISwapchain::SCreationParams swapchainParams = { .surface = m_surface->getSurface() }; + if (!swapchainParams.deduceFormat(m_physicalDevice)) + return logFail("Could not choose a Surface Format for the Swapchain!"); + + auto graphicsQueue = getGraphicsQueue(); + if (!m_surface || !m_surface->init(graphicsQueue, std::make_unique(), swapchainParams.sharedParams)) + return logFail("Could not create Window & Surface or initialize the Surface!"); + + auto pool = m_device->createCommandPool(graphicsQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + + for (auto i = 0u; i < MaxFramesInFlight; i++) + { + if (!pool) + return logFail("Couldn't create Command Pool!"); + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) + return logFail("Couldn't create Command Buffer!"); + } + + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); + transferDownQueue = getTransferDownQueue(); computeQueue = getComputeQueue(); @@ -134,7 +161,24 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub assert(bufferMem.isValid()); } - // create Descriptor Set and Pipeline Layout + // create dummy image + dummyImg = m_device->createImage({ + { + .type = IGPUImage::ET_2D, + .samples = asset::ICPUImage::ESCF_1_BIT, + .format = asset::EF_R16G16B16A16_SFLOAT, + .extent = {WIN_W, WIN_H, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .flags = IImage::ECF_NONE, + .usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT + } + }); + if (!dummyImg || !m_device->allocate(dummyImg->getMemoryReqs(), dummyImg.get()).isValid()) + return logFail("Could not create HDR Image"); + + // create Descriptor Sets and Pipeline Layouts + smart_refctd_ptr benchPplnLayout; { // create Descriptor Set Layout smart_refctd_ptr dsLayout; @@ -148,7 +192,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub // set and transient pool auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1}); - descriptorSet = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); + testDs = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); { IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount]; infos[0].desc = gpuinputDataBuffer; @@ -158,18 +202,49 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub auto buff = outputBuffers[i - 1]; infos[i].info.buffer = { 0u,buff->getSize() }; infos[i].desc = std::move(buff); // save an atomic in the refcount - } IGPUDescriptorSet::SWriteDescriptorSet writes[2]; for (uint32_t i=0u; i<2; i++) - writes[i] = {descriptorSet.get(),i,0u,1u,infos+i}; + writes[i] = {testDs.get(),i,0u,1u,infos+i}; writes[1].count = OutputBufferCount; m_device->updateDescriptorSets(2, writes, 0u, nullptr); } + testPplnLayout = m_device->createPipelineLayout({}, std::move(dsLayout)); - pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout)); + + { + IGPUDescriptorSetLayout::SBinding binding[3]; + for (uint32_t i = 0u; i < 2; i++) + binding[i] = { {},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; + binding[1].count = OutputBufferCount; + binding[2] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; + dsLayout = m_device->createDescriptorSetLayout(binding); + } + + benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &dsLayout.get(),1 }); + benchDs = benchPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); + { + IGPUDescriptorSet::SDescriptorInfo infos[1 + OutputBufferCount]; + infos[0].desc = gpuinputDataBuffer; + infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() }; + for (uint32_t i = 1u; i <= OutputBufferCount; i++) + { + auto buff = outputBuffers[i - 1]; + infos[i].info.buffer = { 0u,buff->getSize() }; + infos[i].desc = std::move(buff); // save an atomic in the refcount + } + // write swapchain image descriptor in loop + + IGPUDescriptorSet::SWriteDescriptorSet writes[2]; + for (uint32_t i = 0u; i < 2; i++) + writes[i] = { testDs.get(),i,0u,1u,infos + i }; + writes[1].count = OutputBufferCount; + + m_device->updateDescriptorSets(2, writes, 0u, nullptr); + } + benchPplnLayout = m_device->createPipelineLayout({}, std::move(dsLayout)); } const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin"; @@ -226,6 +301,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub // now create or retrieve final resources to run our tests sema = m_device->createSemaphore(timelineValue); resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() }); + smart_refctd_ptr cmdbuf; { smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1})) @@ -244,10 +320,15 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; if (b_runTests) - runTests(subgroupTestSource, elementCount, ItemsPerInvocation, MinSubgroupSize, MaxSubgroupSize, workgroupSizes); + { + runTests(cmdbuf.get(), subgroupTestSource, elementCount, ItemsPerInvocation, MinSubgroupSize, MaxSubgroupSize, workgroupSizes); - double time = runBenchmark(subgroupTestSource, elementCount, 5, 256, ItemsPerInvocation, NumLoops); - m_logger->log("Ran for %.3fms (disregard these numbers, profile in Nsight)", ILogger::ELL_INFO, time * 1000.0); + m_logger->log("==========Result==========", ILogger::ELL_INFO); + m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount); + } + + // for each variant, workgroup size etc. + benchPipeline = createBenchmarkPipelines(subgroupTestSource, elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[0], ItemsPerInvocation); //for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) //{ @@ -274,22 +355,276 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub // } //} + m_winMgr->show(m_window.get()); + return true; } virtual bool onAppTerminated() override { - m_logger->log("==========Result==========", ILogger::ELL_INFO); - m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount); delete[] inputData; return true; } // the unit test is carried out on init - void workLoopBody() override {} + void workLoopBody() override + { + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + + const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); + + if (m_realFrameIx >= framesInFlight) + { + const ISemaphore::SWaitInfo cbDonePending[] = + { + { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + 1 - framesInFlight + } + }; + if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) + return; + } + + m_currentImageAcquire = m_surface->acquireNextImage(); + if (!m_currentImageAcquire) + return; + + auto* const cmdbuf = m_cmdBufs.data()[resourceIx].get(); + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + // barrier transition to GENERAL + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }; + imageBarriers[0].image = dummyImg.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; + imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + // bind dummy image + IGPUImageView::SCreationParams viewParams = { + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, + .image = dummyImg, + .viewType = IGPUImageView::ET_2D, + .format = dummyImg->getCreationParameters().format + }; + auto dummyImgView = m_device->createImageView(std::move(viewParams)); + + video::IGPUDescriptorSet::SDescriptorInfo dsInfo; + dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL; + dsInfo.desc = dummyImgView; + + IGPUDescriptorSet::SWriteDescriptorSet dsWrites[1u] = + { + { + .dstSet = benchDs.get(), + .binding = 2u, + .arrayElement = 0u, + .count = 1u, + .info = &dsInfo, + } + }; + m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr); + + const uint32_t elementCount = Output<>::ScanElementCount; + const uint32_t ItemsPerInvocation = 4u; + const uint32_t NumLoops = 100000u; + const std::array workgroupSizes = { 256, 512, 1024 }; + // const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; + const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; + const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; + + //{ + // auto startTime = std::chrono::high_resolution_clock::now(); + + // const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = { {.semaphore = sema.get(),.value = ++timelineValue} }; + // const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = { {.cmdbuf = cmdbuf.get()} }; + // const IQueue::SSubmitInfo submits[1] = { {.commandBuffers = cmdbufs,.signalSemaphores = signal} }; + // computeQueue->submit(submits); + // const ISemaphore::SWaitInfo wait[1] = { {.semaphore = sema.get(),.value = timelineValue} }; + // m_device->blockForSemaphores(wait); + + // auto endTime = std::chrono::high_resolution_clock::now(); + //} + + double time = runBenchmark(cmdbuf, benchPipeline, elementCount, 5, 256, ItemsPerInvocation, NumLoops); + m_logger->log("Ran for %.3fms (disregard these numbers, profile in Nsight)", ILogger::ELL_INFO, time * 1000.0); + + + // blit + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[2]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT + } + }; + imageBarriers[0].image = dummyImg.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; + imageBarriers[0].newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL; + + imageBarriers[1].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT + } + }; + imageBarriers[1].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); + imageBarriers[1].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[1].oldLayout = IImage::LAYOUT::UNDEFINED; + imageBarriers[1].newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + { + IGPUCommandBuffer::SImageBlit regions[] = { { + .srcMinCoord = {0,0,0}, + .srcMaxCoord = {WIN_W,WIN_H,1}, + .dstMinCoord = {0,0,0}, + .dstMaxCoord = {WIN_W,WIN_H,1}, + .layerCount = 1, + .srcBaseLayer = 0, + .dstBaseLayer = 0, + .srcMipLevel = 0, + .dstMipLevel = 0, + .aspectMask = IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT + } }; + + auto srcImg = dummyImg.get(); + auto scRes = static_cast(m_surface->getSwapchainResources()); + auto dstImg = scRes->getImage(m_currentImageAcquire.imageIndex); + + cmdbuf->blitImage(srcImg, IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, dstImg, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, regions, ISampler::ETF_NEAREST); + } + + // barrier transition to PRESENT + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::NONE, + .dstAccessMask = ACCESS_FLAGS::NONE + } + }; + imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL; + imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + cmdbuf->end(); + + // submit + { + auto* queue = getGraphicsQueue(); + const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = + { + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } + }; + { + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cmdbuf } + }; + + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = + { + { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = rendered + } + }; + + if (queue->submit(infos) == IQueue::RESULT::SUCCESS) + { + const nbl::video::ISemaphore::SWaitInfo waitInfos[] = + { { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + } }; + + m_device->blockForSemaphores(waitInfos); // this is not solution, quick wa to not throw validation errors + } + else + --m_realFrameIx; + } + } + + std::string caption = "[Nabla Engine] Geometry Creator"; + { + caption += ", displaying [all objects]"; + m_window->setCaption(caption); + } + m_surface->present(m_currentImageAcquire.imageIndex, rendered); + } + + numSubmits++; + } // - bool keepRunning() override { return true; } + bool keepRunning() override { return numSubmits < MaxNumSubmits; } private: void logTestOutcome(bool passed, uint32_t workgroupSize) @@ -303,7 +638,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } } - void runTests(smart_refctd_ptr subgroupTestSource, uint32_t elementCount, uint32_t ItemsPerInvocation, uint32_t MinSubgroupSize, uint32_t MaxSubgroupSize, const std::array& workgroupSizes) + void runTests(IGPUCommandBuffer* cmdbuf, smart_refctd_ptr subgroupTestSource, uint32_t elementCount, uint32_t ItemsPerInvocation, uint32_t MinSubgroupSize, uint32_t MaxSubgroupSize, const std::array& workgroupSizes) { for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { @@ -316,11 +651,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub bool passed = true; // TODO async the testing - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed; + passed = runTest(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed; logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed; + passed = runTest(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed; logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed; + passed = runTest(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed; logTestOutcome(passed, workgroupSize); //for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--) //{ @@ -353,7 +688,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub { auto shader = m_device->createShader(overridenUnspecialized); IGPUComputePipeline::SCreationParams params = {}; - params.layout = pipelineLayout.get(); + params.layout = testPplnLayout.get(); params.shader = { .entryPoint = "main", .shader = shader.get(), @@ -367,10 +702,22 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub return pipeline; } + template class Arithmetic> + smart_refctd_ptr createBenchmarkPipelines(const smart_refctd_ptr&source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u) + { + std::string arith_name = Arithmetic>::name; // TODO all operations + + smart_refctd_ptr overridenUnspecialized = CHLSLCompiler::createOverridenCopy( + source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n", + (("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2 + ); + return createPipeline(overridenUnspecialized.get(), subgroupSizeLog2); + }; + template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u) + bool runTest(IGPUCommandBuffer* cmdbuf, const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u) { - std::string arith_name = Arithmetic>::name; + std::string arith_name = Arithmetic>::name; smart_refctd_ptr overridenUnspecialized; //if constexpr (WorkgroupTest) @@ -394,7 +741,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub const uint32_t workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc); cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); cmdbuf->bindComputePipeline(pipeline.get()); - cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); + cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &testDs.get()); cmdbuf->dispatch(workgroupCount, 1, 1); { IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; @@ -419,7 +766,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub cmdbuf->end(); const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf.get()}}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf}}; const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}}; computeQueue->submit(submits); const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}}; @@ -514,21 +861,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub template class Arithmetic> - double runBenchmark(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) + bool runBenchmark(IGPUCommandBuffer* cmdbuf, const smart_refctd_ptr& pipeline, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) { - std::string arith_name = Arithmetic>::name; - - smart_refctd_ptr overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n", - (("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2 - ); - auto pipeline = createPipeline(overridenUnspecialized.get(), subgroupSizeLog2); - const uint32_t workgroupCount = elementCount / (workgroupSize * itemsPerInvoc); - cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); cmdbuf->bindComputePipeline(pipeline.get()); - cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); + cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &testDs.get()); cmdbuf->dispatch(workgroupCount, 1, 1); { IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; @@ -550,20 +888,8 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .memBarriers = {},.bufBarriers = memoryBarrier }; cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, info); } - cmdbuf->end(); - - auto startTime = std::chrono::high_resolution_clock::now(); - - const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = { {.semaphore = sema.get(),.value = ++timelineValue} }; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = { {.cmdbuf = cmdbuf.get()} }; - const IQueue::SSubmitInfo submits[1] = { {.commandBuffers = cmdbufs,.signalSemaphores = signal} }; - computeQueue->submit(submits); - const ISemaphore::SWaitInfo wait[1] = { {.semaphore = sema.get(),.value = timelineValue} }; - m_device->blockForSemaphores(wait); - - auto endTime = std::chrono::high_resolution_clock::now(); - return std::chrono::duration(endTime - startTime).count(); + return true; } IQueue* transferDownQueue; @@ -572,19 +898,33 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub smart_refctd_ptr m_spirv_isa_cache_output; smart_refctd_ptr m_window; - smart_refctd_ptr> m_surface; + smart_refctd_ptr> m_surface; + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; + std::array, MaxFramesInFlight> m_cmdBufs; + ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + smart_refctd_ptr m_inputSystem; + smart_refctd_ptr dummyImg; + + smart_refctd_ptr benchPipeline; // TODO array + smart_refctd_ptr benchPool; + smart_refctd_ptr benchDs; + + smart_refctd_ptr testDs; + smart_refctd_ptr testPplnLayout; + + constexpr static inline uint32_t MaxNumSubmits = 30; + uint32_t numSubmits = 0; + bool b_runTests = false; uint32_t* inputData = nullptr; constexpr static inline uint32_t OutputBufferCount = 8u; smart_refctd_ptr outputBuffers[OutputBufferCount]; - smart_refctd_ptr descriptorSet; - smart_refctd_ptr pipelineLayout; smart_refctd_ptr sema; uint64_t timelineValue = 0; - smart_refctd_ptr cmdbuf; smart_refctd_ptr resultsBuffer; uint32_t totalFailCount = 0; From 019299994a9969f4d542a9769aa23b2bd5076318 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 8 Apr 2025 15:14:31 +0700 Subject: [PATCH 113/296] use correct shader, spirv line dbinfo for nsight --- 73_ArithmeticBench/main.cpp | 53 ++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp index 8e067e6cc..29a38d2eb 100644 --- a/73_ArithmeticBench/main.cpp +++ b/73_ArithmeticBench/main.cpp @@ -297,6 +297,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub }; auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); + auto subgroupBenchSource = getShaderSource("app_resources/benchmarkSubgroup.comp.hlsl"); //auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl"); // now create or retrieve final resources to run our tests sema = m_device->createSemaphore(timelineValue); @@ -313,7 +314,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub // TODO variable items per invocation? const uint32_t ItemsPerInvocation = 4u; - const uint32_t NumLoops = 100000u; + const uint32_t NumLoops = 1000u; const std::array workgroupSizes = { 256, 512, 1024 }; // const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; @@ -328,7 +329,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } // for each variant, workgroup size etc. - benchPipeline = createBenchmarkPipelines(subgroupTestSource, elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[0], ItemsPerInvocation); + benchPipeline = createBenchmarkPipelines(subgroupBenchSource, elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[0], ItemsPerInvocation, NumLoops); //for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) //{ @@ -703,14 +704,52 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } template class Arithmetic> - smart_refctd_ptr createBenchmarkPipelines(const smart_refctd_ptr&source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u) + smart_refctd_ptr createBenchmarkPipelines(const smart_refctd_ptr&source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) { std::string arith_name = Arithmetic>::name; // TODO all operations - smart_refctd_ptr overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n", - (("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2 - ); + //smart_refctd_ptr overridenUnspecialized = CHLSLCompiler::createOverridenCopy( + // source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n", + // (("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2 + //); + + auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + CHLSLCompiler::SOptions options = {}; + options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; + options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; + options.spirvOptimizer = nullptr; +//#ifndef _NBL_DEBUG +// ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; +// auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); +// options.spirvOptimizer = opt.get(); +//#endif + options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; + options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); + options.preprocessorOptions.logger = m_logger.get(); + + auto* includeFinder = compiler->getDefaultIncludeFinder(); + includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr(m_physicalDevice->getLimits(), m_device->getEnabledFeatures())); + options.preprocessorOptions.includeFinder = includeFinder; + + const std::string definitions[5] = { + "subgroup2::" + arith_name, + std::to_string(workgroupSize), + std::to_string(itemsPerInvoc), + std::to_string(subgroupSizeLog2), + std::to_string(numLoops) + }; + + const IShaderCompiler::SMacroDefinition defines[5] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE", definitions[1] }, + { "ITEMS_PER_INVOCATION", definitions[2] }, + { "SUBGROUP_SIZE_LOG2", definitions[3] }, + { "NUM_LOOPS", definitions[4] }, + }; + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + + smart_refctd_ptr overridenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + return createPipeline(overridenUnspecialized.get(), subgroupSizeLog2); }; From 8c9d55e6233d0f50f99403835e100b8aba799bca Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 8 Apr 2025 17:02:09 +0700 Subject: [PATCH 114/296] support for 1 item per invoc --- .../app_resources/shaderCommon.hlsl | 6 + 73_ArithmeticBench/main.cpp | 111 ++++++++---------- 2 files changed, 58 insertions(+), 59 deletions(-) diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl index fa3713c44..7d25b98ee 100644 --- a/73_ArithmeticBench/app_resources/shaderCommon.hlsl +++ b/73_ArithmeticBench/app_resources/shaderCommon.hlsl @@ -2,6 +2,7 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/subgroup/basic.hlsl" +#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" @@ -25,7 +26,12 @@ bool canStore(); //typedef decltype(inputValue[0]) type_t; //typedef uint32_t type_t; //typedef uint32_t4 type_t; + +#if ITEMS_PER_INVOCATION > 1 typedef vector type_t; +#else +typedef uint32_t type_t; +#endif #ifndef OPERATION diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp index 29a38d2eb..276efbd18 100644 --- a/73_ArithmeticBench/main.cpp +++ b/73_ArithmeticBench/main.cpp @@ -214,17 +214,18 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub testPplnLayout = m_device->createPipelineLayout({}, std::move(dsLayout)); + smart_refctd_ptr benchLayout; { IGPUDescriptorSetLayout::SBinding binding[3]; for (uint32_t i = 0u; i < 2; i++) binding[i] = { {},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; binding[1].count = OutputBufferCount; binding[2] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; - dsLayout = m_device->createDescriptorSetLayout(binding); + benchLayout = m_device->createDescriptorSetLayout(binding); } - benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &dsLayout.get(),1 }); - benchDs = benchPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); + benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &benchLayout.get(),1 }); + benchDs = benchPool->createDescriptorSet(smart_refctd_ptr(benchLayout)); { IGPUDescriptorSet::SDescriptorInfo infos[1 + OutputBufferCount]; infos[0].desc = gpuinputDataBuffer; @@ -239,12 +240,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub IGPUDescriptorSet::SWriteDescriptorSet writes[2]; for (uint32_t i = 0u; i < 2; i++) - writes[i] = { testDs.get(),i,0u,1u,infos + i }; + writes[i] = { benchDs.get(),i,0u,1u,infos + i }; writes[1].count = OutputBufferCount; m_device->updateDescriptorSets(2, writes, 0u, nullptr); } - benchPplnLayout = m_device->createPipelineLayout({}, std::move(dsLayout)); + benchPplnLayout = m_device->createPipelineLayout({}, std::move(benchLayout)); } const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin"; @@ -313,7 +314,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } // TODO variable items per invocation? - const uint32_t ItemsPerInvocation = 4u; const uint32_t NumLoops = 1000u; const std::array workgroupSizes = { 256, 512, 1024 }; // const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; @@ -328,33 +328,9 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount); } - // for each variant, workgroup size etc. - benchPipeline = createBenchmarkPipelines(subgroupBenchSource, elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[0], ItemsPerInvocation, NumLoops); - - //for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) - //{ - // const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - // for (const auto& workgroupSize : workgroupSizes) - // { - // passed = runBenchmark(subgroupTestSource, queryPool, elementCount, subgroupSizeLog2, workgroupSize, ItemsPerInvocation, NumLoops) && passed; - // logTestOutcome(passed, workgroupSize); - // passed = runBenchmark(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ItemsPerInvocation, NumLoops) && passed; - // logTestOutcome(passed, workgroupSize); - // passed = runBenchmark(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ItemsPerInvocation, NumLoops) && passed; - // logTestOutcome(passed, workgroupSize); - - // // save cache every now and then - // { - // auto cpu = m_spirv_isa_cache->convertToCPUCache(); - // // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata - // auto bin = cpu->getEntries().begin()->second.bin; - // IFile::success_t success; - // m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size()); - // if (!success) - // logFail("Could not write Create SPIR-V to ISA cache to disk!"); - // } - // } - //} + // for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram) + for (uint32_t i = 0; i < workgroupSizes.size(); i++) + benchSets[i] = createBenchmarkPipelines(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); m_winMgr->show(m_window.get()); @@ -447,9 +423,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr); const uint32_t elementCount = Output<>::ScanElementCount; - const uint32_t ItemsPerInvocation = 4u; - const uint32_t NumLoops = 100000u; - const std::array workgroupSizes = { 256, 512, 1024 }; // const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; @@ -467,8 +440,10 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub // auto endTime = std::chrono::high_resolution_clock::now(); //} - double time = runBenchmark(cmdbuf, benchPipeline, elementCount, 5, 256, ItemsPerInvocation, NumLoops); - m_logger->log("Ran for %.3fms (disregard these numbers, profile in Nsight)", ILogger::ELL_INFO, time * 1000.0); + double t0 = runBenchmark(cmdbuf, benchSets[0], elementCount, 5); + double t1 = runBenchmark(cmdbuf, benchSets[1], elementCount, 5); + double t2 = runBenchmark(cmdbuf, benchSets[2], elementCount, 5); + m_logger->log("Ran for %.3fms (disregard these numbers, profile in Nsight)", ILogger::ELL_INFO, t0 * 1000.0); // blit @@ -639,7 +614,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } } - void runTests(IGPUCommandBuffer* cmdbuf, smart_refctd_ptr subgroupTestSource, uint32_t elementCount, uint32_t ItemsPerInvocation, uint32_t MinSubgroupSize, uint32_t MaxSubgroupSize, const std::array& workgroupSizes) + void runTests(IGPUCommandBuffer* cmdbuf, smart_refctd_ptr subgroupTestSource, uint32_t elementCount, uint32_t itemsPerInvocation, uint32_t MinSubgroupSize, uint32_t MaxSubgroupSize, const std::array& workgroupSizes) { for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { @@ -652,11 +627,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub bool passed = true; // TODO async the testing - passed = runTest(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed; + passed = runTest(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; logTestOutcome(passed, workgroupSize); - passed = runTest(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed; + passed = runTest(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; logTestOutcome(passed, workgroupSize); - passed = runTest(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, ItemsPerInvocation) && passed; + passed = runTest(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; logTestOutcome(passed, workgroupSize); //for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--) //{ @@ -685,11 +660,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } // create pipeline (specialized every test) [TODO: turn into a future/async] - smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2) + smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2) { auto shader = m_device->createShader(overridenUnspecialized); IGPUComputePipeline::SCreationParams params = {}; - params.layout = testPplnLayout.get(); + params.layout = layout; params.shader = { .entryPoint = "main", .shader = shader.get(), @@ -703,8 +678,15 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub return pipeline; } + struct BenchmarkSet + { + smart_refctd_ptr pipeline; + uint32_t workgroupSize; + uint32_t itemsPerInvocation; + }; + template class Arithmetic> - smart_refctd_ptr createBenchmarkPipelines(const smart_refctd_ptr&source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) + BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) { std::string arith_name = Arithmetic>::name; // TODO all operations @@ -731,26 +713,35 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr(m_physicalDevice->getLimits(), m_device->getEnabledFeatures())); options.preprocessorOptions.includeFinder = includeFinder; - const std::string definitions[5] = { + const std::string definitions[6] = { "subgroup2::" + arith_name, + "subgroup::" + arith_name, std::to_string(workgroupSize), std::to_string(itemsPerInvoc), std::to_string(subgroupSizeLog2), std::to_string(numLoops) }; - const IShaderCompiler::SMacroDefinition defines[5] = { - { "OPERATION", definitions[0] }, - { "WORKGROUP_SIZE", definitions[1] }, - { "ITEMS_PER_INVOCATION", definitions[2] }, - { "SUBGROUP_SIZE_LOG2", definitions[3] }, - { "NUM_LOOPS", definitions[4] }, + const IShaderCompiler::SMacroDefinition defines[6] = { + { "OPERATION", ItemsPerInvocation > 1 ? definitions[0] : definitions[1] }, + { "WORKGROUP_SIZE", definitions[2] }, + { "ITEMS_PER_INVOCATION", definitions[3] }, + { "SUBGROUP_SIZE_LOG2", definitions[4] }, + { "NUM_LOOPS", definitions[5] }, }; - options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + //if (b_useOldSubgroups) + // options.preprocessorOptions.extraDefines = { defines, defines + 6 }; + //else + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; smart_refctd_ptr overridenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); - return createPipeline(overridenUnspecialized.get(), subgroupSizeLog2); + BenchmarkSet set; + set.pipeline = createPipeline(overridenUnspecialized.get(), layout, subgroupSizeLog2); + set.workgroupSize = workgroupSize; + set.itemsPerInvocation = itemsPerInvoc; + + return set; }; template class Arithmetic, bool WorkgroupTest> @@ -774,7 +765,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub (("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2 ); //} - auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2); + auto pipeline = createPipeline(overridenUnspecialized.get(),testPplnLayout.get(), subgroupSizeLog2); // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) const uint32_t workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc); @@ -900,12 +891,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub template class Arithmetic> - bool runBenchmark(IGPUCommandBuffer* cmdbuf, const smart_refctd_ptr& pipeline, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) + bool runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2) { - const uint32_t workgroupCount = elementCount / (workgroupSize * itemsPerInvoc); + const uint32_t workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation); - cmdbuf->bindComputePipeline(pipeline.get()); - cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &testDs.get()); + cmdbuf->bindComputePipeline(set.pipeline.get()); + cmdbuf->bindDescriptorSets(EPBP_COMPUTE, set.pipeline->getLayout(), 0u, 1u, &benchDs.get()); cmdbuf->dispatch(workgroupCount, 1, 1); { IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; @@ -947,6 +938,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub smart_refctd_ptr dummyImg; + std::array benchSets; smart_refctd_ptr benchPipeline; // TODO array smart_refctd_ptr benchPool; smart_refctd_ptr benchDs; @@ -959,6 +951,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub bool b_runTests = false; uint32_t* inputData = nullptr; + uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t OutputBufferCount = 8u; smart_refctd_ptr outputBuffers[OutputBufferCount]; From 07d6980cacbc8e646de2e622405a365dc47dd961 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 9 Apr 2025 14:48:31 +0700 Subject: [PATCH 115/296] handle when items per invoc =1 --- .../app_resources/benchmarkSubgroup.comp.hlsl | 6 ++- .../app_resources/shaderCommon.hlsl | 4 ++ 73_ArithmeticBench/main.cpp | 44 ++++++------------- 3 files changed, 23 insertions(+), 31 deletions(-) diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl index 2815d1e38..fb9f5e8c7 100644 --- a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl @@ -42,7 +42,7 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) type_t value = sourceVal; operation_t func; - [unroll] + // [unroll] for (uint32_t i = 0; i < NUM_LOOPS; i++) value = func(value); @@ -53,11 +53,15 @@ void benchmark() { const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION; type_t sourceVal; +#if ITEMS_PER_INVOCATION > 1 [unroll] for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) { sourceVal[i] = inputValue[idx + i]; } +#else + sourceVal = inputValue[idx]; +#endif subbench(sourceVal); subbench(sourceVal); diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl index 7d25b98ee..5cb1f3cf1 100644 --- a/73_ArithmeticBench/app_resources/shaderCommon.hlsl +++ b/73_ArithmeticBench/app_resources/shaderCommon.hlsl @@ -62,11 +62,15 @@ type_t test() { const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION; type_t sourceVal; +#if ITEMS_PER_INVOCATION > 1 [unroll] for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) { sourceVal[i] = inputValue[idx + i]; } +#else + sourceVal = inputValue[idx]; +#endif subtest(sourceVal); subtest(sourceVal); diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp index 276efbd18..5ddd0cf6b 100644 --- a/73_ArithmeticBench/main.cpp +++ b/73_ArithmeticBench/main.cpp @@ -330,7 +330,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub // for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram) for (uint32_t i = 0; i < workgroupSizes.size(); i++) - benchSets[i] = createBenchmarkPipelines(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); + benchSets[i] = createBenchmarkPipelines(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); m_winMgr->show(m_window.get()); @@ -423,27 +423,15 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr); const uint32_t elementCount = Output<>::ScanElementCount; - // const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; - //{ - // auto startTime = std::chrono::high_resolution_clock::now(); - - // const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = { {.semaphore = sema.get(),.value = ++timelineValue} }; - // const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = { {.cmdbuf = cmdbuf.get()} }; - // const IQueue::SSubmitInfo submits[1] = { {.commandBuffers = cmdbufs,.signalSemaphores = signal} }; - // computeQueue->submit(submits); - // const ISemaphore::SWaitInfo wait[1] = { {.semaphore = sema.get(),.value = timelineValue} }; - // m_device->blockForSemaphores(wait); + const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize); - // auto endTime = std::chrono::high_resolution_clock::now(); - //} - - double t0 = runBenchmark(cmdbuf, benchSets[0], elementCount, 5); - double t1 = runBenchmark(cmdbuf, benchSets[1], elementCount, 5); - double t2 = runBenchmark(cmdbuf, benchSets[2], elementCount, 5); - m_logger->log("Ran for %.3fms (disregard these numbers, profile in Nsight)", ILogger::ELL_INFO, t0 * 1000.0); + bool passed = true; + passed = runBenchmark(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2); + passed = runBenchmark(cmdbuf, benchSets[1], elementCount, SubgroupSizeLog2); + passed = runBenchmark(cmdbuf, benchSets[2], elementCount, SubgroupSizeLog2); // blit @@ -713,26 +701,22 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr(m_physicalDevice->getLimits(), m_device->getEnabledFeatures())); options.preprocessorOptions.includeFinder = includeFinder; - const std::string definitions[6] = { + const std::string definitions[5] = { "subgroup2::" + arith_name, - "subgroup::" + arith_name, std::to_string(workgroupSize), std::to_string(itemsPerInvoc), std::to_string(subgroupSizeLog2), std::to_string(numLoops) }; - const IShaderCompiler::SMacroDefinition defines[6] = { - { "OPERATION", ItemsPerInvocation > 1 ? definitions[0] : definitions[1] }, - { "WORKGROUP_SIZE", definitions[2] }, - { "ITEMS_PER_INVOCATION", definitions[3] }, - { "SUBGROUP_SIZE_LOG2", definitions[4] }, - { "NUM_LOOPS", definitions[5] }, + const IShaderCompiler::SMacroDefinition defines[5] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE", definitions[1] }, + { "ITEMS_PER_INVOCATION", definitions[2] }, + { "SUBGROUP_SIZE_LOG2", definitions[3] }, + { "NUM_LOOPS", definitions[4] }, }; - //if (b_useOldSubgroups) - // options.preprocessorOptions.extraDefines = { defines, defines + 6 }; - //else - options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; smart_refctd_ptr overridenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); From be756d56f66a94e608f43f3ad98c43a6d8557f43 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 10 Apr 2025 11:49:19 +0700 Subject: [PATCH 116/296] minor fixes --- .../app_resources/benchmarkSubgroup.comp.hlsl | 1 + 73_ArithmeticBench/main.cpp | 17 ++++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl index fb9f5e8c7..0b6a7e3c4 100644 --- a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl @@ -4,6 +4,7 @@ #include "shaderCommon.hlsl" +// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders [[vk::binding(2, 0)]] RWTexture2D outImage; // dummy uint32_t globalIndex() diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp index 5ddd0cf6b..94983c03c 100644 --- a/73_ArithmeticBench/main.cpp +++ b/73_ArithmeticBench/main.cpp @@ -47,6 +47,7 @@ struct emulatedScanExclusive static inline constexpr const char* name = "exclusive_scan"; }; +// NOTE added swapchain + drawing frames to be able to profile with Nsight, which still doesn't support profiling headless compute shaders class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { using device_base_t = examples::SimpleWindowedApplication; @@ -330,7 +331,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub // for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram) for (uint32_t i = 0; i < workgroupSizes.size(); i++) - benchSets[i] = createBenchmarkPipelines(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); + benchSets[i] = createBenchmarkPipelines(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); m_winMgr->show(m_window.get()); @@ -429,9 +430,9 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize); bool passed = true; - passed = runBenchmark(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2); - passed = runBenchmark(cmdbuf, benchSets[1], elementCount, SubgroupSizeLog2); - passed = runBenchmark(cmdbuf, benchSets[2], elementCount, SubgroupSizeLog2); + passed = runBenchmark(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2); + passed = runBenchmark(cmdbuf, benchSets[1], elementCount, SubgroupSizeLog2); + passed = runBenchmark(cmdbuf, benchSets[2], elementCount, SubgroupSizeLog2); // blit @@ -676,7 +677,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub template class Arithmetic> BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) { - std::string arith_name = Arithmetic>::name; // TODO all operations + std::string arith_name = Arithmetic>::name; // TODO all operations //smart_refctd_ptr overridenUnspecialized = CHLSLCompiler::createOverridenCopy( // source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n", @@ -874,7 +875,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } - template class Arithmetic> bool runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2) { const uint32_t workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation); @@ -933,9 +933,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub constexpr static inline uint32_t MaxNumSubmits = 30; uint32_t numSubmits = 0; + template + using ArithmeticOp = emulatedReduction; // change this to test other arithmetic ops + bool b_runTests = false; uint32_t* inputData = nullptr; - uint32_t ItemsPerInvocation = 4u; + uint32_t ItemsPerInvocation = 1u; constexpr static inline uint32_t OutputBufferCount = 8u; smart_refctd_ptr outputBuffers[OutputBufferCount]; From 1963b51c27cf445b6515bbd16eb2bec3da9aa311 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 10 Apr 2025 16:00:08 +0700 Subject: [PATCH 117/296] changes in Param, Config usage --- .../app_resources/benchmarkSubgroup.comp.hlsl | 10 +-- .../app_resources/shaderCommon.hlsl | 18 ++--- 73_ArithmeticBench/main.cpp | 75 ++++--------------- 3 files changed, 27 insertions(+), 76 deletions(-) diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl index 0b6a7e3c4..4715f0abf 100644 --- a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl @@ -38,7 +38,7 @@ bool canStore() {return true;} template class binop, typename T, uint32_t N> static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) { - using config_t = nbl::hlsl::subgroup::Configuration; + using config_t = nbl::hlsl::subgroup2::Configuration; using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; type_t value = sourceVal; @@ -54,15 +54,15 @@ void benchmark() { const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION; type_t sourceVal; -#if ITEMS_PER_INVOCATION > 1 +// #if ITEMS_PER_INVOCATION > 1 [unroll] for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) { sourceVal[i] = inputValue[idx + i]; } -#else - sourceVal = inputValue[idx]; -#endif +// #else +// sourceVal = inputValue[idx]; +// #endif subbench(sourceVal); subbench(sourceVal); diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl index 5cb1f3cf1..3fdd3c986 100644 --- a/73_ArithmeticBench/app_resources/shaderCommon.hlsl +++ b/73_ArithmeticBench/app_resources/shaderCommon.hlsl @@ -27,11 +27,11 @@ bool canStore(); //typedef uint32_t type_t; //typedef uint32_t4 type_t; -#if ITEMS_PER_INVOCATION > 1 +// #if ITEMS_PER_INVOCATION > 1 typedef vector type_t; -#else -typedef uint32_t type_t; -#endif +// #else +// typedef uint32_t type_t; +// #endif #ifndef OPERATION @@ -46,7 +46,7 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { // TODO static assert vector == type_t //using type_t = vector; - using config_t = nbl::hlsl::subgroup::Configuration; + using config_t = nbl::hlsl::subgroup2::Configuration; using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; if (globalIndex()==0u) @@ -62,15 +62,15 @@ type_t test() { const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION; type_t sourceVal; -#if ITEMS_PER_INVOCATION > 1 +// #if ITEMS_PER_INVOCATION > 1 [unroll] for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) { sourceVal[i] = inputValue[idx + i]; } -#else - sourceVal = inputValue[idx]; -#endif +// #else +// sourceVal = inputValue[idx]; +// #endif subtest(sourceVal); subtest(sourceVal); diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp index 94983c03c..d129cfaf9 100644 --- a/73_ArithmeticBench/main.cpp +++ b/73_ArithmeticBench/main.cpp @@ -249,39 +249,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub benchPplnLayout = m_device->createPipelineLayout({}, std::move(benchLayout)); } - const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin"; - // enclose to make sure file goes out of scope and we can reopen it - { - smart_refctd_ptr spirv_isa_cache_input; - // try to load SPIR-V to ISA cache - { - ISystem::future_t> fileCreate; - m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT); - if (auto lock=fileCreate.acquire()) - spirv_isa_cache_input = *lock; - } - // create the cache - { - std::span spirv_isa_cache_data = {}; - if (spirv_isa_cache_input) - spirv_isa_cache_data = {reinterpret_cast(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()}; - else - m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE); - // Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead - m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data); - } - } - { - // TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ? - m_system->deleteDirectory(spirv_isa_cache_path); - ISystem::future_t> fileCreate; - m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE); - // I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though. - m_spirv_isa_cache_output=*fileCreate.acquire(); - if (!m_spirv_isa_cache_output) - logFail("Failed to Create SPIR-V to ISA cache file."); - } - // load shader source from file auto getShaderSource = [&](const char* filePath) -> auto { @@ -429,10 +396,10 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize); - bool passed = true; - passed = runBenchmark(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2); - passed = runBenchmark(cmdbuf, benchSets[1], elementCount, SubgroupSizeLog2); - passed = runBenchmark(cmdbuf, benchSets[2], elementCount, SubgroupSizeLog2); + cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get()); + + for (uint32_t i = 0; i < benchSets.size(); i++) + runBenchmark(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2); // blit @@ -633,17 +600,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub // logTestOutcome(passed, itemsPerWG); //} m_api->endCapture(); - - // save cache every now and then - { - auto cpu = m_spirv_isa_cache->convertToCPUCache(); - // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata - auto bin = cpu->getEntries().begin()->second.bin; - IFile::success_t success; - m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size()); - if (!success) - logFail("Could not write Create SPIR-V to ISA cache to disk!"); - } } } } @@ -662,7 +618,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub .requireFullSubgroups = true }; core::smart_refctd_ptr pipeline; - if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{¶ms,1},&pipeline)) + if (!m_device->createComputePipelines(nullptr,{¶ms,1},&pipeline)) return nullptr; return pipeline; } @@ -689,12 +645,13 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; options.spirvOptimizer = nullptr; -//#ifndef _NBL_DEBUG -// ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; -// auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); -// options.spirvOptimizer = opt.get(); -//#endif +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + options.spirvOptimizer = opt.get(); +#else options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; +#endif options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); options.preprocessorOptions.logger = m_logger.get(); @@ -875,12 +832,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } - bool runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2) + void runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2) { const uint32_t workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation); cmdbuf->bindComputePipeline(set.pipeline.get()); - cmdbuf->bindDescriptorSets(EPBP_COMPUTE, set.pipeline->getLayout(), 0u, 1u, &benchDs.get()); cmdbuf->dispatch(workgroupCount, 1, 1); { IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; @@ -902,14 +858,10 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .memBarriers = {},.bufBarriers = memoryBarrier }; cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, info); } - - return true; } IQueue* transferDownQueue; IQueue* computeQueue; - smart_refctd_ptr m_spirv_isa_cache; - smart_refctd_ptr m_spirv_isa_cache_output; smart_refctd_ptr m_window; smart_refctd_ptr> m_surface; @@ -923,7 +875,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub smart_refctd_ptr dummyImg; std::array benchSets; - smart_refctd_ptr benchPipeline; // TODO array smart_refctd_ptr benchPool; smart_refctd_ptr benchDs; @@ -938,7 +889,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub bool b_runTests = false; uint32_t* inputData = nullptr; - uint32_t ItemsPerInvocation = 1u; + uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t OutputBufferCount = 8u; smart_refctd_ptr outputBuffers[OutputBufferCount]; From 08b2442e2f01ff3928a601f02bc1b2189add6ef5 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 18 Apr 2025 22:40:55 +0200 Subject: [PATCH 118/296] make example 07 run again after slight updates to API --- 07_StagingAndMultipleQueues/main.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp index 658a28a35..875053d60 100644 --- a/07_StagingAndMultipleQueues/main.cpp +++ b/07_StagingAndMultipleQueues/main.cpp @@ -432,15 +432,16 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul submitInfo[0].waitSemaphores = waitSemaphoreSubmitInfo; // there's no save to wait on, or need to prevent signal-after-submit because Renderdoc freezes because it // starts capturing immediately upon a submit and can't defer a capture till semaphores signal. - if (imageToProcessIdisRunningInRenderdoc()) + const bool isRunningInRenderdoc = m_api->runningInGraphicsDebugger()==IAPIConnection::EDebuggerType::Renderdoc; + if (imageToProcessIdisRunningInRenderdoc() && imageToProcessId>=SUBMITS_IN_FLIGHT) + if (isRunningInRenderdoc && imageToProcessId>=SUBMITS_IN_FLIGHT) for (auto old = histogramsSaved.load(); old < histogramSaveWaitSemaphoreValue; old = histogramsSaved.load()) histogramsSaved.wait(old); // Some Devices like all of the Intel GPUs do not have enough queues for us to allocate different queues to compute and transfers, // so our `BasicMultiQueueApplication` will "alias" a single queue to both usages. Normally you don't need to care, but here we're // attempting to do "out-of-order" "submit-before-signal" so we need to "hold back" submissions if the queues are aliased! - if (getTransferUpQueue()==computeQueue || m_api->isRunningInRenderdoc()) + if (getTransferUpQueue()==computeQueue || isRunningInRenderdoc) for (auto old = transfersSubmitted.load(); old <= imageToProcessId; old = transfersSubmitted.load()) transfersSubmitted.wait(old); computeQueue->submit(submitInfo); From 32ec0af7bd1a1e6be1020b5790ebba864b041a9a Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 19 Apr 2025 12:43:38 +0200 Subject: [PATCH 119/296] FFT examples now fully deprecate the old, and move Clustered Rendering to old --- CMakeLists.txt | 8 +- old_to_refactor/49_ComputeFFT/CMakeLists.txt | 11 - .../49_ComputeFFT/config.json.template | 28 - .../49_ComputeFFT/extra_parameters.glsl | 16 - .../49_ComputeFFT/fft_convolve_ifft.comp | 109 --- .../49_ComputeFFT/image_first_fft.comp | 56 -- old_to_refactor/49_ComputeFFT/last_fft.comp | 72 -- old_to_refactor/49_ComputeFFT/main.cpp | 753 ------------------ .../49_ComputeFFT/normalization.comp | 34 - old_to_refactor/49_ComputeFFT/pipeline.groovy | 50 -- .../60_ClusteredRendering}/CMakeLists.txt | 0 .../config.json.template | 0 .../60_ClusteredRendering}/main.cpp | 0 .../60_ClusteredRendering}/pipeline.groovy | 0 14 files changed, 1 insertion(+), 1136 deletions(-) delete mode 100644 old_to_refactor/49_ComputeFFT/CMakeLists.txt delete mode 100644 old_to_refactor/49_ComputeFFT/config.json.template delete mode 100644 old_to_refactor/49_ComputeFFT/extra_parameters.glsl delete mode 100644 old_to_refactor/49_ComputeFFT/fft_convolve_ifft.comp delete mode 100644 old_to_refactor/49_ComputeFFT/image_first_fft.comp delete mode 100644 old_to_refactor/49_ComputeFFT/last_fft.comp delete mode 100644 old_to_refactor/49_ComputeFFT/main.cpp delete mode 100644 old_to_refactor/49_ComputeFFT/normalization.comp delete mode 100644 old_to_refactor/49_ComputeFFT/pipeline.groovy rename {60_ClusteredRendering => old_to_refactor/60_ClusteredRendering}/CMakeLists.txt (100%) rename {60_ClusteredRendering => old_to_refactor/60_ClusteredRendering}/config.json.template (100%) rename {60_ClusteredRendering => old_to_refactor/60_ClusteredRendering}/main.cpp (100%) rename {60_ClusteredRendering => old_to_refactor/60_ClusteredRendering}/pipeline.groovy (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index f358d962d..24fb7fad8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,7 +52,6 @@ if(NBL_BUILD_EXAMPLES) # Waiting for a refactor #add_subdirectory(27_PLYSTLDemo EXCLUDE_FROM_ALL) - #add_subdirectory(29_SpecializationConstants EXCLUDE_FROM_ALL) #add_subdirectory(33_Draw3DLine EXCLUDE_FROM_ALL) # Unit Test Examples @@ -75,16 +74,11 @@ if(NBL_BUILD_EXAMPLES) # add_subdirectory(39_DenoiserTonemapper EXCLUDE_FROM_ALL) # endif() - add_subdirectory(42_FragmentShaderPathTracer EXCLUDE_FROM_ALL) #add_subdirectory(43_SumAndCDFFilters EXCLUDE_FROM_ALL) - #add_subdirectory(45_BRDFEvalTest EXCLUDE_FROM_ALL) - #add_subdirectory(46_SamplingValidation EXCLUDE_FROM_ALL) add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL) add_subdirectory(53_ComputeShaders EXCLUDE_FROM_ALL) add_subdirectory(54_Transformations EXCLUDE_FROM_ALL) add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL) - add_subdirectory(56_RayQuery EXCLUDE_FROM_ALL) - add_subdirectory(60_ClusteredRendering EXCLUDE_FROM_ALL) add_subdirectory(61_UI EXCLUDE_FROM_ALL) add_subdirectory(62_CAD EXCLUDE_FROM_ALL) add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL) @@ -95,7 +89,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL) add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL) - add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) + add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") diff --git a/old_to_refactor/49_ComputeFFT/CMakeLists.txt b/old_to_refactor/49_ComputeFFT/CMakeLists.txt deleted file mode 100644 index b591db9e9..000000000 --- a/old_to_refactor/49_ComputeFFT/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ - -include(common RESULT_VARIABLE RES) -if(NOT RES) - message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") -endif() - -set(EXAMPLE_SOURCES - ../../src/nbl/ext/FFT/FFT.cpp -) - -nbl_create_executable_project("${EXAMPLE_SOURCES}" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file diff --git a/old_to_refactor/49_ComputeFFT/config.json.template b/old_to_refactor/49_ComputeFFT/config.json.template deleted file mode 100644 index f961745c1..000000000 --- a/old_to_refactor/49_ComputeFFT/config.json.template +++ /dev/null @@ -1,28 +0,0 @@ -{ - "enableParallelBuild": true, - "threadsPerBuildProcess" : 2, - "isExecuted": false, - "scriptPath": "", - "cmake": { - "configurations": [ "Release", "Debug", "RelWithDebInfo" ], - "buildModes": [], - "requiredOptions": [] - }, - "profiles": [ - { - "backend": "vulkan", - "platform": "windows", - "buildModes": [], - "runConfiguration": "Release", - "gpuArchitectures": [] - } - ], - "dependencies": [], - "data": [ - { - "dependencies": [], - "command": [""], - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/old_to_refactor/49_ComputeFFT/extra_parameters.glsl b/old_to_refactor/49_ComputeFFT/extra_parameters.glsl deleted file mode 100644 index 032f4c363..000000000 --- a/old_to_refactor/49_ComputeFFT/extra_parameters.glsl +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#include "nbl/builtin/glsl/ext/FFT/parameters_struct.glsl" -struct convolve_parameters_t -{ - nbl_glsl_ext_FFT_Parameters_t fft; - vec2 kernel_half_pixel_size; -}; - -struct image_store_parameters_t -{ - nbl_glsl_ext_FFT_Parameters_t fft; - ivec2 unpad_offset; -}; \ No newline at end of file diff --git a/old_to_refactor/49_ComputeFFT/fft_convolve_ifft.comp b/old_to_refactor/49_ComputeFFT/fft_convolve_ifft.comp deleted file mode 100644 index 18702fe81..000000000 --- a/old_to_refactor/49_ComputeFFT/fft_convolve_ifft.comp +++ /dev/null @@ -1,109 +0,0 @@ -layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in; - -layout(set=0, binding=2) uniform sampler2D NormalizedKernel[3]; - -/* TODO: Hardcode the parameters for the frequent FFTs -uvec3 nbl_glsl_ext_FFT_Parameters_t_getDimensions() -{ - return uvec3(1280u,1024u,1u); -} -bool nbl_glsl_ext_FFT_Parameters_t_getIsInverse() -{ - return false; -} -uint nbl_glsl_ext_FFT_Parameters_t_getDirection() -{ - return 0u; -} -uint nbl_glsl_ext_FFT_Parameters_t_getMaxChannel() -{ - return 2u; -} -uint nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize() -{ - return 11u; -} -uint nbl_glsl_ext_FFT_Parameters_t_getPaddingType() -{ - return 3u; // _NBL_GLSL_EXT_FFT_PAD_MIRROR_; -} -uvec4 nbl_glsl_ext_FFT_Parameters_t_getInputStrides() -{ - return uvec4(1024u,1u,0u,1024u*1280u); -} -uvec4 nbl_glsl_ext_FFT_Parameters_t_getOutputStrides() -{ - return uvec4(1u,1280u,0u,1280u*1024u); -} -#define _NBL_GLSL_EXT_FFT_PARAMETERS_METHODS_DECLARED_ -*/ - -#include "extra_parameters.glsl" -layout(push_constant) uniform PushConstants -{ - convolve_parameters_t params; -} pc; -#define _NBL_GLSL_EXT_FFT_PUSH_CONSTANTS_DEFINED_ - -nbl_glsl_ext_FFT_Parameters_t nbl_glsl_ext_FFT_getParameters() -{ - return pc.params.fft; -} -#define _NBL_GLSL_EXT_FFT_GET_PARAMETERS_DEFINED_ - -#define _NBL_GLSL_EXT_FFT_MAIN_DEFINED_ -#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp" - -void convolve(in uint item_per_thread_count, in uint ch) -{ - // TODO: decouple kernel size from image size (can't get the math to work in my head) - for(uint t=0u; t>1u; - const uint shifted = tid-padding; - if (tid>=padding && shifted -nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(in ivec3 coordinate, in uint channel) -{ - ivec2 inputImageSize = textureSize(inputImage, 0); - vec2 normalizedCoords = (vec2(coordinate.xy)+vec2(0.5f))/(vec2(inputImageSize)*KERNEL_SCALE); - vec4 texelValue = textureLod(inputImage, normalizedCoords+vec2(0.5-0.5/KERNEL_SCALE), -log2(KERNEL_SCALE)); - return nbl_glsl_complex(texelValue[channel], 0.0f); -} -#define _NBL_GLSL_EXT_FFT_GET_PADDED_DATA_DEFINED_ - - -/* TODO: Hardcode the parameters for the frequent FFTs -#if _NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_>512 -uvec3 nbl_glsl_ext_FFT_Parameters_t_getDimensions() -{ - return uvec3(1280u,720u,1u); -} -bool nbl_glsl_ext_FFT_Parameters_t_getIsInverse() -{ - return false; -} -uint nbl_glsl_ext_FFT_Parameters_t_getDirection() -{ - return 1u; -} -uint nbl_glsl_ext_FFT_Parameters_t_getMaxChannel() -{ - return 2u; -} -uint nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize() -{ - return 10u; -} -uint nbl_glsl_ext_FFT_Parameters_t_getPaddingType() -{ - return 3u; // _NBL_GLSL_EXT_FFT_PAD_MIRROR_; -} -uvec4 nbl_glsl_ext_FFT_Parameters_t_getInputStrides() -{ - return uvec4(0xdeadbeefu); -} -uvec4 nbl_glsl_ext_FFT_Parameters_t_getOutputStrides() -{ - return uvec4(1024u,1u,0u,1024u*1280u); -} -#define _NBL_GLSL_EXT_FFT_PARAMETERS_METHODS_DECLARED_ -#endif -*/ - -#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp" \ No newline at end of file diff --git a/old_to_refactor/49_ComputeFFT/last_fft.comp b/old_to_refactor/49_ComputeFFT/last_fft.comp deleted file mode 100644 index 2183ef63c..000000000 --- a/old_to_refactor/49_ComputeFFT/last_fft.comp +++ /dev/null @@ -1,72 +0,0 @@ -layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in; - -// Output Descriptor -layout(set=0, binding=1, rgba16f) uniform image2D outImage; -#define _NBL_GLSL_EXT_FFT_OUTPUT_DESCRIPTOR_DEFINED_ - -/* TODO: Hardcode the parameters for the frequent FFTs -uvec3 nbl_glsl_ext_FFT_Parameters_t_getDimensions() -{ - return uvec3(1280u,1024u,1u); -} -bool nbl_glsl_ext_FFT_Parameters_t_getIsInverse() -{ - return true; -} -uint nbl_glsl_ext_FFT_Parameters_t_getDirection() -{ - return 1u; -} -uint nbl_glsl_ext_FFT_Parameters_t_getMaxChannel() -{ - return 2u; -} -uint nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize() -{ - return 10u; -} -uint nbl_glsl_ext_FFT_Parameters_t_getPaddingType() -{ - return 3u; // _NBL_GLSL_EXT_FFT_PAD_MIRROR_; -} -uvec4 nbl_glsl_ext_FFT_Parameters_t_getInputStrides() -{ - return uvec4(1u,1280u,0u,1280u*1024u); -} -uvec4 nbl_glsl_ext_FFT_Parameters_t_getOutputStrides() -{ - return uvec4(0xdeadbeefu); -} -#define _NBL_GLSL_EXT_FFT_PARAMETERS_METHODS_DECLARED_ -*/ - -#include "extra_parameters.glsl" -layout(push_constant) uniform PushConstants -{ - image_store_parameters_t params; -} pc; -#define _NBL_GLSL_EXT_FFT_PUSH_CONSTANTS_DEFINED_ - -nbl_glsl_ext_FFT_Parameters_t nbl_glsl_ext_FFT_getParameters() -{ - return pc.params.fft; -} -#define _NBL_GLSL_EXT_FFT_GET_PARAMETERS_DEFINED_ - - -#include -void nbl_glsl_ext_FFT_setData(in uvec3 coordinate, in uint channel, in nbl_glsl_complex complex_value) -{ - const ivec2 coords = ivec2(coordinate.xy)-pc.params.unpad_offset; - - if (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageSize(outImage),coords))) - { - vec4 color_value = imageLoad(outImage, coords); - color_value[channel] = complex_value.x; - imageStore(outImage, coords, color_value); - } -} -#define _NBL_GLSL_EXT_FFT_SET_DATA_DEFINED_ - - -#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp" \ No newline at end of file diff --git a/old_to_refactor/49_ComputeFFT/main.cpp b/old_to_refactor/49_ComputeFFT/main.cpp deleted file mode 100644 index ba2b7e33e..000000000 --- a/old_to_refactor/49_ComputeFFT/main.cpp +++ /dev/null @@ -1,753 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#define _NBL_STATIC_LIB_ -#include -#include -#include - -#include "nbl/ext/FFT/FFT.h" -#include "../common/QToQuitEventReceiver.h" - -using namespace nbl; -using namespace nbl::core; -using namespace nbl::asset; -using namespace nbl::video; - -using FFTClass = ext::FFT::FFT; - -constexpr uint32_t channelCountOverride = 3u; - -inline core::smart_refctd_ptr createShader( - video::IVideoDriver* driver, - const uint32_t maxFFTlen, - const bool useHalfStorage, - const char* includeMainName, - float kernelScale = 1.f) -{ - const char* sourceFmt = -R"===(#version 430 core - -#define _NBL_GLSL_WORKGROUP_SIZE_ %u -#define _NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_ %u -#define _NBL_GLSL_EXT_FFT_HALF_STORAGE_ %u - -#define KERNEL_SCALE %f - -#include "%s" - -)==="; - - const size_t extraSize = 4u+8u+8u+128u; - - constexpr uint32_t DEFAULT_WORK_GROUP_SIZE = FFTClass::DEFAULT_WORK_GROUP_SIZE; - auto shader = core::make_smart_refctd_ptr(strlen(sourceFmt)+extraSize+1u); - snprintf( - reinterpret_cast(shader->getPointer()),shader->getSize(), sourceFmt, - DEFAULT_WORK_GROUP_SIZE, - maxFFTlen, - useHalfStorage ? 1u:0u, - kernelScale, - includeMainName - ); - - auto cpuSpecializedShader = core::make_smart_refctd_ptr( - core::make_smart_refctd_ptr(std::move(shader),ICPUShader::buffer_contains_glsl), - ISpecializedShader::SInfo{nullptr, nullptr, "main", asset::ISpecializedShader::ESS_COMPUTE} - ); - - auto gpuShader = driver->createShader(nbl::core::smart_refctd_ptr(cpuSpecializedShader->getUnspecialized())); - - auto gpuSpecializedShader = driver->createSpecializedShader(gpuShader.get(), cpuSpecializedShader->getSpecializationInfo()); - - return gpuSpecializedShader; -} - - - -inline void updateDescriptorSet_Convolution ( - video::IVideoDriver * driver, - video::IGPUDescriptorSet * set, - core::smart_refctd_ptr inputBufferDescriptor, - core::smart_refctd_ptr outputBufferDescriptor, - const core::smart_refctd_ptr* kernelNormalizedSpectrumImageDescriptors) -{ - constexpr uint32_t descCount = 3u; - video::IGPUDescriptorSet::SDescriptorInfo pInfos[descCount-1u+channelCountOverride]; - video::IGPUDescriptorSet::SWriteDescriptorSet pWrites[descCount]; - - for (auto i = 0; i < descCount; i++) - { - pWrites[i].binding = i; - pWrites[i].dstSet = set; - pWrites[i].arrayElement = 0u; - pWrites[i].info = pInfos+i; - } - - // Input Buffer - pWrites[0].descriptorType = asset::EDT_STORAGE_BUFFER; - pWrites[0].count = 1; - pInfos[0].desc = inputBufferDescriptor; - pInfos[0].buffer.size = inputBufferDescriptor->getSize(); - pInfos[0].buffer.offset = 0u; - - // - pWrites[1].descriptorType = asset::EDT_STORAGE_BUFFER; - pWrites[1].count = 1; - pInfos[1].desc = outputBufferDescriptor; - pInfos[1].buffer.size = outputBufferDescriptor->getSize(); - pInfos[1].buffer.offset = 0u; - - // Kernel Buffer - pWrites[2].descriptorType = asset::EDT_COMBINED_IMAGE_SAMPLER; - pWrites[2].count = channelCountOverride; - for (uint32_t i=0u; iupdateDescriptorSets(descCount, pWrites, 0u, nullptr); -} -inline void updateDescriptorSet_LastFFT ( - video::IVideoDriver * driver, - video::IGPUDescriptorSet * set, - core::smart_refctd_ptr inputBufferDescriptor, - core::smart_refctd_ptr outputImageDescriptor) -{ - video::IGPUDescriptorSet::SDescriptorInfo pInfos[2]; - video::IGPUDescriptorSet::SWriteDescriptorSet pWrites[2]; - - for (auto i = 0; i< 2; i++) - { - pWrites[i].dstSet = set; - pWrites[i].arrayElement = 0u; - pWrites[i].count = 1u; - pWrites[i].info = pInfos+i; - } - - // Input Buffer - pWrites[0].binding = 0; - pWrites[0].descriptorType = asset::EDT_STORAGE_BUFFER; - pWrites[0].count = 1; - pInfos[0].desc = inputBufferDescriptor; - pInfos[0].buffer.size = inputBufferDescriptor->getSize(); - pInfos[0].buffer.offset = 0u; - - // Output Buffer - pWrites[1].binding = 1; - pWrites[1].descriptorType = asset::EDT_STORAGE_IMAGE; - pWrites[1].count = 1; - pInfos[1].desc = outputImageDescriptor; - pInfos[1].image.sampler = nullptr; - pInfos[1].image.imageLayout = static_cast(0u);; - - driver->updateDescriptorSets(2u, pWrites, 0u, nullptr); -} - -using nbl_glsl_ext_FFT_Parameters_t = ext::FFT::FFT::Parameters_t; -struct vec2 -{ - float x,y; -}; -struct ivec2 -{ - int32_t x,y; -}; -#include "extra_parameters.glsl" - - -int main() -{ - nbl::SIrrlichtCreationParameters deviceParams; - deviceParams.Bits = 24; //may have to set to 32bit for some platforms - deviceParams.ZBufferBits = 24; //we'd like 32bit here - deviceParams.DriverType = EDT_OPENGL; //! Only Well functioning driver, software renderer left for sake of 2D image drawing - deviceParams.WindowSize = dimension2d(1280, 720); - deviceParams.Fullscreen = false; - deviceParams.Vsync = true; //! If supported by target platform - deviceParams.Doublebuffer = true; - deviceParams.Stencilbuffer = false; //! This will not even be a choice soon - - auto device = createDeviceEx(deviceParams); - if (!device) - return 1; // could not create selected driver. - - QToQuitEventReceiver receiver; - device->setEventReceiver(&receiver); - - IVideoDriver* driver = device->getVideoDriver(); - - nbl::io::IFileSystem* filesystem = device->getFileSystem(); - IAssetManager* am = device->getAssetManager(); - // Loading SrcImage and Kernel Image from File - - IAssetLoader::SAssetLoadParams lp; - auto srcImageBundle = am->getAsset("../../media/colorexr.exr", lp); - auto kerImageBundle = am->getAsset("../../media/kernels/physical_flare_256.exr", lp); - - // get GPU image views - smart_refctd_ptr srcImageView; - { - auto srcGpuImages = driver->getGPUObjectsFromAssets(srcImageBundle.getContents()); - - IGPUImageView::SCreationParams srcImgViewInfo; - srcImgViewInfo.flags = static_cast(0u); - srcImgViewInfo.image = srcGpuImages->operator[](0u); - srcImgViewInfo.viewType = IGPUImageView::ET_2D; - srcImgViewInfo.format = srcImgViewInfo.image->getCreationParameters().format; - srcImgViewInfo.subresourceRange.aspectMask = static_cast(0u); - srcImgViewInfo.subresourceRange.baseMipLevel = 0; - srcImgViewInfo.subresourceRange.levelCount = 1; - srcImgViewInfo.subresourceRange.baseArrayLayer = 0; - srcImgViewInfo.subresourceRange.layerCount = 1; - srcImageView = driver->createImageView(std::move(srcImgViewInfo)); - } - smart_refctd_ptr kerImageView; - { - auto kerGpuImages = driver->getGPUObjectsFromAssets(kerImageBundle.getContents()); - - IGPUImageView::SCreationParams kerImgViewInfo; - kerImgViewInfo.flags = static_cast(0u); - kerImgViewInfo.image = kerGpuImages->operator[](0u); - kerImgViewInfo.viewType = IGPUImageView::ET_2D; - kerImgViewInfo.format = kerImgViewInfo.image->getCreationParameters().format; - kerImgViewInfo.subresourceRange.aspectMask = static_cast(0u); - kerImgViewInfo.subresourceRange.baseMipLevel = 0; - kerImgViewInfo.subresourceRange.levelCount = kerImgViewInfo.image->getCreationParameters().mipLevels; - kerImgViewInfo.subresourceRange.baseArrayLayer = 0; - kerImgViewInfo.subresourceRange.layerCount = 1; - kerImageView = driver->createImageView(std::move(kerImgViewInfo)); - } - - // agree on formats - const E_FORMAT srcFormat = srcImageView->getCreationParameters().format; - uint32_t srcNumChannels = getFormatChannelCount(srcFormat); - uint32_t kerNumChannels = getFormatChannelCount(kerImageView->getCreationParameters().format); - //! OVERRIDE (we dont need alpha) - srcNumChannels = channelCountOverride; - kerNumChannels = channelCountOverride; - assert(srcNumChannels == kerNumChannels); // Just to make sure, because the other case is not handled in this example - - // Create Out Image - smart_refctd_ptr outImg; - smart_refctd_ptr outImgView; - { - auto dstImgViewInfo = srcImageView->getCreationParameters(); - - auto dstImgInfo = dstImgViewInfo.image->getCreationParameters(); - outImg = driver->createDeviceLocalGPUImageOnDedMem(std::move(dstImgInfo)); - - dstImgViewInfo.image = outImg; - outImgView = driver->createImageView(IGPUImageView::SCreationParams(dstImgViewInfo)); - } - - // input pipeline - auto imageFirstFFTPipelineLayout = [driver]() -> auto - { - IGPUDescriptorSetLayout::SBinding bnd[] = - { - { - 0u, - EDT_COMBINED_IMAGE_SAMPLER, - 1u, - ISpecializedShader::ESS_COMPUTE, - nullptr - }, - { - 1u, - EDT_STORAGE_BUFFER, - 1u, - ISpecializedShader::ESS_COMPUTE, - nullptr - } - }; - - core::SRange pcRange = FFTClass::getDefaultPushConstantRanges(); - core::SRange bindings = {bnd,bnd+sizeof(bnd)/sizeof(IGPUDescriptorSetLayout::SBinding)}; - - return driver->createPipelineLayout( - pcRange.begin(),pcRange.end(), - driver->createDescriptorSetLayout(bindings.begin(),bindings.end()),nullptr,nullptr,nullptr - ); - }(); - auto convolvePipelineLayout = [driver]() -> auto - { - IGPUSampler::SParams params = - { - { - ISampler::ETC_REPEAT, - ISampler::ETC_REPEAT, - ISampler::ETC_REPEAT, - ISampler::ETBC_FLOAT_OPAQUE_BLACK, - ISampler::ETF_LINEAR, // is it needed? - ISampler::ETF_LINEAR, - ISampler::ESMM_NEAREST, - 0u, - 0u, - ISampler::ECO_ALWAYS - } - }; - auto sampler = driver->createSampler(std::move(params)); - smart_refctd_ptr samplers[channelCountOverride]; - std::fill_n(samplers,channelCountOverride,sampler); - - IGPUDescriptorSetLayout::SBinding bnd[] = - { - { - 0u, - EDT_STORAGE_BUFFER, - 1u, - ISpecializedShader::ESS_COMPUTE, - nullptr - }, - { - 1u, - EDT_STORAGE_BUFFER, - 1u, - ISpecializedShader::ESS_COMPUTE, - nullptr - }, - { - 2u, - EDT_COMBINED_IMAGE_SAMPLER, - channelCountOverride, - ISpecializedShader::ESS_COMPUTE, - samplers - } - }; - - const asset::SPushConstantRange pcRange = {ISpecializedShader::ESS_COMPUTE,0u,sizeof(convolve_parameters_t)}; - core::SRange bindings = {bnd,bnd+sizeof(bnd)/sizeof(IGPUDescriptorSetLayout::SBinding)}; - - return driver->createPipelineLayout( - &pcRange,&pcRange+1, - driver->createDescriptorSetLayout(bindings.begin(),bindings.end()),nullptr,nullptr,nullptr - ); - }(); - auto lastFFTPipelineLayout = [driver]() -> auto - { - IGPUDescriptorSetLayout::SBinding bnd[] = - { - { - 0u, - EDT_STORAGE_BUFFER, - 1u, - ISpecializedShader::ESS_COMPUTE, - nullptr - }, - { - 1u, - EDT_STORAGE_IMAGE, - 1u, - ISpecializedShader::ESS_COMPUTE, - nullptr - }, - }; - - const asset::SPushConstantRange pcRange = {ISpecializedShader::ESS_COMPUTE,0u,sizeof(image_store_parameters_t)}; - core::SRange bindings = {bnd, bnd+sizeof(bnd)/sizeof(IGPUDescriptorSetLayout::SBinding)};; - - return driver->createPipelineLayout( - &pcRange,&pcRange+1, - driver->createDescriptorSetLayout(bindings.begin(),bindings.end()),nullptr,nullptr,nullptr - ); - }(); - - const float bloomRelativeScale = 0.25f; - const auto kerDim = kerImageView->getCreationParameters().image->getCreationParameters().extent; - const auto srcDim = srcImageView->getCreationParameters().image->getCreationParameters().extent; - const float bloomScale = core::min(float(srcDim.width)/float(kerDim.width),float(srcDim.height)/float(kerDim.height))*bloomRelativeScale; - if (bloomScale>1.f) - std::cout << "WARNING: Bloom Kernel will Clip and loose sharpness, increase resolution of bloom kernel!" << std::endl; - const auto marginSrcDim = [srcDim,kerDim,bloomScale]() -> auto - { - auto tmp = srcDim; - for (auto i=0u; i<3u; i++) - { - const auto coord = (&kerDim.width)[i]; - if (coord>1u) - (&tmp.width)[i] += core::max(coord*bloomScale,1u)-1u; - } - return tmp; - }(); - constexpr bool useHalfFloats = true; - // Allocate Output Buffer - auto fftOutputBuffer_0 = driver->createDeviceLocalGPUBufferOnDedMem(FFTClass::getOutputBufferSize(useHalfFloats,marginSrcDim,srcNumChannels)); - auto fftOutputBuffer_1 = driver->createDeviceLocalGPUBufferOnDedMem(FFTClass::getOutputBufferSize(useHalfFloats,marginSrcDim,srcNumChannels)); - core::smart_refctd_ptr kernelNormalizedSpectrums[channelCountOverride]; - - auto updateDescriptorSet = [driver](video::IGPUDescriptorSet* set, core::smart_refctd_ptr inputImageDescriptor, asset::ISampler::E_TEXTURE_CLAMP textureWrap, core::smart_refctd_ptr outputBufferDescriptor) -> void - { - IGPUSampler::SParams params = - { - { - textureWrap, - textureWrap, - textureWrap, - ISampler::ETBC_FLOAT_OPAQUE_BLACK, - ISampler::ETF_LINEAR, - ISampler::ETF_LINEAR, - ISampler::ESMM_LINEAR, - 8u, - 0u, - ISampler::ECO_ALWAYS - } - }; - auto sampler = driver->createSampler(std::move(params)); - - constexpr auto kDescriptorCount = 2u; - video::IGPUDescriptorSet::SDescriptorInfo pInfos[kDescriptorCount]; - video::IGPUDescriptorSet::SWriteDescriptorSet pWrites[kDescriptorCount]; - - for (auto i=0; i(0u); - - // Output Buffer - pWrites[1].binding = 1; - pWrites[1].descriptorType = asset::EDT_STORAGE_BUFFER; - pWrites[1].count = 1; - pInfos[1].desc = outputBufferDescriptor; - pInfos[1].buffer.size = outputBufferDescriptor->getSize(); - pInfos[1].buffer.offset = 0u; - - driver->updateDescriptorSets(2u, pWrites, 0u, nullptr); - }; - - // Precompute Kernel FFT - { - const VkExtent3D paddedKerDim = FFTClass::padDimensions(kerDim); - - // create kernel spectrums - auto createKernelSpectrum = [&]() -> auto - { - video::IGPUImage::SCreationParams imageParams; - imageParams.flags = static_cast(0u); - imageParams.type = asset::IImage::ET_2D; - imageParams.format = useHalfFloats ? EF_R16G16_SFLOAT:EF_R32G32_SFLOAT; - imageParams.extent = { paddedKerDim.width,paddedKerDim.height,1u}; - imageParams.mipLevels = 1u; - imageParams.arrayLayers = 1u; - imageParams.samples = asset::IImage::ESCF_1_BIT; - - video::IGPUImageView::SCreationParams viewParams; - viewParams.flags = static_cast(0u); - viewParams.image = driver->createGPUImageOnDedMem(std::move(imageParams),driver->getDeviceLocalGPUMemoryReqs()); - viewParams.viewType = video::IGPUImageView::ET_2D; - viewParams.format = useHalfFloats ? EF_R16G16_SFLOAT:EF_R32G32_SFLOAT; - viewParams.components = {}; - viewParams.subresourceRange = {}; - viewParams.subresourceRange.levelCount = 1u; - viewParams.subresourceRange.layerCount = 1u; - return driver->createImageView(std::move(viewParams)); - }; - for (uint32_t i=0u; i fftPipeline_SSBOInput(core::make_smart_refctd_ptr(driver,0x1u<getDefaultPipeline()); - - // descriptor sets - core::smart_refctd_ptr fftDescriptorSet_Ker_FFT[2] = - { - driver->createDescriptorSet(core::smart_refctd_ptr(imageFirstFFTPipelineLayout->getDescriptorSetLayout(0u))), - driver->createDescriptorSet(core::smart_refctd_ptr(fftPipeline_SSBOInput->getLayout()->getDescriptorSetLayout(0u))) - }; - updateDescriptorSet(fftDescriptorSet_Ker_FFT[0].get(), kerImageView, ISampler::ETC_CLAMP_TO_BORDER, fftOutputBuffer_0); - FFTClass::updateDescriptorSet(driver,fftDescriptorSet_Ker_FFT[1].get(), fftOutputBuffer_0, fftOutputBuffer_1); - - // Normalization of FFT spectrum - struct NormalizationPushConstants - { - ext::FFT::uvec4 stride; - ext::FFT::uvec4 bitreverse_shift; - }; - auto fftPipelineLayout_KernelNormalization = [&]() -> auto - { - IGPUDescriptorSetLayout::SBinding bnd[] = - { - { - 0u, - EDT_STORAGE_BUFFER, - 1u, - ISpecializedShader::ESS_COMPUTE, - nullptr - }, - { - 1u, - EDT_STORAGE_IMAGE, - channelCountOverride, - ISpecializedShader::ESS_COMPUTE, - nullptr - }, - }; - SPushConstantRange pc_rng; - pc_rng.offset = 0u; - pc_rng.size = sizeof(NormalizationPushConstants); - pc_rng.stageFlags = ISpecializedShader::ESS_COMPUTE; - return driver->createPipelineLayout( - &pc_rng,&pc_rng+1u, - driver->createDescriptorSetLayout(bnd,bnd+2),nullptr,nullptr,nullptr - ); - }(); - auto fftDescriptorSet_KernelNormalization = [&]() -> auto - { - auto dset = driver->createDescriptorSet(core::smart_refctd_ptr(fftPipelineLayout_KernelNormalization->getDescriptorSetLayout(0u))); - - video::IGPUDescriptorSet::SDescriptorInfo pInfos[1+channelCountOverride]; - video::IGPUDescriptorSet::SWriteDescriptorSet pWrites[2]; - - for (auto i = 0; i < 2; i++) - { - pWrites[i].dstSet = dset.get(); - pWrites[i].arrayElement = 0u; - pWrites[i].count = 1u; - pWrites[i].info = pInfos + i; - } - - // In Buffer - pWrites[0].binding = 0; - pWrites[0].descriptorType = asset::EDT_STORAGE_BUFFER; - pWrites[0].count = 1; - pInfos[0].desc = fftOutputBuffer_1; - pInfos[0].buffer.size = fftOutputBuffer_1->getSize(); - pInfos[0].buffer.offset = 0u; - - // Out Buffer - pWrites[1].binding = 1; - pWrites[1].descriptorType = asset::EDT_STORAGE_IMAGE; - pWrites[1].count = channelCountOverride; - for (uint32_t i=0u; iupdateDescriptorSets(2u, pWrites, 0u, nullptr); - return dset; - }(); - - // Ker Image First Axis FFT - { - auto fftPipeline_ImageInput = driver->createComputePipeline(nullptr,core::smart_refctd_ptr(imageFirstFFTPipelineLayout),createShader(driver,0x1u<bindComputePipeline(fftPipeline_ImageInput.get()); - driver->bindDescriptorSets(EPBP_COMPUTE, imageFirstFFTPipelineLayout.get(), 0u, 1u, &fftDescriptorSet_Ker_FFT[0].get(), nullptr); - FFTClass::dispatchHelper(driver, imageFirstFFTPipelineLayout.get(), fftPushConstants[0], fftDispatchInfo[0]); - } - - // Ker Image Last Axis FFT - driver->bindComputePipeline(fftPipeline_SSBOInput.get()); - driver->bindDescriptorSets(EPBP_COMPUTE, fftPipeline_SSBOInput->getLayout(), 0u, 1u, &fftDescriptorSet_Ker_FFT[1].get(), nullptr); - FFTClass::dispatchHelper(driver, fftPipeline_SSBOInput->getLayout(), fftPushConstants[1], fftDispatchInfo[1]); - - // Ker Normalization - auto fftPipeline_KernelNormalization = driver->createComputePipeline(nullptr,core::smart_refctd_ptr(fftPipelineLayout_KernelNormalization),createShader(driver,0xdeadbeefu,useHalfFloats,"../normalization.comp")); - driver->bindComputePipeline(fftPipeline_KernelNormalization.get()); - driver->bindDescriptorSets(EPBP_COMPUTE, fftPipelineLayout_KernelNormalization.get(), 0u, 1u, &fftDescriptorSet_KernelNormalization.get(), nullptr); - { - NormalizationPushConstants normalizationPC; - normalizationPC.stride = fftPushConstants[1].output_strides; - normalizationPC.bitreverse_shift.x = 32-core::findMSB(paddedKerDim.width); - normalizationPC.bitreverse_shift.y = 32-core::findMSB(paddedKerDim.height); - normalizationPC.bitreverse_shift.z = 0; - driver->pushConstants(fftPipelineLayout_KernelNormalization.get(),ICPUSpecializedShader::ESS_COMPUTE,0u,sizeof(normalizationPC),&normalizationPC); - } - { - const uint32_t dispatchSizeX = (paddedKerDim.width-1u)/16u+1u; - const uint32_t dispatchSizeY = (paddedKerDim.height-1u)/16u+1u; - driver->dispatch(dispatchSizeX,dispatchSizeY,kerNumChannels); - FFTClass::defaultBarrier(); - } - } - - FFTClass::Parameters_t fftPushConstants[3]; - FFTClass::DispatchInfo_t fftDispatchInfo[3]; - const ISampler::E_TEXTURE_CLAMP fftPadding[2] = {ISampler::ETC_MIRROR,ISampler::ETC_MIRROR}; - const auto passes = FFTClass::buildParameters(false,srcNumChannels,srcDim,fftPushConstants,fftDispatchInfo,fftPadding,marginSrcDim); - { - // override for less work and storage (dont need to store the extra padding of the last axis after iFFT) - fftPushConstants[1].output_strides.x = fftPushConstants[0].input_strides.x; - fftPushConstants[1].output_strides.y = fftPushConstants[0].input_strides.y; - fftPushConstants[1].output_strides.z = fftPushConstants[1].input_strides.z; - fftPushConstants[1].output_strides.w = fftPushConstants[1].input_strides.w; - // iFFT - fftPushConstants[2].input_dimensions = fftPushConstants[1].input_dimensions; - { - fftPushConstants[2].input_dimensions.w = fftPushConstants[0].input_dimensions.w^0x80000000u; - fftPushConstants[2].input_strides = fftPushConstants[1].output_strides; - fftPushConstants[2].output_strides = fftPushConstants[0].input_strides; - } - fftDispatchInfo[2] = fftDispatchInfo[0]; - } - assert(passes==2); - // pipelines - auto fftPipeline_ImageInput = driver->createComputePipeline(nullptr,core::smart_refctd_ptr(imageFirstFFTPipelineLayout),createShader(driver,0x1u<createComputePipeline(nullptr, std::move(convolvePipelineLayout), createShader(driver,0x1u<createComputePipeline(nullptr, std::move(lastFFTPipelineLayout), createShader(driver,0x1u<createDescriptorSet(core::smart_refctd_ptr(imageFirstFFTPipelineLayout->getDescriptorSetLayout(0u))); - updateDescriptorSet(fftDescriptorSet_Src_FirstFFT.get(), srcImageView, ISampler::ETC_MIRROR, fftOutputBuffer_0); - - // Convolution - auto convolveDescriptorSet = driver->createDescriptorSet(core::smart_refctd_ptr(convolvePipeline->getLayout()->getDescriptorSetLayout(0u))); - updateDescriptorSet_Convolution(driver, convolveDescriptorSet.get(), fftOutputBuffer_0, fftOutputBuffer_1, kernelNormalizedSpectrums); - - // Last Axis IFFT - auto lastFFTDescriptorSet = driver->createDescriptorSet(core::smart_refctd_ptr(lastFFTPipeline->getLayout()->getDescriptorSetLayout(0u))); - updateDescriptorSet_LastFFT(driver, lastFFTDescriptorSet.get(), fftOutputBuffer_1, outImgView); - - uint32_t outBufferIx = 0u; - auto lastPresentStamp = std::chrono::high_resolution_clock::now(); - bool savedToFile = false; - - auto downloadStagingArea = driver->getDefaultDownStreamingBuffer(); - - auto blitFBO = driver->addFrameBuffer(); - blitFBO->attach(video::EFAP_COLOR_ATTACHMENT0, std::move(outImgView)); - - while (device->run() && receiver.keepOpen()) - { - driver->beginScene(false, false); - - // Src Image First Axis FFT - driver->bindComputePipeline(fftPipeline_ImageInput.get()); - driver->bindDescriptorSets(EPBP_COMPUTE, imageFirstFFTPipelineLayout.get(), 0u, 1u, &fftDescriptorSet_Src_FirstFFT.get(), nullptr); - FFTClass::dispatchHelper(driver, imageFirstFFTPipelineLayout.get(), fftPushConstants[0], fftDispatchInfo[0]); - - // Src Image Last Axis FFT + Convolution + Convolved Last Axis IFFT Y - driver->bindComputePipeline(convolvePipeline.get()); - driver->bindDescriptorSets(EPBP_COMPUTE, convolvePipeline->getLayout(), 0u, 1u, &convolveDescriptorSet.get(), nullptr); - { - const auto& kernelImgExtent = kernelNormalizedSpectrums[0]->getCreationParameters().image->getCreationParameters().extent; - vec2 kernel_half_pixel_size{0.5f,0.5f}; - kernel_half_pixel_size.x /= kernelImgExtent.width; - kernel_half_pixel_size.y /= kernelImgExtent.height; - driver->pushConstants(convolvePipeline->getLayout(),ISpecializedShader::ESS_COMPUTE,offsetof(convolve_parameters_t,kernel_half_pixel_size),sizeof(convolve_parameters_t::kernel_half_pixel_size),&kernel_half_pixel_size); - } - FFTClass::dispatchHelper(driver, convolvePipeline->getLayout(), fftPushConstants[1], fftDispatchInfo[1]); - - // Last FFT Padding and Copy to GPU Image - driver->bindComputePipeline(lastFFTPipeline.get()); - driver->bindDescriptorSets(EPBP_COMPUTE, lastFFTPipeline->getLayout(), 0u, 1u, &lastFFTDescriptorSet.get(), nullptr); - { - const auto paddedSrcDim = FFTClass::padDimensions(marginSrcDim); - ivec2 unpad_offset = { 0,0 }; - for (auto i=0u; i<2u; i++) - if (fftDispatchInfo[2].workGroupCount[i]==1u) - (&unpad_offset.x)[i] = ((&paddedSrcDim.width)[i]-(&srcDim.width)[i])>>1u; - driver->pushConstants(lastFFTPipeline->getLayout(),ISpecializedShader::ESS_COMPUTE,offsetof(image_store_parameters_t,unpad_offset),sizeof(image_store_parameters_t::unpad_offset),&unpad_offset); - } - FFTClass::dispatchHelper(driver, lastFFTPipeline->getLayout(), fftPushConstants[2], fftDispatchInfo[2]); - - if(!savedToFile) - { - savedToFile = true; - - core::smart_refctd_ptr imageView; - const uint32_t colorBufferBytesize = srcDim.height * srcDim.width * asset::getTexelOrBlockBytesize(srcFormat); - - // create image - ICPUImage::SCreationParams imgParams; - imgParams.flags = static_cast(0u); // no flags - imgParams.type = ICPUImage::ET_2D; - imgParams.format = srcFormat; - imgParams.extent = srcDim; - imgParams.mipLevels = 1u; - imgParams.arrayLayers = 1u; - imgParams.samples = ICPUImage::ESCF_1_BIT; - auto image = ICPUImage::create(std::move(imgParams)); - - constexpr uint64_t timeoutInNanoSeconds = 300000000000u; - const auto waitPoint = std::chrono::high_resolution_clock::now()+std::chrono::nanoseconds(timeoutInNanoSeconds); - - uint32_t address = std::remove_pointer::type::invalid_address; // remember without initializing the address to be allocated to invalid_address you won't get an allocation! - const uint32_t alignment = 4096u; // common page size - auto unallocatedSize = downloadStagingArea->multi_alloc(waitPoint, 1u, &address, &colorBufferBytesize, &alignment); - if (unallocatedSize) - { - os::Printer::log("Could not download the buffer from the GPU!", ELL_ERROR); - } - - // set up regions - auto regions = core::make_refctd_dynamic_array >(1u); - { - auto& region = regions->front(); - - region.bufferOffset = 0u; - region.bufferRowLength = 0u; - region.bufferImageHeight = 0u; - //region.imageSubresource.aspectMask = wait for Vulkan; - region.imageSubresource.mipLevel = 0u; - region.imageSubresource.baseArrayLayer = 0u; - region.imageSubresource.layerCount = 1u; - region.imageOffset = { 0u,0u,0u }; - region.imageExtent = imgParams.extent; - } - - driver->copyImageToBuffer(outImg.get(), downloadStagingArea->getBuffer(), 1, ®ions->front()); - - auto downloadFence = driver->placeFence(true); - - auto* data = reinterpret_cast(downloadStagingArea->getBufferPointer()) + address; - auto cpubufferalias = core::make_smart_refctd_ptr > >(colorBufferBytesize, data, core::adopt_memory); - image->setBufferAndRegions(std::move(cpubufferalias),regions); - - // wait for download fence and then invalidate the CPU cache - { - auto result = downloadFence->waitCPU(timeoutInNanoSeconds,true); - if (result==E_DRIVER_FENCE_RETVAL::EDFR_TIMEOUT_EXPIRED||result==E_DRIVER_FENCE_RETVAL::EDFR_FAIL) - { - os::Printer::log("Could not download the buffer from the GPU, fence not signalled!", ELL_ERROR); - downloadStagingArea->multi_free(1u, &address, &colorBufferBytesize, nullptr); - continue; - } - if (downloadStagingArea->needsManualFlushOrInvalidate()) - driver->invalidateMappedMemoryRanges({{downloadStagingArea->getBuffer()->getBoundMemory(),address,colorBufferBytesize}}); - } - - // create image view - ICPUImageView::SCreationParams imgViewParams; - imgViewParams.flags = static_cast(0u); - imgViewParams.format = image->getCreationParameters().format; - imgViewParams.image = std::move(image); - imgViewParams.viewType = ICPUImageView::ET_2D; - imgViewParams.subresourceRange = {static_cast(0u),0u,1u,0u,1u}; - imageView = ICPUImageView::create(std::move(imgViewParams)); - - IAssetWriter::SAssetWriteParams wp(imageView.get()); - volatile bool success = am->writeAsset("convolved_exr.exr", wp); - assert(success); - } - - driver->blitRenderTargets(blitFBO, nullptr, false, false); - - driver->endScene(); - } - - return 0; -} \ No newline at end of file diff --git a/old_to_refactor/49_ComputeFFT/normalization.comp b/old_to_refactor/49_ComputeFFT/normalization.comp deleted file mode 100644 index b3926090d..000000000 --- a/old_to_refactor/49_ComputeFFT/normalization.comp +++ /dev/null @@ -1,34 +0,0 @@ -layout(local_size_x=16, local_size_y=16, local_size_z=1) in; - -#include - -layout(set=0, binding=0) restrict readonly buffer InBuffer -{ - nbl_glsl_ext_FFT_storage_t in_data[]; -}; - -layout(set=0, binding=1, rg16f) uniform image2D NormalizedKernel[3]; - -layout(push_constant) uniform PushConstants -{ - uvec4 strides; - uvec4 bitreverse_shift; -} pc; - -#include - -void main() -{ - nbl_glsl_complex value = nbl_glsl_ext_FFT_storage_t_get(in_data[nbl_glsl_dot(gl_GlobalInvocationID,pc.strides.xyz)]); - - // imaginary component will be 0, image shall be positive - vec3 avg; - for (uint i=0u; i<3u; i++) - avg[i] = nbl_glsl_ext_FFT_storage_t_get(in_data[pc.strides.z*i]).x; - const float power = (nbl_glsl_scRGBtoXYZ*avg).y; - - const uvec2 coord = bitfieldReverse(gl_GlobalInvocationID.xy)>>pc.bitreverse_shift.xy; - const nbl_glsl_complex shift = nbl_glsl_expImaginary(-nbl_glsl_PI*float(coord.x+coord.y)); - value = nbl_glsl_complex_mul(value,shift)/power; - imageStore(NormalizedKernel[gl_WorkGroupID.z],ivec2(coord),vec4(value,0.0,0.0)); -} \ No newline at end of file diff --git a/old_to_refactor/49_ComputeFFT/pipeline.groovy b/old_to_refactor/49_ComputeFFT/pipeline.groovy deleted file mode 100644 index 64874da2a..000000000 --- a/old_to_refactor/49_ComputeFFT/pipeline.groovy +++ /dev/null @@ -1,50 +0,0 @@ -import org.DevshGraphicsProgramming.Agent -import org.DevshGraphicsProgramming.BuilderInfo -import org.DevshGraphicsProgramming.IBuilder - -class CComputeFFTBuilder extends IBuilder -{ - public CComputeFFTBuilder(Agent _agent, _info) - { - super(_agent, _info) - } - - @Override - public boolean prepare(Map axisMapping) - { - return true - } - - @Override - public boolean build(Map axisMapping) - { - IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") - IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") - - def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) - def nameOfConfig = getNameOfConfig(config) - - agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") - - return true - } - - @Override - public boolean test(Map axisMapping) - { - return true - } - - @Override - public boolean install(Map axisMapping) - { - return true - } -} - -def create(Agent _agent, _info) -{ - return new CComputeFFTBuilder(_agent, _info) -} - -return this \ No newline at end of file diff --git a/60_ClusteredRendering/CMakeLists.txt b/old_to_refactor/60_ClusteredRendering/CMakeLists.txt similarity index 100% rename from 60_ClusteredRendering/CMakeLists.txt rename to old_to_refactor/60_ClusteredRendering/CMakeLists.txt diff --git a/60_ClusteredRendering/config.json.template b/old_to_refactor/60_ClusteredRendering/config.json.template similarity index 100% rename from 60_ClusteredRendering/config.json.template rename to old_to_refactor/60_ClusteredRendering/config.json.template diff --git a/60_ClusteredRendering/main.cpp b/old_to_refactor/60_ClusteredRendering/main.cpp similarity index 100% rename from 60_ClusteredRendering/main.cpp rename to old_to_refactor/60_ClusteredRendering/main.cpp diff --git a/60_ClusteredRendering/pipeline.groovy b/old_to_refactor/60_ClusteredRendering/pipeline.groovy similarity index 100% rename from 60_ClusteredRendering/pipeline.groovy rename to old_to_refactor/60_ClusteredRendering/pipeline.groovy From 24f952d2c5baa9bd890335591654b4c5e8b7669a Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 19 Apr 2025 13:10:10 +0200 Subject: [PATCH 120/296] count path depth properly, bug from unremoved testing code caused MIS=off at any bounce > 2 added a wishlist of TODOs --- .../app_resources/common.glsl | 16 +++++++++------- 30_ComputeShaderPathTracer/main.cpp | 5 +++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/30_ComputeShaderPathTracer/app_resources/common.glsl b/30_ComputeShaderPathTracer/app_resources/common.glsl index 2463f82cf..aaadae4a8 100644 --- a/30_ComputeShaderPathTracer/app_resources/common.glsl +++ b/30_ComputeShaderPathTracer/app_resources/common.glsl @@ -352,9 +352,9 @@ struct Payload_t vec3 accumulation; float otherTechniqueHeuristic; vec3 throughput; - #ifdef KILL_DIFFUSE_SPECULAR_PATHS +#ifdef KILL_DIFFUSE_SPECULAR_PATHS bool hasDiffuse; - #endif +#endif }; struct Ray_t @@ -491,6 +491,7 @@ layout (constant_id = 1) const int MAX_SAMPLES_LOG2 = 10; #include +// TODO: use PCG hash + XOROSHIRO and don't read any textures mat2x3 rand3d(in uint protoDimension, in uint _sample, inout nbl_glsl_xoroshiro64star_state_t scramble_state) { mat2x3 retval; @@ -552,6 +553,7 @@ nbl_glsl_LightSample nbl_glsl_light_generate_and_remainder_and_pdf(out vec3 rema } uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection); +// returns whether to stop tracing bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nbl_glsl_xoroshiro64star_state_t scramble_state) { const MutableRay_t _mutable = ray._mutable; @@ -602,7 +604,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb const bool isBSDF = BSDFNode_isBSDF(bsdf); //rand - mat2x3 epsilon = rand3d(depth,_sample,scramble_state); + mat2x3 epsilon = rand3d(depth*2,_sample,scramble_state); // thresholds const float bsdfPdfThreshold = 0.0001; @@ -613,7 +615,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb // do NEE const float neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf); float rcpChoiceProb; - if (!nbl_glsl_partitionRandVariable(neeProbability,epsilon[0].z,rcpChoiceProb) && depth<2u) + if (!nbl_glsl_partitionRandVariable(neeProbability,epsilon[0].z,rcpChoiceProb)) { vec3 neeContrib; float lightPdf, t; nbl_glsl_LightSample nee_sample = nbl_glsl_light_generate_and_remainder_and_pdf( @@ -748,15 +750,15 @@ void main() ray._payload.accumulation = vec3(0.0); ray._payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths ray._payload.throughput = vec3(1.0); - #ifdef KILL_DIFFUSE_SPECULAR_PATHS +#ifdef KILL_DIFFUSE_SPECULAR_PATHS ray._payload.hasDiffuse = false; - #endif +#endif } // bounces { bool hit = true; bool rayAlive = true; - for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d+=2) + for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d++) { ray._mutable.intersectionT = nbl_glsl_FLT_MAX; ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction); diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp index 26d673002..ed93cf81f 100644 --- a/30_ComputeShaderPathTracer/main.cpp +++ b/30_ComputeShaderPathTracer/main.cpp @@ -15,13 +15,14 @@ using namespace asset; using namespace ui; using namespace video; +// TODO: share push constants struct PTPushConstant { matrix4SIMD invMVP; int sampleCount; int depth; }; -// TODO: Add a QueryPool for timestamping once its ready +// TODO: Add a QueryPool for timestamping once its ready (actually add IMGUI mspf plotter) // TODO: Do buffer creation using assConv class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { @@ -859,7 +860,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); ImGui::ListBox("Shader", &PTPipline, shaderNames, E_LIGHT_GEOMETRY::ELG_COUNT); ImGui::SliderInt("SPP", &spp, 1, MaxBufferSamples); - ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 3); + ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 6); ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); From 3253b6f7c43042562158acde5924a7fdbef8f7cc Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 19 Apr 2025 13:39:06 +0200 Subject: [PATCH 121/296] fix NEE_ONLY setting --- .../app_resources/common.glsl | 55 +++++++++++-------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/30_ComputeShaderPathTracer/app_resources/common.glsl b/30_ComputeShaderPathTracer/app_resources/common.glsl index aaadae4a8..65ed0609e 100644 --- a/30_ComputeShaderPathTracer/app_resources/common.glsl +++ b/30_ComputeShaderPathTracer/app_resources/common.glsl @@ -596,7 +596,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb if (BSDFNode_isNotDiffuse(bsdf)) { if (ray._payload.hasDiffuse) - return true; + return false; } else ray._payload.hasDiffuse = true; @@ -613,47 +613,55 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb const float monochromeEta = dot(throughputCIE_Y,BSDFNode_getEta(bsdf)[0])/(throughputCIE_Y.r+throughputCIE_Y.g+throughputCIE_Y.b); // do NEE - const float neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf); +#ifndef NEE_ONLY + // to turn off NEE, set this to 0 + const float neeProbability = BSDFNode_getNEEProb(bsdf); float rcpChoiceProb; if (!nbl_glsl_partitionRandVariable(neeProbability,epsilon[0].z,rcpChoiceProb)) { +#endif vec3 neeContrib; float lightPdf, t; nbl_glsl_LightSample nee_sample = nbl_glsl_light_generate_and_remainder_and_pdf( neeContrib, lightPdf, t, intersection, interaction, isBSDF, epsilon[0], depth ); - // We don't allow non watertight transmitters in this renderer + // We don't allow non watertight transmitters in this renderer & scene, one cannot reach a light from the backface (optimization) bool validPath = nee_sample.NdotL>nbl_glsl_FLT_MIN; // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself nbl_glsl_AnisotropicMicrofacetCache _cache; validPath = validPath && nbl_glsl_calcAnisotropicMicrofacetCache(_cache, interaction, nee_sample, monochromeEta); + // infinite PDF would mean a point light or a thin line, but our lights have finite radiance per steradian (area lights) if (lightPdflumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1) - ray._payload.accumulation += neeContrib; - }} + if (bsdfPdflumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1) + ray._payload.accumulation += neeContrib; + } + } +#ifndef NEE_ONLY } -#if NEE_ONLY - return false; -#endif + // sample BSDF float bsdfPdf; vec3 bsdfSampleL; { @@ -682,6 +690,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb #endif return true; } +#endif } return false; } From c699bd086acd886745943cf478e9bf1160d36212 Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 19 Apr 2025 13:45:29 +0200 Subject: [PATCH 122/296] Remove legacy deprecated/reimplemented examples --- 42_FragmentShaderPathTracer/CMakeLists.txt | 7 - 42_FragmentShaderPathTracer/common.glsl | 812 ------------ .../config.json.template | 28 - .../litByRectangle.comp | 182 --- 42_FragmentShaderPathTracer/litBySphere.comp | 60 - .../litByTriangle.comp | 105 -- 42_FragmentShaderPathTracer/main.cpp | 693 ---------- 42_FragmentShaderPathTracer/pipeline.groovy | 50 - 53_ComputeShaders/CMakeLists.txt | 6 - 53_ComputeShaders/computeShader.comp | 95 -- 53_ComputeShaders/config.json.template | 28 - 53_ComputeShaders/fragmentShader.frag | 12 - 53_ComputeShaders/geometryShader.geom | 27 - 53_ComputeShaders/main.cpp | 694 ---------- 53_ComputeShaders/pipeline.groovy | 50 - 53_ComputeShaders/shaderCommon.glsl | 6 - 53_ComputeShaders/vertexShader.vert | 23 - 56_RayQuery/CMakeLists.txt | 7 - 56_RayQuery/common.glsl | 793 ----------- 56_RayQuery/config.json.template | 28 - 56_RayQuery/litByRectangle.comp | 106 -- 56_RayQuery/litBySphere.comp | 61 - 56_RayQuery/litByTriangle.comp | 105 -- 56_RayQuery/main.cpp | 1156 ----------------- 56_RayQuery/pipeline.groovy | 50 - CMakeLists.txt | 1 - 26 files changed, 5185 deletions(-) delete mode 100644 42_FragmentShaderPathTracer/CMakeLists.txt delete mode 100644 42_FragmentShaderPathTracer/common.glsl delete mode 100644 42_FragmentShaderPathTracer/config.json.template delete mode 100644 42_FragmentShaderPathTracer/litByRectangle.comp delete mode 100644 42_FragmentShaderPathTracer/litBySphere.comp delete mode 100644 42_FragmentShaderPathTracer/litByTriangle.comp delete mode 100644 42_FragmentShaderPathTracer/main.cpp delete mode 100644 42_FragmentShaderPathTracer/pipeline.groovy delete mode 100644 53_ComputeShaders/CMakeLists.txt delete mode 100644 53_ComputeShaders/computeShader.comp delete mode 100644 53_ComputeShaders/config.json.template delete mode 100644 53_ComputeShaders/fragmentShader.frag delete mode 100644 53_ComputeShaders/geometryShader.geom delete mode 100644 53_ComputeShaders/main.cpp delete mode 100644 53_ComputeShaders/pipeline.groovy delete mode 100644 53_ComputeShaders/shaderCommon.glsl delete mode 100644 53_ComputeShaders/vertexShader.vert delete mode 100644 56_RayQuery/CMakeLists.txt delete mode 100644 56_RayQuery/common.glsl delete mode 100644 56_RayQuery/config.json.template delete mode 100644 56_RayQuery/litByRectangle.comp delete mode 100644 56_RayQuery/litBySphere.comp delete mode 100644 56_RayQuery/litByTriangle.comp delete mode 100644 56_RayQuery/main.cpp delete mode 100644 56_RayQuery/pipeline.groovy diff --git a/42_FragmentShaderPathTracer/CMakeLists.txt b/42_FragmentShaderPathTracer/CMakeLists.txt deleted file mode 100644 index a476b6203..000000000 --- a/42_FragmentShaderPathTracer/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ - -include(common RESULT_VARIABLE RES) -if(NOT RES) - message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") -endif() - -nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file diff --git a/42_FragmentShaderPathTracer/common.glsl b/42_FragmentShaderPathTracer/common.glsl deleted file mode 100644 index 20f7a7359..000000000 --- a/42_FragmentShaderPathTracer/common.glsl +++ /dev/null @@ -1,812 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -// basic settings -#define MAX_DEPTH 3 -#define SAMPLES 128 - -// firefly and variance reduction techniques -//#define KILL_DIFFUSE_SPECULAR_PATHS -//#define VISUALIZE_HIGH_VARIANCE - -layout(set = 2, binding = 0) uniform sampler2D envMap; -layout(set = 2, binding = 1) uniform usamplerBuffer sampleSequence; -layout(set = 2, binding = 2) uniform usampler2D scramblebuf; - -layout(set=0, binding=0, rgba16f) uniform image2D outImage; - -#ifndef _NBL_GLSL_WORKGROUP_SIZE_ -#define _NBL_GLSL_WORKGROUP_SIZE_ 16 -layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=_NBL_GLSL_WORKGROUP_SIZE_, local_size_z=1) in; -#endif - -ivec2 getCoordinates() { - return ivec2(gl_GlobalInvocationID.xy); -} - -vec2 getTexCoords() { - ivec2 imageSize = imageSize(outImage); - ivec2 iCoords = getCoordinates(); - return vec2(float(iCoords.x) / imageSize.x, 1.0 - float(iCoords.y) / imageSize.y); -} - - -#include -#include -#include -#include - -#include - -layout(set = 1, binding = 0, row_major, std140) uniform UBO -{ - nbl_glsl_SBasicViewParameters params; -} cameraData; - - -#define INVALID_ID_16BIT 0xffffu -struct Sphere -{ - vec3 position; - float radius2; - uint bsdfLightIDs; -}; - -Sphere Sphere_Sphere(in vec3 position, in float radius, in uint bsdfID, in uint lightID) -{ - Sphere sphere; - sphere.position = position; - sphere.radius2 = radius*radius; - sphere.bsdfLightIDs = bitfieldInsert(bsdfID,lightID,16,16); - return sphere; -} - -// return intersection distance if found, nbl_glsl_FLT_NAN otherwise -float Sphere_intersect(in Sphere sphere, in vec3 origin, in vec3 direction) -{ - vec3 relOrigin = origin-sphere.position; - float relOriginLen2 = dot(relOrigin,relOrigin); - const float radius2 = sphere.radius2; - - float dirDotRelOrigin = dot(direction,relOrigin); - float det = radius2-relOriginLen2+dirDotRelOrigin*dirDotRelOrigin; - - // do some speculative math here - float detsqrt = sqrt(det); - return -dirDotRelOrigin+(relOriginLen2>radius2 ? (-detsqrt):detsqrt); -} - -vec3 Sphere_getNormal(in Sphere sphere, in vec3 position) -{ - const float radiusRcp = inversesqrt(sphere.radius2); - return (position-sphere.position)*radiusRcp; -} - -float Sphere_getSolidAngle_impl(in float cosThetaMax) -{ - return 2.0*nbl_glsl_PI*(1.0-cosThetaMax); -} -float Sphere_getSolidAngle(in Sphere sphere, in vec3 origin) -{ - float cosThetaMax = sqrt(1.0-sphere.radius2/nbl_glsl_lengthSq(sphere.position-origin)); - return Sphere_getSolidAngle_impl(cosThetaMax); -} - - -Sphere spheres[SPHERE_COUNT] = { - Sphere_Sphere(vec3(0.0,-100.5,-1.0),100.0,0u,INVALID_ID_16BIT), - Sphere_Sphere(vec3(2.0,0.0,-1.0),0.5,1u,INVALID_ID_16BIT), - Sphere_Sphere(vec3(0.0,0.0,-1.0),0.5,2u,INVALID_ID_16BIT), - Sphere_Sphere(vec3(-2.0,0.0,-1.0),0.5,3u,INVALID_ID_16BIT), - Sphere_Sphere(vec3(2.0,0.0,1.0),0.5,4u,INVALID_ID_16BIT), - Sphere_Sphere(vec3(0.0,0.0,1.0),0.5,4u,INVALID_ID_16BIT), - Sphere_Sphere(vec3(-2.0,0.0,1.0),0.5,5u,INVALID_ID_16BIT), - Sphere_Sphere(vec3(0.5,1.0,0.5),0.5,6u,INVALID_ID_16BIT) -#if SPHERE_COUNT>8 - ,Sphere_Sphere(vec3(-1.5,1.5,0.0),0.3,INVALID_ID_16BIT,0u) -#endif -}; - - -struct Triangle -{ - vec3 vertex0; - uint bsdfLightIDs; - vec3 vertex1; - uint padding0; - vec3 vertex2; - uint padding1; -}; - -Triangle Triangle_Triangle(in mat3 vertices, in uint bsdfID, in uint lightID) -{ - Triangle tri; - tri.vertex0 = vertices[0]; - tri.vertex1 = vertices[1]; - tri.vertex2 = vertices[2]; - // - tri.bsdfLightIDs = bitfieldInsert(bsdfID, lightID, 16, 16); - return tri; -} - -// return intersection distance if found, nbl_glsl_FLT_NAN otherwise -float Triangle_intersect(in Triangle tri, in vec3 origin, in vec3 direction) -{ - const vec3 edges[2] = vec3[2](tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0); - - const vec3 h = cross(direction,edges[1]); - const float a = dot(edges[0],h); - - const vec3 relOrigin = origin-tri.vertex0; - - const float u = dot(relOrigin,h)/a; - - const vec3 q = cross(relOrigin,edges[0]); - const float v = dot(direction,q)/a; - - const float t = dot(edges[1],q)/a; - - return t>0.f&&u>=0.f&&v>=0.f&&(u+v)<=1.f ? t:nbl_glsl_FLT_NAN; -} - -vec3 Triangle_getNormalTimesArea_impl(in mat2x3 edges) -{ - return cross(edges[0],edges[1])*0.5; -} -vec3 Triangle_getNormalTimesArea(in Triangle tri) -{ - return Triangle_getNormalTimesArea_impl(mat2x3(tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0)); -} - - - -struct Rectangle -{ - vec3 offset; - uint bsdfLightIDs; - vec3 edge0; - uint padding0; - vec3 edge1; - uint padding1; -}; - -Rectangle Rectangle_Rectangle(in vec3 offset, in vec3 edge0, in vec3 edge1, in uint bsdfID, in uint lightID) -{ - Rectangle rect; - rect.offset = offset; - rect.edge0 = edge0; - rect.edge1 = edge1; - // - rect.bsdfLightIDs = bitfieldInsert(bsdfID, lightID, 16, 16); - return rect; -} - -void Rectangle_getNormalBasis(in Rectangle rect, out mat3 basis, out vec2 extents) -{ - extents = vec2(length(rect.edge0), length(rect.edge1)); - basis[0] = rect.edge0/extents[0]; - basis[1] = rect.edge1/extents[1]; - basis[2] = normalize(cross(basis[0],basis[1])); -} - -// return intersection distance if found, nbl_glsl_FLT_NAN otherwise -float Rectangle_intersect(in Rectangle rect, in vec3 origin, in vec3 direction) -{ - const vec3 h = cross(direction,rect.edge1); - const float a = dot(rect.edge0,h); - - const vec3 relOrigin = origin-rect.offset; - - const float u = dot(relOrigin,h)/a; - - const vec3 q = cross(relOrigin,rect.edge0); - const float v = dot(direction,q)/a; - - const float t = dot(rect.edge1,q)/a; - - const bool intersection = t>0.f&&u>=0.f&&v>=0.f&&u<=1.f&&v<=1.f; - return intersection ? t:nbl_glsl_FLT_NAN; -} - -vec3 Rectangle_getNormalTimesArea(in Rectangle rect) -{ - return cross(rect.edge0,rect.edge1); -} - - - -#define DIFFUSE_OP 0u -#define CONDUCTOR_OP 1u -#define DIELECTRIC_OP 2u -#define OP_BITS_OFFSET 0 -#define OP_BITS_SIZE 2 -struct BSDFNode -{ - uvec4 data[2]; -}; - -uint BSDFNode_getType(in BSDFNode node) -{ - return bitfieldExtract(node.data[0].w,OP_BITS_OFFSET,OP_BITS_SIZE); -} -bool BSDFNode_isBSDF(in BSDFNode node) -{ - return BSDFNode_getType(node)==DIELECTRIC_OP; -} -bool BSDFNode_isNotDiffuse(in BSDFNode node) -{ - return BSDFNode_getType(node)!=DIFFUSE_OP; -} -float BSDFNode_getRoughness(in BSDFNode node) -{ - return uintBitsToFloat(node.data[1].w); -} -vec3 BSDFNode_getRealEta(in BSDFNode node) -{ - return uintBitsToFloat(node.data[0].rgb); -} -vec3 BSDFNode_getImaginaryEta(in BSDFNode node) -{ - return uintBitsToFloat(node.data[1].rgb); -} -mat2x3 BSDFNode_getEta(in BSDFNode node) -{ - return mat2x3(BSDFNode_getRealEta(node),BSDFNode_getImaginaryEta(node)); -} -#include -vec3 BSDFNode_getReflectance(in BSDFNode node, in float VdotH) -{ - const vec3 albedoOrRealIoR = uintBitsToFloat(node.data[0].rgb); - if (BSDFNode_isNotDiffuse(node)) - return nbl_glsl_fresnel_conductor(albedoOrRealIoR, BSDFNode_getImaginaryEta(node), VdotH); - else - return albedoOrRealIoR; -} - -float BSDFNode_getNEEProb(in BSDFNode bsdf) -{ - const float alpha = BSDFNode_isNotDiffuse(bsdf) ? BSDFNode_getRoughness(bsdf):1.0; - return min(8.0*alpha,1.0); -} - -#include -#include -float getLuma(in vec3 col) -{ - return dot(transpose(nbl_glsl_scRGBtoXYZ)[1],col); -} - -#define BSDF_COUNT 7 -BSDFNode bsdfs[BSDF_COUNT] = { - {{uvec4(floatBitsToUint(vec3(0.8,0.8,0.8)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}}, - {{uvec4(floatBitsToUint(vec3(0.8,0.4,0.4)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}}, - {{uvec4(floatBitsToUint(vec3(0.4,0.8,0.4)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}}, - {{uvec4(floatBitsToUint(vec3(1.02,1.02,1.3)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,1.0,2.0,0.0))}}, - {{uvec4(floatBitsToUint(vec3(1.02,1.3,1.02)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,2.0,1.0,0.0))}}, - {{uvec4(floatBitsToUint(vec3(1.02,1.3,1.02)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,2.0,1.0,0.15))}}, - {{uvec4(floatBitsToUint(vec3(1.4,1.45,1.5)),DIELECTRIC_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0625))}} -}; - - -struct Light -{ - vec3 radiance; - uint objectID; -}; - -vec3 Light_getRadiance(in Light light) -{ - return light.radiance; -} -uint Light_getObjectID(in Light light) -{ - return light.objectID; -} - - -#define LIGHT_COUNT 1 -float scene_getLightChoicePdf(in Light light) -{ - return 1.0/float(LIGHT_COUNT); -} - - -#define LIGHT_COUNT 1 -Light lights[LIGHT_COUNT] = -{ - { - vec3(30.0,25.0,15.0), -#ifdef POLYGON_METHOD - 0u -#else - 8u -#endif - } -}; - - - -#define ANY_HIT_FLAG (-2147483648) -#define DEPTH_BITS_COUNT 8 -#define DEPTH_BITS_OFFSET (31-DEPTH_BITS_COUNT) -struct ImmutableRay_t -{ - vec3 origin; - vec3 direction; -#if POLYGON_METHOD==2 - vec3 normalAtOrigin; - bool wasBSDFAtOrigin; -#endif -}; -struct MutableRay_t -{ - float intersectionT; - uint objectID; - /* irrelevant here - uint triangleID; - vec2 barycentrics; - */ -}; -struct Payload_t -{ - vec3 accumulation; - float otherTechniqueHeuristic; - vec3 throughput; - #ifdef KILL_DIFFUSE_SPECULAR_PATHS - bool hasDiffuse; - #endif -}; - -struct Ray_t -{ - ImmutableRay_t _immutable; - MutableRay_t _mutable; - Payload_t _payload; -}; - - -#define INTERSECTION_ERROR_BOUND_LOG2 (-8.0) -float getTolerance_common(in uint depth) -{ - float depthRcp = 1.0/float(depth); - return INTERSECTION_ERROR_BOUND_LOG2;// *depthRcp*depthRcp; -} -float getStartTolerance(in uint depth) -{ - return exp2(getTolerance_common(depth)); -} -float getEndTolerance(in uint depth) -{ - return 1.0-exp2(getTolerance_common(depth)+1.0); -} - - -vec2 SampleSphericalMap(vec3 v) -{ - vec2 uv = vec2(atan(v.z, v.x), asin(v.y)); - uv *= nbl_glsl_RECIPROCAL_PI*0.5; - uv += 0.5; - return uv; -} - -void missProgram(in ImmutableRay_t _immutable, inout Payload_t _payload) -{ - vec3 finalContribution = _payload.throughput; - // #define USE_ENVMAP -#ifdef USE_ENVMAP - vec2 uv = SampleSphericalMap(_immutable.direction); - finalContribution *= textureLod(envMap, uv, 0.0).rgb; -#else - const vec3 kConstantEnvLightRadiance = vec3(0.15, 0.21, 0.3); - finalContribution *= kConstantEnvLightRadiance; - _payload.accumulation += finalContribution; -#endif -} - -#include -#include -#include -#include -#include -#include -#include -nbl_glsl_LightSample nbl_glsl_bsdf_cos_generate(in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in vec3 u, in BSDFNode bsdf, in float monochromeEta, out nbl_glsl_AnisotropicMicrofacetCache _cache) -{ - const float a = BSDFNode_getRoughness(bsdf); - const mat2x3 ior = BSDFNode_getEta(bsdf); - - // fresnel stuff for dielectrics - float orientedEta, rcpOrientedEta; - const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta); - - nbl_glsl_LightSample smpl; - nbl_glsl_AnisotropicMicrofacetCache dummy; - switch (BSDFNode_getType(bsdf)) - { - case DIFFUSE_OP: - smpl = nbl_glsl_oren_nayar_cos_generate(interaction,u.xy,a*a); - break; - case CONDUCTOR_OP: - smpl = nbl_glsl_ggx_cos_generate(interaction,u.xy,a,a,_cache); - break; - default: - smpl = nbl_glsl_ggx_dielectric_cos_generate(interaction,u,a,a,monochromeEta,_cache); - break; - } - return smpl; -} - -vec3 nbl_glsl_bsdf_cos_remainder_and_pdf(out float pdf, in nbl_glsl_LightSample _sample, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in BSDFNode bsdf, in float monochromeEta, in nbl_glsl_AnisotropicMicrofacetCache _cache) -{ - // are V and L on opposite sides of the surface? - const bool transmitted = nbl_glsl_isTransmissionPath(interaction.isotropic.NdotV,_sample.NdotL); - - // is the BSDF or BRDF, if it is then we make the dot products `abs` before `max(,0.0)` - const bool transmissive = BSDFNode_isBSDF(bsdf); - const float clampedNdotL = nbl_glsl_conditionalAbsOrMax(transmissive,_sample.NdotL,0.0); - const float clampedNdotV = nbl_glsl_conditionalAbsOrMax(transmissive,interaction.isotropic.NdotV,0.0); - - vec3 remainder; - - const float minimumProjVectorLen = 0.00000001; - if (clampedNdotV>minimumProjVectorLen && clampedNdotL>minimumProjVectorLen) - { - // fresnel stuff for conductors (but reflectance also doubles as albedo) - const mat2x3 ior = BSDFNode_getEta(bsdf); - const vec3 reflectance = BSDFNode_getReflectance(bsdf,_cache.isotropic.VdotH); - - // fresnel stuff for dielectrics - float orientedEta, rcpOrientedEta; - const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta); - - // - const float VdotL = dot(interaction.isotropic.V.dir,_sample.L); - - // - const float a = max(BSDFNode_getRoughness(bsdf),0.0001); // TODO: @Crisspl 0-roughness still doesn't work! Also Beckmann has a weird dark rim instead as fresnel!? - const float a2 = a*a; - - // TODO: refactor into Material Compiler-esque thing - switch (BSDFNode_getType(bsdf)) - { - case DIFFUSE_OP: - remainder = reflectance*nbl_glsl_oren_nayar_cos_remainder_and_pdf_wo_clamps(pdf,a*a,VdotL,clampedNdotL,clampedNdotV); - break; - case CONDUCTOR_OP: - remainder = nbl_glsl_ggx_cos_remainder_and_pdf_wo_clamps(pdf,nbl_glsl_ggx_trowbridge_reitz(a2,_cache.isotropic.NdotH2),clampedNdotL,_sample.NdotL2,clampedNdotV,interaction.isotropic.NdotV_squared,reflectance,a2); - break; - default: - remainder = vec3(nbl_glsl_ggx_dielectric_cos_remainder_and_pdf(pdf, _sample, interaction.isotropic, _cache.isotropic, monochromeEta, a*a)); - break; - } - } - else - remainder = vec3(0.0); - return remainder; -} - -layout (constant_id = 0) const int MAX_DEPTH_LOG2 = 4; -layout (constant_id = 1) const int MAX_SAMPLES_LOG2 = 10; - - -#include - -mat2x3 rand3d(in uint protoDimension, in uint _sample, inout nbl_glsl_xoroshiro64star_state_t scramble_state) -{ - mat2x3 retval; - uint address = bitfieldInsert(protoDimension,_sample,MAX_DEPTH_LOG2,MAX_SAMPLES_LOG2); - for (int i=0; i<2u; i++) - { - uvec3 seqVal = texelFetch(sampleSequence,int(address)+i).xyz; - seqVal ^= uvec3(nbl_glsl_xoroshiro64star(scramble_state),nbl_glsl_xoroshiro64star(scramble_state),nbl_glsl_xoroshiro64star(scramble_state)); - retval[i] = vec3(seqVal)*uintBitsToFloat(0x2f800004u); - } - return retval; -} - - -void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction); -int traceRay(inout float intersectionT, in vec3 origin, in vec3 direction) -{ - const bool anyHit = intersectionT!=nbl_glsl_FLT_MAX; - - int objectID = -1; - for (int i=0; i0.0 && tnbl_glsl_FLT_MIN; - // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself - nbl_glsl_AnisotropicMicrofacetCache _cache; - validPath = validPath && nbl_glsl_calcAnisotropicMicrofacetCache(_cache, interaction, nee_sample, monochromeEta); - if (lightPdflumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1) - ray._payload.accumulation += neeContrib; - }} - } -#if 1 - return false; -#endif - // sample BSDF - float bsdfPdf; vec3 bsdfSampleL; - { - nbl_glsl_AnisotropicMicrofacetCache _cache; - nbl_glsl_LightSample bsdf_sample = nbl_glsl_bsdf_cos_generate(interaction,epsilon[1],bsdf,monochromeEta,_cache); - // the value of the bsdf divided by the probability of the sample being generated - throughput *= nbl_glsl_bsdf_cos_remainder_and_pdf(bsdfPdf,bsdf_sample,interaction,bsdf,monochromeEta,_cache); - // - bsdfSampleL = bsdf_sample.L; - } - - // additional threshold - const float lumaThroughputThreshold = lumaContributionThreshold; - if (bsdfPdf>bsdfPdfThreshold && getLuma(throughput)>lumaThroughputThreshold) - { - ray._payload.throughput = throughput; - ray._payload.otherTechniqueHeuristic = neeProbability/bsdfPdf; // numerically stable, don't touch - ray._payload.otherTechniqueHeuristic *= ray._payload.otherTechniqueHeuristic; - - // trace new ray - ray._immutable.origin = intersection+bsdfSampleL*(1.0/*kSceneSize*/)*getStartTolerance(depth); - ray._immutable.direction = bsdfSampleL; - #if POLYGON_METHOD==2 - ray._immutable.normalAtOrigin = interaction.isotropic.N; - ray._immutable.wasBSDFAtOrigin = isBSDF; - #endif - return true; - } - } - return false; -} - -void main() -{ - const ivec2 imageExtents = imageSize(outImage); - const ivec2 coords = getCoordinates(); - vec2 texCoord = vec2(coords) / vec2(imageExtents); - texCoord.y = 1.0 - texCoord.y; - - if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) { - return; - } - - if (((MAX_DEPTH-1)>>MAX_DEPTH_LOG2)>0 || ((SAMPLES-1)>>MAX_SAMPLES_LOG2)>0) - { - vec4 pixelCol = vec4(1.0,0.0,0.0,1.0); - imageStore(outImage, coords, pixelCol); - return; - } - - nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg; - const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0)); - - - const mat4 invMVP = inverse(cameraData.params.MVP); - - vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0); - vec3 camPos; - { - vec4 tmp = invMVP*NDC; - camPos = tmp.xyz/tmp.w; - NDC.z = 1.0; - } - - vec3 color = vec3(0.0); - float meanLumaSquared = 0.0; - // TODO: if we collapse the nested for loop, then all GPUs will get `MAX_DEPTH` factor speedup, not just NV with separate PC - for (int i=0; i5.0) - color = vec3(1.0,0.0,0.0); - #endif - - vec4 pixelCol = vec4(color, 1.0); - imageStore(outImage, coords, pixelCol); -} -/** TODO: Improving Rendering - -Now: -- Always MIS (path correlated reuse) -- Test MIS alpha (roughness) scheme - -Many Lights: -- Path Guiding -- Light Importance Lists/Classification -- Spatio-Temporal Reservoir Sampling - -Indirect Light: -- Bidirectional Path Tracing -- Uniform Path Sampling / Vertex Connection and Merging / Path Space Regularization - -Animations: -- A-SVGF / BMFR -**/ \ No newline at end of file diff --git a/42_FragmentShaderPathTracer/config.json.template b/42_FragmentShaderPathTracer/config.json.template deleted file mode 100644 index f961745c1..000000000 --- a/42_FragmentShaderPathTracer/config.json.template +++ /dev/null @@ -1,28 +0,0 @@ -{ - "enableParallelBuild": true, - "threadsPerBuildProcess" : 2, - "isExecuted": false, - "scriptPath": "", - "cmake": { - "configurations": [ "Release", "Debug", "RelWithDebInfo" ], - "buildModes": [], - "requiredOptions": [] - }, - "profiles": [ - { - "backend": "vulkan", - "platform": "windows", - "buildModes": [], - "runConfiguration": "Release", - "gpuArchitectures": [] - } - ], - "dependencies": [], - "data": [ - { - "dependencies": [], - "command": [""], - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/42_FragmentShaderPathTracer/litByRectangle.comp b/42_FragmentShaderPathTracer/litByRectangle.comp deleted file mode 100644 index 300cef559..000000000 --- a/42_FragmentShaderPathTracer/litByRectangle.comp +++ /dev/null @@ -1,182 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#version 430 core -#extension GL_GOOGLE_include_directive : require - -#define SPHERE_COUNT 8 -#define POLYGON_METHOD 1 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling -#include "common.glsl" - -#define RECTANGLE_COUNT 1 -const vec3 edge0 = normalize(vec3(2,0,-1)); -const vec3 edge1 = normalize(vec3(2,-5,4)); -Rectangle rectangles[RECTANGLE_COUNT] = { - Rectangle_Rectangle(vec3(-3.8,0.35,1.3),edge0*7.0,edge1*0.1,INVALID_ID_16BIT,0u) -}; - - -void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction) -{ - for (int i=0; i0.0 && t -#include -#include - -float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray) -{ - const Rectangle rect = rectangles[Light_getObjectID(light)]; - - const ImmutableRay_t _immutable = ray._immutable; - const vec3 L = _immutable.direction; -#if POLYGON_METHOD==0 - const float dist = ray._mutable.intersectionT; - return dist*dist/abs(dot(Rectangle_getNormalTimesArea(rect),L)); -#else - #ifdef TRIANGLE_REFERENCE - const mat3 sphericalVertices[2] = - { - nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset,rect.offset+rect.edge0,rect.offset+rect.edge1),_immutable.origin), - nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset+rect.edge1,rect.offset+rect.edge0,rect.offset+rect.edge0+rect.edge1),_immutable.origin) - }; - float solidAngle[2]; - vec3 cos_vertices[2],sin_vertices[2]; - float cos_a[2],cos_c[2],csc_b[2],csc_c[2]; - for (uint i=0u; i<2u; i++) - solidAngle[i] = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i]); - const float rectSolidAngle = solidAngle[0]+solidAngle[1]; - #if POLYGON_METHOD==1 - return 1.f/rectSolidAngle; - #elif POLYGON_METHOD==2 - // TODO: figure out what breaks for a directly visible light under MIS - if (rectSolidAngle > nbl_glsl_FLT_MIN) - { - const vec2 bary = nbl_glsl_barycentric_reconstructBarycentrics(L*ray._mutable.intersectionT+_immutable.origin-rect.offset,mat2x3(rect.edge0,rect.edge1)); - const uint i = bary.x>=0.f&&bary.y>=0.f&&(bary.x+bary.y)<=1.f ? 0u:1u; - - float pdf = nbl_glsl_sampling_probProjectedSphericalTriangleSample(solidAngle[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i],sphericalVertices[i],_immutable.normalAtOrigin,_immutable.wasBSDFAtOrigin,L); - pdf *= solidAngle[i]/rectSolidAngle; - return pdf; - } - else - return nbl_glsl_FLT_INF; - #endif - #else - float pdf; - mat3 rectNormalBasis; - vec2 rectExtents; - Rectangle_getNormalBasis(rect, rectNormalBasis, rectExtents); - vec3 sphR0 = nbl_glsl_shapes_getSphericalRectangle(_immutable.origin, rect.offset, rectNormalBasis); - float solidAngle = nbl_glsl_shapes_SolidAngleOfRectangle(sphR0, rectExtents); - if (solidAngle > nbl_glsl_FLT_MIN) - { - #if POLYGON_METHOD==1 - pdf = 1.f/solidAngle; - #else - #error - #endif - } - else - pdf = nbl_glsl_FLT_INF; - return pdf; - #endif -#endif -} - -vec3 nbl_glsl_light_generate_and_pdf(out float pdf, out float newRayMaxT, in vec3 origin, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in bool isBSDF, in vec3 xi, in uint objectID) -{ - const Rectangle rect = rectangles[objectID]; - const vec3 N = Rectangle_getNormalTimesArea(rect); - - const vec3 origin2origin = rect.offset-origin; -#if POLYGON_METHOD==0 - vec3 L = origin2origin+rect.edge0*xi.x+rect.edge1*xi.y; // TODO: refactor - - const float distanceSq = dot(L,L); - const float rcpDistance = inversesqrt(distanceSq); - L *= rcpDistance; - - pdf = distanceSq/abs(dot(N,L)); - newRayMaxT = 1.0/rcpDistance; - return L; -#else - #ifdef TRIANGLE_REFERENCE - const mat3 sphericalVertices[2] = - { - nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset,rect.offset+rect.edge0,rect.offset+rect.edge1),origin), - nbl_glsl_shapes_getSphericalTriangle(mat3(rect.offset+rect.edge1,rect.offset+rect.edge0,rect.offset+rect.edge0+rect.edge1),origin) - }; - float solidAngle[2]; - vec3 cos_vertices[2],sin_vertices[2]; - float cos_a[2],cos_c[2],csc_b[2],csc_c[2]; - for (uint i=0u; i<2u; i++) - solidAngle[i] = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i]); - vec3 L = vec3(0.f,0.f,0.f); - const float rectangleSolidAngle = solidAngle[0]+solidAngle[1]; - if (rectangleSolidAngle > nbl_glsl_FLT_MIN) - { - float rcpTriangleChoiceProb; - const uint i = nbl_glsl_partitionRandVariable(solidAngle[0]/rectangleSolidAngle,xi.z,rcpTriangleChoiceProb) ? 1u:0u; - #if POLYGON_METHOD==1 - L = nbl_glsl_sampling_generateSphericalTriangleSample(solidAngle[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i],sphericalVertices[i],xi.xy); - pdf = 1.f/rectangleSolidAngle; - #elif POLYGON_METHOD==2 - float rcpPdf; - L = nbl_glsl_sampling_generateProjectedSphericalTriangleSample(rcpPdf,solidAngle[i],cos_vertices[i],sin_vertices[i],cos_a[i],cos_c[i],csc_b[i],csc_c[i],sphericalVertices[i],interaction.isotropic.N,isBSDF,xi.xy); - pdf = 1.f/(rcpPdf*rcpTriangleChoiceProb); - #endif - } - else - pdf = nbl_glsl_FLT_INF; - #else - mat3 rectNormalBasis; - vec2 rectExtents; - Rectangle_getNormalBasis(rect, rectNormalBasis, rectExtents); - vec3 sphR0 = nbl_glsl_shapes_getSphericalRectangle(origin, rect.offset, rectNormalBasis); - vec3 L = vec3(0.f,0.f,0.f); - float solidAngle; - vec2 sphUv = nbl_glsl_sampling_generateSphericalRectangleSample(sphR0, rectExtents, xi.xy, solidAngle); - if (solidAngle > nbl_glsl_FLT_MIN) - { - #if POLYGON_METHOD==1 - vec3 sph_sample = sphUv[0] * rect.edge0 + sphUv[1] * rect.edge1 + rect.offset; - L = normalize(sph_sample - origin); - pdf = 1.f/solidAngle; - #else - #error - #endif - } - else - pdf = nbl_glsl_FLT_INF; - #endif - newRayMaxT = dot(N,origin2origin)/dot(N,L); - return L; -#endif -} - - -uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection) -{ - if (objectID0.0) - { - const float rcpDistance = inversesqrt(distanceSQ); - Z *= rcpDistance; - - const float cosThetaMax = sqrt(cosThetaMax2); - const float cosTheta = mix(1.0,cosThetaMax,xi.x); - - vec3 L = Z*cosTheta; - - const float cosTheta2 = cosTheta*cosTheta; - const float sinTheta = sqrt(1.0-cosTheta2); - float sinPhi,cosPhi; - nbl_glsl_sincos(2.0*nbl_glsl_PI*xi.y-nbl_glsl_PI,sinPhi,cosPhi); - mat2x3 XY = nbl_glsl_frisvad(Z); - - L += (XY[0]*cosPhi+XY[1]*sinPhi)*sinTheta; - - newRayMaxT = (cosTheta-sqrt(cosTheta2-cosThetaMax2))/rcpDistance; - pdf = 1.0/Sphere_getSolidAngle_impl(cosThetaMax); - return L; - } - pdf = 0.0; - return vec3(0.0,0.0,0.0); -} - -uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection) -{ - Sphere sphere = spheres[objectID]; - normal = Sphere_getNormal(sphere,intersection); - return sphere.bsdfLightIDs; -} \ No newline at end of file diff --git a/42_FragmentShaderPathTracer/litByTriangle.comp b/42_FragmentShaderPathTracer/litByTriangle.comp deleted file mode 100644 index ba23c82e5..000000000 --- a/42_FragmentShaderPathTracer/litByTriangle.comp +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#version 430 core -#extension GL_GOOGLE_include_directive : require - -#define SPHERE_COUNT 8 -#define POLYGON_METHOD 1 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling -#include "common.glsl" - -#define TRIANGLE_COUNT 1 -Triangle triangles[TRIANGLE_COUNT] = { - Triangle_Triangle(mat3(vec3(-1.8,0.35,0.3),vec3(-1.2,0.35,0.0),vec3(-1.5,0.8,-0.3))*10.0,INVALID_ID_16BIT,0u) -}; - -void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction) -{ - for (int i=0; i0.0 && t -float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray) -{ - const Triangle tri = triangles[Light_getObjectID(light)]; - - const vec3 L = ray._immutable.direction; -#if POLYGON_METHOD==0 - const float dist = ray._mutable.intersectionT; - return dist*dist/abs(dot(Triangle_getNormalTimesArea(tri),L)); -#else - const ImmutableRay_t _immutable = ray._immutable; - const mat3 sphericalVertices = nbl_glsl_shapes_getSphericalTriangle(mat3(tri.vertex0,tri.vertex1,tri.vertex2),_immutable.origin); - #if POLYGON_METHOD==1 - const float rcpProb = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices); - // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 - return rcpProb>nbl_glsl_FLT_MIN ? (1.0/rcpProb):nbl_glsl_FLT_MAX; - #elif POLYGON_METHOD==2 - const float pdf = nbl_glsl_sampling_probProjectedSphericalTriangleSample(sphericalVertices,_immutable.normalAtOrigin,_immutable.wasBSDFAtOrigin,L); - // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small - return pdfnbl_glsl_FLT_MIN ? (1.0/rcpPdf):0.0; - - const vec3 N = Triangle_getNormalTimesArea(tri); - newRayMaxT = dot(N,tri.vertex0-origin)/dot(N,L); - return L; -#endif -} - - -uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection) -{ - if (objectID - -#include "../common/CommonAPI.h" -#include "CCamera.hpp" -#include "nbl/ext/ScreenShot/ScreenShot.h" -#include "nbl/video/utilities/CDumbPresentationOracle.h" - -using namespace nbl; -using namespace core; -using namespace ui; - - -using namespace nbl; -using namespace core; -using namespace asset; -using namespace video; - -smart_refctd_ptr createHDRImageView(nbl::core::smart_refctd_ptr device, asset::E_FORMAT colorFormat, uint32_t width, uint32_t height) -{ - smart_refctd_ptr gpuImageViewColorBuffer; - { - IGPUImage::SCreationParams imgInfo; - imgInfo.format = colorFormat; - imgInfo.type = IGPUImage::ET_2D; - imgInfo.extent.width = width; - imgInfo.extent.height = height; - imgInfo.extent.depth = 1u; - imgInfo.mipLevels = 1u; - imgInfo.arrayLayers = 1u; - imgInfo.samples = asset::ICPUImage::ESCF_1_BIT; - imgInfo.flags = static_cast(0u); - imgInfo.usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT; - - auto image = device->createImage(std::move(imgInfo)); - auto imageMemReqs = image->getMemoryReqs(); - imageMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - device->allocate(imageMemReqs, image.get()); - - IGPUImageView::SCreationParams imgViewInfo; - imgViewInfo.image = std::move(image); - imgViewInfo.format = colorFormat; - imgViewInfo.viewType = IGPUImageView::ET_2D; - imgViewInfo.flags = static_cast(0u); - imgViewInfo.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; - imgViewInfo.subresourceRange.baseArrayLayer = 0u; - imgViewInfo.subresourceRange.baseMipLevel = 0u; - imgViewInfo.subresourceRange.layerCount = 1u; - imgViewInfo.subresourceRange.levelCount = 1u; - - gpuImageViewColorBuffer = device->createImageView(std::move(imgViewInfo)); - } - - return gpuImageViewColorBuffer; -} - -struct ShaderParameters -{ - const uint32_t MaxDepthLog2 = 4; //5 - const uint32_t MaxSamplesLog2 = 10; //18 -} kShaderParameters; - -enum E_LIGHT_GEOMETRY -{ - ELG_SPHERE, - ELG_TRIANGLE, - ELG_RECTANGLE -}; - -struct DispatchInfo_t -{ - uint32_t workGroupCount[3]; -}; - -_NBL_STATIC_INLINE_CONSTEXPR uint32_t DEFAULT_WORK_GROUP_SIZE = 16u; - -DispatchInfo_t getDispatchInfo(uint32_t imgWidth, uint32_t imgHeight) { - DispatchInfo_t ret = {}; - ret.workGroupCount[0] = (uint32_t)core::ceil((float)imgWidth / (float)DEFAULT_WORK_GROUP_SIZE); - ret.workGroupCount[1] = (uint32_t)core::ceil((float)imgHeight / (float)DEFAULT_WORK_GROUP_SIZE); - ret.workGroupCount[2] = 1; - return ret; -} - -int main() -{ - system::IApplicationFramework::GlobalsInit(); - - constexpr uint32_t WIN_W = 1280; - constexpr uint32_t WIN_H = 720; - constexpr uint32_t FBO_COUNT = 2u; - constexpr uint32_t FRAMES_IN_FLIGHT = 5u; - constexpr bool LOG_TIMESTAMP = false; - static_assert(FRAMES_IN_FLIGHT>FBO_COUNT); - - const auto swapchainImageUsage = static_cast(asset::IImage::EUF_COLOR_ATTACHMENT_BIT | asset::IImage::EUF_TRANSFER_DST_BIT); - CommonAPI::InitParams initParams; - initParams.apiType = video::EAT_VULKAN; - initParams.appName = { "Compute Shader PathTracer" }; - initParams.framesInFlight = FRAMES_IN_FLIGHT; - initParams.windowWidth = WIN_W; - initParams.windowHeight = WIN_H; - initParams.swapchainImageCount = FBO_COUNT; - initParams.swapchainImageUsage = swapchainImageUsage; - initParams.depthFormat = asset::EF_D32_SFLOAT; - auto initOutput = CommonAPI::InitWithDefaultExt(std::move(initParams)); - - auto system = std::move(initOutput.system); - auto window = std::move(initParams.window); - auto windowCb = std::move(initParams.windowCb); - auto gl = std::move(initOutput.apiConnection); - auto surface = std::move(initOutput.surface); - auto gpuPhysicalDevice = std::move(initOutput.physicalDevice); - auto device = std::move(initOutput.logicalDevice); - auto queues = std::move(initOutput.queues); - auto graphicsQueue = queues[CommonAPI::InitOutput::EQT_GRAPHICS]; - auto transferUpQueue = queues[CommonAPI::InitOutput::EQT_TRANSFER_UP]; - auto computeQueue = queues[CommonAPI::InitOutput::EQT_COMPUTE]; - auto renderpass = std::move(initOutput.renderToSwapchainRenderpass); - auto assetManager = std::move(initOutput.assetManager); - auto cpu2gpuParams = std::move(initOutput.cpu2gpuParams); - auto logger = std::move(initOutput.logger); - auto inputSystem = std::move(initOutput.inputSystem); - auto utilities = std::move(initOutput.utilities); - auto graphicsCommandPools = std::move(initOutput.commandPools[CommonAPI::InitOutput::EQT_GRAPHICS]); - auto computeCommandPools = std::move(initOutput.commandPools[CommonAPI::InitOutput::EQT_COMPUTE]); - auto swapchainCreationParams = std::move(initOutput.swapchainCreationParams); - - core::smart_refctd_ptr swapchain = nullptr; - CommonAPI::createSwapchain(std::move(device), swapchainCreationParams, WIN_W, WIN_H, swapchain); - assert(swapchain); - auto fbo = CommonAPI::createFBOWithSwapchainImages( - swapchain->getImageCount(), WIN_W, WIN_H, - device, swapchain, renderpass, - asset::EF_D32_SFLOAT - ); - - auto graphicsCmdPoolQueueFamIdx = graphicsQueue->getFamilyIndex(); - - nbl::video::IGPUObjectFromAssetConverter CPU2GPU; - - core::smart_refctd_ptr cmdbuf[FRAMES_IN_FLIGHT]; - for (uint32_t i = 0u; i < FRAMES_IN_FLIGHT; i++) - device->createCommandBuffers(graphicsCommandPools[i].get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, cmdbuf+i); - - constexpr uint32_t maxDescriptorCount = 256u; - constexpr uint32_t PoolSizesCount = 5u; - - nbl::video::IDescriptorPool::SCreateInfo createInfo; - createInfo.maxDescriptorCount[static_cast(nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = maxDescriptorCount * 1; - createInfo.maxDescriptorCount[static_cast(nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE)] = maxDescriptorCount * 8; - createInfo.maxDescriptorCount[static_cast(nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER)] = maxDescriptorCount * 2; - createInfo.maxDescriptorCount[static_cast(nbl::asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER)] = maxDescriptorCount * 1; - createInfo.maxDescriptorCount[static_cast(nbl::asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER)] = maxDescriptorCount * 1; - createInfo.maxSets = maxDescriptorCount; - - auto descriptorPool = device->createDescriptorPool(std::move(createInfo)); - - const auto timestampQueryPool = device->createQueryPool({ - .queryType = video::IQueryPool::EQT_TIMESTAMP, - .queryCount = 2u - }); - - // Camera - core::vectorSIMDf cameraPosition(0, 5, -10); - matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(60.0f), video::ISurface::getTransformedAspectRatio(swapchain->getPreTransform(), WIN_W, WIN_H), 0.01f, 500.0f); - Camera cam = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); - - IGPUDescriptorSetLayout::SBinding descriptorSet0Bindings[] = { - { 0u, nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr }, - }; - IGPUDescriptorSetLayout::SBinding uboBinding - { 0u, nbl::asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr }; - IGPUDescriptorSetLayout::SBinding descriptorSet3Bindings[] = { - { 0u, nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr }, - { 1u, nbl::asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr }, - { 2u, nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr }, - }; - - auto gpuDescriptorSetLayout0 = device->createDescriptorSetLayout(descriptorSet0Bindings, descriptorSet0Bindings + 1u); - auto gpuDescriptorSetLayout1 = device->createDescriptorSetLayout(&uboBinding, &uboBinding + 1u); - auto gpuDescriptorSetLayout2 = device->createDescriptorSetLayout(descriptorSet3Bindings, descriptorSet3Bindings+3u); - - auto createGpuResources = [&](std::string pathToShader) -> core::smart_refctd_ptr - { - asset::IAssetLoader::SAssetLoadParams params{}; - params.logger = logger.get(); - //params.relativeDir = tmp.c_str(); - auto spec = assetManager->getAsset(pathToShader,params).getContents(); - - if (spec.empty()) - assert(false); - - auto cpuComputeSpecializedShader = core::smart_refctd_ptr_static_cast(*spec.begin()); - - ISpecializedShader::SInfo info = cpuComputeSpecializedShader->getSpecializationInfo(); - info.m_backingBuffer = ICPUBuffer::create({ sizeof(ShaderParameters) }); - memcpy(info.m_backingBuffer->getPointer(),&kShaderParameters,sizeof(ShaderParameters)); - info.m_entries = core::make_refctd_dynamic_array>(2u); - for (uint32_t i=0; i<2; i++) - info.m_entries->operator[](i) = {i,(uint32_t)(i*sizeof(uint32_t)),sizeof(uint32_t)}; - - - cpuComputeSpecializedShader->setSpecializationInfo(std::move(info)); - - auto gpuComputeSpecializedShader = CPU2GPU.getGPUObjectsFromAssets(&cpuComputeSpecializedShader, &cpuComputeSpecializedShader + 1, cpu2gpuParams)->front(); - - auto gpuPipelineLayout = device->createPipelineLayout(nullptr, nullptr, core::smart_refctd_ptr(gpuDescriptorSetLayout0), core::smart_refctd_ptr(gpuDescriptorSetLayout1), core::smart_refctd_ptr(gpuDescriptorSetLayout2), nullptr); - - auto gpuPipeline = device->createComputePipeline(nullptr, std::move(gpuPipelineLayout), std::move(gpuComputeSpecializedShader)); - - return gpuPipeline; - }; - - E_LIGHT_GEOMETRY lightGeom = ELG_SPHERE; - constexpr const char* shaderPaths[] = {"../litBySphere.comp","../litByTriangle.comp","../litByRectangle.comp"}; - auto gpuComputePipeline = createGpuResources(shaderPaths[lightGeom]); - - DispatchInfo_t dispatchInfo = getDispatchInfo(WIN_W, WIN_H); - - auto createImageView = [&](std::string pathToOpenEXRHDRIImage) - { -#ifndef _NBL_COMPILE_WITH_OPENEXR_LOADER_ - assert(false); -#endif - - auto pathToTexture = pathToOpenEXRHDRIImage; - IAssetLoader::SAssetLoadParams lp(0ull, nullptr, IAssetLoader::ECF_DONT_CACHE_REFERENCES); - auto cpuTexture = assetManager->getAsset(pathToTexture, lp); - auto cpuTextureContents = cpuTexture.getContents(); - assert(!cpuTextureContents.empty()); - auto cpuImage = core::smart_refctd_ptr_static_cast(*cpuTextureContents.begin()); - cpuImage->setImageUsageFlags(IImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT); - - ICPUImageView::SCreationParams viewParams; - viewParams.flags = static_cast(0u); - viewParams.image = cpuImage; - viewParams.format = viewParams.image->getCreationParameters().format; - viewParams.viewType = IImageView::ET_2D; - viewParams.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; - viewParams.subresourceRange.baseArrayLayer = 0u; - viewParams.subresourceRange.layerCount = 1u; - viewParams.subresourceRange.baseMipLevel = 0u; - viewParams.subresourceRange.levelCount = 1u; - - auto cpuImageView = ICPUImageView::create(std::move(viewParams)); - - cpu2gpuParams.beginCommandBuffers(); - auto gpuImageView = CPU2GPU.getGPUObjectsFromAssets(&cpuImageView, &cpuImageView + 1u, cpu2gpuParams)->front(); - cpu2gpuParams.waitForCreationToComplete(false); - - return gpuImageView; - }; - - auto gpuEnvmapImageView = createImageView("../../media/envmap/envmap_0.exr"); - - smart_refctd_ptr gpuSequenceBufferView; - { - const uint32_t MaxDimensions = 3u<(sampleSequence->getPointer()); - for (auto dim=0u; dimcreateFilledDeviceLocalBufferOnDedMem(graphicsQueue, sampleSequence->getSize(), sampleSequence->getPointer()); - core::smart_refctd_ptr gpuSequenceBuffer; - { - IGPUBuffer::SCreationParams params = {}; - const size_t size = sampleSequence->getSize(); - params.usage = core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT) | asset::IBuffer::EUF_UNIFORM_TEXEL_BUFFER_BIT; - params.size = size; - gpuSequenceBuffer = device->createBuffer(std::move(params)); - auto gpuSequenceBufferMemReqs = gpuSequenceBuffer->getMemoryReqs(); - gpuSequenceBufferMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - device->allocate(gpuSequenceBufferMemReqs, gpuSequenceBuffer.get()); - utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{0u,size,gpuSequenceBuffer},sampleSequence->getPointer(), graphicsQueue); - } - gpuSequenceBufferView = device->createBufferView(gpuSequenceBuffer.get(), asset::EF_R32G32B32_UINT); - } - - smart_refctd_ptr gpuScrambleImageView; - { - IGPUImage::SCreationParams imgParams; - imgParams.flags = static_cast(0u); - imgParams.type = IImage::ET_2D; - imgParams.format = EF_R32G32_UINT; - imgParams.extent = {WIN_W, WIN_H,1u}; - imgParams.mipLevels = 1u; - imgParams.arrayLayers = 1u; - imgParams.samples = IImage::ESCF_1_BIT; - imgParams.usage = core::bitflag(IImage::EUF_SAMPLED_BIT) | IImage::EUF_TRANSFER_DST_BIT; - imgParams.initialLayout = asset::IImage::EL_UNDEFINED; - - IGPUImage::SBufferCopy region = {}; - region.bufferOffset = 0u; - region.bufferRowLength = 0u; - region.bufferImageHeight = 0u; - region.imageExtent = imgParams.extent; - region.imageOffset = {0u,0u,0u}; - region.imageSubresource.layerCount = 1u; - region.imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; - - constexpr auto ScrambleStateChannels = 2u; - const auto renderPixelCount = imgParams.extent.width*imgParams.extent.height; - core::vector random(renderPixelCount*ScrambleStateChannels); - { - core::RandomSampler rng(0xbadc0ffeu); - for (auto& pixel : random) - pixel = rng.nextSample(); - } - - // TODO: Temp Fix because createFilledDeviceLocalBufferOnDedMem doesn't take in params - // auto buffer = utilities->createFilledDeviceLocalBufferOnDedMem(graphicsQueue, random.size()*sizeof(uint32_t), random.data()); - core::smart_refctd_ptr buffer; - { - IGPUBuffer::SCreationParams params = {}; - const size_t size = random.size() * sizeof(uint32_t); - params.usage = core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT) | asset::IBuffer::EUF_TRANSFER_SRC_BIT; - params.size = size; - buffer = device->createBuffer(std::move(params)); - auto bufferMemReqs = buffer->getMemoryReqs(); - bufferMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - device->allocate(bufferMemReqs, buffer.get()); - utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{0u,size,buffer},random.data(),graphicsQueue); - } - - IGPUImageView::SCreationParams viewParams; - viewParams.flags = static_cast(0u); - // TODO: Replace this IGPUBuffer -> IGPUImage to using image upload utility - viewParams.image = utilities->createFilledDeviceLocalImageOnDedMem(std::move(imgParams), buffer.get(), 1u, ®ion, graphicsQueue); - viewParams.viewType = IGPUImageView::ET_2D; - viewParams.format = EF_R32G32_UINT; - viewParams.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; - viewParams.subresourceRange.levelCount = 1u; - viewParams.subresourceRange.layerCount = 1u; - gpuScrambleImageView = device->createImageView(std::move(viewParams)); - } - - // Create Out Image TODO - constexpr uint32_t MAX_FBO_COUNT = 4u; - smart_refctd_ptr outHDRImageViews[MAX_FBO_COUNT] = {}; - assert(MAX_FBO_COUNT >= swapchain->getImageCount()); - for(uint32_t i = 0; i < swapchain->getImageCount(); ++i) { - outHDRImageViews[i] = createHDRImageView(device, asset::EF_R16G16B16A16_SFLOAT, WIN_W, WIN_H); - } - - core::smart_refctd_ptr descriptorSets0[FBO_COUNT] = {}; - for(uint32_t i = 0; i < FBO_COUNT; ++i) - { - auto & descSet = descriptorSets0[i]; - descSet = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(gpuDescriptorSetLayout0)); - video::IGPUDescriptorSet::SWriteDescriptorSet writeDescriptorSet; - writeDescriptorSet.dstSet = descSet.get(); - writeDescriptorSet.binding = 0; - writeDescriptorSet.count = 1u; - writeDescriptorSet.arrayElement = 0u; - writeDescriptorSet.descriptorType = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE; - video::IGPUDescriptorSet::SDescriptorInfo info; - { - info.desc = outHDRImageViews[i]; - info.info.image.sampler = nullptr; - info.info.image.imageLayout = asset::IImage::EL_GENERAL; - } - writeDescriptorSet.info = &info; - device->updateDescriptorSets(1u, &writeDescriptorSet, 0u, nullptr); - } - - struct SBasicViewParametersAligned - { - SBasicViewParameters uboData; - }; - - IGPUBuffer::SCreationParams gpuuboParams = {}; - gpuuboParams.usage = core::bitflag(IGPUBuffer::EUF_UNIFORM_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT; - gpuuboParams.size = sizeof(SBasicViewParametersAligned); - auto gpuubo = device->createBuffer(std::move(gpuuboParams)); - auto gpuuboMemReqs = gpuubo->getMemoryReqs(); - gpuuboMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - device->allocate(gpuuboMemReqs, gpuubo.get()); - - auto uboDescriptorSet1 = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(gpuDescriptorSetLayout1)); - { - video::IGPUDescriptorSet::SWriteDescriptorSet uboWriteDescriptorSet; - uboWriteDescriptorSet.dstSet = uboDescriptorSet1.get(); - uboWriteDescriptorSet.binding = 0; - uboWriteDescriptorSet.count = 1u; - uboWriteDescriptorSet.arrayElement = 0u; - uboWriteDescriptorSet.descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER; - video::IGPUDescriptorSet::SDescriptorInfo info; - { - info.desc = gpuubo; - info.info.buffer.offset = 0ull; - info.info.buffer.size = sizeof(SBasicViewParametersAligned); - } - uboWriteDescriptorSet.info = &info; - device->updateDescriptorSets(1u, &uboWriteDescriptorSet, 0u, nullptr); - } - - ISampler::SParams samplerParams0 = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_FLOAT_OPAQUE_BLACK, ISampler::ETF_LINEAR, ISampler::ETF_LINEAR, ISampler::ESMM_LINEAR, 0u, false, ECO_ALWAYS }; - auto sampler0 = device->createSampler(samplerParams0); - ISampler::SParams samplerParams1 = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_INT_OPAQUE_BLACK, ISampler::ETF_NEAREST, ISampler::ETF_NEAREST, ISampler::ESMM_NEAREST, 0u, false, ECO_ALWAYS }; - auto sampler1 = device->createSampler(samplerParams1); - - auto descriptorSet2 = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(gpuDescriptorSetLayout2)); - { - constexpr auto kDescriptorCount = 3; - IGPUDescriptorSet::SWriteDescriptorSet samplerWriteDescriptorSet[kDescriptorCount]; - IGPUDescriptorSet::SDescriptorInfo samplerDescriptorInfo[kDescriptorCount]; - for (auto i=0; iupdateDescriptorSets(kDescriptorCount, samplerWriteDescriptorSet, 0u, nullptr); - } - - constexpr uint32_t FRAME_COUNT = 500000u; - - core::smart_refctd_ptr frameComplete[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr imageAcquire[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr renderFinished[FRAMES_IN_FLIGHT] = { nullptr }; - for (uint32_t i=0u; icreateSemaphore(); - renderFinished[i] = device->createSemaphore(); - } - - CDumbPresentationOracle oracle; - oracle.reportBeginFrameRecord(); - constexpr uint64_t MAX_TIMEOUT = 99999999999999ull; - - // polling for events! - CommonAPI::InputSystem::ChannelReader mouse; - CommonAPI::InputSystem::ChannelReader keyboard; - - uint32_t resourceIx = 0; - while(windowCb->isWindowOpen()) - { - resourceIx++; - if(resourceIx >= FRAMES_IN_FLIGHT) { - resourceIx = 0; - } - - oracle.reportEndFrameRecord(); - double dt = oracle.getDeltaTimeInMicroSeconds() / 1000.0; - auto nextPresentationTimeStamp = oracle.getNextPresentationTimeStamp(); - oracle.reportBeginFrameRecord(); - - // Input - inputSystem->getDefaultMouse(&mouse); - inputSystem->getDefaultKeyboard(&keyboard); - - cam.beginInputProcessing(nextPresentationTimeStamp); - mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { cam.mouseProcess(events); }, logger.get()); - keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { cam.keyboardProcess(events); }, logger.get()); - cam.endInputProcessing(nextPresentationTimeStamp); - - auto& cb = cmdbuf[resourceIx]; - auto& fence = frameComplete[resourceIx]; - if (fence) - while (device->waitForFences(1u,&fence.get(),false,MAX_TIMEOUT)==video::IGPUFence::ES_TIMEOUT) - { - } - else - fence = device->createFence(static_cast(0)); - - const auto viewMatrix = cam.getViewMatrix(); - const auto viewProjectionMatrix = matrix4SIMD::concatenateBFollowedByAPrecisely( - video::ISurface::getSurfaceTransformationMatrix(swapchain->getPreTransform()), - cam.getConcatenatedMatrix() - ); - - // safe to proceed - cb->begin(IGPUCommandBuffer::EU_NONE); - cb->resetQueryPool(timestampQueryPool.get(), 0u, 2u); - - // renderpass - uint32_t imgnum = 0u; - swapchain->acquireNextImage(MAX_TIMEOUT,imageAcquire[resourceIx].get(),nullptr,&imgnum); - { - auto mv = viewMatrix; - auto mvp = viewProjectionMatrix; - core::matrix3x4SIMD normalMat; - mv.getSub3x3InverseTranspose(normalMat); - - SBasicViewParametersAligned viewParams; - memcpy(viewParams.uboData.MV, mv.pointer(), sizeof(mv)); - memcpy(viewParams.uboData.MVP, mvp.pointer(), sizeof(mvp)); - memcpy(viewParams.uboData.NormalMat, normalMat.pointer(), sizeof(normalMat)); - - asset::SBufferRange range; - range.buffer = gpuubo; - range.offset = 0ull; - range.size = sizeof(viewParams); - utilities->updateBufferRangeViaStagingBufferAutoSubmit(range, &viewParams, graphicsQueue); - } - - // TRANSITION outHDRImageViews[imgnum] to EIL_GENERAL (because of descriptorSets0 -> ComputeShader Writes into the image) - { - IGPUCommandBuffer::SImageMemoryBarrier imageBarriers[3u] = {}; - imageBarriers[0].barrier.srcAccessMask = asset::EAF_NONE; - imageBarriers[0].barrier.dstAccessMask = static_cast(asset::EAF_SHADER_WRITE_BIT); - imageBarriers[0].oldLayout = asset::IImage::EL_UNDEFINED; - imageBarriers[0].newLayout = asset::IImage::EL_GENERAL; - imageBarriers[0].srcQueueFamilyIndex = graphicsCmdPoolQueueFamIdx; - imageBarriers[0].dstQueueFamilyIndex = graphicsCmdPoolQueueFamIdx; - imageBarriers[0].image = outHDRImageViews[imgnum]->getCreationParameters().image; - imageBarriers[0].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - imageBarriers[0].subresourceRange.baseMipLevel = 0u; - imageBarriers[0].subresourceRange.levelCount = 1; - imageBarriers[0].subresourceRange.baseArrayLayer = 0u; - imageBarriers[0].subresourceRange.layerCount = 1; - - imageBarriers[1].barrier.srcAccessMask = asset::EAF_NONE; - imageBarriers[1].barrier.dstAccessMask = static_cast(asset::EAF_SHADER_READ_BIT); - imageBarriers[1].oldLayout = asset::IImage::EL_UNDEFINED; - imageBarriers[1].newLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL; - imageBarriers[1].srcQueueFamilyIndex = graphicsCmdPoolQueueFamIdx; - imageBarriers[1].dstQueueFamilyIndex = graphicsCmdPoolQueueFamIdx; - imageBarriers[1].image = gpuScrambleImageView->getCreationParameters().image; - imageBarriers[1].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - imageBarriers[1].subresourceRange.baseMipLevel = 0u; - imageBarriers[1].subresourceRange.levelCount = 1; - imageBarriers[1].subresourceRange.baseArrayLayer = 0u; - imageBarriers[1].subresourceRange.layerCount = 1; - - imageBarriers[2].barrier.srcAccessMask = asset::EAF_NONE; - imageBarriers[2].barrier.dstAccessMask = static_cast(asset::EAF_SHADER_READ_BIT); - imageBarriers[2].oldLayout = asset::IImage::EL_UNDEFINED; - imageBarriers[2].newLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL; - imageBarriers[2].srcQueueFamilyIndex = graphicsCmdPoolQueueFamIdx; - imageBarriers[2].dstQueueFamilyIndex = graphicsCmdPoolQueueFamIdx; - imageBarriers[2].image = gpuEnvmapImageView->getCreationParameters().image; - imageBarriers[2].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - imageBarriers[2].subresourceRange.baseMipLevel = 0u; - imageBarriers[2].subresourceRange.levelCount = gpuEnvmapImageView->getCreationParameters().subresourceRange.levelCount; - imageBarriers[2].subresourceRange.baseArrayLayer = 0u; - imageBarriers[2].subresourceRange.layerCount = gpuEnvmapImageView->getCreationParameters().subresourceRange.layerCount; - - cb->pipelineBarrier(asset::EPSF_TOP_OF_PIPE_BIT, asset::EPSF_COMPUTE_SHADER_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 3u, imageBarriers); - } - - // cube envmap handle - { - cb->writeTimestamp(asset::E_PIPELINE_STAGE_FLAGS::EPSF_TOP_OF_PIPE_BIT, timestampQueryPool.get(), 0u); - cb->bindComputePipeline(gpuComputePipeline.get()); - cb->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 0u, 1u, &descriptorSets0[imgnum].get()); - cb->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 1u, 1u, &uboDescriptorSet1.get()); - cb->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 2u, 1u, &descriptorSet2.get()); - cb->dispatch(dispatchInfo.workGroupCount[0], dispatchInfo.workGroupCount[1], dispatchInfo.workGroupCount[2]); - cb->writeTimestamp(asset::E_PIPELINE_STAGE_FLAGS::EPSF_BOTTOM_OF_PIPE_BIT, timestampQueryPool.get(), 1u); - } - // TODO: tone mapping and stuff - - // Copy HDR Image to SwapChain - auto srcImgViewCreationParams = outHDRImageViews[imgnum]->getCreationParameters(); - auto dstImgViewCreationParams = fbo->begin()[imgnum]->getCreationParameters().attachments[0]->getCreationParameters(); - - // Getting Ready for Blit - // TRANSITION outHDRImageViews[imgnum] to EIL_TRANSFER_SRC_OPTIMAL - // TRANSITION `fbo[imgnum]->getCreationParameters().attachments[0]` to EIL_TRANSFER_DST_OPTIMAL - { - IGPUCommandBuffer::SImageMemoryBarrier imageBarriers[2u] = {}; - imageBarriers[0].barrier.srcAccessMask = asset::EAF_NONE; - imageBarriers[0].barrier.dstAccessMask = asset::EAF_TRANSFER_WRITE_BIT; - imageBarriers[0].oldLayout = asset::IImage::EL_UNDEFINED; - imageBarriers[0].newLayout = asset::IImage::EL_TRANSFER_SRC_OPTIMAL; - imageBarriers[0].srcQueueFamilyIndex = graphicsCmdPoolQueueFamIdx; - imageBarriers[0].dstQueueFamilyIndex = graphicsCmdPoolQueueFamIdx; - imageBarriers[0].image = srcImgViewCreationParams.image; - imageBarriers[0].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - imageBarriers[0].subresourceRange.baseMipLevel = 0u; - imageBarriers[0].subresourceRange.levelCount = 1; - imageBarriers[0].subresourceRange.baseArrayLayer = 0u; - imageBarriers[0].subresourceRange.layerCount = 1; - - imageBarriers[1].barrier.srcAccessMask = asset::EAF_NONE; - imageBarriers[1].barrier.dstAccessMask = asset::EAF_TRANSFER_WRITE_BIT; - imageBarriers[1].oldLayout = asset::IImage::EL_UNDEFINED; - imageBarriers[1].newLayout = asset::IImage::EL_TRANSFER_DST_OPTIMAL; - imageBarriers[1].srcQueueFamilyIndex = graphicsCmdPoolQueueFamIdx; - imageBarriers[1].dstQueueFamilyIndex = graphicsCmdPoolQueueFamIdx; - imageBarriers[1].image = dstImgViewCreationParams.image; - imageBarriers[1].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - imageBarriers[1].subresourceRange.baseMipLevel = 0u; - imageBarriers[1].subresourceRange.levelCount = 1; - imageBarriers[1].subresourceRange.baseArrayLayer = 0u; - imageBarriers[1].subresourceRange.layerCount = 1; - cb->pipelineBarrier(asset::EPSF_TRANSFER_BIT, asset::EPSF_TRANSFER_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 2u, imageBarriers); - } - - // Blit Image - { - SImageBlit blit = {}; - blit.srcOffsets[0] = {0, 0, 0}; - blit.srcOffsets[1] = {WIN_W, WIN_H, 1}; - - blit.srcSubresource.aspectMask = srcImgViewCreationParams.subresourceRange.aspectMask; - blit.srcSubresource.mipLevel = srcImgViewCreationParams.subresourceRange.baseMipLevel; - blit.srcSubresource.baseArrayLayer = srcImgViewCreationParams.subresourceRange.baseArrayLayer; - blit.srcSubresource.layerCount = srcImgViewCreationParams.subresourceRange.layerCount; - blit.dstOffsets[0] = {0, 0, 0}; - blit.dstOffsets[1] = {WIN_W, WIN_H, 1}; - blit.dstSubresource.aspectMask = dstImgViewCreationParams.subresourceRange.aspectMask; - blit.dstSubresource.mipLevel = dstImgViewCreationParams.subresourceRange.baseMipLevel; - blit.dstSubresource.baseArrayLayer = dstImgViewCreationParams.subresourceRange.baseArrayLayer; - blit.dstSubresource.layerCount = dstImgViewCreationParams.subresourceRange.layerCount; - - auto srcImg = srcImgViewCreationParams.image; - auto dstImg = dstImgViewCreationParams.image; - - cb->blitImage(srcImg.get(), asset::IImage::EL_TRANSFER_SRC_OPTIMAL, dstImg.get(), asset::IImage::EL_TRANSFER_DST_OPTIMAL, 1u, &blit , ISampler::ETF_NEAREST); - } - - // TRANSITION `fbo[imgnum]->getCreationParameters().attachments[0]` to EIL_PRESENT - { - IGPUCommandBuffer::SImageMemoryBarrier imageBarriers[1u] = {}; - imageBarriers[0].barrier.srcAccessMask = asset::EAF_TRANSFER_WRITE_BIT; - imageBarriers[0].barrier.dstAccessMask = asset::EAF_NONE; - imageBarriers[0].oldLayout = asset::IImage::EL_TRANSFER_DST_OPTIMAL; - imageBarriers[0].newLayout = asset::IImage::EL_PRESENT_SRC; - imageBarriers[0].srcQueueFamilyIndex = graphicsCmdPoolQueueFamIdx; - imageBarriers[0].dstQueueFamilyIndex = graphicsCmdPoolQueueFamIdx; - imageBarriers[0].image = dstImgViewCreationParams.image; - imageBarriers[0].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - imageBarriers[0].subresourceRange.baseMipLevel = 0u; - imageBarriers[0].subresourceRange.levelCount = 1; - imageBarriers[0].subresourceRange.baseArrayLayer = 0u; - imageBarriers[0].subresourceRange.layerCount = 1; - cb->pipelineBarrier(asset::EPSF_TRANSFER_BIT, asset::EPSF_TOP_OF_PIPE_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 1u, imageBarriers); - } - - cb->end(); - device->resetFences(1, &fence.get()); - CommonAPI::Submit(device.get(), cb.get(), graphicsQueue, imageAcquire[resourceIx].get(), renderFinished[resourceIx].get(), fence.get()); - CommonAPI::Present(device.get(), swapchain.get(), graphicsQueue, renderFinished[resourceIx].get(), imgnum); - - if (LOG_TIMESTAMP) - { - std::array timestamps{}; - auto queryResultFlags = core::bitflag(video::IQueryPool::EQRF_WAIT_BIT) | video::IQueryPool::EQRF_WITH_AVAILABILITY_BIT | video::IQueryPool::EQRF_64_BIT; - device->getQueryPoolResults(timestampQueryPool.get(), 0u, 2u, sizeof(timestamps), timestamps.data(), sizeof(uint64_t) * 2ull, queryResultFlags); - const float timePassed = (timestamps[2] - timestamps[0]) * device->getPhysicalDevice()->getLimits().timestampPeriodInNanoSeconds; - logger->log("Time Passed (Seconds) = %f", system::ILogger::ELL_INFO, (timePassed * 1e-9)); - logger->log("Timestamps availablity: %d, %d", system::ILogger::ELL_INFO, timestamps[1], timestamps[3]); - } - } - - const auto& fboCreationParams = fbo->begin()[0]->getCreationParameters(); - auto gpuSourceImageView = fboCreationParams.attachments[0]; - - device->waitIdle(); - - // bool status = ext::ScreenShot::createScreenShot(device.get(), queues[decltype(initOutput)::EQT_TRANSFER_UP], renderFinished[0].get(), gpuSourceImageView.get(), assetManager.get(), "ScreenShot.png"); - // assert(status); - - return 0; -} diff --git a/42_FragmentShaderPathTracer/pipeline.groovy b/42_FragmentShaderPathTracer/pipeline.groovy deleted file mode 100644 index 9e3a71cf3..000000000 --- a/42_FragmentShaderPathTracer/pipeline.groovy +++ /dev/null @@ -1,50 +0,0 @@ -import org.DevshGraphicsProgramming.Agent -import org.DevshGraphicsProgramming.BuilderInfo -import org.DevshGraphicsProgramming.IBuilder - -class CFragmentShaderPathTracerBuilder extends IBuilder -{ - public CFragmentShaderPathTracerBuilder(Agent _agent, _info) - { - super(_agent, _info) - } - - @Override - public boolean prepare(Map axisMapping) - { - return true - } - - @Override - public boolean build(Map axisMapping) - { - IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") - IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") - - def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) - def nameOfConfig = getNameOfConfig(config) - - agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") - - return true - } - - @Override - public boolean test(Map axisMapping) - { - return true - } - - @Override - public boolean install(Map axisMapping) - { - return true - } -} - -def create(Agent _agent, _info) -{ - return new CFragmentShaderPathTracerBuilder(_agent, _info) -} - -return this \ No newline at end of file diff --git a/53_ComputeShaders/CMakeLists.txt b/53_ComputeShaders/CMakeLists.txt deleted file mode 100644 index 2f9218f93..000000000 --- a/53_ComputeShaders/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -include(common RESULT_VARIABLE RES) -if(NOT RES) - message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") -endif() - -nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file diff --git a/53_ComputeShaders/computeShader.comp b/53_ComputeShaders/computeShader.comp deleted file mode 100644 index 033a6aabb..000000000 --- a/53_ComputeShaders/computeShader.comp +++ /dev/null @@ -1,95 +0,0 @@ -#version 450 core -#extension GL_EXT_shader_16bit_storage : require - -#include "shaderCommon.glsl" - -layout(set = 0, binding = 0, std430) buffer Position -{ - vec4 positions[]; -}; - -layout(set = 0, binding = 1, std430) buffer Velocity -{ - vec4 velocities[]; -}; - -layout(set = 0, binding = 2, std430) buffer Color -{ - vec4 colors[]; -}; - -layout(set = 0, binding = 3, std430) buffer ColorRisingFlag -{ - bvec4 colorsRisingFlag[]; -}; - -layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; - -void manageColorAxieState(float colorAxie, inout bool colorIntensityRisingAxieFlag) -{ - if(colorAxie <= 0) - colorIntensityRisingAxieFlag = true; - else if(colorAxie >= 1) - colorIntensityRisingAxieFlag = false; -} - -void manageColorState(vec3 color) -{ - uint globalInvocationID = gl_GlobalInvocationID.x; // the .y and .z are both 1 in this case - bvec4 isColorIntensityRising = colorsRisingFlag[globalInvocationID]; - - manageColorAxieState(color.x, isColorIntensityRising.x); - manageColorAxieState(color.y, isColorIntensityRising.y); - manageColorAxieState(color.z, isColorIntensityRising.z); - - colorsRisingFlag[globalInvocationID] = isColorIntensityRising; -} - -float getNewAxieColor(float colorAxie, bool colorIntensityRisingAxieFlag) -{ - const float colorDelta = 0.04; - - if(colorIntensityRisingAxieFlag) - colorAxie += colorDelta; - else - colorAxie -= colorDelta; - - return colorAxie; -} - -vec3 getNewColor(vec3 color) -{ - uint globalInvocationID = gl_GlobalInvocationID.x; // the .y and .z are both 1 in this case - bvec4 isColorIntensityRising = colorsRisingFlag[globalInvocationID]; - - return vec3(getNewAxieColor(color.x, isColorIntensityRising.x), getNewAxieColor(color.y, isColorIntensityRising.y), getNewAxieColor(color.z, isColorIntensityRising.z)); -} - -void main() -{ - const float deltaTime = 0.004; - - uint globalInvocationID = gl_GlobalInvocationID.x; // the .y and .z are both 1 in this case - - vec3 position = positions[globalInvocationID].xyz; - vec3 velocity = velocities[globalInvocationID].xyz; - vec3 color = colors[globalInvocationID].xyz; - - if(!pushConstants.isXPressed) - { - /* - if(pushConstants.isZPressed) - { - // TODO gravity to force a particle's velocity towards the user - } - */ - position += velocity * deltaTime; - } - - vec3 newComputedColor = getNewColor(color); - manageColorState(newComputedColor); - - positions[globalInvocationID].xyz = position; - velocities[globalInvocationID].xyz = velocity; - colors[globalInvocationID].xyz = newComputedColor; -} \ No newline at end of file diff --git a/53_ComputeShaders/config.json.template b/53_ComputeShaders/config.json.template deleted file mode 100644 index f961745c1..000000000 --- a/53_ComputeShaders/config.json.template +++ /dev/null @@ -1,28 +0,0 @@ -{ - "enableParallelBuild": true, - "threadsPerBuildProcess" : 2, - "isExecuted": false, - "scriptPath": "", - "cmake": { - "configurations": [ "Release", "Debug", "RelWithDebInfo" ], - "buildModes": [], - "requiredOptions": [] - }, - "profiles": [ - { - "backend": "vulkan", - "platform": "windows", - "buildModes": [], - "runConfiguration": "Release", - "gpuArchitectures": [] - } - ], - "dependencies": [], - "data": [ - { - "dependencies": [], - "command": [""], - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/53_ComputeShaders/fragmentShader.frag b/53_ComputeShaders/fragmentShader.frag deleted file mode 100644 index 9fe445b2b..000000000 --- a/53_ComputeShaders/fragmentShader.frag +++ /dev/null @@ -1,12 +0,0 @@ -#version 430 core - -layout(location = 0) in vec4 inFFullyProjectedVelocity; -layout(location = 1) in vec4 inFColor; - -layout(location = 0) out vec4 outColor; - -void main() -{ - outColor = inFColor; -} - \ No newline at end of file diff --git a/53_ComputeShaders/geometryShader.geom b/53_ComputeShaders/geometryShader.geom deleted file mode 100644 index 4a8bf36f0..000000000 --- a/53_ComputeShaders/geometryShader.geom +++ /dev/null @@ -1,27 +0,0 @@ -#version 450 core - -#include "shaderCommon.glsl" - -layout(location = 0) in vec4 gFullyProjectedVelocity[]; -layout(location = 1) in vec4 gColor[]; - -layout(location = 0) out vec4 outFVelocity; -layout(location = 1) out vec4 outFColor; - -layout (points) in; -layout (line_strip, max_vertices = 2) out; - -void main() -{ - if(pushConstants.isCPressed) - { - outFColor = vec4(0.0, 1.0, 0.0, 0.0); - gl_Position = gl_in[0].gl_Position; - EmitVertex(); - gl_Position = gl_in[0].gl_Position + gFullyProjectedVelocity[0]; - EmitVertex(); - - EndPrimitive(); - } -} - \ No newline at end of file diff --git a/53_ComputeShaders/main.cpp b/53_ComputeShaders/main.cpp deleted file mode 100644 index b8fb14017..000000000 --- a/53_ComputeShaders/main.cpp +++ /dev/null @@ -1,694 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#define _NBL_STATIC_LIB_ -#include -#include -#include - -#include "CCamera.hpp" -#include "../common/CommonAPI.h" -#include "nbl/ext/ScreenShot/ScreenShot.h" - -using namespace nbl; -using namespace asset; -using namespace core; - -/* - Uncomment for more detailed logging -*/ - -// #define NBL_MORE_LOGS - -class CEventReceiver -{ -public: - CEventReceiver() : particlesVectorChangeFlag(false), forceChangeVelocityFlag(false), visualizeVelocityVectorsFlag(false) {} - - void process(const ui::IKeyboardEventChannel::range_t& events) - { - particlesVectorChangeFlag = false; - forceChangeVelocityFlag = false; - visualizeVelocityVectorsFlag = false; - - for (auto eventIterator = events.begin(); eventIterator != events.end(); eventIterator++) - { - auto event = *eventIterator; - - if (event.keyCode == nbl::ui::EKC_X) - particlesVectorChangeFlag = true; - - if (event.keyCode == nbl::ui::EKC_Z) - forceChangeVelocityFlag = true; - - if (event.keyCode == nbl::ui::EKC_C) - visualizeVelocityVectorsFlag = true; - - if (event.keyCode == nbl::ui::EKC_V) - visualizeVelocityVectorsFlag = false; - } - } - - inline bool isXPressed() const { return particlesVectorChangeFlag; } - inline bool isZPressed() const { return forceChangeVelocityFlag; } - inline bool isCPressed() const { return visualizeVelocityVectorsFlag; } - -private: - bool particlesVectorChangeFlag; - bool forceChangeVelocityFlag; - bool visualizeVelocityVectorsFlag; -}; - -_NBL_STATIC_INLINE_CONSTEXPR size_t NUMBER_OF_PARTICLES = 1024 * 1024; // total number of particles to move -_NBL_STATIC_INLINE_CONSTEXPR size_t WORK_GROUP_SIZE = 128; // work-items per work-group - -enum E_ENTRIES -{ - EE_POSITIONS, - EE_VELOCITIES, - EE_COLORS, - EE_COLORS_RISING_FLAG, - EE_COUNT -}; - -#include "nbl/nblpack.h" -struct alignas(16) SShaderStorageBufferObject -{ - core::vector4df_SIMD positions[NUMBER_OF_PARTICLES]; - core::vector4df_SIMD velocities[NUMBER_OF_PARTICLES]; - core::vector4df_SIMD colors[NUMBER_OF_PARTICLES]; - bool isColorIntensityRising[NUMBER_OF_PARTICLES][4]; -} PACK_STRUCT; -#include "nbl/nblunpack.h" - -static_assert(sizeof(SShaderStorageBufferObject) == sizeof(SShaderStorageBufferObject::positions) + sizeof(SShaderStorageBufferObject::velocities) + sizeof(SShaderStorageBufferObject::colors) + sizeof(SShaderStorageBufferObject::isColorIntensityRising), "There will be inproper alignment!"); - -#include "nbl/nblpack.h" -struct alignas(32) SPushConstants -{ - uint32_t isXPressed = false; - uint32_t isZPressed = false; - uint32_t isCPressed = false; - core::vector3df currentUserAbsolutePosition; -} PACK_STRUCT; -#include "nbl/nblunpack.h" - -void triggerRandomSetup(SShaderStorageBufferObject* ssbo) -{ - _NBL_STATIC_INLINE_CONSTEXPR float POSITION_EACH_AXIE_MIN = -10.f; - _NBL_STATIC_INLINE_CONSTEXPR float POSITION_EACH_AXIE_MAX = 10.f; - - _NBL_STATIC_INLINE_CONSTEXPR float VELOCITY_EACH_AXIE_MIN = 0.f; - _NBL_STATIC_INLINE_CONSTEXPR float VELOCITY_EACH_AXIE_MAX = 0.001f; - - _NBL_STATIC_INLINE_CONSTEXPR float COLOR_EACH_AXIE_MIN = 0.f; - _NBL_STATIC_INLINE_CONSTEXPR float COLOR_EACH_AXIE_MAX = 1.f; - - auto get_random = [&](const float& min, const float& max) - { - static std::default_random_engine engine; - static std::uniform_real_distribution<> distribution(min, max); - return distribution(engine); - }; - - for (size_t i = 0; i < NUMBER_OF_PARTICLES; ++i) - { - ssbo->positions[i] = core::vector4df_SIMD(get_random(POSITION_EACH_AXIE_MIN, POSITION_EACH_AXIE_MAX), get_random(POSITION_EACH_AXIE_MIN, POSITION_EACH_AXIE_MAX), get_random(POSITION_EACH_AXIE_MIN, POSITION_EACH_AXIE_MAX), get_random(POSITION_EACH_AXIE_MIN, POSITION_EACH_AXIE_MAX)); - ssbo->velocities[i] = core::vector4df_SIMD(get_random(VELOCITY_EACH_AXIE_MIN, VELOCITY_EACH_AXIE_MAX), get_random(VELOCITY_EACH_AXIE_MIN, VELOCITY_EACH_AXIE_MAX), get_random(VELOCITY_EACH_AXIE_MIN, VELOCITY_EACH_AXIE_MAX), get_random(VELOCITY_EACH_AXIE_MIN, VELOCITY_EACH_AXIE_MAX)); - ssbo->colors[i] = core::vector4df_SIMD(get_random(COLOR_EACH_AXIE_MIN, COLOR_EACH_AXIE_MAX), get_random(COLOR_EACH_AXIE_MIN, COLOR_EACH_AXIE_MAX), get_random(COLOR_EACH_AXIE_MIN, COLOR_EACH_AXIE_MAX), get_random(COLOR_EACH_AXIE_MIN, COLOR_EACH_AXIE_MAX)); - - for (uint8_t b = 0; b < 4; ++b) - ssbo->isColorIntensityRising[i][b] = true; - } -} - -class MeshLoadersApp : public ApplicationBase -{ - static constexpr uint32_t WIN_W = 1280; - static constexpr uint32_t WIN_H = 720; - static constexpr uint32_t FBO_COUNT = 2u; - static constexpr uint32_t FRAMES_IN_FLIGHT = 1u; - static constexpr size_t NBL_FRAMES_TO_AVERAGE = 100ull; - -public: - nbl::core::smart_refctd_ptr windowManager; - nbl::core::smart_refctd_ptr window; - nbl::core::smart_refctd_ptr windowCallback; - nbl::core::smart_refctd_ptr gl; - nbl::core::smart_refctd_ptr surface; - nbl::core::smart_refctd_ptr utilities; - nbl::core::smart_refctd_ptr logicalDevice; - nbl::video::IPhysicalDevice* gpuPhysicalDevice; - std::array queues = { nullptr, nullptr, nullptr, nullptr }; - nbl::core::smart_refctd_ptr swapchain; - nbl::core::smart_refctd_ptr renderpass; - nbl::core::smart_refctd_dynamic_array> fbo; - std::array, CommonAPI::InitOutput::MaxFramesInFlight>, CommonAPI::InitOutput::MaxQueuesCount> commandPools; - nbl::core::smart_refctd_ptr system; - nbl::core::smart_refctd_ptr assetManager; - nbl::video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams; - nbl::core::smart_refctd_ptr logger; - nbl::core::smart_refctd_ptr inputSystem; - - nbl::core::smart_refctd_ptr gpuTransferFence; - nbl::core::smart_refctd_ptr gpuComputeFence; - nbl::video::IGPUObjectFromAssetConverter cpu2gpu; - - core::smart_refctd_ptr commandBuffers[1]; - - CEventReceiver eventReceiver; - CommonAPI::InputSystem::ChannelReader mouse; - CommonAPI::InputSystem::ChannelReader keyboard; - - Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); - std::chrono::system_clock::time_point lastTime; - size_t frame_count = 0ull; - double time_sum = 0; - double dtList[NBL_FRAMES_TO_AVERAGE] = {}; - - SPushConstants pushConstants; - nbl::core::smart_refctd_ptr gpuComputePipeline; - nbl::core::smart_refctd_ptr gpuCDescriptorSet; - nbl::core::smart_refctd_ptr gpuUBO; - nbl::core::smart_refctd_ptr gpuGraphicsPipeline; - nbl::core::smart_refctd_ptr gpuGraphicsPipeline2; - nbl::core::smart_refctd_ptr gpuMeshBuffer; - nbl::core::smart_refctd_ptr gpuMeshBuffer2; - core::smart_refctd_ptr gpuGDescriptorSet1; - nbl::core::smart_refctd_ptr render_finished_sem; - nbl::video::ISwapchain::SCreationParams m_swapchainCreationParams; - - void setWindow(core::smart_refctd_ptr&& wnd) override - { - window = std::move(wnd); - } - void setSystem(core::smart_refctd_ptr&& s) override - { - system = std::move(s); - } - nbl::ui::IWindow* getWindow() override - { - return window.get(); - } - video::IAPIConnection* getAPIConnection() override - { - return gl.get(); - } - video::ILogicalDevice* getLogicalDevice() override - { - return logicalDevice.get(); - } - video::IGPURenderpass* getRenderpass() override - { - return renderpass.get(); - } - void setSurface(core::smart_refctd_ptr&& s) override - { - surface = std::move(s); - } - void setFBOs(std::vector>& f) override - { - for (int i = 0; i < f.size(); i++) - { - fbo->begin()[i] = core::smart_refctd_ptr(f[i]); - } - } - void setSwapchain(core::smart_refctd_ptr&& s) override - { - swapchain = std::move(s); - } - uint32_t getSwapchainImageCount() override - { - return swapchain->getImageCount(); - } - virtual nbl::asset::E_FORMAT getDepthFormat() override - { - return nbl::asset::EF_D32_SFLOAT; - } - -APP_CONSTRUCTOR(MeshLoadersApp) - - void onAppInitialized_impl() override - { - const auto swapchainImageUsage = static_cast(asset::IImage::EUF_COLOR_ATTACHMENT_BIT); - CommonAPI::InitParams initParams; - initParams.window = core::smart_refctd_ptr(window); - initParams.apiType = video::EAT_VULKAN; - initParams.appName = { _NBL_APP_NAME_ }; - initParams.framesInFlight = FRAMES_IN_FLIGHT; - initParams.windowWidth = WIN_W; - initParams.windowHeight = WIN_H; - initParams.swapchainImageCount = FBO_COUNT; - initParams.swapchainImageUsage = swapchainImageUsage; - initParams.depthFormat = nbl::asset::EF_D32_SFLOAT; - auto initOutput = CommonAPI::InitWithDefaultExt(std::move(initParams)); - - window = std::move(initParams.window); - gl = std::move(initOutput.apiConnection); - surface = std::move(initOutput.surface); - gpuPhysicalDevice = std::move(initOutput.physicalDevice); - logicalDevice = std::move(initOutput.logicalDevice); - queues = std::move(initOutput.queues); - renderpass = std::move(initOutput.renderToSwapchainRenderpass); - commandPools = std::move(initOutput.commandPools); - assetManager = std::move(initOutput.assetManager); - logger = std::move(initOutput.logger); - inputSystem = std::move(initOutput.inputSystem); - windowCallback = std::move(initParams.windowCb); - cpu2gpuParams = std::move(initOutput.cpu2gpuParams); - m_swapchainCreationParams = std::move(initOutput.swapchainCreationParams); - auto defaultGraphicsCommandPool = commandPools[CommonAPI::InitOutput::EQT_GRAPHICS][0]; - - CommonAPI::createSwapchain(std::move(logicalDevice), m_swapchainCreationParams, WIN_W, WIN_H, swapchain); - assert(swapchain); - fbo = CommonAPI::createFBOWithSwapchainImages( - swapchain->getImageCount(), WIN_W, WIN_H, - logicalDevice, swapchain, renderpass, - nbl::asset::EF_D32_SFLOAT - ); - - logicalDevice->createCommandBuffers(defaultGraphicsCommandPool.get(), nbl::video::IGPUCommandBuffer::EL_PRIMARY, 1, commandBuffers); - auto commandBuffer = commandBuffers[0]; - - auto createDescriptorPool = [&](const uint32_t itemCount, E_DESCRIPTOR_TYPE descriptorType) - { - constexpr uint32_t maxItemCount = 256u; - { - nbl::video::IDescriptorPool::SDescriptorPoolSize poolSize; - poolSize.count = itemCount; - poolSize.type = descriptorType; - return logicalDevice->createDescriptorPool(static_cast(0), maxItemCount, 1u, &poolSize); - } - }; - - /* - Compute pipeline - */ - - auto computeShaderBundle = assetManager->getAsset("../computeShader.comp", {}); - { - bool status = !computeShaderBundle.getContents().empty(); - assert(status); - } - - auto cpuComputeShader = core::smart_refctd_ptr_static_cast(computeShaderBundle.getContents().begin()[0]); - smart_refctd_ptr gpuComputeShader; - { - auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&cpuComputeShader, &cpuComputeShader + 1, cpu2gpuParams); - if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0]) - assert(false); - - gpuComputeShader = (*gpu_array)[0]; - } - - auto cpuSSBOBuffer = ICPUBuffer::create({ sizeof(SShaderStorageBufferObject) }); - cpuSSBOBuffer->addUsageFlags(asset::IBuffer::EUF_STORAGE_BUFFER_BIT); - triggerRandomSetup(reinterpret_cast(cpuSSBOBuffer->getPointer())); - core::smart_refctd_ptr gpuSSBOBuffer; - { - cpu2gpuParams.beginCommandBuffers(); - - auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&cpuSSBOBuffer, &cpuSSBOBuffer + 1, cpu2gpuParams); - if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0]) - assert(false); - - cpu2gpuParams.waitForCreationToComplete(false); - - auto gpuSSBOOffsetBufferPair = (*gpu_array)[0]; - gpuSSBOBuffer = core::smart_refctd_ptr(gpuSSBOOffsetBufferPair->getBuffer()); - } - - video::IGPUDescriptorSetLayout::SBinding gpuBindingsLayout[EE_COUNT] = - { - {EE_POSITIONS, EDT_STORAGE_BUFFER, 1u, video::IGPUShader::ESS_COMPUTE, nullptr}, - {EE_VELOCITIES, EDT_STORAGE_BUFFER, 1u, video::IGPUShader::ESS_COMPUTE, nullptr}, - {EE_COLORS, EDT_STORAGE_BUFFER, 1u, video::IGPUShader::ESS_COMPUTE, nullptr}, - {EE_COLORS_RISING_FLAG, EDT_STORAGE_BUFFER, 1u, video::IGPUShader::ESS_COMPUTE, nullptr} - }; - - auto gpuCDescriptorPool = createDescriptorPool(EE_COUNT, EDT_STORAGE_BUFFER); - auto gpuCDescriptorSetLayout = logicalDevice->createDescriptorSetLayout(gpuBindingsLayout, gpuBindingsLayout + EE_COUNT); - gpuCDescriptorSet = logicalDevice->createDescriptorSet(gpuCDescriptorPool.get(), core::smart_refctd_ptr(gpuCDescriptorSetLayout)); - { - video::IGPUDescriptorSet::SDescriptorInfo gpuDescriptorSetInfos[EE_COUNT]; - - gpuDescriptorSetInfos[EE_POSITIONS].desc = gpuSSBOBuffer; - gpuDescriptorSetInfos[EE_POSITIONS].buffer.size = sizeof(SShaderStorageBufferObject::positions); - gpuDescriptorSetInfos[EE_POSITIONS].buffer.offset = 0; - - gpuDescriptorSetInfos[EE_VELOCITIES].desc = gpuSSBOBuffer; - gpuDescriptorSetInfos[EE_VELOCITIES].buffer.size = sizeof(SShaderStorageBufferObject::velocities); - gpuDescriptorSetInfos[EE_VELOCITIES].buffer.offset = sizeof(SShaderStorageBufferObject::positions); - - gpuDescriptorSetInfos[EE_COLORS].desc = gpuSSBOBuffer; - gpuDescriptorSetInfos[EE_COLORS].buffer.size = sizeof(SShaderStorageBufferObject::colors); - gpuDescriptorSetInfos[EE_COLORS].buffer.offset = gpuDescriptorSetInfos[EE_VELOCITIES].buffer.offset + sizeof(SShaderStorageBufferObject::velocities); - - gpuDescriptorSetInfos[EE_COLORS_RISING_FLAG].desc = gpuSSBOBuffer; - gpuDescriptorSetInfos[EE_COLORS_RISING_FLAG].buffer.size = sizeof(SShaderStorageBufferObject::isColorIntensityRising); - gpuDescriptorSetInfos[EE_COLORS_RISING_FLAG].buffer.offset = gpuDescriptorSetInfos[EE_COLORS].buffer.offset + sizeof(SShaderStorageBufferObject::colors); - - video::IGPUDescriptorSet::SWriteDescriptorSet gpuWrites[EE_COUNT]; - { - for (uint32_t binding = 0u; binding < EE_COUNT; binding++) - gpuWrites[binding] = { gpuCDescriptorSet.get(), binding, 0u, 1u, EDT_STORAGE_BUFFER, gpuDescriptorSetInfos + binding }; - logicalDevice->updateDescriptorSets(EE_COUNT, gpuWrites, 0u, nullptr); - } - } - - asset::SPushConstantRange pushConstantRange; - { - pushConstantRange.stageFlags = (asset::IShader::E_SHADER_STAGE)(asset::IShader::ESS_COMPUTE | asset::IShader::ESS_GEOMETRY); - pushConstantRange.offset = 0; - pushConstantRange.size = sizeof(SPushConstants); - } - - auto gpuCPipelineLayout = logicalDevice->createPipelineLayout(&pushConstantRange, &pushConstantRange + 1, std::move(gpuCDescriptorSetLayout), nullptr, nullptr, nullptr); - gpuComputePipeline = logicalDevice->createComputePipeline(nullptr, std::move(gpuCPipelineLayout), std::move(gpuComputeShader)); - - /* - Graphics Pipeline - */ - - asset::SVertexInputParams inputVertexParams; - inputVertexParams.enabledAttribFlags = core::createBitmask({ EE_POSITIONS, EE_VELOCITIES, EE_COLORS, EE_COLORS_RISING_FLAG }); - inputVertexParams.enabledBindingFlags = core::createBitmask({ EE_POSITIONS, EE_VELOCITIES, EE_COLORS, EE_COLORS_RISING_FLAG }); - - for (uint8_t i = 0; i < EE_COUNT; ++i) - { - inputVertexParams.bindings[i].stride = (i == EE_COLORS_RISING_FLAG ? getTexelOrBlockBytesize(EF_R8G8B8A8_UINT) : getTexelOrBlockBytesize(EF_R32G32B32A32_SFLOAT)); - inputVertexParams.bindings[i].inputRate = asset::EVIR_PER_VERTEX; - - inputVertexParams.attributes[i].binding = i; - inputVertexParams.attributes[i].format = (i == EE_COLORS_RISING_FLAG ? EF_R8G8B8A8_UINT : asset::EF_R32G32B32A32_SFLOAT); - inputVertexParams.attributes[i].relativeOffset = 0; - } - - asset::SBlendParams blendParams; - asset::SPrimitiveAssemblyParams primitiveAssemblyParams; - primitiveAssemblyParams.primitiveType = EPT_POINT_LIST; - asset::SRasterizationParams rasterizationParams; - - video::IGPUDescriptorSetLayout::SBinding gpuUboBinding = {}; - gpuUboBinding.count = 1u; - gpuUboBinding.binding = 0; - gpuUboBinding.stageFlags = static_cast(asset::ICPUShader::ESS_VERTEX | asset::ICPUShader::ESS_FRAGMENT); - gpuUboBinding.type = asset::EDT_UNIFORM_BUFFER; - - auto gpuGDescriptorPool = createDescriptorPool(1, EDT_UNIFORM_BUFFER); - auto gpuGDs1Layout = logicalDevice->createDescriptorSetLayout(&gpuUboBinding, &gpuUboBinding + 1); - - video::IGPUBuffer::SCreationParams gpuUBOCreationParams; - //gpuUBOCreationParams.size = sizeof(SBasicViewParameters); - gpuUBOCreationParams.usage = asset::IBuffer::E_USAGE_FLAGS(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF); - gpuUBOCreationParams.queueFamilyIndexCount = 0u; - gpuUBOCreationParams.queueFamilyIndices = nullptr; - gpuUBOCreationParams.size = sizeof(SBasicViewParameters); - - gpuUBO = logicalDevice->createBuffer(std::move(gpuUBOCreationParams)); - auto gpuUBOmemreqs = gpuUBO->getMemoryReqs(); - gpuUBOmemreqs.memoryTypeBits &= gpuPhysicalDevice->getDeviceLocalMemoryTypeBits(); - logicalDevice->allocate(gpuUBOmemreqs, gpuUBO.get()); - - gpuGDescriptorSet1 = logicalDevice->createDescriptorSet(gpuGDescriptorPool.get(), gpuGDs1Layout); - { - video::IGPUDescriptorSet::SWriteDescriptorSet write; - write.dstSet = gpuGDescriptorSet1.get(); - write.binding = 0; - write.count = 1u; - write.arrayElement = 0u; - write.descriptorType = asset::EDT_UNIFORM_BUFFER; - video::IGPUDescriptorSet::SDescriptorInfo info; - { - info.desc = gpuUBO; - info.buffer.offset = 0ull; - info.buffer.size = sizeof(SBasicViewParameters); - } - write.info = &info; - logicalDevice->updateDescriptorSets(1u, &write, 0u, nullptr); - } - - auto vertexShaderBundle = assetManager->getAsset("../vertexShader.vert", {}); - { - bool status = !vertexShaderBundle.getContents().empty(); - assert(status); - } - - auto cpuVertexShader = core::smart_refctd_ptr_static_cast(vertexShaderBundle.getContents().begin()[0]); - smart_refctd_ptr gpuVertexShader; - { - auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&cpuVertexShader, &cpuVertexShader + 1, cpu2gpuParams); - if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0]) - assert(false); - - gpuVertexShader = (*gpu_array)[0]; - } - - auto fragmentShaderBundle = assetManager->getAsset("../fragmentShader.frag", {}); - { - bool status = !fragmentShaderBundle.getContents().empty(); - assert(status); - } - - auto cpuFragmentShader = core::smart_refctd_ptr_static_cast(fragmentShaderBundle.getContents().begin()[0]); - smart_refctd_ptr gpuFragmentShader; - { - auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&cpuFragmentShader, &cpuFragmentShader + 1, cpu2gpuParams); - if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0]) - assert(false); - - gpuFragmentShader = (*gpu_array)[0]; - } - - auto geometryShaderBundle = assetManager->getAsset("../geometryShader.geom", {}); - { - bool status = !geometryShaderBundle.getContents().empty(); - assert(status); - } - - auto cpuGeometryShader = core::smart_refctd_ptr_static_cast(geometryShaderBundle.getContents().begin()[0]); - smart_refctd_ptr gpuGeometryShader; - { - auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&cpuGeometryShader, &cpuGeometryShader + 1, cpu2gpuParams); - if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0]) - assert(false); - - gpuGeometryShader = (*gpu_array)[0]; - } - - core::smart_refctd_ptr gpuGShaders[] = { gpuVertexShader, gpuFragmentShader, gpuGeometryShader }; - auto gpuGShadersPointer = reinterpret_cast(gpuGShaders); - - auto gpuGPipelineLayout = logicalDevice->createPipelineLayout(&pushConstantRange, &pushConstantRange + 1, nullptr, std::move(gpuGDs1Layout), nullptr, nullptr); - auto gpuRenderpassIndependentPipeline = logicalDevice->createRenderpassIndependentPipeline(nullptr, core::smart_refctd_ptr(gpuGPipelineLayout), gpuGShadersPointer, gpuGShadersPointer + 2 /* discard geometry shader*/, inputVertexParams, blendParams, primitiveAssemblyParams, rasterizationParams); - auto gpuRenderpassIndependentPipeline2 = logicalDevice->createRenderpassIndependentPipeline(nullptr, core::smart_refctd_ptr(gpuGPipelineLayout), gpuGShadersPointer, gpuGShadersPointer + 3, inputVertexParams, blendParams, primitiveAssemblyParams, rasterizationParams); - - asset::SBufferBinding gpuGbindings[video::IGPUMeshBuffer::MAX_ATTR_BUF_BINDING_COUNT]; - - gpuGbindings[EE_POSITIONS].buffer = gpuSSBOBuffer; - gpuGbindings[EE_POSITIONS].offset = 0; - - gpuGbindings[EE_VELOCITIES].buffer = gpuSSBOBuffer; - gpuGbindings[EE_VELOCITIES].offset = sizeof(SShaderStorageBufferObject::positions); - - gpuGbindings[EE_COLORS].buffer = gpuSSBOBuffer; - gpuGbindings[EE_COLORS].offset = gpuGbindings[EE_VELOCITIES].offset + sizeof(SShaderStorageBufferObject::velocities); - - gpuGbindings[EE_COLORS_RISING_FLAG].buffer = gpuSSBOBuffer; - gpuGbindings[EE_COLORS_RISING_FLAG].offset = gpuGbindings[EE_COLORS].offset + sizeof(SShaderStorageBufferObject::colors); - - gpuMeshBuffer = core::make_smart_refctd_ptr(std::move(gpuRenderpassIndependentPipeline), nullptr, gpuGbindings, asset::SBufferBinding()); - { - gpuMeshBuffer->setIndexType(asset::EIT_UNKNOWN); - gpuMeshBuffer->setIndexCount(NUMBER_OF_PARTICLES); - } - - { - nbl::video::IGPUGraphicsPipeline::SCreationParams graphicsPipelineParams; - graphicsPipelineParams.renderpassIndependent = core::smart_refctd_ptr(const_cast(gpuMeshBuffer->getPipeline())); - graphicsPipelineParams.renderpass = core::smart_refctd_ptr(renderpass); - gpuGraphicsPipeline = logicalDevice->createGraphicsPipeline(nullptr, std::move(graphicsPipelineParams)); - } - - gpuMeshBuffer2 = core::make_smart_refctd_ptr(std::move(gpuRenderpassIndependentPipeline2), nullptr, gpuGbindings, asset::SBufferBinding()); - { - gpuMeshBuffer2->setIndexType(asset::EIT_UNKNOWN); - gpuMeshBuffer2->setIndexCount(NUMBER_OF_PARTICLES); - } - - { - nbl::video::IGPUGraphicsPipeline::SCreationParams graphicsPipelineParams; - graphicsPipelineParams.renderpassIndependent = core::smart_refctd_ptr(const_cast(gpuMeshBuffer2->getPipeline())); - graphicsPipelineParams.renderpass = core::smart_refctd_ptr(renderpass); - gpuGraphicsPipeline2 = logicalDevice->createGraphicsPipeline(nullptr, std::move(graphicsPipelineParams)); - } - - const std::string captionData = "[Nabla Engine] Compute Shaders"; - window->setCaption(captionData); - - core::vectorSIMDf cameraPosition(0, 0, 0); - matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), video::ISurface::getTransformedAspectRatio(swapchain->getPreTransform(), WIN_W, WIN_H), 0.001, 1000); - camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, -1), projectionMatrix, 10.f, 1.f); - lastTime = std::chrono::system_clock::now(); - for (size_t i = 0ull; i < NBL_FRAMES_TO_AVERAGE; ++i) - dtList[i] = 0.0; - } - - void onAppTerminated_impl() override - { - const auto& fboCreationParams = fbo->begin()[0]->getCreationParameters(); - auto gpuSourceImageView = fboCreationParams.attachments[0]; - - bool status = ext::ScreenShot::createScreenShot(logicalDevice.get(), - queues[CommonAPI::InitOutput::EQT_TRANSFER_UP], - render_finished_sem.get(), - gpuSourceImageView.get(), - assetManager.get(), - "ScreenShot.png", - asset::IImage::EL_PRESENT_SRC, - asset::EAF_NONE); - - assert(status); - } - - void workLoopBody() override - { - auto renderStart = std::chrono::system_clock::now(); - const auto renderDt = std::chrono::duration_cast(renderStart - lastTime).count(); - lastTime = renderStart; - { // Calculate Simple Moving Average for FrameTime - time_sum -= dtList[frame_count]; - time_sum += renderDt; - dtList[frame_count] = renderDt; - frame_count++; - if (frame_count >= NBL_FRAMES_TO_AVERAGE) - frame_count = 0; - } - const double averageFrameTime = time_sum / (double)NBL_FRAMES_TO_AVERAGE; - -#ifdef NBL_MORE_LOGS - logger->log("renderDt = %f ------ averageFrameTime = %f", system::ILogger::ELL_INFO, renderDt, averageFrameTime); -#endif // NBL_MORE_LOGS - - auto averageFrameTimeDuration = std::chrono::duration(averageFrameTime); - auto nextPresentationTime = renderStart + averageFrameTimeDuration; - auto nextPresentationTimeStamp = std::chrono::duration_cast(nextPresentationTime.time_since_epoch()); - - inputSystem->getDefaultMouse(&mouse); - inputSystem->getDefaultKeyboard(&keyboard); - - camera.beginInputProcessing(nextPresentationTimeStamp); - mouse.consumeEvents([&](const ui::IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, logger.get()); - keyboard.consumeEvents([&](const ui::IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); eventReceiver.process(events); }, logger.get()); - camera.endInputProcessing(nextPresentationTimeStamp); - - const auto& viewMatrix = camera.getViewMatrix(); - const auto& viewProjectionMatrix = matrix4SIMD::concatenateBFollowedByAPrecisely( - video::ISurface::getSurfaceTransformationMatrix(swapchain->getPreTransform()), - camera.getConcatenatedMatrix() - ); - - auto& commandBuffer = commandBuffers[0]; - commandBuffer->reset(nbl::video::IGPUCommandBuffer::ERF_RELEASE_RESOURCES_BIT); - commandBuffer->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool - - asset::SViewport viewport; - viewport.minDepth = 1.f; - viewport.maxDepth = 0.f; - viewport.x = 0u; - viewport.y = 0u; - viewport.width = WIN_W; - viewport.height = WIN_H; - commandBuffer->setViewport(0u, 1u, &viewport); - - nbl::video::IGPUCommandBuffer::SRenderpassBeginInfo beginInfo; - VkRect2D area; - area.offset = { 0,0 }; - area.extent = { WIN_W, WIN_H }; - nbl::asset::SClearValue clear[2]; - clear[0].color.float32[0] = 0.f; - clear[0].color.float32[1] = 0.f; - clear[0].color.float32[2] = 0.f; - clear[0].color.float32[3] = 0.f; - clear[1].depthStencil.depth = 0.f; - - beginInfo.clearValueCount = 2u; - beginInfo.framebuffer = fbo->begin()[0]; - beginInfo.renderpass = renderpass; - beginInfo.renderArea = area; - beginInfo.clearValues = clear; - - commandBuffer->beginRenderPass(&beginInfo, nbl::asset::ESC_INLINE); - - pushConstants.isXPressed = eventReceiver.isXPressed(); - pushConstants.isZPressed = eventReceiver.isZPressed(); - pushConstants.isCPressed = eventReceiver.isCPressed(); - pushConstants.currentUserAbsolutePosition = camera.getPosition().getAsVector3df(); - - /* - Calculation of particle postitions takes place here - */ - - commandBuffer->bindComputePipeline(gpuComputePipeline.get()); - commandBuffer->pushConstants(gpuComputePipeline->getLayout(), asset::IShader::ESS_COMPUTE, 0, sizeof(SPushConstants), &pushConstants); - commandBuffer->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 0, 1, &gpuCDescriptorSet.get(), 0u); - - static_assert(NUMBER_OF_PARTICLES % WORK_GROUP_SIZE == 0, "Inccorect amount!"); - _NBL_STATIC_INLINE_CONSTEXPR size_t groupCountX = NUMBER_OF_PARTICLES / WORK_GROUP_SIZE; - - commandBuffer->dispatch(groupCountX, 1, 1); - - /* - After calculation of positions each particle gets displayed - */ - - core::matrix3x4SIMD modelMatrix; - modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0)); - - core::matrix4SIMD mvp = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); - - SBasicViewParameters uboData; - memcpy(uboData.MV, viewMatrix.pointer(), sizeof(uboData.MV)); - memcpy(uboData.MVP, mvp.pointer(), sizeof(uboData.MVP)); - memcpy(uboData.NormalMat, viewMatrix.pointer(), sizeof(uboData.NormalMat)); - commandBuffer->updateBuffer(gpuUBO.get(), 0ull, sizeof(uboData), &uboData); - - /* - Draw particles - */ - - commandBuffer->bindGraphicsPipeline(gpuGraphicsPipeline.get()); - commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuMeshBuffer->getPipeline()->getLayout(), 1u, 1u, &gpuGDescriptorSet1.get(), 0u); - commandBuffer->drawMeshBuffer(gpuMeshBuffer.get()); - - /* - Draw extras with geometry usage under key c and v conditions - */ - - commandBuffer->bindGraphicsPipeline(gpuGraphicsPipeline2.get()); - commandBuffer->pushConstants(gpuMeshBuffer2->getPipeline()->getLayout(), asset::IShader::ESS_GEOMETRY, 0, sizeof(SPushConstants), &pushConstants); - commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuMeshBuffer2->getPipeline()->getLayout(), 1u, 1u, &gpuGDescriptorSet1.get(), 0u); - commandBuffer->drawMeshBuffer(gpuMeshBuffer2.get()); - - commandBuffer->endRenderPass(); - commandBuffer->end(); - - auto img_acq_sem = logicalDevice->createSemaphore(); - render_finished_sem = logicalDevice->createSemaphore(); - - uint32_t imgnum = 0u; - constexpr uint64_t MAX_TIMEOUT = 99999999999999ull; // ns - swapchain->acquireNextImage(MAX_TIMEOUT, img_acq_sem.get(), nullptr, &imgnum); - - CommonAPI::Submit(logicalDevice.get(), commandBuffer.get(), queues[CommonAPI::InitOutput::EQT_GRAPHICS], img_acq_sem.get(), render_finished_sem.get()); - CommonAPI::Present(logicalDevice.get(), swapchain.get(), queues[CommonAPI::InitOutput::EQT_GRAPHICS], render_finished_sem.get(), imgnum); - } - - bool keepRunning() override - { - return windowCallback->isWindowOpen(); - } -}; - -NBL_COMMON_API_MAIN(MeshLoadersApp, MeshLoadersApp::Nabla) diff --git a/53_ComputeShaders/pipeline.groovy b/53_ComputeShaders/pipeline.groovy deleted file mode 100644 index e8eb74b5b..000000000 --- a/53_ComputeShaders/pipeline.groovy +++ /dev/null @@ -1,50 +0,0 @@ -import org.DevshGraphicsProgramming.Agent -import org.DevshGraphicsProgramming.BuilderInfo -import org.DevshGraphicsProgramming.IBuilder - -class CComputeShadersBuilder extends IBuilder -{ - public CComputeShadersBuilder(Agent _agent, _info) - { - super(_agent, _info) - } - - @Override - public boolean prepare(Map axisMapping) - { - return true - } - - @Override - public boolean build(Map axisMapping) - { - IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") - IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") - - def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) - def nameOfConfig = getNameOfConfig(config) - - agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") - - return true - } - - @Override - public boolean test(Map axisMapping) - { - return true - } - - @Override - public boolean install(Map axisMapping) - { - return true - } -} - -def create(Agent _agent, _info) -{ - return new CComputeShadersBuilder(_agent, _info) -} - -return this \ No newline at end of file diff --git a/53_ComputeShaders/shaderCommon.glsl b/53_ComputeShaders/shaderCommon.glsl deleted file mode 100644 index 972a8789a..000000000 --- a/53_ComputeShaders/shaderCommon.glsl +++ /dev/null @@ -1,6 +0,0 @@ -layout(push_constant, row_major) uniform Block{ - bool isXPressed; - bool isZPressed; - bool isCPressed; - vec3 currentUserAbsolutePostion; -} pushConstants; \ No newline at end of file diff --git a/53_ComputeShaders/vertexShader.vert b/53_ComputeShaders/vertexShader.vert deleted file mode 100644 index 6b14d97c8..000000000 --- a/53_ComputeShaders/vertexShader.vert +++ /dev/null @@ -1,23 +0,0 @@ -#version 430 core - -layout(location = 0) in vec4 vPosition; -layout(location = 1) in vec4 vVelocity; -layout(location = 2) in vec4 vColor; - -#include -#include - -layout (set = 1, binding = 0, row_major, std140) uniform UBO -{ - nbl_glsl_SBasicViewParameters params; -} cameraData; - -layout(location = 0) flat out vec4 outGOrFFullyProjectedVelocity; -layout(location = 1) flat out vec4 outGorFColor; - -void main() -{ - gl_Position = (cameraData.params.MVP) * vPosition; - outGOrFFullyProjectedVelocity = (cameraData.params.MVP) * vVelocity * 0.0001; - outGorFColor = vColor; -} \ No newline at end of file diff --git a/56_RayQuery/CMakeLists.txt b/56_RayQuery/CMakeLists.txt deleted file mode 100644 index a476b6203..000000000 --- a/56_RayQuery/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ - -include(common RESULT_VARIABLE RES) -if(NOT RES) - message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") -endif() - -nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file diff --git a/56_RayQuery/common.glsl b/56_RayQuery/common.glsl deleted file mode 100644 index ad88789f8..000000000 --- a/56_RayQuery/common.glsl +++ /dev/null @@ -1,793 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -// basic settings -#define MAX_DEPTH 15 -#define SAMPLES 32 - -// firefly and variance reduction techniques -//#define KILL_DIFFUSE_SPECULAR_PATHS -//#define VISUALIZE_HIGH_VARIANCE - -#define INVALID_ID_16BIT 0xffffu -struct Sphere -{ - vec3 position; - float radius2; - uint bsdfLightIDs; -}; - -layout(set=0, binding=0, rgba16f) uniform image2D outImage; - -layout(set = 2, binding = 0) uniform sampler2D envMap; -layout(set = 2, binding = 1) uniform usamplerBuffer sampleSequence; -layout(set = 2, binding = 2) uniform usampler2D scramblebuf; -layout(set = 2, binding = 3) uniform accelerationStructureEXT topLevelAS; -layout(set = 2, binding = 4) readonly restrict buffer InputBuffer -{ - Sphere spheres[]; -}; - -#ifndef _NBL_GLSL_WORKGROUP_SIZE_ -#define _NBL_GLSL_WORKGROUP_SIZE_ 16 -layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=_NBL_GLSL_WORKGROUP_SIZE_, local_size_z=1) in; -#endif - -ivec2 getCoordinates() { - return ivec2(gl_GlobalInvocationID.xy); -} - -vec2 getTexCoords() { - ivec2 imageSize = imageSize(outImage); - ivec2 iCoords = getCoordinates(); - return vec2(float(iCoords.x) / imageSize.x, 1.0 - float(iCoords.y) / imageSize.y); -} - - -#include -#include -#include - -#include - -layout(set = 1, binding = 0, row_major, std140) uniform UBO -{ - nbl_glsl_SBasicViewParameters params; -} cameraData; - -Sphere Sphere_Sphere(in vec3 position, in float radius, in uint bsdfID, in uint lightID) -{ - Sphere sphere; - sphere.position = position; - sphere.radius2 = radius*radius; - sphere.bsdfLightIDs = bitfieldInsert(bsdfID,lightID,16,16); - return sphere; -} - -// return intersection distance if found, FLT_NAN otherwise -float Sphere_intersect(in Sphere sphere, in vec3 origin, in vec3 direction) -{ - vec3 relOrigin = origin-sphere.position; - float relOriginLen2 = dot(relOrigin,relOrigin); - const float radius2 = sphere.radius2; - - float dirDotRelOrigin = dot(direction,relOrigin); - float det = radius2-relOriginLen2+dirDotRelOrigin*dirDotRelOrigin; - - // do some speculative math here - float detsqrt = sqrt(det); - return -dirDotRelOrigin+(relOriginLen2>radius2 ? (-detsqrt):detsqrt); -} - -vec3 Sphere_getNormal(in Sphere sphere, in vec3 position) -{ - const float radiusRcp = inversesqrt(sphere.radius2); - return (position-sphere.position)*radiusRcp; -} - -float Sphere_getSolidAngle_impl(in float cosThetaMax) -{ - return 2.0*nbl_glsl_PI*(1.0-cosThetaMax); -} -float Sphere_getSolidAngle(in Sphere sphere, in vec3 origin) -{ - float cosThetaMax = sqrt(1.0-sphere.radius2/nbl_glsl_lengthSq(sphere.position-origin)); - return Sphere_getSolidAngle_impl(cosThetaMax); -} - -struct Triangle -{ - vec3 vertex0; - uint bsdfLightIDs; - vec3 vertex1; - uint padding0; - vec3 vertex2; - uint padding1; -}; - -Triangle Triangle_Triangle(in mat3 vertices, in uint bsdfID, in uint lightID) -{ - Triangle tri; - tri.vertex0 = vertices[0]; - tri.vertex1 = vertices[1]; - tri.vertex2 = vertices[2]; - // - tri.bsdfLightIDs = bitfieldInsert(bsdfID, lightID, 16, 16); - return tri; -} - -// return intersection distance if found, FLT_NAN otherwise -float Triangle_intersect(in Triangle tri, in vec3 origin, in vec3 direction) -{ - const vec3 edges[2] = vec3[2](tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0); - - const vec3 h = cross(direction,edges[1]); - const float a = dot(edges[0],h); - - const vec3 relOrigin = origin-tri.vertex0; - - const float u = dot(relOrigin,h)/a; - - const vec3 q = cross(relOrigin,edges[0]); - const float v = dot(direction,q)/a; - - const float t = dot(edges[1],q)/a; - - return t>0.f&&u>=0.f&&v>=0.f&&(u+v)<=1.f ? t:nbl_glsl_FLT_NAN; -} - -vec3 Triangle_getNormalTimesArea_impl(in mat2x3 edges) -{ - return cross(edges[0],edges[1])*0.5; -} -vec3 Triangle_getNormalTimesArea(in Triangle tri) -{ - return Triangle_getNormalTimesArea_impl(mat2x3(tri.vertex1-tri.vertex0,tri.vertex2-tri.vertex0)); -} - - - -struct Rectangle -{ - vec3 offset; - uint bsdfLightIDs; - vec3 edge0; - uint padding0; - vec3 edge1; - uint padding1; -}; - -Rectangle Rectangle_Rectangle(in vec3 offset, in vec3 edge0, in vec3 edge1, in uint bsdfID, in uint lightID) -{ - Rectangle rect; - rect.offset = offset; - rect.edge0 = edge0; - rect.edge1 = edge1; - // - rect.bsdfLightIDs = bitfieldInsert(bsdfID, lightID, 16, 16); - return rect; -} - -// return intersection distance if found, FLT_NAN otherwise -float Rectangle_intersect(in Rectangle rect, in vec3 origin, in vec3 direction) -{ - const vec3 h = cross(direction,rect.edge1); - const float a = dot(rect.edge0,h); - - const vec3 relOrigin = origin-rect.offset; - - const float u = dot(relOrigin,h)/a; - - const vec3 q = cross(relOrigin,rect.edge0); - const float v = dot(direction,q)/a; - - const float t = dot(rect.edge1,q)/a; - - const bool intersection = t>0.f&&u>=0.f&&v>=0.f&&u<=1.f&&v<=1.f; - return intersection ? t:nbl_glsl_FLT_NAN; -} - -vec3 Rectangle_getNormalTimesArea(in Rectangle rect) -{ - return cross(rect.edge0,rect.edge1); -} - - - -#define DIFFUSE_OP 0u -#define CONDUCTOR_OP 1u -#define DIELECTRIC_OP 2u -#define OP_BITS_OFFSET 0 -#define OP_BITS_SIZE 2 -struct BSDFNode -{ - uvec4 data[2]; -}; - -uint BSDFNode_getType(in BSDFNode node) -{ - return bitfieldExtract(node.data[0].w,OP_BITS_OFFSET,OP_BITS_SIZE); -} -bool BSDFNode_isBSDF(in BSDFNode node) -{ - return BSDFNode_getType(node)==DIELECTRIC_OP; -} -bool BSDFNode_isNotDiffuse(in BSDFNode node) -{ - return BSDFNode_getType(node)!=DIFFUSE_OP; -} -float BSDFNode_getRoughness(in BSDFNode node) -{ - return uintBitsToFloat(node.data[1].w); -} -vec3 BSDFNode_getRealEta(in BSDFNode node) -{ - return uintBitsToFloat(node.data[0].rgb); -} -vec3 BSDFNode_getImaginaryEta(in BSDFNode node) -{ - return uintBitsToFloat(node.data[1].rgb); -} -mat2x3 BSDFNode_getEta(in BSDFNode node) -{ - return mat2x3(BSDFNode_getRealEta(node),BSDFNode_getImaginaryEta(node)); -} -#include -vec3 BSDFNode_getReflectance(in BSDFNode node, in float VdotH) -{ - const vec3 albedoOrRealIoR = uintBitsToFloat(node.data[0].rgb); - if (BSDFNode_isNotDiffuse(node)) - return nbl_glsl_fresnel_conductor(albedoOrRealIoR, BSDFNode_getImaginaryEta(node), VdotH); - else - return albedoOrRealIoR; -} - -float BSDFNode_getNEEProb(in BSDFNode bsdf) -{ - const float alpha = BSDFNode_isNotDiffuse(bsdf) ? BSDFNode_getRoughness(bsdf):1.0; - return min(8.0*alpha,1.0); -} - -#include -#include -float getLuma(in vec3 col) -{ - return dot(transpose(nbl_glsl_scRGBtoXYZ)[1],col); -} - -#define BSDF_COUNT 7 -BSDFNode bsdfs[BSDF_COUNT] = { - {{uvec4(floatBitsToUint(vec3(0.8,0.8,0.8)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}}, - {{uvec4(floatBitsToUint(vec3(0.8,0.4,0.4)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}}, - {{uvec4(floatBitsToUint(vec3(0.4,0.8,0.4)),DIFFUSE_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0))}}, - {{uvec4(floatBitsToUint(vec3(1.02,1.02,1.3)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,1.0,2.0,0.0))}}, - {{uvec4(floatBitsToUint(vec3(1.02,1.3,1.02)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,2.0,1.0,0.0))}}, - {{uvec4(floatBitsToUint(vec3(1.02,1.3,1.02)),CONDUCTOR_OP),floatBitsToUint(vec4(1.0,2.0,1.0,0.15))}}, - {{uvec4(floatBitsToUint(vec3(1.4,1.45,1.5)),DIELECTRIC_OP),floatBitsToUint(vec4(0.0,0.0,0.0,0.0625))}} -}; - - -struct Light -{ - vec3 radiance; - uint objectID; -}; - -vec3 Light_getRadiance(in Light light) -{ - return light.radiance; -} -uint Light_getObjectID(in Light light) -{ - return light.objectID; -} - - -#define LIGHT_COUNT 1 -float scene_getLightChoicePdf(in Light light) -{ - return 1.0/float(LIGHT_COUNT); -} - - -#define LIGHT_COUNT 1 -Light lights[LIGHT_COUNT] = -{ - { - vec3(30.0,25.0,15.0), -#ifdef POLYGON_METHOD - 0u -#else - 8u -#endif - } -}; - - - -#define ANY_HIT_FLAG (-2147483648) -#define DEPTH_BITS_COUNT 8 -#define DEPTH_BITS_OFFSET (31-DEPTH_BITS_COUNT) -struct ImmutableRay_t -{ - vec3 origin; - vec3 direction; -#if POLYGON_METHOD==2 - vec3 normalAtOrigin; - bool wasBSDFAtOrigin; -#endif -}; -struct MutableRay_t -{ - float intersectionT; - uint objectID; - /* irrelevant here - uint triangleID; - vec2 barycentrics; - */ -}; -struct Payload_t -{ - vec3 accumulation; - float otherTechniqueHeuristic; - vec3 throughput; - #ifdef KILL_DIFFUSE_SPECULAR_PATHS - bool hasDiffuse; - #endif -}; - -struct Ray_t -{ - ImmutableRay_t _immutable; - MutableRay_t _mutable; - Payload_t _payload; -}; - - -#define INTERSECTION_ERROR_BOUND_LOG2 (-8.0) -float getTolerance_common(in uint depth) -{ - float depthRcp = 1.0/float(depth); - return INTERSECTION_ERROR_BOUND_LOG2;// *depthRcp*depthRcp; -} -float getStartTolerance(in uint depth) -{ - return exp2(getTolerance_common(depth)); -} -float getEndTolerance(in uint depth) -{ - return 1.0-exp2(getTolerance_common(depth)+1.0); -} - - -vec2 SampleSphericalMap(vec3 v) -{ - vec2 uv = vec2(atan(v.z, v.x), asin(v.y)); - uv *= nbl_glsl_RECIPROCAL_PI*0.5; - uv += 0.5; - return uv; -} - -void missProgram(in ImmutableRay_t _immutable, inout Payload_t _payload) -{ - vec3 finalContribution = _payload.throughput; - // #define USE_ENVMAP -#ifdef USE_ENVMAP - vec2 uv = SampleSphericalMap(_immutable.direction); - finalContribution *= textureLod(envMap, uv, 0.0).rgb; -#else - const vec3 kConstantEnvLightRadiance = vec3(0.15, 0.21, 0.3); - finalContribution *= kConstantEnvLightRadiance; -#endif - _payload.accumulation += finalContribution; -} - -#include -#include -#include -#include -#include -#include -#include -nbl_glsl_LightSample nbl_glsl_bsdf_cos_generate(in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in vec3 u, in BSDFNode bsdf, in float monochromeEta, out nbl_glsl_AnisotropicMicrofacetCache _cache) -{ - const float a = BSDFNode_getRoughness(bsdf); - const mat2x3 ior = BSDFNode_getEta(bsdf); - - // fresnel stuff for dielectrics - float orientedEta, rcpOrientedEta; - const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta); - - nbl_glsl_LightSample smpl; - nbl_glsl_AnisotropicMicrofacetCache dummy; - switch (BSDFNode_getType(bsdf)) - { - case DIFFUSE_OP: - smpl = nbl_glsl_oren_nayar_cos_generate(interaction,u.xy,a*a); - break; - case CONDUCTOR_OP: - smpl = nbl_glsl_ggx_cos_generate(interaction,u.xy,a,a,_cache); - break; - default: - smpl = nbl_glsl_ggx_dielectric_cos_generate(interaction,u,a,a,monochromeEta,_cache); - break; - } - return smpl; -} - -vec3 nbl_glsl_bsdf_cos_remainder_and_pdf(out float pdf, in nbl_glsl_LightSample _sample, in nbl_glsl_AnisotropicViewSurfaceInteraction interaction, in BSDFNode bsdf, in float monochromeEta, in nbl_glsl_AnisotropicMicrofacetCache _cache) -{ - // are V and L on opposite sides of the surface? - const bool transmitted = nbl_glsl_isTransmissionPath(interaction.isotropic.NdotV,_sample.NdotL); - - // is the BSDF or BRDF, if it is then we make the dot products `abs` before `max(,0.0)` - const bool transmissive = BSDFNode_isBSDF(bsdf); - const float clampedNdotL = nbl_glsl_conditionalAbsOrMax(transmissive,_sample.NdotL,0.0); - const float clampedNdotV = nbl_glsl_conditionalAbsOrMax(transmissive,interaction.isotropic.NdotV,0.0); - - vec3 remainder; - - const float minimumProjVectorLen = 0.00000001; - if (clampedNdotV>minimumProjVectorLen && clampedNdotL>minimumProjVectorLen) - { - // fresnel stuff for conductors (but reflectance also doubles as albedo) - const mat2x3 ior = BSDFNode_getEta(bsdf); - const vec3 reflectance = BSDFNode_getReflectance(bsdf,_cache.isotropic.VdotH); - - // fresnel stuff for dielectrics - float orientedEta, rcpOrientedEta; - const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta); - - // - const float VdotL = dot(interaction.isotropic.V.dir,_sample.L); - - // - const float a = max(BSDFNode_getRoughness(bsdf),0.0001); // TODO: @Crisspl 0-roughness still doesn't work! Also Beckmann has a weird dark rim instead as fresnel!? - const float a2 = a*a; - - // TODO: refactor into Material Compiler-esque thing - switch (BSDFNode_getType(bsdf)) - { - case DIFFUSE_OP: - remainder = reflectance*nbl_glsl_oren_nayar_cos_remainder_and_pdf_wo_clamps(pdf,a*a,VdotL,clampedNdotL,clampedNdotV); - break; - case CONDUCTOR_OP: - remainder = nbl_glsl_ggx_cos_remainder_and_pdf_wo_clamps(pdf,nbl_glsl_ggx_trowbridge_reitz(a2,_cache.isotropic.NdotH2),clampedNdotL,_sample.NdotL2,clampedNdotV,interaction.isotropic.NdotV_squared,reflectance,a2); - break; - default: - remainder = vec3(nbl_glsl_ggx_dielectric_cos_remainder_and_pdf(pdf, _sample, interaction.isotropic, _cache.isotropic, monochromeEta, a*a)); - break; - } - } - else - remainder = vec3(0.0); - return remainder; -} - -layout (constant_id = 0) const int MAX_DEPTH_LOG2 = 4; -layout (constant_id = 1) const int MAX_SAMPLES_LOG2 = 10; - - -#include - -mat2x3 rand3d(in uint protoDimension, in uint _sample, inout nbl_glsl_xoroshiro64star_state_t scramble_state) -{ - mat2x3 retval; - uint address = bitfieldInsert(protoDimension,_sample,MAX_DEPTH_LOG2,MAX_SAMPLES_LOG2); - for (int i=0; i<2u; i++) - { - uvec3 seqVal = texelFetch(sampleSequence,int(address)+i).xyz; - seqVal ^= uvec3(nbl_glsl_xoroshiro64star(scramble_state),nbl_glsl_xoroshiro64star(scramble_state),nbl_glsl_xoroshiro64star(scramble_state)); - retval[i] = vec3(seqVal)*uintBitsToFloat(0x2f800004u); - } - return retval; -} - - -void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction); -int traceRay(inout float intersectionT, in vec3 origin, in vec3 direction) -{ - int objectID = -1; - -#define USE_RAY_QUERY -#ifdef USE_RAY_QUERY - rayQueryEXT rayQuery; - rayQueryInitializeEXT(rayQuery, topLevelAS, gl_RayFlagsNoneEXT, 0xFF, origin, 0.0, direction, 1000.0); - - // Start traversal: return false if traversal is complete - while(rayQueryProceedEXT(rayQuery)) - { - if(rayQueryGetIntersectionTypeEXT(rayQuery, false) == gl_RayQueryCandidateIntersectionAABBEXT) - { - int id = rayQueryGetIntersectionPrimitiveIndexEXT(rayQuery, false); - float t = Sphere_intersect(spheres[id],origin,direction); - bool reportIntersection = (t != nbl_glsl_FLT_NAN && t > 0 && t < intersectionT); - if(reportIntersection) - { - intersectionT = t; - objectID = id; - rayQueryGenerateIntersectionEXT(rayQuery, t); - } - } - } -#else - for (int i=0; i0.0 && t0.0; - // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself - nbl_glsl_AnisotropicMicrofacetCache _cache; - validPath = validPath && nbl_glsl_calcAnisotropicMicrofacetCache(_cache, interaction, nee_sample, monochromeEta); - if (validPath) - { - float bsdfPdf; - neeContrib *= nbl_glsl_bsdf_cos_remainder_and_pdf(bsdfPdf,nee_sample,interaction,bsdf,monochromeEta,_cache)*throughput; - const float oc = bsdfPdf*rcpChoiceProb; - neeContrib /= 1.0/oc+oc/(lightPdf*lightPdf); // MIS weight - if (bsdfPdflumaContributionThreshold && traceRay(t,intersection+nee_sample.L*t*getStartTolerance(depth),nee_sample.L)==-1) - ray._payload.accumulation += neeContrib; - } - } - - // sample BSDF - float bsdfPdf; vec3 bsdfSampleL; - { - nbl_glsl_AnisotropicMicrofacetCache _cache; - nbl_glsl_LightSample bsdf_sample = nbl_glsl_bsdf_cos_generate(interaction,epsilon[1],bsdf,monochromeEta,_cache); - // the value of the bsdf divided by the probability of the sample being generated - throughput *= nbl_glsl_bsdf_cos_remainder_and_pdf(bsdfPdf,bsdf_sample,interaction,bsdf,monochromeEta,_cache); - // - bsdfSampleL = bsdf_sample.L; - } - - // additional threshold - const float lumaThroughputThreshold = lumaContributionThreshold; - if (bsdfPdf>bsdfPdfThreshold && getLuma(throughput)>lumaThroughputThreshold) - { - ray._payload.throughput = throughput; - ray._payload.otherTechniqueHeuristic = neeProbability/bsdfPdf; // numerically stable, don't touch - ray._payload.otherTechniqueHeuristic *= ray._payload.otherTechniqueHeuristic; - - // trace new ray - ray._immutable.origin = intersection+bsdfSampleL*(1.0/*kSceneSize*/)*getStartTolerance(depth); - ray._immutable.direction = bsdfSampleL; - #if POLYGON_METHOD==2 - ray._immutable.normalAtOrigin = interaction.isotropic.N; - ray._immutable.wasBSDFAtOrigin = isBSDF; - #endif - return true; - } - } - return false; -} - -void main() -{ - const ivec2 coords = getCoordinates(); - const vec2 texCoord = getTexCoords(); - - if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageSize(outImage),coords)))) { - return; - } - - if (((MAX_DEPTH-1)>>MAX_DEPTH_LOG2)>0 || ((SAMPLES-1)>>MAX_SAMPLES_LOG2)>0) - { - vec4 pixelCol = vec4(1.0,0.0,0.0,1.0); - imageStore(outImage, coords, pixelCol); - return; - } - - nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg; - const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0)); - - - const mat4 invMVP = inverse(cameraData.params.MVP); - - vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0); - vec3 camPos; - { - vec4 tmp = invMVP*NDC; - camPos = tmp.xyz/tmp.w; - NDC.z = 1.0; - } - - vec3 color = vec3(0.0); - float meanLumaSquared = 0.0; - // TODO: if we collapse the nested for loop, then all GPUs will get `MAX_DEPTH` factor speedup, not just NV with separate PC - for (int i=0; i5.0) - color = vec3(1.0,0.0,0.0); - #endif - - vec4 pixelCol = vec4(color, 1.0); - imageStore(outImage, coords, pixelCol); -} -/** TODO: Improving Rendering - -Now: -- Always MIS (path correlated reuse) -- Test MIS alpha (roughness) scheme - -Many Lights: -- Path Guiding -- Light Importance Lists/Classification -- Spatio-Temporal Reservoir Sampling - -Indirect Light: -- Bidirectional Path Tracing -- Uniform Path Sampling / Vertex Connection and Merging / Path Space Regularization - -Animations: -- A-SVGF / BMFR -**/ \ No newline at end of file diff --git a/56_RayQuery/config.json.template b/56_RayQuery/config.json.template deleted file mode 100644 index f961745c1..000000000 --- a/56_RayQuery/config.json.template +++ /dev/null @@ -1,28 +0,0 @@ -{ - "enableParallelBuild": true, - "threadsPerBuildProcess" : 2, - "isExecuted": false, - "scriptPath": "", - "cmake": { - "configurations": [ "Release", "Debug", "RelWithDebInfo" ], - "buildModes": [], - "requiredOptions": [] - }, - "profiles": [ - { - "backend": "vulkan", - "platform": "windows", - "buildModes": [], - "runConfiguration": "Release", - "gpuArchitectures": [] - } - ], - "dependencies": [], - "data": [ - { - "dependencies": [], - "command": [""], - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/56_RayQuery/litByRectangle.comp b/56_RayQuery/litByRectangle.comp deleted file mode 100644 index 829d03398..000000000 --- a/56_RayQuery/litByRectangle.comp +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#version 460 core -#extension GL_GOOGLE_include_directive : require -#extension GL_EXT_ray_query : enable - -#define SPHERE_COUNT 8 -#define POLYGON_METHOD 0 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling -#include "common.glsl" - - -#define RECTANGLE_COUNT 1 -const vec3 edge0 = normalize(vec3(2,0,-1)); -const vec3 edge1 = normalize(vec3(2,-5,4)); -Rectangle rectangles[RECTANGLE_COUNT] = { - Rectangle_Rectangle(vec3(-3.8,0.35,1.3),edge0*7.0,edge1*0.1,INVALID_ID_16BIT,0u) -}; - - -void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction) -{ - for (int i=0; i0.0 && t -float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray) -{ - const Rectangle rect = rectangles[Light_getObjectID(light)]; - - const vec3 L = ray._immutable.direction; -#if POLYGON_METHOD==0 - const float dist = ray._mutable.intersectionT; - return dist*dist/abs(dot(Rectangle_getNormalTimesArea(rect),L)); -#else - const ImmutableRay_t _immutable = ray._immutable; - const mat3 sphericalVertices = nbl_glsl_shapes_getSphericalTriangle(mat3(tri.vertex0,tri.vertex1,tri.vertex2),_immutable.origin); - #if POLYGON_METHOD==1 - const float rcpProb = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices); - // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 - return rcpProb>FLT_MIN ? (1.0/rcpProb):nbl_glsl_FLT_MAX; - #elif POLYGON_METHOD==2 - const float pdf = nbl_glsl_sampling_probProjectedSphericalTriangleSample(sphericalVertices,_immutable.normalAtOrigin,_immutable.wasBSDFAtOrigin,L); - // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small - return pdfFLT_MIN ? (1.0/rcpPdf):0.0; - - const vec3 N = Triangle_getNormalTimesArea(tri); - newRayMaxT = dot(N,tri.vertex0-origin)/dot(N,L); - return L; -#endif -} - - -uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection) -{ - if (objectID0.0) - { - const float rcpDistance = inversesqrt(distanceSQ); - Z *= rcpDistance; - - const float cosThetaMax = sqrt(cosThetaMax2); - const float cosTheta = mix(1.0,cosThetaMax,xi.x); - - vec3 L = Z*cosTheta; - - const float cosTheta2 = cosTheta*cosTheta; - const float sinTheta = sqrt(1.0-cosTheta2); - float sinPhi,cosPhi; - nbl_glsl_sincos(2.0*nbl_glsl_PI*xi.y-nbl_glsl_PI,sinPhi,cosPhi); - mat2x3 XY = nbl_glsl_frisvad(Z); - - L += (XY[0]*cosPhi+XY[1]*sinPhi)*sinTheta; - - newRayMaxT = (cosTheta-sqrt(cosTheta2-cosThetaMax2))/rcpDistance; - pdf = 1.0/Sphere_getSolidAngle_impl(cosThetaMax); - return L; - } - pdf = 0.0; - return vec3(0.0,0.0,0.0); -} - -uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection) -{ - Sphere sphere = spheres[objectID]; - normal = Sphere_getNormal(sphere,intersection); - return sphere.bsdfLightIDs; -} \ No newline at end of file diff --git a/56_RayQuery/litByTriangle.comp b/56_RayQuery/litByTriangle.comp deleted file mode 100644 index 1cd1d3ee3..000000000 --- a/56_RayQuery/litByTriangle.comp +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#version 460 core -#extension GL_GOOGLE_include_directive : require -#extension GL_EXT_ray_query : enable - -#define SPHERE_COUNT 8 -#define POLYGON_METHOD 0 // 0 area sampling, 1 solid angle sampling, 2 approximate projected solid angle sampling -#include "common.glsl" - -#define TRIANGLE_COUNT 1 -Triangle triangles[TRIANGLE_COUNT] = { - Triangle_Triangle(mat3(vec3(-1.8,0.35,0.3),vec3(-1.2,0.35,0.0),vec3(-1.5,0.8,-0.3)),INVALID_ID_16BIT,0u) -}; - -void traceRay_extraShape(inout int objectID, inout float intersectionT, in vec3 origin, in vec3 direction) -{ - for (int i=0; i0.0 && t -float nbl_glsl_light_deferred_pdf(in Light light, in Ray_t ray) -{ - const Triangle tri = triangles[Light_getObjectID(light)]; - - const vec3 L = ray._immutable.direction; -#if POLYGON_METHOD==0 - const float dist = ray._mutable.intersectionT; - return dist*dist/abs(dot(Triangle_getNormalTimesArea(tri),L)); -#else - const ImmutableRay_t _immutable = ray._immutable; - const mat3 sphericalVertices = nbl_glsl_shapes_getSphericalTriangle(mat3(tri.vertex0,tri.vertex1,tri.vertex2),_immutable.origin); - #if POLYGON_METHOD==1 - const float rcpProb = nbl_glsl_shapes_SolidAngleOfTriangle(sphericalVertices); - // if `rcpProb` is NAN then the triangle's solid angle was close to 0.0 - return rcpProb>FLT_MIN ? (1.0/rcpProb):nbl_glsl_FLT_MAX; - #elif POLYGON_METHOD==2 - const float pdf = nbl_glsl_sampling_probProjectedSphericalTriangleSample(sphericalVertices,_immutable.normalAtOrigin,_immutable.wasBSDFAtOrigin,L); - // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small - return pdfFLT_MIN ? (1.0/rcpPdf):0.0; - - const vec3 N = Triangle_getNormalTimesArea(tri); - newRayMaxT = dot(N,tri.vertex0-origin)/dot(N,L); - return L; -#endif -} - - -uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection) -{ - if (objectID - -#include "../common/CommonAPI.h" -#include "CCamera.hpp" -#include "nbl/ext/ScreenShot/ScreenShot.h" -#include "nbl/video/utilities/CDumbPresentationOracle.h" - -using namespace nbl; -using namespace core; -using namespace ui; - - -using namespace nbl; -using namespace core; -using namespace asset; -using namespace video; - -smart_refctd_ptr createHDRImageView(nbl::core::smart_refctd_ptr device, asset::E_FORMAT colorFormat, uint32_t width, uint32_t height) -{ - smart_refctd_ptr gpuImageViewColorBuffer; - { - IGPUImage::SCreationParams imgInfo; - imgInfo.format = colorFormat; - imgInfo.type = IGPUImage::ET_2D; - imgInfo.extent.width = width; - imgInfo.extent.height = height; - imgInfo.extent.depth = 1u; - imgInfo.mipLevels = 1u; - imgInfo.arrayLayers = 1u; - imgInfo.samples = asset::ICPUImage::ESCF_1_BIT; - imgInfo.flags = static_cast(0u); - imgInfo.usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT; - - // (Erfan -> Cyprian) - // auto image = device->createGPUImageOnDedMem(std::move(imgInfo),device->getDeviceLocalGPUMemoryReqs()); - auto image = device->createImage(std::move(imgInfo)); - auto imageMemoryReqs = image->getMemoryReqs(); - imageMemoryReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); // getDeviceLocalMemoryTypeBits because of previous code getDeviceLocalGPUMemoryReqs - auto imageMem = device->allocate(imageMemoryReqs, image.get()); - - IGPUImageView::SCreationParams imgViewInfo; - imgViewInfo.image = std::move(image); - imgViewInfo.format = colorFormat; - imgViewInfo.viewType = IGPUImageView::ET_2D; - imgViewInfo.flags = static_cast(0u); - imgViewInfo.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; - imgViewInfo.subresourceRange.baseArrayLayer = 0u; - imgViewInfo.subresourceRange.baseMipLevel = 0u; - imgViewInfo.subresourceRange.layerCount = 1u; - imgViewInfo.subresourceRange.levelCount = 1u; - - gpuImageViewColorBuffer = device->createImageView(std::move(imgViewInfo)); - } - - return gpuImageViewColorBuffer; -} - -struct ShaderParameters -{ - const uint32_t MaxDepthLog2 = 4; //5 - const uint32_t MaxSamplesLog2 = 10; //18 -} kShaderParameters; - -enum E_LIGHT_GEOMETRY -{ - ELG_SPHERE, - ELG_TRIANGLE, - ELG_RECTANGLE -}; - -struct DispatchInfo_t -{ - uint32_t workGroupCount[3]; -}; - -_NBL_STATIC_INLINE_CONSTEXPR uint32_t DEFAULT_WORK_GROUP_SIZE = 16u; - -DispatchInfo_t getDispatchInfo(uint32_t imgWidth, uint32_t imgHeight) { - DispatchInfo_t ret = {}; - ret.workGroupCount[0] = (uint32_t)core::ceil((float)imgWidth / (float)DEFAULT_WORK_GROUP_SIZE); - ret.workGroupCount[1] = (uint32_t)core::ceil((float)imgHeight / (float)DEFAULT_WORK_GROUP_SIZE); - ret.workGroupCount[2] = 1; - return ret; -} - -class RayQuerySampleApp : public ApplicationBase -{ - constexpr static uint32_t WIN_W = 1280u; - constexpr static uint32_t WIN_H = 720u; - constexpr static uint32_t FRAMES_IN_FLIGHT = 5u; - static constexpr uint64_t MAX_TIMEOUT = 99999999999999ull; - - core::smart_refctd_ptr windowManager; - core::smart_refctd_ptr window; - core::smart_refctd_ptr windowCb; - core::smart_refctd_ptr apiConnection; - core::smart_refctd_ptr surface; - core::smart_refctd_ptr utilities; - core::smart_refctd_ptr logicalDevice; - video::IPhysicalDevice* physicalDevice; - std::array queues; - core::smart_refctd_ptr swapchain; - core::smart_refctd_ptr renderpass; - core::smart_refctd_dynamic_array> fbos; - std::array, CommonAPI::InitOutput::MaxFramesInFlight>, CommonAPI::InitOutput::MaxQueuesCount> commandPools; - core::smart_refctd_ptr system; - core::smart_refctd_ptr assetManager; - video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams; - core::smart_refctd_ptr logger; - core::smart_refctd_ptr inputSystem; - video::IGPUObjectFromAssetConverter cpu2gpu; - - int32_t m_resourceIx = -1; - uint32_t m_acquiredNextFBO = {}; - - CDumbPresentationOracle oracle; - - // polling for events! - CommonAPI::InputSystem::ChannelReader mouse; - CommonAPI::InputSystem::ChannelReader keyboard; - - core::smart_refctd_ptr frameUploadDataCompleteFence[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr frameComplete[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr imageAcquire[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr renderFinished[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr frameUploadDataCompleteSemaphore[FRAMES_IN_FLIGHT] = { nullptr }; - - core::smart_refctd_ptr cmdbuf[FRAMES_IN_FLIGHT]; // from graphics - - Camera cam; - - core::smart_refctd_ptr gpuubo = nullptr; - core::smart_refctd_ptr gpuEnvmapImageView = nullptr; - core::smart_refctd_ptr gpuScrambleImageView; - - core::smart_refctd_ptr gpuComputePipeline = nullptr; - DispatchInfo_t dispatchInfo = {}; - - core::smart_refctd_ptr outHDRImageViews[CommonAPI::InitOutput::MaxSwapChainImageCount] = {}; - - core::smart_refctd_ptr descriptorSets0[CommonAPI::InitOutput::MaxSwapChainImageCount] = {}; - core::smart_refctd_ptr descriptorSet2 = nullptr; - core::smart_refctd_ptr uboDescriptorSet1 = nullptr; - - core::smart_refctd_ptr aabbsBuffer = nullptr; - core::smart_refctd_ptr gpuBlas = nullptr; - core::smart_refctd_ptr gpuBlas2 = nullptr; // Built via CPUObject To GPUObject operations and utility - core::smart_refctd_ptr gpuTlas = nullptr; - core::smart_refctd_ptr instancesBuffer = nullptr; - - core::smart_refctd_ptr gpuSequenceBufferView = nullptr; - - core::smart_refctd_ptr sampler0 = nullptr; - core::smart_refctd_ptr sampler1 = nullptr; - - core::smart_refctd_ptr gpuSequenceBuffer = nullptr; - - core::smart_refctd_ptr spheresBuffer = nullptr; - - struct SBasicViewParametersAligned - { - SBasicViewParameters uboData; - }; - -public: - void setWindow(core::smart_refctd_ptr&& wnd) override - { - window = std::move(wnd); - } - nbl::ui::IWindow* getWindow() override - { - return window.get(); - } - void setSystem(core::smart_refctd_ptr&& system) override - { - system = std::move(system); - } - - APP_CONSTRUCTOR(RayQuerySampleApp); - - void onAppInitialized_impl() override - { - const auto swapchainImageUsage = static_cast(asset::IImage::EUF_COLOR_ATTACHMENT_BIT | asset::IImage::EUF_TRANSFER_DST_BIT | asset::IImage::EUF_TRANSFER_SRC_BIT); - - CommonAPI::InitParams initParams; - initParams.window = core::smart_refctd_ptr(window); - initParams.apiType = video::EAT_VULKAN; - initParams.appName = { _NBL_APP_NAME_ }; - initParams.framesInFlight = FRAMES_IN_FLIGHT; - initParams.windowWidth = WIN_W; - initParams.windowHeight = WIN_H; - initParams.swapchainImageCount = 2u; - initParams.swapchainImageUsage = swapchainImageUsage; - initParams.depthFormat = asset::EF_D32_SFLOAT; - auto initOutput = CommonAPI::InitWithRaytracingExt(std::move(initParams)); - - system = std::move(initOutput.system); - window = std::move(initParams.window); - windowCb = std::move(initParams.windowCb); - apiConnection = std::move(initOutput.apiConnection); - surface = std::move(initOutput.surface); - physicalDevice = std::move(initOutput.physicalDevice); - logicalDevice = std::move(initOutput.logicalDevice); - utilities = std::move(initOutput.utilities); - queues = std::move(initOutput.queues); - renderpass = std::move(initOutput.renderToSwapchainRenderpass); - commandPools = std::move(initOutput.commandPools); - assetManager = std::move(initOutput.assetManager); - cpu2gpuParams = std::move(initOutput.cpu2gpuParams); - logger = std::move(initOutput.logger); - inputSystem = std::move(initOutput.inputSystem); - - CommonAPI::createSwapchain(std::move(logicalDevice), initOutput.swapchainCreationParams, WIN_W, WIN_H, swapchain); - assert(swapchain); - fbos = CommonAPI::createFBOWithSwapchainImages( - swapchain->getImageCount(), WIN_W, WIN_H, - logicalDevice, swapchain, renderpass, - asset::EF_D32_SFLOAT - ); - auto graphicsQueue = queues[CommonAPI::InitOutput::EQT_GRAPHICS]; - auto computeQueue = queues[CommonAPI::InitOutput::EQT_GRAPHICS]; - auto graphicsCommandPools = commandPools[CommonAPI::InitOutput::EQT_GRAPHICS]; - auto computeCommandPools = commandPools[CommonAPI::InitOutput::EQT_COMPUTE]; - - video::IGPUObjectFromAssetConverter cpu2gpu; - for (uint32_t i = 0u; i < FRAMES_IN_FLIGHT; i++) - logicalDevice->createCommandBuffers(graphicsCommandPools[i].get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, cmdbuf+i); - - core::smart_refctd_ptr descriptorPool = nullptr; - { - video::IDescriptorPool::SCreateInfo createInfo = {}; - createInfo.maxSets = CommonAPI::InitOutput::MaxSwapChainImageCount+2; - createInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = 1; - createInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE)] = CommonAPI::InitOutput::MaxSwapChainImageCount; - createInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER)] = 2; - createInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER)] = 1; - createInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER)] = 1; - createInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE)] = 1; - - descriptorPool = logicalDevice->createDescriptorPool(std::move(createInfo)); - } - - // Initialize Spheres - constexpr uint32_t SphereCount = 9u; - constexpr uint32_t INVALID_ID_16BIT = 0xffffu; - - struct alignas(16) Sphere - { - Sphere() - : position(0.0f, 0.0f, 0.0f) - , radius2(0.0f) - { - bsdfLightIDs = core::bitfieldInsert(0u,INVALID_ID_16BIT,16,16); - } - - Sphere(core::vector3df _position, float _radius, uint32_t _bsdfID, uint32_t _lightID) - { - position = _position; - radius2 = _radius*_radius; - bsdfLightIDs = core::bitfieldInsert(_bsdfID,_lightID,16,16); - } - - IGPUAccelerationStructure::AABB_Position getAABB() const - { - float radius = core::sqrt(radius2); - return IGPUAccelerationStructure::AABB_Position(position-core::vector3df(radius, radius, radius), position+core::vector3df(radius, radius, radius)); - } - - core::vector3df position; - float radius2; - uint32_t bsdfLightIDs; - }; - - Sphere spheres[SphereCount] = {}; - spheres[0] = Sphere(core::vector3df(0.0,-100.5,-1.0), 100.0, 0u, INVALID_ID_16BIT); - spheres[1] = Sphere(core::vector3df(3.0,0.0,-1.0), 0.5, 1u, INVALID_ID_16BIT); - spheres[2] = Sphere(core::vector3df(0.0,0.0,-1.0), 0.5, 2u, INVALID_ID_16BIT); - spheres[3] = Sphere(core::vector3df(-3.0,0.0,-1.0), 0.5, 3u, INVALID_ID_16BIT); - spheres[4] = Sphere(core::vector3df(3.0,0.0,1.0), 0.5, 4u, INVALID_ID_16BIT); - spheres[5] = Sphere(core::vector3df(0.0,0.0,1.0), 0.5, 4u, INVALID_ID_16BIT); - spheres[6] = Sphere(core::vector3df(-3.0,0.0,1.0), 0.5, 5u, INVALID_ID_16BIT); - spheres[7] = Sphere(core::vector3df(0.5,1.0,0.5), 0.5, 6u, INVALID_ID_16BIT); - spheres[8] = Sphere(core::vector3df(-1.5,1.5,0.0), 0.3, INVALID_ID_16BIT, 0u); - - // Create Spheres Buffer - uint32_t spheresBufferSize = sizeof(Sphere) * SphereCount; - - { - IGPUBuffer::SCreationParams params = {}; - params.size = spheresBufferSize; // (Erfan->Cyprian) See How I moved "createDeviceLocalGPUBufferOnDedMem" second parameter to params.size? IGPUBuffer::SCreationParams::size is very important to be filled unlike before - params.usage = core::bitflag(asset::IBuffer::EUF_STORAGE_BUFFER_BIT) | asset::IBuffer::EUF_TRANSFER_DST_BIT; - spheresBuffer = logicalDevice->createBuffer(std::move(params)); - auto bufferReqs = spheresBuffer->getMemoryReqs(); - bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); // (Erfan->Cyprian) I used `getDeviceLocalMemoryTypeBits` because of previous createDeviceLocalGPUBufferOnDedMem (Focus on DeviceLocal Part) - auto spheresBufferMem = logicalDevice->allocate(bufferReqs, spheresBuffer.get()); - utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{0u,spheresBufferSize,spheresBuffer}, spheres, graphicsQueue); - } - -#define TEST_CPU_2_GPU_BLAS -#ifdef TEST_CPU_2_GPU_BLAS - // Acceleration Structure Test - // Create + Build BLAS (CPU2GPU Version) - { - struct AABB { - IGPUAccelerationStructure::AABB_Position aabb; - }; - const uint32_t aabbsCount = SphereCount / 2u; - uint32_t aabbsBufferSize = sizeof(AABB) * aabbsCount; - - AABB aabbs[aabbsCount] = {}; - for(uint32_t i = 0; i < aabbsCount; ++i) - { - aabbs[i].aabb = spheres[i].getAABB(); - } - - // auto raytracingFlags = core::bitflag(asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT; - // | asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT - core::smart_refctd_ptr aabbsBuffer = ICPUBuffer::create({ aabbsBufferSize }); - memcpy(aabbsBuffer->getPointer(), aabbs, aabbsBufferSize); - - ICPUAccelerationStructure::SCreationParams asCreateParams; - asCreateParams.type = ICPUAccelerationStructure::ET_BOTTOM_LEVEL; - asCreateParams.flags = ICPUAccelerationStructure::ECF_NONE; - core::smart_refctd_ptr cpuBlas = ICPUAccelerationStructure::create(std::move(asCreateParams)); - - using HostGeom = ICPUAccelerationStructure::HostBuildGeometryInfo::Geom; - core::smart_refctd_dynamic_array geometries = core::make_refctd_dynamic_array>(1u); - - HostGeom & simpleGeom = geometries->operator[](0u); - simpleGeom.type = IAccelerationStructure::EGT_AABBS; - simpleGeom.flags = IAccelerationStructure::EGF_OPAQUE_BIT; - simpleGeom.data.aabbs.data.offset = 0u; - simpleGeom.data.aabbs.data.buffer = aabbsBuffer; - simpleGeom.data.aabbs.stride = sizeof(AABB); - - ICPUAccelerationStructure::HostBuildGeometryInfo buildInfo; - buildInfo.type = asCreateParams.type; - buildInfo.buildFlags = ICPUAccelerationStructure::EBF_PREFER_FAST_TRACE_BIT; - buildInfo.buildMode = ICPUAccelerationStructure::EBM_BUILD; - buildInfo.geometries = geometries; - - core::smart_refctd_dynamic_array buildRangeInfos = core::make_refctd_dynamic_array>(1u); - ICPUAccelerationStructure::BuildRangeInfo & firstBuildRangeInfo = buildRangeInfos->operator[](0u); - firstBuildRangeInfo.primitiveCount = aabbsCount; - firstBuildRangeInfo.primitiveOffset = 0u; - firstBuildRangeInfo.firstVertex = 0u; - firstBuildRangeInfo.transformOffset = 0u; - - cpuBlas->setBuildInfoAndRanges(std::move(buildInfo), buildRangeInfos); - - // Build BLAS - { - cpu2gpuParams.beginCommandBuffers(); - gpuBlas2 = cpu2gpu.getGPUObjectsFromAssets(&cpuBlas, &cpuBlas + 1u, cpu2gpuParams)->front(); - cpu2gpuParams.waitForCreationToComplete(); - } - } -#endif - - // Create + Build BLAS - { - // Build BLAS with AABBS - const uint32_t aabbsCount = SphereCount; - - struct AABB { - IGPUAccelerationStructure::AABB_Position aabb; - }; - - AABB aabbs[aabbsCount] = {}; - for(uint32_t i = 0; i < aabbsCount; ++i) - { - aabbs[i].aabb = spheres[i].getAABB(); - } - auto raytracingFlags = core::bitflag(asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT; - uint32_t aabbsBufferSize = sizeof(AABB) * aabbsCount; - - { - IGPUBuffer::SCreationParams params = {}; - params.size = aabbsBufferSize; - params.usage = raytracingFlags | asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - aabbsBuffer = logicalDevice->createBuffer(std::move(params)); - auto bufferReqs = aabbsBuffer->getMemoryReqs(); - bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto aabbBufferMem = logicalDevice->allocate(bufferReqs, aabbsBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - // (Erfan->Cyprian) -> I passed `IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT` as a third parameter to the allocate function because the buffer needs the usage `EUF_SHADER_DEVICE_ADDRESS_BIT` - // You don't have to worry about it, it's only used in this example - utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{0u,aabbsBufferSize,aabbsBuffer}, aabbs, graphicsQueue); - } - - using DeviceGeom = IGPUAccelerationStructure::DeviceBuildGeometryInfo::Geometry; - - DeviceGeom simpleGeom = {}; - simpleGeom.type = IAccelerationStructure::EGT_AABBS; - simpleGeom.flags = IAccelerationStructure::EGF_OPAQUE_BIT; - simpleGeom.data.aabbs.data.offset = 0u; - simpleGeom.data.aabbs.data.buffer = aabbsBuffer; - simpleGeom.data.aabbs.stride = sizeof(AABB); - - IGPUAccelerationStructure::DeviceBuildGeometryInfo blasBuildInfo = {}; - blasBuildInfo.type = IGPUAccelerationStructure::ET_BOTTOM_LEVEL; - blasBuildInfo.buildFlags = IGPUAccelerationStructure::EBF_PREFER_FAST_TRACE_BIT; - blasBuildInfo.buildMode = IGPUAccelerationStructure::EBM_BUILD; - blasBuildInfo.srcAS = nullptr; - blasBuildInfo.dstAS = nullptr; - blasBuildInfo.geometries = core::SRange(&simpleGeom, &simpleGeom + 1u); - blasBuildInfo.scratchAddr = {}; - - // Get BuildSizes - IGPUAccelerationStructure::BuildSizes buildSizes = {}; - { - std::vector maxPrimCount(1u); - maxPrimCount[0] = aabbsCount; - buildSizes = logicalDevice->getAccelerationStructureBuildSizes(blasBuildInfo, maxPrimCount.data()); - } - - { - core::smart_refctd_ptr asBuffer; - IGPUBuffer::SCreationParams params = {}; - params.size = buildSizes.accelerationStructureSize; - params.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - asBuffer = logicalDevice->createBuffer(std::move(params)); - auto bufferReqs = asBuffer->getMemoryReqs(); - bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto asBufferMem = logicalDevice->allocate(bufferReqs, asBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - - IGPUAccelerationStructure::SCreationParams blasParams = {}; - blasParams.type = IGPUAccelerationStructure::ET_BOTTOM_LEVEL; - blasParams.flags = IGPUAccelerationStructure::ECF_NONE; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = buildSizes.accelerationStructureSize; - gpuBlas = logicalDevice->createAccelerationStructure(std::move(blasParams)); - } - - // Allocate ScratchBuffer - core::smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params = {}; - params.size = buildSizes.buildScratchSize; - params.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT; - scratchBuffer = logicalDevice->createBuffer(std::move(params)); - auto bufferReqs = scratchBuffer->getMemoryReqs(); - bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto scratchBufferMem = logicalDevice->allocate(bufferReqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - } - - // Complete BLAS Build Info - { - blasBuildInfo.dstAS = gpuBlas.get(); - blasBuildInfo.scratchAddr.buffer = scratchBuffer; - blasBuildInfo.scratchAddr.offset = 0u; - } - - IGPUAccelerationStructure::BuildRangeInfo firstBuildRangeInfos[1u]; - firstBuildRangeInfos[0].primitiveCount = aabbsCount; - firstBuildRangeInfos[0].primitiveOffset = 0u; - firstBuildRangeInfos[0].firstVertex = 0u; - firstBuildRangeInfos[0].transformOffset = 0u; - IGPUAccelerationStructure::BuildRangeInfo* pRangeInfos[1u]; - pRangeInfos[0] = firstBuildRangeInfos; - // pRangeInfos[1] = &secondBuildRangeInfos; - - // Build BLAS - { - utilities->buildAccelerationStructures(computeQueue, core::SRange(&blasBuildInfo, &blasBuildInfo + 1u), pRangeInfos); - } - } - - // Create + Build TLAS - { - struct Instance { - IGPUAccelerationStructure::Instance instance; - }; - - const uint32_t instancesCount = 1u; - Instance instances[instancesCount] = {}; - core::matrix3x4SIMD identity; - instances[0].instance.mat = identity; - instances[0].instance.instanceCustomIndex = 0u; - instances[0].instance.mask = 0xFF; - instances[0].instance.instanceShaderBindingTableRecordOffset = 0u; - instances[0].instance.flags = IAccelerationStructure::EIF_TRIANGLE_FACING_CULL_DISABLE_BIT; -#ifdef TEST_CPU_2_GPU_BLAS - instances[0].instance.accelerationStructureReference = gpuBlas2->getReferenceForDeviceOperations(); -#else - instances[0].instance.accelerationStructureReference = gpuBlas->getReferenceForDeviceOperations(); -#endif - auto raytracingFlags = core::bitflag(asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT; - - uint32_t instancesBufferSize = sizeof(Instance); - { - IGPUBuffer::SCreationParams params = {}; - params.size = instancesBufferSize; - params.usage = raytracingFlags | asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - instancesBuffer = logicalDevice->createBuffer(std::move(params)); - auto bufferReqs = instancesBuffer->getMemoryReqs(); - bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto instancesBufferMem = logicalDevice->allocate(bufferReqs, instancesBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{0u,instancesBufferSize,instancesBuffer}, instances, graphicsQueue); - } - - using DeviceGeom = IGPUAccelerationStructure::DeviceBuildGeometryInfo::Geometry; - - DeviceGeom blasInstancesGeom = {}; - blasInstancesGeom.type = IAccelerationStructure::EGT_INSTANCES; - blasInstancesGeom.flags = IAccelerationStructure::EGF_NONE; - blasInstancesGeom.data.instances.data.offset = 0u; - blasInstancesGeom.data.instances.data.buffer = instancesBuffer; - - IGPUAccelerationStructure::DeviceBuildGeometryInfo tlasBuildInfo = {}; - tlasBuildInfo.type = IGPUAccelerationStructure::ET_TOP_LEVEL; - tlasBuildInfo.buildFlags = IGPUAccelerationStructure::EBF_PREFER_FAST_TRACE_BIT; - tlasBuildInfo.buildMode = IGPUAccelerationStructure::EBM_BUILD; - tlasBuildInfo.srcAS = nullptr; - tlasBuildInfo.dstAS = nullptr; - tlasBuildInfo.geometries = core::SRange(&blasInstancesGeom, &blasInstancesGeom + 1u); - tlasBuildInfo.scratchAddr = {}; - - // Get BuildSizes - IGPUAccelerationStructure::BuildSizes buildSizes = {}; - { - std::vector maxPrimCount(1u); - maxPrimCount[0] = instancesCount; - buildSizes = logicalDevice->getAccelerationStructureBuildSizes(tlasBuildInfo, maxPrimCount.data()); - } - - { - core::smart_refctd_ptr asBuffer; - IGPUBuffer::SCreationParams params = {}; - params.size = buildSizes.accelerationStructureSize; - params.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - asBuffer = logicalDevice->createBuffer(std::move(params)); - auto bufferReqs = asBuffer->getMemoryReqs(); - bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto asBufferMem = logicalDevice->allocate(bufferReqs, asBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - - IGPUAccelerationStructure::SCreationParams tlasParams = {}; - tlasParams.type = IGPUAccelerationStructure::ET_TOP_LEVEL; - tlasParams.flags = IGPUAccelerationStructure::ECF_NONE; - tlasParams.bufferRange.buffer = asBuffer; - tlasParams.bufferRange.offset = 0u; - tlasParams.bufferRange.size = buildSizes.accelerationStructureSize; - gpuTlas = logicalDevice->createAccelerationStructure(std::move(tlasParams)); - } - - // Allocate ScratchBuffer - core::smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params = {}; - params.size = buildSizes.buildScratchSize; - params.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT; - scratchBuffer = logicalDevice->createBuffer(std::move(params)); - auto bufferReqs = scratchBuffer->getMemoryReqs(); - bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto scratchBufferMem = logicalDevice->allocate(bufferReqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - } - - // Complete BLAS Build Info - { - tlasBuildInfo.dstAS = gpuTlas.get(); - tlasBuildInfo.scratchAddr.buffer = scratchBuffer; - tlasBuildInfo.scratchAddr.offset = 0u; - } - - IGPUAccelerationStructure::BuildRangeInfo firstBuildRangeInfos[1u]; - firstBuildRangeInfos[0].primitiveCount = instancesCount; - firstBuildRangeInfos[0].primitiveOffset = 0u; - firstBuildRangeInfos[0].firstVertex = 0u; - firstBuildRangeInfos[0].transformOffset = 0u; - IGPUAccelerationStructure::BuildRangeInfo* pRangeInfos[1u]; - pRangeInfos[0] = firstBuildRangeInfos; - - // Build TLAS - { - utilities->buildAccelerationStructures(computeQueue, core::SRange(&tlasBuildInfo, &tlasBuildInfo + 1u), pRangeInfos); - } - } - - - // Camera - core::vectorSIMDf cameraPosition(0, 5, -10); - matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(60.0f), video::ISurface::getTransformedAspectRatio(swapchain->getPreTransform(), WIN_W, WIN_H), 0.01f, 500.0f); - cam = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); - - IGPUDescriptorSetLayout::SBinding descriptorSet0Bindings[] = - { - { 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr }, - }; - IGPUDescriptorSetLayout::SBinding uboBinding {0, asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER, video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr}; - IGPUDescriptorSetLayout::SBinding descriptorSet3Bindings[] = { - { 0u, asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr }, - { 1u, asset::IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER, video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr }, - { 2u, asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr }, - { 3u, asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr }, - { 4u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, IShader::ESS_COMPUTE, 1u, nullptr } - }; - - auto gpuDescriptorSetLayout0 = logicalDevice->createDescriptorSetLayout(descriptorSet0Bindings, descriptorSet0Bindings + 1u); - auto gpuDescriptorSetLayout1 = logicalDevice->createDescriptorSetLayout(&uboBinding, &uboBinding + 1u); - auto gpuDescriptorSetLayout2 = logicalDevice->createDescriptorSetLayout(descriptorSet3Bindings, descriptorSet3Bindings+5u); - - auto createGpuResources = [&](std::string pathToShader) -> core::smart_refctd_ptr - { - asset::IAssetLoader::SAssetLoadParams params{}; - params.logger = logger.get(); - //params.relativeDir = tmp.c_str(); - auto spec = assetManager->getAsset(pathToShader,params).getContents(); - - if (spec.empty()) - assert(false); - - auto cpuComputeSpecializedShader = core::smart_refctd_ptr_static_cast(*spec.begin()); - - ISpecializedShader::SInfo info = cpuComputeSpecializedShader->getSpecializationInfo(); - info.m_backingBuffer = ICPUBuffer::create({ sizeof(ShaderParameters) }); - memcpy(info.m_backingBuffer->getPointer(),&kShaderParameters,sizeof(ShaderParameters)); - info.m_entries = core::make_refctd_dynamic_array>(2u); - for (uint32_t i=0; i<2; i++) - info.m_entries->operator[](i) = {i,i*sizeof(uint32_t),sizeof(uint32_t)}; - - - cpuComputeSpecializedShader->setSpecializationInfo(std::move(info)); - - auto gpuComputeSpecializedShader = cpu2gpu.getGPUObjectsFromAssets(&cpuComputeSpecializedShader, &cpuComputeSpecializedShader + 1, cpu2gpuParams)->front(); - - auto gpuPipelineLayout = logicalDevice->createPipelineLayout(nullptr, nullptr, core::smart_refctd_ptr(gpuDescriptorSetLayout0), core::smart_refctd_ptr(gpuDescriptorSetLayout1), core::smart_refctd_ptr(gpuDescriptorSetLayout2), nullptr); - - auto gpuPipeline = logicalDevice->createComputePipeline(nullptr, std::move(gpuPipelineLayout), std::move(gpuComputeSpecializedShader)); - - return gpuPipeline; - }; - - E_LIGHT_GEOMETRY lightGeom = ELG_SPHERE; - constexpr const char* shaderPaths[] = {"../litBySphere.comp","../litByTriangle.comp","../litByRectangle.comp"}; - gpuComputePipeline = createGpuResources(shaderPaths[lightGeom]); - - dispatchInfo = getDispatchInfo(WIN_W, WIN_H); - - auto createImageView = [&](std::string pathToOpenEXRHDRIImage) - { - auto pathToTexture = pathToOpenEXRHDRIImage; - IAssetLoader::SAssetLoadParams lp(0ull, nullptr, IAssetLoader::ECF_DONT_CACHE_REFERENCES); - auto cpuTexture = assetManager->getAsset(pathToTexture, lp); - auto cpuTextureContents = cpuTexture.getContents(); - assert(!cpuTextureContents.empty()); - auto cpuImage = core::smart_refctd_ptr_static_cast(*cpuTextureContents.begin()); - cpuImage->setImageUsageFlags(IImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT); - - ICPUImageView::SCreationParams viewParams; - viewParams.flags = static_cast(0u); - viewParams.image = cpuImage; - viewParams.format = viewParams.image->getCreationParameters().format; - viewParams.viewType = IImageView::ET_2D; - viewParams.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; - viewParams.subresourceRange.baseArrayLayer = 0u; - viewParams.subresourceRange.layerCount = 1u; - viewParams.subresourceRange.baseMipLevel = 0u; - viewParams.subresourceRange.levelCount = 1u; - - auto cpuImageView = ICPUImageView::create(std::move(viewParams)); - - cpu2gpuParams.beginCommandBuffers(); - auto gpuImageView = cpu2gpu.getGPUObjectsFromAssets(&cpuImageView, &cpuImageView + 1u, cpu2gpuParams)->front(); - cpu2gpuParams.waitForCreationToComplete(); - - return gpuImageView; - }; - - gpuEnvmapImageView = createImageView("../../media/envmap/envmap_0.exr"); - - { - const uint32_t MaxDimensions = 3u<(sampleSequence->getPointer()); - for (auto dim=0u; dimgetSize(); - IGPUBuffer::SCreationParams params = {}; - params.size = bufferSize; - params.usage = core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT) | asset::IBuffer::EUF_UNIFORM_TEXEL_BUFFER_BIT; - gpuSequenceBuffer = logicalDevice->createBuffer(std::move(params)); - auto bufferReqs = gpuSequenceBuffer->getMemoryReqs(); - bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto gpuSequenceBufferMem = logicalDevice->allocate(bufferReqs, gpuSequenceBuffer.get()); - utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{0u,bufferSize,gpuSequenceBuffer},sampleSequence->getPointer(), graphicsQueue); - } - gpuSequenceBufferView = logicalDevice->createBufferView(gpuSequenceBuffer.get(), asset::EF_R32G32B32_UINT); - } - - { - IGPUImage::SCreationParams imgParams; - imgParams.flags = static_cast(0u); - imgParams.type = IImage::ET_2D; - imgParams.format = EF_R32G32_UINT; - imgParams.extent = {WIN_W, WIN_H,1u}; - imgParams.mipLevels = 1u; - imgParams.arrayLayers = 1u; - imgParams.samples = IImage::ESCF_1_BIT; - imgParams.usage = core::bitflag(IImage::EUF_SAMPLED_BIT) | IImage::EUF_TRANSFER_DST_BIT; - imgParams.initialLayout = asset::IImage::EL_UNDEFINED; - - IGPUImage::SBufferCopy region = {}; - region.bufferOffset = 0u; - region.bufferRowLength = 0u; - region.bufferImageHeight = 0u; - region.imageExtent = imgParams.extent; - region.imageOffset = {0u,0u,0u}; - region.imageSubresource.layerCount = 1u; - region.imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; - - constexpr auto ScrambleStateChannels = 2u; - const auto renderPixelCount = imgParams.extent.width*imgParams.extent.height; - core::vector random(renderPixelCount*ScrambleStateChannels); - { - core::RandomSampler rng(0xbadc0ffeu); - for (auto& pixel : random) - pixel = rng.nextSample(); - } - - core::smart_refctd_ptr scrambleImageBuffer; - { - const auto bufferSize = random.size() * sizeof(uint32_t); - IGPUBuffer::SCreationParams params = {}; - params.size = bufferSize; - params.usage = core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT) | asset::IBuffer::EUF_TRANSFER_SRC_BIT; - scrambleImageBuffer = logicalDevice->createBuffer(std::move(params)); - auto bufferReqs = scrambleImageBuffer->getMemoryReqs(); - bufferReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto bufferMem = logicalDevice->allocate(bufferReqs, scrambleImageBuffer.get()); - utilities->updateBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{0u,bufferSize,scrambleImageBuffer},random.data(),graphicsQueue); - } - - IGPUImageView::SCreationParams viewParams; - viewParams.flags = static_cast(0u); - // TODO: Replace this IGPUBuffer -> IGPUImage to using image upload utility - viewParams.image = utilities->createFilledDeviceLocalImageOnDedMem(std::move(imgParams), scrambleImageBuffer.get(), 1u, ®ion, graphicsQueue); - viewParams.viewType = IGPUImageView::ET_2D; - viewParams.format = EF_R32G32_UINT; - viewParams.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; - viewParams.subresourceRange.levelCount = 1u; - viewParams.subresourceRange.layerCount = 1u; - gpuScrambleImageView = logicalDevice->createImageView(std::move(viewParams)); - } - - // Create Out Image - for(uint32_t i = 0; i < swapchain->getImageCount(); ++i) { - outHDRImageViews[i] = createHDRImageView(logicalDevice, asset::EF_R16G16B16A16_SFLOAT, WIN_W, WIN_H); - } - - for(uint32_t i = 0; i < swapchain->getImageCount(); ++i) - { - auto & descSet = descriptorSets0[i]; - descSet = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(gpuDescriptorSetLayout0)); - video::IGPUDescriptorSet::SWriteDescriptorSet writeDescriptorSet; - writeDescriptorSet.dstSet = descSet.get(); - writeDescriptorSet.binding = 0; - writeDescriptorSet.count = 1u; - writeDescriptorSet.arrayElement = 0u; - writeDescriptorSet.descriptorType = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE; - video::IGPUDescriptorSet::SDescriptorInfo info; - { - info.desc = outHDRImageViews[i]; - info.info.image.sampler = nullptr; - info.info.image.imageLayout = asset::IImage::EL_GENERAL; - } - writeDescriptorSet.info = &info; - logicalDevice->updateDescriptorSets(1u, &writeDescriptorSet, 0u, nullptr); - } - - IGPUBuffer::SCreationParams gpuuboParams = {}; - gpuuboParams.size = sizeof(SBasicViewParametersAligned); - gpuuboParams.usage = core::bitflag(IGPUBuffer::EUF_UNIFORM_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT; - gpuubo = logicalDevice->createBuffer(std::move(gpuuboParams)); - auto gpuUboMemReqs = gpuubo->getMemoryReqs(); - gpuUboMemReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto gpuUboMem = logicalDevice->allocate(gpuUboMemReqs, gpuubo.get()); - - uboDescriptorSet1 = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(gpuDescriptorSetLayout1)); - { - video::IGPUDescriptorSet::SWriteDescriptorSet uboWriteDescriptorSet; - uboWriteDescriptorSet.dstSet = uboDescriptorSet1.get(); - uboWriteDescriptorSet.binding = 0; - uboWriteDescriptorSet.count = 1u; - uboWriteDescriptorSet.arrayElement = 0u; - uboWriteDescriptorSet.descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER; - video::IGPUDescriptorSet::SDescriptorInfo info; - { - info.desc = gpuubo; - info.info.buffer.offset = 0ull; - info.info.buffer.size = sizeof(SBasicViewParametersAligned); - } - uboWriteDescriptorSet.info = &info; - logicalDevice->updateDescriptorSets(1u, &uboWriteDescriptorSet, 0u, nullptr); - } - - ISampler::SParams samplerParams0 = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_FLOAT_OPAQUE_BLACK, ISampler::ETF_LINEAR, ISampler::ETF_LINEAR, ISampler::ESMM_LINEAR, 0u, false, ECO_ALWAYS }; - sampler0 = logicalDevice->createSampler(samplerParams0); - ISampler::SParams samplerParams1 = { ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETC_CLAMP_TO_EDGE, ISampler::ETBC_INT_OPAQUE_BLACK, ISampler::ETF_NEAREST, ISampler::ETF_NEAREST, ISampler::ESMM_NEAREST, 0u, false, ECO_ALWAYS }; - sampler1 = logicalDevice->createSampler(samplerParams1); - - descriptorSet2 = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(gpuDescriptorSetLayout2)); - { - constexpr auto kDescriptorCount = 5; - IGPUDescriptorSet::SWriteDescriptorSet writeDescriptorSet2[kDescriptorCount]; - IGPUDescriptorSet::SDescriptorInfo writeDescriptorInfo[kDescriptorCount]; - for (auto i=0; iupdateDescriptorSets(kDescriptorCount, writeDescriptorSet2, 0u, nullptr); - } - - constexpr uint32_t FRAME_COUNT = 500000u; - - for (uint32_t i=0u; icreateSemaphore(); - renderFinished[i] = logicalDevice->createSemaphore(); - frameComplete[i] = logicalDevice->createFence(video::IGPUFence::ECF_SIGNALED_BIT); - frameUploadDataCompleteSemaphore[i] = logicalDevice->createSemaphore(); - frameUploadDataCompleteFence[i] = logicalDevice->createFence(video::IGPUFence::ECF_UNSIGNALED); - } - - oracle.reportBeginFrameRecord(); - } - - void onAppTerminated_impl() override - { - const auto& fboCreationParams = fbos->begin()[m_acquiredNextFBO]->getCreationParameters(); - auto gpuSourceImageView = fboCreationParams.attachments[0]; - logicalDevice->waitIdle(); - - bool status = ext::ScreenShot::createScreenShot( - logicalDevice.get(), - queues[CommonAPI::InitOutput::EQT_TRANSFER_UP], - renderFinished[m_resourceIx].get(), - gpuSourceImageView.get(), - assetManager.get(), - "ScreenShot.png", - asset::IImage::EL_PRESENT_SRC, - asset::EAF_NONE); - - assert(status); - } - - void workLoopBody() override - { - auto& graphicsQueue = queues[CommonAPI::InitOutput::EQT_GRAPHICS]; - - m_resourceIx++; - if(m_resourceIx >= FRAMES_IN_FLIGHT) { - m_resourceIx = 0; - } - - oracle.reportEndFrameRecord(); - double dt = oracle.getDeltaTimeInMicroSeconds() / 1000.0; - auto nextPresentationTimeStamp = oracle.getNextPresentationTimeStamp(); - oracle.reportBeginFrameRecord(); - - // Input - inputSystem->getDefaultMouse(&mouse); - inputSystem->getDefaultKeyboard(&keyboard); - - cam.beginInputProcessing(nextPresentationTimeStamp); - mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { cam.mouseProcess(events); }, logger.get()); - keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { cam.keyboardProcess(events); }, logger.get()); - cam.endInputProcessing(nextPresentationTimeStamp); - - auto& cb = cmdbuf[m_resourceIx]; - auto& fence = frameComplete[m_resourceIx]; - while (logicalDevice->waitForFences(1u,&fence.get(),false,MAX_TIMEOUT)==video::IGPUFence::ES_TIMEOUT) - { - } - - const auto viewMatrix = cam.getViewMatrix(); - const auto viewProjectionMatrix = matrix4SIMD::concatenateBFollowedByAPrecisely( - video::ISurface::getSurfaceTransformationMatrix(swapchain->getPreTransform()), - cam.getConcatenatedMatrix() - ); - - // safe to proceed - cb->begin(IGPUCommandBuffer::EU_NONE); - - // renderpass - swapchain->acquireNextImage(MAX_TIMEOUT,imageAcquire[m_resourceIx].get(),nullptr,&m_acquiredNextFBO); - { - auto mv = viewMatrix; - auto mvp = viewProjectionMatrix; - core::matrix3x4SIMD normalMat; - mv.getSub3x3InverseTranspose(normalMat); - - SBasicViewParametersAligned viewParams; - memcpy(viewParams.uboData.MV, mv.pointer(), sizeof(mv)); - memcpy(viewParams.uboData.MVP, mvp.pointer(), sizeof(mvp)); - memcpy(viewParams.uboData.NormalMat, normalMat.pointer(), sizeof(normalMat)); - - asset::SBufferRange range; - range.buffer = gpuubo; - range.offset = 0ull; - range.size = sizeof(viewParams); - - video::IGPUQueue::SSubmitInfo uploadImageSubmit; - uploadImageSubmit.pSignalSemaphores = &frameUploadDataCompleteSemaphore[m_resourceIx].get(); - uploadImageSubmit.signalSemaphoreCount = 1u; - - // We know the fence is already signal because of how we structured our execution -> frameUploadDataCompleteSemaphore -> signals to Render Frame -> wait for frameComplete fence to finish -> then we know frameUploadCompleteFence is signalled - utilities->getDefaultUpStreamingBuffer()->cull_frees(); // need to cull_frees after fence signalled and before fence is reset again - logicalDevice->resetFences(1, &frameUploadDataCompleteFence[m_resourceIx].get()); - - utilities->updateBufferRangeViaStagingBufferAutoSubmit(range, &viewParams, graphicsQueue, frameUploadDataCompleteFence[m_resourceIx].get(), uploadImageSubmit); - // No need to wait for frameUploadDataCompleteFence in CPU, we'll use semaphores to singal the next stage the upload is complete. - } - - auto graphicsCmdQueueFamIdx = queues[CommonAPI::InitOutput::EQT_GRAPHICS]->getFamilyIndex(); - // TRANSITION outHDRImageViews[m_acquiredNextFBO] to EIL_GENERAL (because of descriptorSets0 -> ComputeShader Writes into the image) - { - IGPUCommandBuffer::SImageMemoryBarrier imageBarriers[3u] = {}; - imageBarriers[0].barrier.srcAccessMask = asset::EAF_NONE; - imageBarriers[0].barrier.dstAccessMask = static_cast(asset::EAF_SHADER_WRITE_BIT); - imageBarriers[0].oldLayout = asset::IImage::EL_UNDEFINED; - imageBarriers[0].newLayout = asset::IImage::EL_GENERAL; - imageBarriers[0].srcQueueFamilyIndex = graphicsCmdQueueFamIdx; - imageBarriers[0].dstQueueFamilyIndex = graphicsCmdQueueFamIdx; - imageBarriers[0].image = outHDRImageViews[m_acquiredNextFBO]->getCreationParameters().image; - imageBarriers[0].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - imageBarriers[0].subresourceRange.baseMipLevel = 0u; - imageBarriers[0].subresourceRange.levelCount = 1; - imageBarriers[0].subresourceRange.baseArrayLayer = 0u; - imageBarriers[0].subresourceRange.layerCount = 1; - - imageBarriers[1].barrier.srcAccessMask = asset::EAF_NONE; - imageBarriers[1].barrier.dstAccessMask = static_cast(asset::EAF_SHADER_READ_BIT); - imageBarriers[1].oldLayout = asset::IImage::EL_UNDEFINED; - imageBarriers[1].newLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL; - imageBarriers[1].srcQueueFamilyIndex = graphicsCmdQueueFamIdx; - imageBarriers[1].dstQueueFamilyIndex = graphicsCmdQueueFamIdx; - imageBarriers[1].image = gpuScrambleImageView->getCreationParameters().image; - imageBarriers[1].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - imageBarriers[1].subresourceRange.baseMipLevel = 0u; - imageBarriers[1].subresourceRange.levelCount = 1; - imageBarriers[1].subresourceRange.baseArrayLayer = 0u; - imageBarriers[1].subresourceRange.layerCount = 1; - - imageBarriers[2].barrier.srcAccessMask = asset::EAF_NONE; - imageBarriers[2].barrier.dstAccessMask = static_cast(asset::EAF_SHADER_READ_BIT); - imageBarriers[2].oldLayout = asset::IImage::EL_UNDEFINED; - imageBarriers[2].newLayout = asset::IImage::EL_SHADER_READ_ONLY_OPTIMAL; - imageBarriers[2].srcQueueFamilyIndex = graphicsCmdQueueFamIdx; - imageBarriers[2].dstQueueFamilyIndex = graphicsCmdQueueFamIdx; - imageBarriers[2].image = gpuEnvmapImageView->getCreationParameters().image; - imageBarriers[2].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - imageBarriers[2].subresourceRange.baseMipLevel = 0u; - imageBarriers[2].subresourceRange.levelCount = gpuEnvmapImageView->getCreationParameters().subresourceRange.levelCount; - imageBarriers[2].subresourceRange.baseArrayLayer = 0u; - imageBarriers[2].subresourceRange.layerCount = gpuEnvmapImageView->getCreationParameters().subresourceRange.layerCount; - - cb->pipelineBarrier(asset::EPSF_TOP_OF_PIPE_BIT, asset::EPSF_COMPUTE_SHADER_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 3u, imageBarriers); - } - - // cube envmap handle - { - cb->bindComputePipeline(gpuComputePipeline.get()); - cb->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 0u, 1u, &descriptorSets0[m_acquiredNextFBO].get()); - cb->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 1u, 1u, &uboDescriptorSet1.get()); - cb->bindDescriptorSets(EPBP_COMPUTE, gpuComputePipeline->getLayout(), 2u, 1u, &descriptorSet2.get()); - cb->dispatch(dispatchInfo.workGroupCount[0], dispatchInfo.workGroupCount[1], dispatchInfo.workGroupCount[2]); - } - // TODO: tone mapping and stuff - - // Copy HDR Image to SwapChain - auto srcImgViewCreationParams = outHDRImageViews[m_acquiredNextFBO]->getCreationParameters(); - auto dstImgViewCreationParams = fbos->begin()[m_acquiredNextFBO]->getCreationParameters().attachments[0]->getCreationParameters(); - - // Getting Ready for Blit - // TRANSITION outHDRImageViews[m_acquiredNextFBO] to EIL_TRANSFER_SRC_OPTIMAL - // TRANSITION `fbos[m_acquiredNextFBO]->getCreationParameters().attachments[0]` to EIL_TRANSFER_DST_OPTIMAL - { - IGPUCommandBuffer::SImageMemoryBarrier imageBarriers[2u] = {}; - imageBarriers[0].barrier.srcAccessMask = asset::EAF_NONE; - imageBarriers[0].barrier.dstAccessMask = asset::EAF_TRANSFER_WRITE_BIT; - imageBarriers[0].oldLayout = asset::IImage::EL_UNDEFINED; - imageBarriers[0].newLayout = asset::IImage::EL_TRANSFER_SRC_OPTIMAL; - imageBarriers[0].srcQueueFamilyIndex = graphicsCmdQueueFamIdx; - imageBarriers[0].dstQueueFamilyIndex = graphicsCmdQueueFamIdx; - imageBarriers[0].image = srcImgViewCreationParams.image; - imageBarriers[0].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - imageBarriers[0].subresourceRange.baseMipLevel = 0u; - imageBarriers[0].subresourceRange.levelCount = 1; - imageBarriers[0].subresourceRange.baseArrayLayer = 0u; - imageBarriers[0].subresourceRange.layerCount = 1; - - imageBarriers[1].barrier.srcAccessMask = asset::EAF_NONE; - imageBarriers[1].barrier.dstAccessMask = asset::EAF_TRANSFER_WRITE_BIT; - imageBarriers[1].oldLayout = asset::IImage::EL_UNDEFINED; - imageBarriers[1].newLayout = asset::IImage::EL_TRANSFER_DST_OPTIMAL; - imageBarriers[1].srcQueueFamilyIndex = graphicsCmdQueueFamIdx; - imageBarriers[1].dstQueueFamilyIndex = graphicsCmdQueueFamIdx; - imageBarriers[1].image = dstImgViewCreationParams.image; - imageBarriers[1].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - imageBarriers[1].subresourceRange.baseMipLevel = 0u; - imageBarriers[1].subresourceRange.levelCount = 1; - imageBarriers[1].subresourceRange.baseArrayLayer = 0u; - imageBarriers[1].subresourceRange.layerCount = 1; - cb->pipelineBarrier(asset::EPSF_TRANSFER_BIT, asset::EPSF_TRANSFER_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 2u, imageBarriers); - } - - // Blit Image - { - SImageBlit blit = {}; - blit.srcOffsets[0] = {0, 0, 0}; - blit.srcOffsets[1] = {WIN_W, WIN_H, 1}; - - blit.srcSubresource.aspectMask = srcImgViewCreationParams.subresourceRange.aspectMask; - blit.srcSubresource.mipLevel = srcImgViewCreationParams.subresourceRange.baseMipLevel; - blit.srcSubresource.baseArrayLayer = srcImgViewCreationParams.subresourceRange.baseArrayLayer; - blit.srcSubresource.layerCount = srcImgViewCreationParams.subresourceRange.layerCount; - blit.dstOffsets[0] = {0, 0, 0}; - blit.dstOffsets[1] = {WIN_W, WIN_H, 1}; - blit.dstSubresource.aspectMask = dstImgViewCreationParams.subresourceRange.aspectMask; - blit.dstSubresource.mipLevel = dstImgViewCreationParams.subresourceRange.baseMipLevel; - blit.dstSubresource.baseArrayLayer = dstImgViewCreationParams.subresourceRange.baseArrayLayer; - blit.dstSubresource.layerCount = dstImgViewCreationParams.subresourceRange.layerCount; - - auto srcImg = srcImgViewCreationParams.image; - auto dstImg = dstImgViewCreationParams.image; - - cb->blitImage(srcImg.get(), asset::IImage::EL_TRANSFER_SRC_OPTIMAL, dstImg.get(), asset::IImage::EL_TRANSFER_DST_OPTIMAL, 1u, &blit , ISampler::ETF_NEAREST); - } - - // TRANSITION `fbos[m_acquiredNextFBO]->getCreationParameters().attachments[0]` to EIL_PRESENT - { - IGPUCommandBuffer::SImageMemoryBarrier imageBarriers[1u] = {}; - imageBarriers[0].barrier.srcAccessMask = asset::EAF_TRANSFER_WRITE_BIT; - imageBarriers[0].barrier.dstAccessMask = asset::EAF_NONE; - imageBarriers[0].oldLayout = asset::IImage::EL_TRANSFER_DST_OPTIMAL; - imageBarriers[0].newLayout = asset::IImage::EL_PRESENT_SRC; - imageBarriers[0].srcQueueFamilyIndex = graphicsCmdQueueFamIdx; - imageBarriers[0].dstQueueFamilyIndex = graphicsCmdQueueFamIdx; - imageBarriers[0].image = dstImgViewCreationParams.image; - imageBarriers[0].subresourceRange.aspectMask = asset::IImage::EAF_COLOR_BIT; - imageBarriers[0].subresourceRange.baseMipLevel = 0u; - imageBarriers[0].subresourceRange.levelCount = 1; - imageBarriers[0].subresourceRange.baseArrayLayer = 0u; - imageBarriers[0].subresourceRange.layerCount = 1; - cb->pipelineBarrier(asset::EPSF_TRANSFER_BIT, asset::EPSF_TOP_OF_PIPE_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 1u, imageBarriers); - } - - cb->end(); - logicalDevice->resetFences(1, &fence.get()); - - nbl::video::IGPUQueue::SSubmitInfo submit; - submit.commandBufferCount = 1u; - submit.commandBuffers = &cb.get(); - submit.signalSemaphoreCount = 1u; - submit.pSignalSemaphores = &renderFinished[m_resourceIx].get(); - nbl::video::IGPUSemaphore* waitSemaphores[2u] = { imageAcquire[m_resourceIx].get(), frameUploadDataCompleteSemaphore[m_resourceIx].get() }; - asset::E_PIPELINE_STAGE_FLAGS waitStages[2u] = { nbl::asset::EPSF_COLOR_ATTACHMENT_OUTPUT_BIT, nbl::asset::EPSF_RAY_TRACING_SHADER_BIT_KHR} ; - submit.waitSemaphoreCount = 2u; - submit.pWaitSemaphores = waitSemaphores; - submit.pWaitDstStageMask = waitStages; - - graphicsQueue->submit(1u,&submit,fence.get()); - - CommonAPI::Present(logicalDevice.get(), swapchain.get(), queues[CommonAPI::InitOutput::EQT_GRAPHICS], renderFinished[m_resourceIx].get(), m_acquiredNextFBO); - } - - bool keepRunning() override - { - return windowCb->isWindowOpen(); - } - - video::IAPIConnection* getAPIConnection() override - { - return apiConnection.get(); - } - video::ILogicalDevice* getLogicalDevice() override - { - return logicalDevice.get(); - } - video::IGPURenderpass* getRenderpass() override - { - return renderpass.get(); - } - void setSurface(core::smart_refctd_ptr&& s) override - { - surface = std::move(s); - } - void setFBOs(std::vector>& f) override - { - for (int i = 0; i < f.size(); i++) - { - fbos->begin()[i] = core::smart_refctd_ptr(f[i]); - } - } - void setSwapchain(core::smart_refctd_ptr&& s) override - { - swapchain = std::move(s); - } - uint32_t getSwapchainImageCount() override - { - return swapchain->getImageCount(); - } - virtual nbl::asset::E_FORMAT getDepthFormat() override - { - return nbl::asset::EF_D32_SFLOAT; - } -}; - -NBL_COMMON_API_MAIN(RayQuerySampleApp) diff --git a/56_RayQuery/pipeline.groovy b/56_RayQuery/pipeline.groovy deleted file mode 100644 index beba797c3..000000000 --- a/56_RayQuery/pipeline.groovy +++ /dev/null @@ -1,50 +0,0 @@ -import org.DevshGraphicsProgramming.Agent -import org.DevshGraphicsProgramming.BuilderInfo -import org.DevshGraphicsProgramming.IBuilder - -class CRayQueryBuilder extends IBuilder -{ - public CRayQueryBuilder(Agent _agent, _info) - { - super(_agent, _info) - } - - @Override - public boolean prepare(Map axisMapping) - { - return true - } - - @Override - public boolean build(Map axisMapping) - { - IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") - IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") - - def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) - def nameOfConfig = getNameOfConfig(config) - - agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") - - return true - } - - @Override - public boolean test(Map axisMapping) - { - return true - } - - @Override - public boolean install(Map axisMapping) - { - return true - } -} - -def create(Agent _agent, _info) -{ - return new CRayQueryBuilder(_agent, _info) -} - -return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 24fb7fad8..0b3279a48 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,7 +76,6 @@ if(NBL_BUILD_EXAMPLES) #add_subdirectory(43_SumAndCDFFilters EXCLUDE_FROM_ALL) add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL) - add_subdirectory(53_ComputeShaders EXCLUDE_FROM_ALL) add_subdirectory(54_Transformations EXCLUDE_FROM_ALL) add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL) add_subdirectory(61_UI EXCLUDE_FROM_ALL) From f4cc4cd22ee4bd5506d794e63caafddf974ed7a4 Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 19 Apr 2025 16:04:49 +0200 Subject: [PATCH 123/296] const correctness of BLAS geometry spans --- 67_RayQueryGeometry/main.cpp | 3 ++- 71_RayTracingPipeline/main.cpp | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index dab137cbd..b34c474a0 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -817,8 +817,9 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu ILogicalDevice::AccelerationStructureBuildSizes buildSizes; { + const auto* trianglesData = triangles; const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; - buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{&triangles[i], 1}, maxPrimCount); + buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{trianglesData,1}, maxPrimCount); if (!buildSizes) return logFail("Failed to get BLAS build sizes"); } diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 35c750373..219a7aacb 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1537,10 +1537,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; if (isProcedural) { - buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{&aabbs, 1}, maxPrimCount); - } else + const auto* aabbData = &aabbs; + buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1}, maxPrimCount); + } + else { - buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{&triangles[i], 1}, maxPrimCount); + const auto* trianglesData = triangles.data(); + buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{trianglesData,1}, maxPrimCount); } if (!buildSizes) return logFail("Failed to get BLAS build sizes"); From ff1d0928f4bd85f7d5b259809bc6b20a6f4a3eba Mon Sep 17 00:00:00 2001 From: devsh Date: Sun, 20 Apr 2025 03:38:22 +0200 Subject: [PATCH 124/296] make example use new enum without KHR suffix --- 67_RayQueryGeometry/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index b34c474a0..aff687742 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -806,7 +806,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) - blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR; + blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS; blasBuildInfos[i].buildFlags = blasFlags; blasBuildInfos[i].geometryCount = 1; // only 1 geometry object per blas From 99cf5d862560a752f6491192a136154c5868fd84 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 21 Apr 2025 14:50:17 +0700 Subject: [PATCH 125/296] coalesced load/store data --- .../app_resources/benchmarkSubgroup.comp.hlsl | 32 ++++--------------- .../app_resources/shaderCommon.hlsl | 15 +++++---- .../app_resources/testSubgroup.comp.hlsl | 4 +-- 73_ArithmeticBench/main.cpp | 2 +- 4 files changed, 18 insertions(+), 35 deletions(-) diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl index 4715f0abf..3dd24e432 100644 --- a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl @@ -7,9 +7,9 @@ // NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders [[vk::binding(2, 0)]] RWTexture2D outImage; // dummy -uint32_t globalIndex() +uint32_t globalFirstItemIndex(uint32_t itemIdx) { - return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE*ITEMS_PER_INVOCATION+((nbl::hlsl::glsl::gl_SubgroupID()*ITEMS_PER_INVOCATION+itemIdx)< class binop, typename T, uint32_t N> -// static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) -// { -// using config_t = nbl::hlsl::subgroup::Configuration; -// using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; - -// const uint32_t storeAddr = sizeof(uint32_t) + sizeof(type_t) * globalIndex(); - -// operation_t func; -// [unroll] -// for (uint32_t i = 0; i < NUM_LOOPS; i++) -// { -// const uint32_t arrIndex = i & 7u; // i % 8 -// output[arrIndex].template Store(storeAddr, func(sourceVal)); -// } -// } template class binop, typename T, uint32_t N> static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) @@ -47,22 +31,20 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) for (uint32_t i = 0; i < NUM_LOOPS; i++) value = func(value); - output[binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); + [unroll] + for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) + output[binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(uint32_t) * (globalFirstItemIndex(i) + nbl::hlsl::glsl::gl_SubgroupInvocationID()), value[i]); } void benchmark() { - const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION; + const uint32_t idx = nbl::hlsl::glsl::gl_SubgroupInvocationID(); type_t sourceVal; -// #if ITEMS_PER_INVOCATION > 1 [unroll] for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) { - sourceVal[i] = inputValue[idx + i]; + sourceVal[i] = inputValue[globalFirstItemIndex(i) + idx]; } -// #else -// sourceVal = inputValue[idx]; -// #endif subbench(sourceVal); subbench(sourceVal); diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl index 3fdd3c986..f7ee1892c 100644 --- a/73_ArithmeticBench/app_resources/shaderCommon.hlsl +++ b/73_ArithmeticBench/app_resources/shaderCommon.hlsl @@ -15,8 +15,8 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1 [[vk::binding(0, 0)]] StructuredBuffer inputValue; [[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; -// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way -uint32_t globalIndex(); +// to get next item, move by subgroupSize +uint32_t globalFirstItemIndex(uint32_t itemIdx); // since we test ITEMS_PER_WG::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); operation_t func; + type_t value = func(sourceVal); if (canStore()) - output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); + { + [unroll] + for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) + output[binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(uint32_t) * (globalFirstItemIndex(i) + nbl::hlsl::glsl::gl_SubgroupInvocationID()), value[i]); + } } @@ -62,15 +67,11 @@ type_t test() { const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION; type_t sourceVal; -// #if ITEMS_PER_INVOCATION > 1 [unroll] for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) { sourceVal[i] = inputValue[idx + i]; } -// #else -// sourceVal = inputValue[idx]; -// #endif subtest(sourceVal); subtest(sourceVal); diff --git a/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl index 2cc1ccb60..0001d39e0 100644 --- a/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl +++ b/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl @@ -4,9 +4,9 @@ #include "shaderCommon.hlsl" -uint32_t globalIndex() +uint32_t globalFirstItemIndex(uint32_t itemIdx) { - return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE*ITEMS_PER_INVOCATION+((nbl::hlsl::glsl::gl_SubgroupID()*ITEMS_PER_INVOCATION+itemIdx)< using ArithmeticOp = emulatedReduction; // change this to test other arithmetic ops - bool b_runTests = false; + bool b_runTests = true; uint32_t* inputData = nullptr; uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t OutputBufferCount = 8u; From a3bb526405ce95bafadd34e9307ec526ad6854b4 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 21 Apr 2025 15:49:58 +0700 Subject: [PATCH 126/296] fixed some bugs --- 73_ArithmeticBench/app_resources/shaderCommon.hlsl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl index f7ee1892c..f4fc9d23a 100644 --- a/73_ArithmeticBench/app_resources/shaderCommon.hlsl +++ b/73_ArithmeticBench/app_resources/shaderCommon.hlsl @@ -1,6 +1,7 @@ #include "common.hlsl" #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" #include "nbl/builtin/hlsl/subgroup/basic.hlsl" #include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" @@ -49,7 +50,7 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) using config_t = nbl::hlsl::subgroup2::Configuration; using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; - if (globalIndex()==0u) + if (nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex()==0u) output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); operation_t func; @@ -65,12 +66,12 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) type_t test() { - const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION; + const uint32_t idx = nbl::hlsl::glsl::gl_SubgroupInvocationID(); type_t sourceVal; [unroll] for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) { - sourceVal[i] = inputValue[idx + i]; + sourceVal[i] = inputValue[globalFirstItemIndex(i) + idx]; } subtest(sourceVal); From 355c605d211400626b947a4d38f439d8c944e539 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 21 Apr 2025 16:58:37 +0700 Subject: [PATCH 127/296] disable test by default --- 73_ArithmeticBench/imgui.ini | 5 +++++ 73_ArithmeticBench/main.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 73_ArithmeticBench/imgui.ini diff --git a/73_ArithmeticBench/imgui.ini b/73_ArithmeticBench/imgui.ini new file mode 100644 index 000000000..4a5c20148 --- /dev/null +++ b/73_ArithmeticBench/imgui.ini @@ -0,0 +1,5 @@ +[Window][Debug##Default] +Pos=60,60 +Size=400,400 +Collapsed=0 + diff --git a/73_ArithmeticBench/main.cpp b/73_ArithmeticBench/main.cpp index 2d57c131c..d129cfaf9 100644 --- a/73_ArithmeticBench/main.cpp +++ b/73_ArithmeticBench/main.cpp @@ -887,7 +887,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub template using ArithmeticOp = emulatedReduction; // change this to test other arithmetic ops - bool b_runTests = true; + bool b_runTests = false; uint32_t* inputData = nullptr; uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t OutputBufferCount = 8u; From 21f424fbaf2234a6892a684ebc18333ef88b36be Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:13:40 +0700 Subject: [PATCH 128/296] Fix merge conflict --- .../app_resources/render.comp.hlsl | 149 ++++++++++++++++-- 67_RayQueryGeometry/main.cpp | 9 +- 2 files changed, 140 insertions(+), 18 deletions(-) diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl index b9323ac74..0279978ad 100644 --- a/67_RayQueryGeometry/app_resources/render.comp.hlsl +++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl @@ -6,6 +6,7 @@ #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl" #include "nbl/builtin/hlsl/bda/__ptr.hlsl" + using namespace nbl::hlsl; [[vk::push_constant]] SPushConstants pc; @@ -13,6 +14,17 @@ using namespace nbl::hlsl; [[vk::binding(0, 0)]] RaytracingAccelerationStructure topLevelAS; [[vk::binding(1, 0)]] RWTexture2D outImage; +[[vk::constant_id(0)]] const float shader_variant = 1.0; + +struct SGeomInfo2 +{ + uint64_t vertexBufferAddress; + uint64_t indexBufferAddress; + + uint32_t vertexStride : 29; + uint32_t indexType : 2; // 16 bit, 32 bit or none + uint32_t smoothNormals : 1; // flat for cube, rectangle, disk +}; float3 unpackNormals3x10(uint32_t v) { @@ -23,6 +35,77 @@ float3 unpackNormals3x10(uint32_t v) return clamp(float3(pn) / 511.0, -1.0, 1.0); } +float3 calculateSmoothNormals2(int instID, int primID, SGeomInfo2 geom, float2 bary) +{ + const uint indexType = geom.indexType; + const uint vertexStride = geom.vertexStride; + + const uint64_t vertexBufferAddress = geom.vertexBufferAddress; + const uint64_t indexBufferAddress = geom.indexBufferAddress; + + uint32_t3 indices; + switch (indexType) + { + case 0: // EIT_16BIT + indices = uint32_t3((nbl::hlsl::bda::__ptr::create(indexBufferAddress)+primID).deref().load()); + break; + case 1: // EIT_32BIT + indices = uint32_t3((nbl::hlsl::bda::__ptr::create(indexBufferAddress)+primID).deref().load()); + break; + default: // EIT_NONE + { + indices[0] = primID * 3; + indices[1] = indices[0] + 1; + indices[2] = indices[0] + 2; + } + } + + float3 n0, n1, n2; + switch (instID) + { + case OT_CUBE: + { + // TODO: document why the alignment is 2 here and nowhere else? isnt the `vertexStride` aligned to more than 2 anyway? + uint32_t v0 = vk::RawBufferLoad(vertexBufferAddress + indices[0] * vertexStride, 2u); + uint32_t v1 = vk::RawBufferLoad(vertexBufferAddress + indices[1] * vertexStride, 2u); + uint32_t v2 = vk::RawBufferLoad(vertexBufferAddress + indices[2] * vertexStride, 2u); + + n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz); + n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz); + n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz); + } + break; + case OT_SPHERE: + case OT_CYLINDER: + case OT_ARROW: + case OT_CONE: + { + uint32_t v0 = vk::RawBufferLoad(vertexBufferAddress + indices[0] * vertexStride); + uint32_t v1 = vk::RawBufferLoad(vertexBufferAddress + indices[1] * vertexStride); + uint32_t v2 = vk::RawBufferLoad(vertexBufferAddress + indices[2] * vertexStride); + + n0 = normalize(unpackNormals3x10(v0)); + n1 = normalize(unpackNormals3x10(v1)); + n2 = normalize(unpackNormals3x10(v2)); + } + break; + case OT_RECTANGLE: + case OT_DISK: + case OT_ICOSPHERE: + default: + { + n0 = normalize(vk::RawBufferLoad(vertexBufferAddress + indices[0] * vertexStride)); + n1 = normalize(vk::RawBufferLoad(vertexBufferAddress + indices[1] * vertexStride)); + n2 = normalize(vk::RawBufferLoad(vertexBufferAddress + indices[2] * vertexStride)); + } + } + + float3 barycentrics = float3(0.0, bary); + barycentrics.x = 1.0 - barycentrics.y - barycentrics.z; + + return barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2; +} + float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bary) { const uint indexType = geom.indexType; @@ -129,27 +212,65 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) const SGeomInfo geom = vk::RawBufferLoad(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo)); float3 normals; - if (jit::device_capabilities::rayTracingPositionFetch) + float2 barycentrics = spirv::rayQueryGetIntersectionBarycentricsKHR(query, true); + normals = calculateSmoothNormals(instID, primID, geom, barycentrics); + + normals = normalize(normals) * 0.5 + 0.5; + color = float4(normals, 1.0); + } + + outImage[threadID.xy] = color; +} + +[numthreads(WorkgroupSize, WorkgroupSize, 1)] +[shader("compute")] +void main2(uint32_t3 threadID : SV_DispatchThreadID) +{ + uint2 coords = threadID.xy; + coords.y = nbl::hlsl::glsl::gl_NumWorkGroups().y * WorkgroupSize - coords.y; // need to invert it + + + float4 NDC; + NDC.xy = float2(coords) * pc.scaleNDC; + NDC.xy += pc.offsetNDC; + NDC.zw = float2(0, 1.0); + float3 targetPos; + { + float4 tmp = mul(pc.invMVP, NDC); + targetPos = tmp.xyz / tmp.w; + } + + float3 direction = normalize(targetPos - pc.camPos); + + spirv::RayQueryKHR query; + spirv::rayQueryInitializeKHR(query, topLevelAS, spv::RayFlagsOpaqueKHRMask, 0xFF, pc.camPos, 0.01, direction, 1000.0); + + while (spirv::rayQueryProceedKHR(query)) {} + + float4 color = float4(0, 0, 0, 1); + + if (spirv::rayQueryGetIntersectionTypeKHR(query, true) == spv::RayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionTriangleKHR) + { + const int instID = spirv::rayQueryGetIntersectionInstanceIdKHR(query, true); + const int primID = spirv::rayQueryGetIntersectionPrimitiveIndexKHR(query, true); + + // TODO: candidate for `bda::__ptr` + const SGeomInfo2 geom = vk::RawBufferLoad(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo2)); + + float3 normals; + if (geom.smoothNormals) { - if (geom.smoothNormals) - { - float2 barycentrics = spirv::rayQueryGetIntersectionBarycentricsKHR(query, true); - normals = calculateSmoothNormals(instID, primID, geom, barycentrics); - } - else - { - float3 pos[3] = spirv::rayQueryGetIntersectionTriangleVertexPositionsKHR(query, true); - normals = cross(pos[1] - pos[0], pos[2] - pos[0]); - } + float2 barycentrics = spirv::rayQueryGetIntersectionBarycentricsKHR(query, true); + normals = calculateSmoothNormals2(instID, primID, geom, barycentrics); } else { - float2 barycentrics = spirv::rayQueryGetIntersectionBarycentricsKHR(query, true); - normals = calculateSmoothNormals(instID, primID, geom, barycentrics); + float3 pos[3] = spirv::rayQueryGetIntersectionTriangleVertexPositionsKHR(query, true); + normals = cross(pos[1] - pos[0], pos[2] - pos[0]); } normals = normalize(normals) * 0.5 + 0.5; - color = float4(normals, 1.0); + color = float4(normals, shader_variant); } outImage[threadID.xy] = color; diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index c4c483263..4c09da5da 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -164,8 +164,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu const auto assets = bundle.getContents(); assert(assets.size() == 1); - const auto sourceRaw = smart_refctd_ptr_static_cast(assets[0]); - smart_refctd_ptr shader = m_device->compileShader({sourceRaw.get(), nullptr, nullptr, nullptr}); + smart_refctd_ptr shaderSrc = IAsset::castDown(assets[0]); + auto shader = m_device->compileShader({ shaderSrc.get() }); if (!shader) return logFail("Failed to create shader!"); @@ -783,7 +783,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu { IGPUBottomLevelAccelerationStructure::DeviceBuildInfo blasBuildInfos[OT_COUNT]; uint32_t primitiveCounts[OT_COUNT]; - IGPUBottomLevelAccelerationStructure::Triangles triangles[OT_COUNT]; + using Geometry = IGPUBottomLevelAccelerationStructure::Triangles; + Geometry triangles[OT_COUNT]; uint32_t scratchSizes[OT_COUNT]; for (uint32_t i = 0; i < objectsGpu.size(); i++) @@ -819,7 +820,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu ILogicalDevice::AccelerationStructureBuildSizes buildSizes; { const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; - buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{&triangles[i], 1}, maxPrimCount); + buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{&triangles[i], 1}, maxPrimCount); if (!buildSizes) return logFail("Failed to get BLAS build sizes"); } From 2878d038539324760a2a9f450744b0e7086ca0e8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:13:58 +0700 Subject: [PATCH 129/296] Fix Hello Compute to use IShader --- 02_HelloCompute/main.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/02_HelloCompute/main.cpp b/02_HelloCompute/main.cpp index 124cd7dc5..63a9f8832 100644 --- a/02_HelloCompute/main.cpp +++ b/02_HelloCompute/main.cpp @@ -94,9 +94,9 @@ class HelloComputeApp final : public nbl::application_templates::MonoSystemMonoL // The convention is that an `ICPU` object represents a potentially Mutable (and in the past, Serializable) recipe for creating an `IGPU` object, and later examples will show automated systems for doing that. // The Assets always form a Directed Acyclic Graph and our type system enforces that property at compile time (i.e. an `IBuffer` cannot reference an `IImageView` even indirectly). // Another reason for the 1:1 pairing of types is that one can use a CPU-to-GPU associative cache (asset manager has a default one) and use the pointers to the CPU objects as UUIDs. - // The ICPUShader is just a mutable container for source code (can be high level like HLSL needing compilation to SPIR-V or SPIR-V itself) held in an `nbl::asset::ICPUBuffer`. + // The IShader is just a mutable container for source code (can be high level like HLSL needing compilation to SPIR-V or SPIR-V itself) held in an `nbl::asset::ICPUBuffer`. // They can be created: from buffers of code, by compilation from some other source code, or loaded from files (next example will do that). - smart_refctd_ptr cpuShader; + smart_refctd_ptr cpuShader; { // Normally we'd use the ISystem and the IAssetManager to load shaders flexibly from (virtual) files for ease of development (syntax highlighting and Intellisense), // but I want to show the full process of assembling a shader from raw source code at least once. @@ -138,7 +138,7 @@ class HelloComputeApp final : public nbl::application_templates::MonoSystemMonoL } // Note how each ILogicalDevice method takes a smart-pointer r-value, so that the GPU objects refcount their dependencies - smart_refctd_ptr shader = device->createShader(cpuShader.get()); + smart_refctd_ptr shader = device->compileShader({.source = cpuShader.get()}); if (!shader) return logFail("Failed to create a GPU Shader, seems the Driver doesn't like the SPIR-V we're feeding it!\n"); @@ -169,6 +169,7 @@ class HelloComputeApp final : public nbl::application_templates::MonoSystemMonoL // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main") params.shader.entryPoint = "main"; params.shader.shader = shader.get(); + params.shader.stage = hlsl::ESS_COMPUTE; // we'll cover the specialization constant API in another example if (!device->createComputePipelines(nullptr,{¶ms,1},&pipeline)) return logFail("Failed to create pipelines (compile & link shaders)!\n"); From 2ea63044df93b213ba428c80c3948e80cec57c95 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:14:13 +0700 Subject: [PATCH 130/296] Fix example 03 to use IShader --- 03_DeviceSelectionAndSharedSources/Testers.h | 22 ++++++++++++-------- 03_DeviceSelectionAndSharedSources/main.cpp | 18 ++++++++++------ 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/03_DeviceSelectionAndSharedSources/Testers.h b/03_DeviceSelectionAndSharedSources/Testers.h index a76d4b668..b21da71c4 100644 --- a/03_DeviceSelectionAndSharedSources/Testers.h +++ b/03_DeviceSelectionAndSharedSources/Testers.h @@ -24,7 +24,7 @@ class IntrospectionTesterBase const std::string m_functionToTestName = ""; protected: - static std::pair, smart_refctd_ptr> compileHLSLShaderAndTestIntrospection( + static std::pair, smart_refctd_ptr> compileHLSLShaderAndTestIntrospection( video::IPhysicalDevice* physicalDevice, video::ILogicalDevice* device, system::ILogger* logger, asset::IAssetManager* assetMgr, const std::string& shaderPath, CSPIRVIntrospector& introspector) { IAssetLoader::SAssetLoadParams lp = {}; @@ -33,15 +33,18 @@ class IntrospectionTesterBase // this time we load a shader directly from a file auto assetBundle = assetMgr->getAsset(shaderPath, lp); const auto assets = assetBundle.getContents(); - if (assets.empty()) + const auto* metadata = assetBundle.getMetadata(); + if (assets.empty() || assetBundle.getAssetType() != IAsset::ET_SHADER) { logFail(logger, "Could not load shader!"); assert(0); } + const auto hlslMetadata = static_cast(metadata); + const auto shaderStage = hlslMetadata->shaderStages->front(); // It would be super weird if loading a shader from a file produced more than 1 asset assert(assets.size() == 1); - smart_refctd_ptr source = IAsset::castDown(assets[0]); + smart_refctd_ptr source = IAsset::castDown(assets[0]); smart_refctd_ptr introspection; { @@ -53,7 +56,7 @@ class IntrospectionTesterBase // The Shader Asset Loaders deduce the stage from the file extension, // if the extension is generic (.glsl or .hlsl) the stage is unknown. // But it can still be overriden from within the source with a `#pragma shader_stage` - options.stage = source->getStage() == IShader::E_SHADER_STAGE::ESS_COMPUTE ? source->getStage() : IShader::E_SHADER_STAGE::ESS_VERTEX; // TODO: do smth with it + options.stage = shaderStage == IShader::E_SHADER_STAGE::ESS_COMPUTE ? shaderStage : IShader::E_SHADER_STAGE::ESS_VERTEX; // TODO: do smth with it options.targetSpirvVersion = device->getPhysicalDevice()->getLimits().spirvVersion; // we need to perform an unoptimized compilation with source debug info or we'll lose names of variable sin the introspection options.spirvOptimizer = nullptr; @@ -186,7 +189,7 @@ class PredefinedLayoutTester final : public IntrospectionTesterBase constexpr uint32_t MERGE_TEST_SHADERS_CNT = mergeTestShadersPaths.size(); CSPIRVIntrospector introspector[MERGE_TEST_SHADERS_CNT]; - smart_refctd_ptr sources[MERGE_TEST_SHADERS_CNT]; + smart_refctd_ptr sources[MERGE_TEST_SHADERS_CNT]; for (uint32_t i = 0u; i < MERGE_TEST_SHADERS_CNT; ++i) { @@ -201,7 +204,7 @@ class PredefinedLayoutTester final : public IntrospectionTesterBase .binding = 0, .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ICPUShader::E_SHADER_STAGE::ESS_COMPUTE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 1, .immutableSamplers = nullptr } @@ -213,7 +216,7 @@ class PredefinedLayoutTester final : public IntrospectionTesterBase .binding = 0, .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ICPUShader::E_SHADER_STAGE::ESS_COMPUTE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 1, .immutableSamplers = nullptr }, @@ -221,7 +224,7 @@ class PredefinedLayoutTester final : public IntrospectionTesterBase .binding = 1, .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ICPUShader::E_SHADER_STAGE::ESS_COMPUTE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 2, .immutableSamplers = nullptr } @@ -251,9 +254,10 @@ class PredefinedLayoutTester final : public IntrospectionTesterBase bool pplnCreationSuccess[MERGE_TEST_SHADERS_CNT]; for (uint32_t i = 0u; i < MERGE_TEST_SHADERS_CNT; ++i) { - ICPUShader::SSpecInfo specInfo; + IPipelineBase::SShaderSpecInfo specInfo; specInfo.entryPoint = "main"; specInfo.shader = sources[i].get(); + specInfo.stage = hlsl::ShaderStage::ESS_COMPUTE; pplnCreationSuccess[i] = static_cast(introspector[i].createApproximateComputePipelineFromIntrospection(specInfo, core::smart_refctd_ptr(predefinedPplnLayout))); } diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp index be56791a1..3712b5719 100644 --- a/03_DeviceSelectionAndSharedSources/main.cpp +++ b/03_DeviceSelectionAndSharedSources/main.cpp @@ -4,6 +4,7 @@ #include "nbl/application_templates/MonoDeviceApplication.hpp" #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "nbl/asset/metadata/CHLSLMetadata.h" #include "CommonPCH/PCH.hpp" using namespace nbl; @@ -60,9 +61,10 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M //shaderIntrospection->debugPrint(m_logger.get()); // We've now skipped the manual creation of a descriptor set layout, pipeline layout - ICPUShader::SSpecInfo specInfo; + IPipelineBase::SShaderSpecInfo specInfo; specInfo.entryPoint = "main"; specInfo.shader = source.get(); + specInfo.stage = hlsl::ShaderStage::ESS_COMPUTE; smart_refctd_ptr cpuPipeline = introspector.createApproximateComputePipelineFromIntrospection(specInfo); @@ -236,7 +238,7 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. bool keepRunning() override { return false; } - std::pair, smart_refctd_ptr> compileShaderAndTestIntrospection( + std::pair, smart_refctd_ptr> compileShaderAndTestIntrospection( const std::string& shaderPath, CSPIRVIntrospector& introspector) { IAssetLoader::SAssetLoadParams lp = {}; @@ -245,15 +247,19 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M // this time we load a shader directly from a file auto assetBundle = m_assetMgr->getAsset(shaderPath, lp); const auto assets = assetBundle.getContents(); - if (assets.empty()) + if (assets.empty() || assetBundle.getAssetType() != IAsset::ET_SHADER) { logFail("Could not load shader!"); assert(0); } + const auto* metadata = assetBundle.getMetadata(); + const auto hlslMetadata = static_cast(metadata); + const auto shaderStage = hlslMetadata->shaderStages->front(); + // It would be super weird if loading a shader from a file produced more than 1 asset assert(assets.size() == 1); - smart_refctd_ptr source = IAsset::castDown(assets[0]); + smart_refctd_ptr source = IAsset::castDown(assets[0]); smart_refctd_ptr introspection; { @@ -265,7 +271,7 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M // The Shader Asset Loaders deduce the stage from the file extension, // if the extension is generic (.glsl or .hlsl) the stage is unknown. // But it can still be overriden from within the source with a `#pragma shader_stage` - options.stage = source->getStage() == IShader::E_SHADER_STAGE::ESS_COMPUTE ? source->getStage() : IShader::E_SHADER_STAGE::ESS_VERTEX; // TODO: do smth with it + options.stage = shaderStage == IShader::E_SHADER_STAGE::ESS_COMPUTE ? shaderStage : IShader::E_SHADER_STAGE::ESS_VERTEX; // TODO: do smth with it options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; // we need to perform an unoptimized compilation with source debug info or we'll lose names of variable sin the introspection options.spirvOptimizer = nullptr; @@ -277,7 +283,7 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M options.preprocessorOptions.includeFinder = compilerSet->getShaderCompiler(source->getContentType())->getDefaultIncludeFinder(); auto spirvUnspecialized = compilerSet->compileToSPIRV(source.get(), options); - const CSPIRVIntrospector::CStageIntrospectionData::SParams inspctParams = { .entryPoint = "main", .shader = spirvUnspecialized }; + const CSPIRVIntrospector::CStageIntrospectionData::SParams inspctParams = { .entryPoint = "main", .shader = spirvUnspecialized, .stage = shaderStage }; introspection = introspector.introspect(inspctParams); introspection->debugPrint(m_logger.get()); From 78990f8cf9d4d2b8c7f66b1adf3a29c4a05823a5 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:14:27 +0700 Subject: [PATCH 131/296] Fix example 05 to use IShader --- 05_StreamingAndBufferDeviceAddressApp/main.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/05_StreamingAndBufferDeviceAddressApp/main.cpp b/05_StreamingAndBufferDeviceAddressApp/main.cpp index e8f7dbd33..96ccce9f5 100644 --- a/05_StreamingAndBufferDeviceAddressApp/main.cpp +++ b/05_StreamingAndBufferDeviceAddressApp/main.cpp @@ -91,7 +91,7 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M return false; // this time we load a shader directly from a file - smart_refctd_ptr shader; + smart_refctd_ptr shader; { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = m_logger.get(); @@ -102,14 +102,9 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M return logFail("Could not load shader!"); // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - auto source = IAsset::castDown(assets[0]); + shader = IAsset::castDown(assets[0]); // The down-cast should not fail! - assert(source); - - // this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple - shader = m_device->createShader(source.get()); - if (!shader) - return logFail("Creation of a GPU Shader to from CPU Shader source failed!"); + assert(shader); } // The StreamingTransientDataBuffers are actually composed on top of another useful utility called `CAsyncSingleBufferSubAllocator` @@ -139,6 +134,8 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M IGPUComputePipeline::SCreationParams params = {}; params.layout = layout.get(); params.shader.shader = shader.get(); + params.shader.entryPoint = "main"; + params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE; if (!m_device->createComputePipelines(nullptr,{¶ms,1},&m_pipeline)) return logFail("Failed to create compute pipeline!\n"); } From 01df790d87005ac4e4ecb8bebdd7534ad7d8f7d7 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:14:41 +0700 Subject: [PATCH 132/296] Fix example 07 to use IShader --- 07_StagingAndMultipleQueues/main.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp index 658a28a35..23f2246bc 100644 --- a/07_StagingAndMultipleQueues/main.cpp +++ b/07_StagingAndMultipleQueues/main.cpp @@ -246,7 +246,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul .binding = 0, .type = nbl::asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 1, .immutableSamplers = nullptr }, @@ -254,7 +254,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul .binding = 1, .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 1, .immutableSamplers = nullptr } @@ -281,18 +281,17 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul } // LOAD SHADER FROM FILE - smart_refctd_ptr source; + smart_refctd_ptr source; { - source = loadFistAssetInBundle("../app_resources/comp_shader.hlsl"); - source->setShaderStage(IShader::E_SHADER_STAGE::ESS_COMPUTE); // can also be done via a #pragma in the shader + source = loadFistAssetInBundle("../app_resources/comp_shader.hlsl"); } if (!source) logFailAndTerminate("Could not create a CPU shader!"); - core::smart_refctd_ptr shader = m_device->createShader(source.get()); + core::smart_refctd_ptr shader = m_device->compileShader({ source.get() }); if(!shader) - logFailAndTerminate("Could not create a GPU shader!"); + logFailAndTerminate("Could not compile shader to spirv!"); // CREATE COMPUTE PIPELINE SPushConstantRange pc[1]; @@ -312,6 +311,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main") params.shader.entryPoint = "main"; params.shader.shader = shader.get(); + params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE; // we'll cover the specialization constant API in another example if (!m_device->createComputePipelines(nullptr,{¶ms,1},&pipeline)) logFailAndTerminate("Failed to create pipelines (compile & link shaders)!\n"); @@ -432,15 +432,15 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul submitInfo[0].waitSemaphores = waitSemaphoreSubmitInfo; // there's no save to wait on, or need to prevent signal-after-submit because Renderdoc freezes because it // starts capturing immediately upon a submit and can't defer a capture till semaphores signal. - if (imageToProcessIdisRunningInRenderdoc()) + if (imageToProcessIdisRunningInGraphicsDebugger()) submitInfo[0].waitSemaphores = {waitSemaphoreSubmitInfo,1}; - if (m_api->isRunningInRenderdoc() && imageToProcessId>=SUBMITS_IN_FLIGHT) + if (m_api->isRunningInGraphicsDebugger() && imageToProcessId>=SUBMITS_IN_FLIGHT) for (auto old = histogramsSaved.load(); old < histogramSaveWaitSemaphoreValue; old = histogramsSaved.load()) histogramsSaved.wait(old); // Some Devices like all of the Intel GPUs do not have enough queues for us to allocate different queues to compute and transfers, // so our `BasicMultiQueueApplication` will "alias" a single queue to both usages. Normally you don't need to care, but here we're // attempting to do "out-of-order" "submit-before-signal" so we need to "hold back" submissions if the queues are aliased! - if (getTransferUpQueue()==computeQueue || m_api->isRunningInRenderdoc()) + if (getTransferUpQueue()==computeQueue || m_api->isRunningInGraphicsDebugger()) for (auto old = transfersSubmitted.load(); old <= imageToProcessId; old = transfersSubmitted.load()) transfersSubmitted.wait(old); computeQueue->submit(submitInfo); From f46522c3043b132b4aa4a7765ac41f1c9173ae66 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:14:53 +0700 Subject: [PATCH 133/296] Fix example 70 to use IShader --- .../compute/advectParticles.comp.hlsl | 1 + .../compute/applyBodyForces.comp.hlsl | 1 + .../app_resources/compute/diffusion.comp.hlsl | 1 + .../compute/genParticleVertices.comp.hlsl | 1 + .../compute/particlesInit.comp.hlsl | 1 + .../compute/prepareCellUpdate.comp.hlsl | 1 + .../compute/pressureSolver.comp.hlsl | 1 + .../compute/updateFluidCells.comp.hlsl | 1 + .../fluidParticles.fragment.hlsl | 1 + .../app_resources/fluidParticles.vertex.hlsl | 1 + 70_FLIPFluids/main.cpp | 34 +++++++++++-------- 11 files changed, 30 insertions(+), 14 deletions(-) diff --git a/70_FLIPFluids/app_resources/compute/advectParticles.comp.hlsl b/70_FLIPFluids/app_resources/compute/advectParticles.comp.hlsl index 2d329ac85..64e94f262 100644 --- a/70_FLIPFluids/app_resources/compute/advectParticles.comp.hlsl +++ b/70_FLIPFluids/app_resources/compute/advectParticles.comp.hlsl @@ -26,6 +26,7 @@ using namespace nbl::hlsl; // TODO: delta time push constant? (but then for CI need a commandline `-fixed-timestep=MS` and `-frames=N` option too) [numthreads(WorkgroupSize, 1, 1)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { uint32_t pid = ID.x; diff --git a/70_FLIPFluids/app_resources/compute/applyBodyForces.comp.hlsl b/70_FLIPFluids/app_resources/compute/applyBodyForces.comp.hlsl index 8ffc5e821..b2c1e0b3f 100644 --- a/70_FLIPFluids/app_resources/compute/applyBodyForces.comp.hlsl +++ b/70_FLIPFluids/app_resources/compute/applyBodyForces.comp.hlsl @@ -14,6 +14,7 @@ cbuffer GridData // TODO: can this kernel be fused with any preceeding/succeeding it? [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { // only gravity for now diff --git a/70_FLIPFluids/app_resources/compute/diffusion.comp.hlsl b/70_FLIPFluids/app_resources/compute/diffusion.comp.hlsl index 43a57ed38..e53c91d2d 100644 --- a/70_FLIPFluids/app_resources/compute/diffusion.comp.hlsl +++ b/70_FLIPFluids/app_resources/compute/diffusion.comp.hlsl @@ -34,6 +34,7 @@ groupshared uint16_t3 sAxisCellMat[14][14][14]; // TODO: `uint16_t` per axis is groupshared float16_t3 sDiffusion[14][14][14]; [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)] +[shader("compute")] void setAxisCellMaterial(uint32_t3 ID : SV_DispatchThreadID) { int3 cellIdx = ID; diff --git a/70_FLIPFluids/app_resources/compute/genParticleVertices.comp.hlsl b/70_FLIPFluids/app_resources/compute/genParticleVertices.comp.hlsl index b66db1ca2..4c4a76690 100644 --- a/70_FLIPFluids/app_resources/compute/genParticleVertices.comp.hlsl +++ b/70_FLIPFluids/app_resources/compute/genParticleVertices.comp.hlsl @@ -57,6 +57,7 @@ static const float2 quadUVs[4] = { using namespace nbl::hlsl; [numthreads(WorkgroupSize, 1, 1)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { uint32_t pid = ID.x; diff --git a/70_FLIPFluids/app_resources/compute/particlesInit.comp.hlsl b/70_FLIPFluids/app_resources/compute/particlesInit.comp.hlsl index 173929b10..27bf4366f 100644 --- a/70_FLIPFluids/app_resources/compute/particlesInit.comp.hlsl +++ b/70_FLIPFluids/app_resources/compute/particlesInit.comp.hlsl @@ -17,6 +17,7 @@ cbuffer GridData }; [numthreads(WorkgroupSize, 1, 1)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { uint32_t pid = ID.x; diff --git a/70_FLIPFluids/app_resources/compute/prepareCellUpdate.comp.hlsl b/70_FLIPFluids/app_resources/compute/prepareCellUpdate.comp.hlsl index fe82fe946..157da5bb8 100644 --- a/70_FLIPFluids/app_resources/compute/prepareCellUpdate.comp.hlsl +++ b/70_FLIPFluids/app_resources/compute/prepareCellUpdate.comp.hlsl @@ -42,6 +42,7 @@ float getWeight(float3 pPos, float3 cPos, float invSpacing) } [numthreads(WorkgroupSize, 1, 1)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { uint pid = ID.x; diff --git a/70_FLIPFluids/app_resources/compute/pressureSolver.comp.hlsl b/70_FLIPFluids/app_resources/compute/pressureSolver.comp.hlsl index 668b15c31..b5db995c5 100644 --- a/70_FLIPFluids/app_resources/compute/pressureSolver.comp.hlsl +++ b/70_FLIPFluids/app_resources/compute/pressureSolver.comp.hlsl @@ -36,6 +36,7 @@ groupshared float sDivergence[14][14][14]; groupshared float sPressure[14][14][14]; [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)] +[shader("compute")] void calculateNegativeDivergence(uint32_t3 ID : SV_DispatchThreadID) { int3 cellIdx = ID; diff --git a/70_FLIPFluids/app_resources/compute/updateFluidCells.comp.hlsl b/70_FLIPFluids/app_resources/compute/updateFluidCells.comp.hlsl index 9d7fabd52..62ddfd822 100644 --- a/70_FLIPFluids/app_resources/compute/updateFluidCells.comp.hlsl +++ b/70_FLIPFluids/app_resources/compute/updateFluidCells.comp.hlsl @@ -40,6 +40,7 @@ void updateFluidCells(uint32_t3 ID : SV_DispatchThreadID) } [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)] +[shader("compute")] void updateNeighborFluidCells(uint32_t3 ID : SV_DispatchThreadID) { int3 cIdx = ID; diff --git a/70_FLIPFluids/app_resources/fluidParticles.fragment.hlsl b/70_FLIPFluids/app_resources/fluidParticles.fragment.hlsl index e556ce8ed..cac1bfa4a 100644 --- a/70_FLIPFluids/app_resources/fluidParticles.fragment.hlsl +++ b/70_FLIPFluids/app_resources/fluidParticles.fragment.hlsl @@ -9,6 +9,7 @@ cbuffer CameraData // TODO: BDA instead of UBO, one less thing in DSLayout SMVPParams camParams; }; +[shader("pixel")] float4 main(PSInput input, out float depthTest : SV_DEPTHGREATEREQUAL) : SV_TARGET { float3 N; diff --git a/70_FLIPFluids/app_resources/fluidParticles.vertex.hlsl b/70_FLIPFluids/app_resources/fluidParticles.vertex.hlsl index 4708083c6..89d37eb6f 100644 --- a/70_FLIPFluids/app_resources/fluidParticles.vertex.hlsl +++ b/70_FLIPFluids/app_resources/fluidParticles.vertex.hlsl @@ -14,6 +14,7 @@ struct SPushConstants #include "nbl/builtin/hlsl/bda/__ptr.hlsl" using namespace nbl::hlsl; +[shader("vertex")] PSInput main(uint vertexID : SV_VertexID) { PSInput output; diff --git a/70_FLIPFluids/main.cpp b/70_FLIPFluids/main.cpp index 93e753b68..a0d2ad95d 100644 --- a/70_FLIPFluids/main.cpp +++ b/70_FLIPFluids/main.cpp @@ -9,6 +9,8 @@ #include #include +#include "nbl/asset/metadata/CHLSLMetadata.h" + using namespace nbl::hlsl; using namespace nbl; using namespace core; @@ -372,6 +374,7 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a params.layout = pipelineLayout.get(); params.shader.entryPoint = entryPoint; params.shader.shader = shader.get(); + params.shader.stage = ESS_COMPUTE; m_device->createComputePipelines(nullptr, { ¶ms,1 }, &pipeline); }; @@ -628,6 +631,7 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a params.layout = pipelineLayout.get(); params.shader.entryPoint = iterateKernel; params.shader.shader = iterateShader.get(); + params.shader.stage = ESS_COMPUTE; m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_iterateDiffusionPipeline); } @@ -636,6 +640,7 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a params.layout = pipelineLayout.get(); params.shader.entryPoint = applyKernel; params.shader.shader = applyShader.get(); + params.shader.stage = ESS_COMPUTE; m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_diffusionPipeline); } @@ -1401,7 +1406,7 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a numParticles = m_gridData.particleInitSize.x * m_gridData.particleInitSize.y * m_gridData.particleInitSize.z * particlesPerCell; } - smart_refctd_ptr compileShader(const std::string& filePath, const std::string& entryPoint = "main") + smart_refctd_ptr compileShader(const std::string& filePath, const std::string& entryPoint = "main") { IAssetLoader::SAssetLoadParams lparams = {}; lparams.logger = m_logger.get(); @@ -1415,14 +1420,16 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a const auto assets = bundle.getContents(); assert(assets.size() == 1); - smart_refctd_ptr shaderSrc = IAsset::castDown(assets[0]); + smart_refctd_ptr shaderSrc = IAsset::castDown(assets[0]); + const auto hlslMetadata = static_cast(bundle.getMetadata()); + const auto shaderStage = hlslMetadata->shaderStages->front(); - smart_refctd_ptr shader = shaderSrc; + smart_refctd_ptr shader = shaderSrc; if (entryPoint != "main") { auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); CHLSLCompiler::SOptions options = {}; - options.stage = shaderSrc->getStage(); + options.stage = shaderStage; if (!(options.stage == IShader::E_SHADER_STAGE::ESS_COMPUTE || options.stage == IShader::E_SHADER_STAGE::ESS_FRAGMENT)) options.stage = IShader::E_SHADER_STAGE::ESS_VERTEX; options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; @@ -1443,7 +1450,7 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a shader = compiler->compileToSPIRV((const char*)shaderSrc->getContent()->getPointer(), options); } - return m_device->createShader(shader.get()); + return m_device->compileShader({ shader.get() }); } // TODO: there's a method in IUtilities for this @@ -1562,7 +1569,7 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a // init shaders and pipeline - auto compileShader = [&](const std::string& filePath, IShader::E_SHADER_STAGE stage) -> smart_refctd_ptr + auto compileShader = [&](const std::string& filePath) -> smart_refctd_ptr { IAssetLoader::SAssetLoadParams lparams = {}; lparams.logger = m_logger.get(); @@ -1576,15 +1583,14 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a const auto assets = bundle.getContents(); assert(assets.size() == 1); - smart_refctd_ptr shaderSrc = IAsset::castDown(assets[0]); - shaderSrc->setShaderStage(stage); + smart_refctd_ptr shaderSrc = IAsset::castDown(assets[0]); if (!shaderSrc) return nullptr; - return m_device->createShader(shaderSrc.get()); + return m_device->compileShader({ shaderSrc.get() }); }; - auto vs = compileShader("app_resources/fluidParticles.vertex.hlsl", IShader::E_SHADER_STAGE::ESS_VERTEX); - auto fs = compileShader("app_resources/fluidParticles.fragment.hlsl", IShader::E_SHADER_STAGE::ESS_FRAGMENT); + auto vs = compileShader("app_resources/fluidParticles.vertex.hlsl"); + auto fs = compileShader("app_resources/fluidParticles.fragment.hlsl"); smart_refctd_ptr descriptorSetLayout1; { @@ -1629,9 +1635,9 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a blendParams.blendParams[0u].colorWriteMask = (1u << 0u) | (1u << 1u) | (1u << 2u) | (1u << 3u); { - IGPUShader::SSpecInfo specInfo[3] = { - {.shader = vs.get()}, - {.shader = fs.get()}, + IPipelineBase::SShaderSpecInfo specInfo[] = { + {.shader = vs.get(), .entryPoint = "main", .stage = ESS_VERTEX, }, + {.shader = fs.get(), .entryPoint = "main", .stage = ESS_FRAGMENT, }, }; const asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX, .offset = 0, .size = sizeof(uint64_t) }; From b0b6f648d62ee274bd3b6b34c30f992183402126 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:15:02 +0700 Subject: [PATCH 134/296] Fix example 30 to use IShader --- 30_ComputeShaderPathTracer/main.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp index 26d673002..44a4dd6ef 100644 --- a/30_ComputeShaderPathTracer/main.cpp +++ b/30_ComputeShaderPathTracer/main.cpp @@ -313,12 +313,11 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication std::exit(-1); } - auto source = IAsset::castDown(assets[0]); + auto source = IAsset::castDown(assets[0]); // The down-cast should not fail! assert(source); - // this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple - auto shader = m_device->createShader(source.get()); + auto shader = m_device->compileShader({ .source = source.get(), .stage = ESS_COMPUTE }); if (!shader) { m_logger->log("Shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader); @@ -352,9 +351,10 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication params.layout = ptPipelineLayout.get(); params.shader.shader = ptShader.get(); params.shader.entryPoint = "main"; + params.shader.stage = ESS_COMPUTE; params.shader.entries = nullptr; params.shader.requireFullSubgroups = true; - params.shader.requiredSubgroupSize = static_cast(5); + params.shader.requiredSubgroupSize = static_cast(5); if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTPipelines.data() + index)) { return logFail("Failed to create compute pipeline!\n"); } @@ -373,9 +373,10 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication if (!fragmentShader) return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!"); - const IGPUShader::SSpecInfo fragSpec = { + const IPipelineBase::SShaderSpecInfo fragSpec = { + .shader = fragmentShader.get(), .entryPoint = "main", - .shader = fragmentShader.get() + .stage = ESS_FRAGMENT, }; auto presentLayout = m_device->createPipelineLayout( From a675cdb16ea67d88f5c730758f4c2cdbfa22d8a6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:15:22 +0700 Subject: [PATCH 135/296] Fix example 10 to use IShader --- .../app_resources/prefix_sum_shader.comp.hlsl | 1 + .../app_resources/scatter_shader.comp.hlsl | 1 + 10_CountingSort/main.cpp | 11 ++++++----- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/10_CountingSort/app_resources/prefix_sum_shader.comp.hlsl b/10_CountingSort/app_resources/prefix_sum_shader.comp.hlsl index 1e5d2510e..b0301fc3f 100644 --- a/10_CountingSort/app_resources/prefix_sum_shader.comp.hlsl +++ b/10_CountingSort/app_resources/prefix_sum_shader.comp.hlsl @@ -4,6 +4,7 @@ [[vk::push_constant]] CountingPushData pushData; [numthreads(WorkgroupSize,1,1)] +[shader("compute")] void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID) { sort::CountingParameters < uint32_t > params; diff --git a/10_CountingSort/app_resources/scatter_shader.comp.hlsl b/10_CountingSort/app_resources/scatter_shader.comp.hlsl index fa502726f..ddecfca2b 100644 --- a/10_CountingSort/app_resources/scatter_shader.comp.hlsl +++ b/10_CountingSort/app_resources/scatter_shader.comp.hlsl @@ -6,6 +6,7 @@ using DoublePtrAccessor = DoubleBdaAccessor; [numthreads(WorkgroupSize, 1, 1)] +[shader("compute")] void main(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID) { sort::CountingParameters params; diff --git a/10_CountingSort/main.cpp b/10_CountingSort/main.cpp index 4d0c93516..1fd789ad1 100644 --- a/10_CountingSort/main.cpp +++ b/10_CountingSort/main.cpp @@ -37,7 +37,7 @@ class CountingSortApp final : public application_templates::MonoDeviceApplicatio const uint32_t bucket_count = std::min((uint32_t)3000, MaxBucketCount); const uint32_t elements_per_thread = ceil((float)ceil((float)element_count / limits.computeUnits) / WorkgroupSize); - auto prepShader = [&](const core::string& path) -> smart_refctd_ptr + auto prepShader = [&](const core::string& path) -> smart_refctd_ptr { // this time we load a shader directly from a file IAssetLoader::SAssetLoadParams lp = {}; @@ -51,7 +51,7 @@ class CountingSortApp final : public application_templates::MonoDeviceApplicatio return nullptr; } - auto source = IAsset::castDown(assets[0]); + auto source = IAsset::castDown(assets[0]); // The down-cast should not fail! assert(source); @@ -63,8 +63,8 @@ class CountingSortApp final : public application_templates::MonoDeviceApplicatio WorkgroupSize, bucket_count ); - // this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple - auto shader = m_device->createShader(overrideSource.get()); + // this time we skip the use of the asset converter since the IShader->IGPUShader path is quick and simple + auto shader = m_device->compileShader({ overrideSource.get() }); if (!shader) { logFail("Creation of Prefix Sum Shader from CPU Shader source failed!"); @@ -92,9 +92,10 @@ class CountingSortApp final : public application_templates::MonoDeviceApplicatio params.layout = layout.get(); params.shader.shader = prefixSumShader.get(); params.shader.entryPoint = "main"; + params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE; params.shader.entries = nullptr; params.shader.requireFullSubgroups = true; - params.shader.requiredSubgroupSize = static_cast(5); + params.shader.requiredSubgroupSize = static_cast(5); if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &prefixSumPipeline)) return logFail("Failed to create compute pipeline!\n"); params.shader.shader = scatterShader.get(); From 4104de5f21d801c25a652649af271f7fe69560c1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:15:32 +0700 Subject: [PATCH 136/296] Fix example 11 to use IShader --- 11_FFT/app_resources/shader.comp.hlsl | 1 + 11_FFT/main.cpp | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/11_FFT/app_resources/shader.comp.hlsl b/11_FFT/app_resources/shader.comp.hlsl index ecbf4f092..da3de00cd 100644 --- a/11_FFT/app_resources/shader.comp.hlsl +++ b/11_FFT/app_resources/shader.comp.hlsl @@ -60,6 +60,7 @@ struct Accessor }; [numthreads(ConstevalParameters::WorkgroupSize,1,1)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress); diff --git a/11_FFT/main.cpp b/11_FFT/main.cpp index 80f5f856c..1cac98b1f 100644 --- a/11_FFT/main.cpp +++ b/11_FFT/main.cpp @@ -46,13 +46,13 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ smart_refctd_ptr m_timeline; uint64_t semaphorValue = 0; - inline core::smart_refctd_ptr createShader( + inline core::smart_refctd_ptr createShader( const char* includeMainName) { std::string prelude = "#include \""; - auto CPUShader = core::make_smart_refctd_ptr((prelude + includeMainName + "\"\n").c_str(), IShader::E_SHADER_STAGE::ESS_COMPUTE, IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName); - assert(CPUShader); - return m_device->createShader(CPUShader.get()); + auto hlslShader = core::make_smart_refctd_ptr((prelude + includeMainName + "\"\n").c_str(), IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName); + assert(hlslShader); + return m_device->compileShader({ hlslShader.get() }); } public: @@ -70,7 +70,7 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ return false; // this time we load a shader directly from a file - smart_refctd_ptr shader; + smart_refctd_ptr shader; /* { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = m_logger.get(); @@ -81,14 +81,14 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ return logFail("Could not load shader!"); // Cast down the asset to its proper type - auto source = IAsset::castDown(assets[0]); + auto source = IAsset::castDown(assets[0]); // The down-cast should not fail! assert(source); - // Compile directly to IGPUShader - shader = m_device->createShader(source.get()); + // Compile directly to SPIR-V Shader + shader = m_device->compileShader({ source.get() }); if (!shader) - return logFail("Creation of a GPU Shader to from CPU Shader source failed!"); + return logFail("Creation of a SPIR-V Shader from HLSL Shader source failed!"); }*/ shader = createShader("app_resources/shader.comp.hlsl"); @@ -132,7 +132,9 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ IGPUComputePipeline::SCreationParams params = {}; params.layout = layout.get(); params.shader.shader = shader.get(); - params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)); + params.shader.entryPoint = "main"; + params.shader.stage = hlsl::ESS_COMPUTE; + params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)); params.shader.requireFullSubgroups = true; if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) return logFail("Failed to create compute pipeline!\n"); From 1d6fde6c0ff7496be33a1ff66b50a6adbc3a678f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:15:41 +0700 Subject: [PATCH 137/296] Fix example 22 to use IShader --- 22_CppCompat/ITester.h | 19 ++++++++++--------- 22_CppCompat/main.cpp | 10 +++++----- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/22_CppCompat/ITester.h b/22_CppCompat/ITester.h index a216fbf40..273f51663 100644 --- a/22_CppCompat/ITester.h +++ b/22_CppCompat/ITester.h @@ -5,6 +5,7 @@ #include "app_resources/common.hlsl" #include "nbl/application_templates/MonoDeviceApplication.hpp" #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "nbl/asset/metadata/CHLSLMetadata.h" using namespace nbl; @@ -45,14 +46,15 @@ class ITester logFail("Failed to create Command Buffers!\n"); // Load shaders, set up pipeline - core::smart_refctd_ptr shader; + core::smart_refctd_ptr shader; + auto shaderStage = ESS_UNKNOWN; { asset::IAssetLoader::SAssetLoadParams lp = {}; lp.logger = m_logger.get(); lp.workingDirectory = ""; // virtual root auto assetBundle = m_assetMgr->getAsset(pipleineSetupData.testShaderPath, lp); const auto assets = assetBundle.getContents(); - if (assets.empty()) + if (assets.empty() || assetBundle.getAssetType() != asset::IAsset::ET_SHADER) { logFail("Could not load shader!"); assert(0); @@ -60,12 +62,14 @@ class ITester // It would be super weird if loading a shader from a file produced more than 1 asset assert(assets.size() == 1); - core::smart_refctd_ptr source = asset::IAsset::castDown(assets[0]); + core::smart_refctd_ptr source = asset::IAsset::castDown(assets[0]); + const auto hlslMetadata = static_cast(assetBundle.getMetadata()); + shaderStage = hlslMetadata->shaderStages->front(); auto* compilerSet = m_assetMgr->getCompilerSet(); asset::IShaderCompiler::SCompilerOptions options = {}; - options.stage = source->getStage(); + options.stage = shaderStage; options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; options.spirvOptimizer = nullptr; options.debugInfoFlags |= asset::IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT; @@ -73,11 +77,7 @@ class ITester options.preprocessorOptions.logger = m_logger.get(); options.preprocessorOptions.includeFinder = compilerSet->getShaderCompiler(source->getContentType())->getDefaultIncludeFinder(); - auto spirv = compilerSet->compileToSPIRV(source.get(), options); - - video::ILogicalDevice::SShaderCreationParameters params{}; - params.cpushader = spirv.get(); - shader = m_device->createShader(params); + shader = compilerSet->compileToSPIRV(source.get(), options); } if (!shader) @@ -113,6 +113,7 @@ class ITester params.layout = m_pplnLayout.get(); params.shader.entryPoint = "main"; params.shader.shader = shader.get(); + params.shader.stage = shaderStage; if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) logFail("Failed to create pipelines (compile & link shaders)!\n"); } diff --git a/22_CppCompat/main.cpp b/22_CppCompat/main.cpp index 7fa2556c4..877831c55 100644 --- a/22_CppCompat/main.cpp +++ b/22_CppCompat/main.cpp @@ -84,7 +84,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa m_commandPool = m_device->createCommandPool(m_queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); m_commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &m_cmdbuf,1 }, smart_refctd_ptr(m_logger)); - smart_refctd_ptr shader; + smart_refctd_ptr shader; { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = m_logger.get(); @@ -94,14 +94,12 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa if (assets.empty()) return logFail("Could not load shader!"); - // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - auto source = IAsset::castDown(assets[0]); + auto source = IAsset::castDown(assets[0]); // The down-cast should not fail! assert(source); - assert(source->getStage() == IShader::E_SHADER_STAGE::ESS_COMPUTE); // this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple - shader = m_device->createShader(source.get()); + shader = m_device->compileShader({ source.get() }); if (!shader) return logFail("Creation of a GPU Shader to from CPU Shader source failed!"); } @@ -129,6 +127,8 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa IGPUComputePipeline::SCreationParams params = {}; params.layout = layout.get(); params.shader.shader = shader.get(); + params.shader.entryPoint = "main"; + params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE; if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) return logFail("Failed to create compute pipeline!\n"); } From 32c49741134c5498fdc47b5b29ff00313fbcec96 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:15:55 +0700 Subject: [PATCH 138/296] Fix example 23 to use IShader --- .../app_resources/testSubgroup.comp.hlsl | 1 + .../app_resources/testWorkgroup.comp.hlsl | 1 + 23_ArithmeticUnitTest/main.cpp | 19 ++++++++++--------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl index 479265d73..29114756d 100644 --- a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl +++ b/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl @@ -12,6 +12,7 @@ uint32_t globalIndex() bool canStore() {return true;} [numthreads(WORKGROUP_SIZE,1,1)] +[shader("compute")] void main() { test(); diff --git a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl index 9bafae47f..d47dea29e 100644 --- a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl @@ -75,6 +75,7 @@ bool canStore() } [numthreads(WORKGROUP_SIZE,1,1)] +[shader("compute")] void main() { const type_t sourceVal = test(); diff --git a/23_ArithmeticUnitTest/main.cpp b/23_ArithmeticUnitTest/main.cpp index 147d231e2..e2d7d3cfe 100644 --- a/23_ArithmeticUnitTest/main.cpp +++ b/23_ArithmeticUnitTest/main.cpp @@ -184,7 +184,7 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu exit(-1); } auto firstAssetInBundle = bundle.getContents()[0]; - return smart_refctd_ptr_static_cast(firstAssetInBundle); + return smart_refctd_ptr_static_cast(firstAssetInBundle); }; auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); @@ -276,17 +276,18 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu } // create pipeline (specialized every test) [TODO: turn into a future/async] - smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2) + smart_refctd_ptr createPipeline(const IShader* overridenUnspecialized, const uint8_t subgroupSizeLog2) { - auto shader = m_device->createShader(overridenUnspecialized); + auto shader = m_device->compileShader({ overridenUnspecialized }); IGPUComputePipeline::SCreationParams params = {}; params.layout = pipelineLayout.get(); params.shader = { - .entryPoint = "main", .shader = shader.get(), + .entryPoint = "main", + .stage = hlsl::ESS_COMPUTE, + .requiredSubgroupSize = static_cast(subgroupSizeLog2), + .requireFullSubgroups = true, .entries = nullptr, - .requiredSubgroupSize = static_cast(subgroupSizeLog2), - .requireFullSubgroups = true }; core::smart_refctd_ptr pipeline; if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{¶ms,1},&pipeline)) @@ -295,17 +296,17 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu } /*template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) { return true; }*/ template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) { std::string arith_name = Arithmetic>::name; - smart_refctd_ptr overridenUnspecialized; + smart_refctd_ptr overridenUnspecialized; if constexpr (WorkgroupTest) { overridenUnspecialized = CHLSLCompiler::createOverridenCopy( From 8255a3e97851dd7dd500ed26c12b64a58c1e4f63 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:16:04 +0700 Subject: [PATCH 139/296] Fix example 24 to use IShader --- 24_ColorSpaceTest/main.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/24_ColorSpaceTest/main.cpp b/24_ColorSpaceTest/main.cpp index 844f058fe..1c23a3f2f 100644 --- a/24_ColorSpaceTest/main.cpp +++ b/24_ColorSpaceTest/main.cpp @@ -161,7 +161,7 @@ class ColorSpaceTestSampleApp final : public examples::SimpleWindowedApplication return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); // Load Custom Shader - auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr + auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = m_logger.get(); @@ -172,11 +172,11 @@ class ColorSpaceTestSampleApp final : public examples::SimpleWindowedApplication return nullptr; // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - auto source = IAsset::castDown(assets[0]); + auto source = IAsset::castDown(assets[0]); if (!source) return nullptr; - return m_device->createShader(source.get()); + return m_device->compileShader({ source.get() }); }; auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl"); if (!fragmentShader) @@ -255,14 +255,15 @@ class ColorSpaceTestSampleApp final : public examples::SimpleWindowedApplication // Now create the pipeline { const asset::SPushConstantRange range = { - .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT, + .stageFlags = ESS_FRAGMENT, .offset = 0, .size = sizeof(push_constants_t) }; auto layout = m_device->createPipelineLayout({ &range,1 }, nullptr, nullptr, nullptr, core::smart_refctd_ptr(dsLayout)); - const IGPUShader::SSpecInfo fragSpec = { + const IPipelineBase::SShaderSpecInfo fragSpec = { + .shader = fragmentShader.get(), .entryPoint = "main", - .shader = fragmentShader.get() + .stage = ESS_FRAGMENT, }; m_pipeline = fsTriProtoPPln.createPipeline(fragSpec, layout.get(), scResources->getRenderpass()/*,default is subpass 0*/); if (!m_pipeline) @@ -796,7 +797,7 @@ class ColorSpaceTestSampleApp final : public examples::SimpleWindowedApplication cmdbuf->beginRenderPass(info,IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); } cmdbuf->bindGraphicsPipeline(m_pipeline.get()); - cmdbuf->pushConstants(m_pipeline->getLayout(),IGPUShader::E_SHADER_STAGE::ESS_FRAGMENT,0,sizeof(push_constants_t),&pc); + cmdbuf->pushConstants(m_pipeline->getLayout(),hlsl::ShaderStage::ESS_FRAGMENT,0,sizeof(push_constants_t),&pc); cmdbuf->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS,m_pipeline->getLayout(),3,1,&ds); ext::FullScreenTriangle::recordDrawCall(cmdbuf); cmdbuf->endRenderPass(); From 847927c291742bc2d95edd2312c75e2a9b835794 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:16:13 +0700 Subject: [PATCH 140/296] Fix example 25 to use IShader --- 25_FilterTest/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/25_FilterTest/main.cpp b/25_FilterTest/main.cpp index a66227225..4ce68d66c 100644 --- a/25_FilterTest/main.cpp +++ b/25_FilterTest/main.cpp @@ -868,7 +868,7 @@ class BlitFilterTestApp final : public virtual application_templates::BasicMulti logger->log("Failed to fit the preload region in shared memory even for 1x1x1 workgroup!",ILogger::ELL_ERROR); return false; } - cmdbuf->pushConstants(layout,IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,0,sizeof(params),¶ms); + cmdbuf->pushConstants(layout,hlsl::ShaderStage::ESS_COMPUTE,0,sizeof(params),¶ms); cmdbuf->dispatch(params.perWG.getWorkgroupCount(outExtent16)); if (m_alphaSemantic==IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) { From 20cd09a25ec3236188fce0c2933d999fcc1b8f99 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:16:22 +0700 Subject: [PATCH 141/296] Fix example 26 to use IShader --- 26_Blur/app_resources/shader.comp.hlsl | 1 + 26_Blur/main.cpp | 37 +++++++++++--------------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/26_Blur/app_resources/shader.comp.hlsl b/26_Blur/app_resources/shader.comp.hlsl index 94baa8d2a..99e876ccc 100644 --- a/26_Blur/app_resources/shader.comp.hlsl +++ b/26_Blur/app_resources/shader.comp.hlsl @@ -131,6 +131,7 @@ struct ScanSharedMemoryProxy }; [numthreads(WORKGROUP_SIZE, 1, 1)] +[shader("compute")] void main() { ScanSharedMemoryProxy scanSmemAccessor; diff --git a/26_Blur/main.cpp b/26_Blur/main.cpp index 8217c4e51..4910ba5f0 100644 --- a/26_Blur/main.cpp +++ b/26_Blur/main.cpp @@ -225,7 +225,7 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica if (!m_vertImg || !m_device->allocate(reqs, m_vertImg.get()).isValid()) return logFail("Could not create HDR Image"); - smart_refctd_ptr shader; + smart_refctd_ptr shader; { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = m_logger.get(); @@ -236,10 +236,10 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica return logFail("Failed to load shader from disk"); // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - auto sourceRaw = IAsset::castDown(assets[0]); + auto sourceRaw = IAsset::castDown(assets[0]); if (!sourceRaw) return logFail("Failed to load shader from disk"); - smart_refctd_ptr source = CHLSLCompiler::createOverridenCopy( + smart_refctd_ptr source = CHLSLCompiler::createOverridenCopy( sourceRaw.get(), "static const uint16_t WORKGROUP_SIZE = %d;\n" "static const uint16_t MAX_SCANLINE_SIZE = %d;\n" @@ -264,7 +264,7 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica auto opt = make_smart_refctd_ptr(optPasses); shader = m_device->createShader(source.get(), opt.get()); #else - shader = m_device->createShader(source.get()); + shader = m_device->compileShader({ source.get() }); #endif if (!shader) return false; @@ -272,26 +272,19 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica { const asset::SPushConstantRange ranges[] = { { - .stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, + .stageFlags = hlsl::ShaderStage::ESS_COMPUTE, .offset = 0, .size = sizeof(PushConstants) } }; auto layout = m_device->createPipelineLayout(ranges, smart_refctd_ptr(dsLayout)); - const IGPUComputePipeline::SCreationParams params[] = { { - { - .layout = layout.get() - }, - {}, - IGPUComputePipeline::SCreationParams::FLAGS::NONE, - { - .entryPoint = "main", - .shader = shader.get(), - .entries = nullptr, - .requiredSubgroupSize = static_cast(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)), - .requireFullSubgroups = true - } - }}; - if (!m_device->createComputePipelines(nullptr, params, &m_ppln)) + + IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout.get(); + params.shader.shader = shader.get(); + params.shader.entryPoint = "main"; + params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE; + params.shader.requireFullSubgroups = true; + if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &m_ppln)) return logFail("Failed to create Pipeline"); } @@ -626,7 +619,7 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica cb->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {}, .bufBarriers = {},.imgBarriers = {&vertImgBarrier,1} }); cb->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_COMPUTE, layout, 0, 1, &m_ds0.get()); PushConstants pc = { .radius = blurRadius, .activeAxis = 0, .edgeWrapMode = blurEdgeWrapMode }; - cb->pushConstants(layout, IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc); + cb->pushConstants(layout, hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(pc), &pc); cb->dispatch(image_params.extent.height, 1, 1); image_memory_barrier_t horzImgBarrier = { @@ -646,7 +639,7 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica cb->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {}, .bufBarriers = {},.imgBarriers = {&horzImgBarrier,1} }); cb->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_COMPUTE, layout, 0, 1, &m_ds1.get()); pc.activeAxis = 1; - cb->pushConstants(layout, IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc); + cb->pushConstants(layout, hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(pc), &pc); cb->dispatch(image_params.extent.width, 1, 1); } From fabe1dbdaec23694b2dc5d0b2ad5d6bea75eed9f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:16:31 +0700 Subject: [PATCH 142/296] Fix example 27 to use IShader --- .../app_resources/shader.comp.hlsl | 1 + 27_MPMCScheduler/main.cpp | 32 +++++++------------ 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/27_MPMCScheduler/app_resources/shader.comp.hlsl b/27_MPMCScheduler/app_resources/shader.comp.hlsl index c49ad018c..966963761 100644 --- a/27_MPMCScheduler/app_resources/shader.comp.hlsl +++ b/27_MPMCScheduler/app_resources/shader.comp.hlsl @@ -305,6 +305,7 @@ uint32_t3 gl_WorkGroupSize() {return uint32_t3(WorkgroupSizeX*WorkgroupSizeY,1,1 } } [numthreads(WorkgroupSizeX*WorkgroupSizeY,1,1)] +[shader("compute")] void main() { // manually push an explicit workload diff --git a/27_MPMCScheduler/main.cpp b/27_MPMCScheduler/main.cpp index c380bf3c6..03275d114 100644 --- a/27_MPMCScheduler/main.cpp +++ b/27_MPMCScheduler/main.cpp @@ -69,7 +69,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi if (!asset_base_t::onAppInitialized(std::move(system))) return false; - smart_refctd_ptr shader; + smart_refctd_ptr shader; { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = m_logger.get(); @@ -80,11 +80,11 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi return logFail("Failed to load shader from disk"); // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - auto source = IAsset::castDown(assets[0]); + auto source = IAsset::castDown(assets[0]); if (!source) return logFail("Failed to load shader from disk"); - shader = m_device->createShader(source.get()); + shader = m_device->compileShader({ source.get() }); if (!shader) return false; } @@ -106,26 +106,18 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi { const asset::SPushConstantRange ranges[] = {{ - .stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0, .size = sizeof(PushConstants) }}; auto layout = m_device->createPipelineLayout(ranges,smart_refctd_ptr(dsLayout)); - const IGPUComputePipeline::SCreationParams params[] = { { - { - .layout = layout.get() - }, - {}, - IGPUComputePipeline::SCreationParams::FLAGS::NONE, - { - .entryPoint = "main", - .shader = shader.get(), - .entries = nullptr, - .requiredSubgroupSize = IGPUShader::SSpecInfo::SUBGROUP_SIZE::UNKNOWN, - .requireFullSubgroups = true - } - }}; - if (!m_device->createComputePipelines(nullptr,params,&m_ppln)) + IGPUComputePipeline::SCreationParams params; + params.layout = layout.get(); + params.shader.shader = shader.get(); + params.shader.entryPoint = "main"; + params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE; + params.shader.requireFullSubgroups = true; + if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &m_ppln)) return logFail("Failed to create Pipeline"); } @@ -306,7 +298,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi .sharedAcceptableIdleCount = 0, .globalAcceptableIdleCount = 0 }; - cb->pushConstants(layout,IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,0,sizeof(pc),&pc); + cb->pushConstants(layout,hlsl::ShaderStage::ESS_COMPUTE,0,sizeof(pc),&pc); cb->dispatch(WIN_W/WorkgroupSizeX,WIN_H/WorkgroupSizeY,1); } From d7f9f18171a0b02424386799652af4ef459e73c6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:16:41 +0700 Subject: [PATCH 143/296] Fix example 28 to use IShader --- .../app_resources/fft_convolve_ifft.hlsl | 1 + .../app_resources/image_fft_first_axis.hlsl | 1 + .../app_resources/image_ifft_first_axis.hlsl | 1 + .../app_resources/kernel_fft_first_axis.hlsl | 1 + .../app_resources/kernel_fft_second_axis.hlsl | 1 + .../kernel_spectrum_normalize.hlsl | 1 + 28_FFTBloom/main.cpp | 25 ++++++++++--------- 7 files changed, 19 insertions(+), 12 deletions(-) diff --git a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl index 73d9d7850..07c2ec8cf 100644 --- a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl +++ b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl @@ -223,6 +223,7 @@ NBL_CONSTEXPR_STATIC_INLINE float32_t2 PreloadedSecondAxisAccessor::KernelHalfPi NBL_CONSTEXPR_STATIC_INLINE vector PreloadedSecondAxisAccessor::One = {1.0f, 0.f}; [numthreads(FFTParameters::WorkgroupSize, 1, 1)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { SharedMemoryAccessor sharedmemAccessor; diff --git a/28_FFTBloom/app_resources/image_fft_first_axis.hlsl b/28_FFTBloom/app_resources/image_fft_first_axis.hlsl index 864c64b1e..f1478a8d6 100644 --- a/28_FFTBloom/app_resources/image_fft_first_axis.hlsl +++ b/28_FFTBloom/app_resources/image_fft_first_axis.hlsl @@ -76,6 +76,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase }; [numthreads(FFTParameters::WorkgroupSize, 1, 1)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { SharedMemoryAccessor sharedmemAccessor; diff --git a/28_FFTBloom/app_resources/image_ifft_first_axis.hlsl b/28_FFTBloom/app_resources/image_ifft_first_axis.hlsl index 9146073dd..b3bef3510 100644 --- a/28_FFTBloom/app_resources/image_ifft_first_axis.hlsl +++ b/28_FFTBloom/app_resources/image_ifft_first_axis.hlsl @@ -136,6 +136,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBase }; [numthreads(FFTParameters::WorkgroupSize, 1, 1)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { SharedMemoryAccessor sharedmemAccessor; diff --git a/28_FFTBloom/app_resources/kernel_fft_first_axis.hlsl b/28_FFTBloom/app_resources/kernel_fft_first_axis.hlsl index 51f514c4a..741bac7db 100644 --- a/28_FFTBloom/app_resources/kernel_fft_first_axis.hlsl +++ b/28_FFTBloom/app_resources/kernel_fft_first_axis.hlsl @@ -68,6 +68,7 @@ struct PreloadedFirstAxisAccessor : MultiChannelPreloadedAccessorBase }; [numthreads(FFTParameters::WorkgroupSize, 1, 1)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { SharedMemoryAccessor sharedmemAccessor; diff --git a/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl b/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl index ab7216da2..eaecb5d0f 100644 --- a/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl +++ b/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl @@ -200,6 +200,7 @@ struct PreloadedSecondAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBas }; [numthreads(FFTParameters::WorkgroupSize, 1, 1)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { SharedMemoryAccessor sharedmemAccessor; diff --git a/28_FFTBloom/app_resources/kernel_spectrum_normalize.hlsl b/28_FFTBloom/app_resources/kernel_spectrum_normalize.hlsl index f2ef207d3..efe406301 100644 --- a/28_FFTBloom/app_resources/kernel_spectrum_normalize.hlsl +++ b/28_FFTBloom/app_resources/kernel_spectrum_normalize.hlsl @@ -2,6 +2,7 @@ [[vk::binding(2, 0)]] RWTexture2DArray kernelChannels; [numthreads(8, 8, 1)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { const scalar_t powerReciprocal = vk::RawBufferLoad(pushConstants.rowMajorBufferAddress); diff --git a/28_FFTBloom/main.cpp b/28_FFTBloom/main.cpp index cc312c3be..4718a4090 100644 --- a/28_FFTBloom/main.cpp +++ b/28_FFTBloom/main.cpp @@ -169,7 +169,7 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app float32_t totalSizeReciprocal; }; - inline core::smart_refctd_ptr createShader(const char* includeMainName, const SShaderConstevalParameters& shaderConstants) + inline core::smart_refctd_ptr createShader(const char* includeMainName, const SShaderConstevalParameters& shaderConstants) { // The annoying "const static member field must be initialized outside of struct" bug strikes again std::ostringstream kernelHalfPixelSizeStream; @@ -204,18 +204,17 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app - auto CPUShader = core::make_smart_refctd_ptr((prelude+"\n#include \"" + includeMainName + "\"\n").c_str(), - IShader::E_SHADER_STAGE::ESS_COMPUTE, + auto HLSLShader = core::make_smart_refctd_ptr((prelude+"\n#include \"" + includeMainName + "\"\n").c_str(), IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName); - assert(CPUShader); + assert(HLSLShader); #ifndef _NBL_DEBUG ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); - return m_device->createShader({ CPUShader.get(), opt.get(), m_readCache.get(), m_writeCache.get()}); + return m_device->createShader({ HLSLShader.get(), opt.get(), m_readCache.get(), m_writeCache.get()}); #else - return m_device->createShader({ CPUShader.get(), nullptr, m_readCache.get(), m_writeCache.get() }); + return m_device->compileShader({ HLSLShader.get(), nullptr, m_readCache.get(), m_writeCache.get() }); #endif } @@ -709,7 +708,7 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app // Normalization shader needs this info uint16_t secondAxisFFTHalfLengthLog2 = elementsPerInvocationLog2 + workgroupSizeLog2 - 1; // Create shaders - smart_refctd_ptr shaders[3]; + smart_refctd_ptr shaders[3]; uint16_t2 kernelDimensions = { kerDim.width, kerDim.height }; SShaderConstevalParameters::SShaderConstevalParametersCreateInfo shaderConstevalInfo = { .useHalfFloats = m_useHalfFloats, .elementsPerInvocationLog2 = elementsPerInvocationLog2, .workgroupSizeLog2 = workgroupSizeLog2, .numWorkgroupsLog2 = secondAxisFFTHalfLengthLog2, .previousWorkgroupSizeLog2 = workgroupSizeLog2 }; SShaderConstevalParameters shaderConstevalParameters(shaderConstevalInfo); @@ -722,11 +721,12 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app for (auto i = 0u; i < 3; i++) { params[i].layout = pipelineLayout.get(); - params[i].shader.entryPoint = "main"; params[i].shader.shader = shaders[i].get(); + params[i].shader.entryPoint = "main"; + params[i].shader.stage = hlsl::ShaderStage::ESS_COMPUTE; // Normalization doesn't require full subgroups params[i].shader.requireFullSubgroups = bool(2-i); - params[i].shader.requiredSubgroupSize = static_cast(hlsl::findMSB(deviceLimits.maxSubgroupSize)); + params[i].shader.requiredSubgroupSize = static_cast(hlsl::findMSB(deviceLimits.maxSubgroupSize)); } smart_refctd_ptr pipelines[3]; @@ -884,7 +884,7 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app uint16_t firstAxisFFTHalfLengthLog2; uint16_t firstAxisFFTElementsPerInvocationLog2; uint16_t firstAxisFFTWorkgroupSizeLog2; - smart_refctd_ptr shaders[3]; + smart_refctd_ptr shaders[3]; { auto [elementsPerInvocationLog2, workgroupSizeLog2] = workgroup::fft::optimalFFTParameters(deviceLimits.maxOptimallyResidentWorkgroupInvocations, m_marginSrcDim.height, deviceLimits.maxSubgroupSize); SShaderConstevalParameters::SShaderConstevalParametersCreateInfo shaderConstevalInfo = { .useHalfFloats = m_useHalfFloats, .elementsPerInvocationLog2 = elementsPerInvocationLog2, .workgroupSizeLog2 = workgroupSizeLog2 }; @@ -926,9 +926,10 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app IGPUComputePipeline::SCreationParams params[3] = {}; for (auto i = 0u; i < 3; i++) { params[i].layout = pipelineLayout.get(); - params[i].shader.entryPoint = "main"; params[i].shader.shader = shaders[i].get(); - params[i].shader.requiredSubgroupSize = static_cast(hlsl::findMSB(deviceLimits.maxSubgroupSize)); + params[i].shader.entryPoint = "main"; + params[i].shader.stage = hlsl::ShaderStage::ESS_COMPUTE; + params[i].shader.requiredSubgroupSize = static_cast(hlsl::findMSB(deviceLimits.maxSubgroupSize)); params[i].shader.requireFullSubgroups = true; } From ad5054db7c46c50805bd697e3f8b28937c1310ba Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:16:55 +0700 Subject: [PATCH 144/296] Fix example 71 to use IShader --- 71_RayTracingPipeline/main.cpp | 41 ++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 35c750373..c9ee0eafb 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -137,7 +137,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } // Load Custom Shader - auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr + auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = m_logger.get(); @@ -145,14 +145,20 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, auto assetBundle = m_assetMgr->getAsset(relPath, lp); const auto assets = assetBundle.getContents(); if (assets.empty()) + { + assert(false); return nullptr; + } // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - auto sourceRaw = IAsset::castDown(assets[0]); + auto sourceRaw = IAsset::castDown(assets[0]); if (!sourceRaw) + { + assert(false); return nullptr; + } - return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); + return m_device->compileShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); }; // load shaders @@ -335,18 +341,18 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, RTDS_COUNT }; - IGPUShader::SSpecInfo shaders[RTDS_COUNT]; - shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()}; - shaders[RTDS_MISS] = {.shader = missShader.get()}; - shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() }; - shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()}; - shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()}; - shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()}; - shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get()}; - shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get() }; - shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get()}; - shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get()}; - shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get()}; + IPipelineBase::SShaderSpecInfo shaders[RTDS_COUNT]; + shaders[RTDS_RAYGEN] = {.shader = raygenShader.get(), .entryPoint = "main", .stage = ESS_RAYGEN}; + shaders[RTDS_MISS] = {.shader = missShader.get(), .entryPoint = "main", .stage = ESS_MISS}; + shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get(), .entryPoint = "main", .stage = ESS_MISS}; + shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get(), .entryPoint = "main", .stage = ESS_CLOSEST_HIT}; + shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get(), .entryPoint = "main", .stage = ESS_CLOSEST_HIT}; + shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get(), .entryPoint = "main", .stage = ESS_ANY_HIT}; + shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get(), .entryPoint = "main", .stage = ESS_ANY_HIT}; + shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get(), .entryPoint = "main", .stage = ESS_INTERSECTION }; + shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get(), .entryPoint = "main", .stage = ESS_CALLABLE}; + shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get(), .entryPoint = "main", .stage = ESS_CALLABLE}; + shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get(), .entryPoint = "main", .stage = ESS_CALLABLE}; params.layout = pipelineLayout.get(); params.shaders = std::span(shaders); @@ -448,9 +454,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, if (!fsTriProtoPPln) return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); - const IGPUShader::SSpecInfo fragSpec = { + const IPipelineBase::SShaderSpecInfo fragSpec = { + .shader = fragmentShader.get(), .entryPoint = "main", - .shader = fragmentShader.get() + .stage = ESS_FRAGMENT, }; auto presentLayout = m_device->createPipelineLayout( From 3698fb4f2acbaad19ead55ef68a98f76bf5f5f4d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 22 Apr 2025 08:17:11 +0700 Subject: [PATCH 145/296] Fix geometry scene creator to use IShader --- common/include/CGeomtryCreatorScene.hpp | 29 ++++++++++++------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/common/include/CGeomtryCreatorScene.hpp b/common/include/CGeomtryCreatorScene.hpp index 0d9bc6edd..6ffad2c73 100644 --- a/common/include/CGeomtryCreatorScene.hpp +++ b/common/include/CGeomtryCreatorScene.hpp @@ -46,7 +46,7 @@ constexpr static inline struct ClearValues using image_view_t = std::conditional_t; \ using image_t = std::conditional_t; \ using buffer_t = std::conditional_t; \ - using shader_t = std::conditional_t; \ + using shader_t = nbl::asset::IShader; \ using graphics_pipeline_t = std::conditional_t; \ using descriptor_set = std::conditional_t; \ } @@ -764,36 +764,35 @@ class ResourceBuilder { EXPOSE_NABLA_NAMESPACES(); - auto createShader = [&](IShader::E_SHADER_STAGE stage, smart_refctd_ptr& outShader) -> smart_refctd_ptr + auto createShader = [&](smart_refctd_ptr& outShader) -> smart_refctd_ptr { // TODO: use SPIRV loader & our ::system ns to get those cpu shaders, do not create myself (shit I forgot it exists) const SBuiltinFile& in = ::geometry::creator::spirv::builtin::get_resource(); const auto buffer = ICPUBuffer::create({ { in.size }, (void*)in.contents, core::getNullMemoryResource() }, adopt_memory); - auto shader = make_smart_refctd_ptr(smart_refctd_ptr(buffer), stage, IShader::E_CONTENT_TYPE::ECT_SPIRV, ""); // must create cpu instance regardless underlying type + auto shader = make_smart_refctd_ptr(smart_refctd_ptr(buffer), IShader::E_CONTENT_TYPE::ECT_SPIRV, ""); // must create cpu instance regardless underlying type if constexpr (withAssetConverter) { buffer->setContentHash(buffer->computeContentHash()); - outShader = std::move(shader); } - else - outShader = utilities->getLogicalDevice()->createShader(shader.get()); + + outShader = std::move(shader); return outShader; }; typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (basic.vertex); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (basic.fragment); typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic! + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (cone.vertex); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (cone.fragment); // note we reuse fragment from basic! typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic! + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (ico.vertex); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (ico.fragment); // note we reuse fragment from basic! for (const auto& it : scratch.shaders) { @@ -843,10 +842,10 @@ class ResourceBuilder params.rasterization.faceCullingMode = EFCM_NONE; { - const typename Types::shader_t::SSpecInfo info [] = + const IPipelineBase::SShaderSpecInfo info [] = { - {.entryPoint = "VSMain", .shader = scratch.shaders[inGeometry.shadersType].vertex.get() }, - {.entryPoint = "PSMain", .shader = scratch.shaders[inGeometry.shadersType].fragment.get() } + {.shader = scratch.shaders[inGeometry.shadersType].vertex.get(), .entryPoint = "VSMain", .stage = hlsl::ShaderStage::ESS_VERTEX}, + {.shader = scratch.shaders[inGeometry.shadersType].fragment.get(), .entryPoint = "PSMain", .stage = hlsl::ShaderStage::ESS_FRAGMENT}, }; params.pipeline.layout = scratch.pipelineLayout.get(); From 6b57674651f5eb057d1c632d45122d455a7a48c1 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 25 Apr 2025 16:53:14 +0700 Subject: [PATCH 146/296] refactor to load data as vectors, consecutive uints --- .../app_resources/benchmarkSubgroup.comp.hlsl | 18 +++----- .../app_resources/shaderCommon.hlsl | 45 ++++++------------- .../app_resources/testSubgroup.comp.hlsl | 4 +- 3 files changed, 20 insertions(+), 47 deletions(-) diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl index 3dd24e432..2f575d39a 100644 --- a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl @@ -7,9 +7,9 @@ // NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders [[vk::binding(2, 0)]] RWTexture2D outImage; // dummy -uint32_t globalFirstItemIndex(uint32_t itemIdx) +uint32_t globalIndex() { - return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE*ITEMS_PER_INVOCATION+((nbl::hlsl::glsl::gl_SubgroupID()*ITEMS_PER_INVOCATION+itemIdx)< class binop, typename T, uint32_t N> static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) { @@ -31,20 +30,13 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) for (uint32_t i = 0; i < NUM_LOOPS; i++) value = func(value); - [unroll] - for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) - output[binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(uint32_t) * (globalFirstItemIndex(i) + nbl::hlsl::glsl::gl_SubgroupInvocationID()), value[i]); + output[binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); } void benchmark() { - const uint32_t idx = nbl::hlsl::glsl::gl_SubgroupInvocationID(); - type_t sourceVal; - [unroll] - for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) - { - sourceVal[i] = inputValue[globalFirstItemIndex(i) + idx]; - } + const uint32_t idx = globalIndex(); + type_t sourceVal = inputValue[idx]; subbench(sourceVal); subbench(sourceVal); diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_ArithmeticBench/app_resources/shaderCommon.hlsl index f4fc9d23a..376f69579 100644 --- a/73_ArithmeticBench/app_resources/shaderCommon.hlsl +++ b/73_ArithmeticBench/app_resources/shaderCommon.hlsl @@ -1,7 +1,6 @@ #include "common.hlsl" #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" -#include "nbl/builtin/hlsl/workgroup/basic.hlsl" #include "nbl/builtin/hlsl/subgroup/basic.hlsl" #include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" @@ -11,29 +10,21 @@ // https://github.com/microsoft/DirectXShaderCompiler/issues/6144 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} -// unfortunately DXC chokes on descriptors as static members -// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 -[[vk::binding(0, 0)]] StructuredBuffer inputValue; -[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; - -// to get next item, move by subgroupSize -uint32_t globalFirstItemIndex(uint32_t itemIdx); -// since we test ITEMS_PER_WG 1 typedef vector type_t; -// #else -// typedef uint32_t type_t; -// #endif +// unfortunately DXC chokes on descriptors as static members +// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 +[[vk::binding(0, 0)]] StructuredBuffer inputValue; +[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; + +// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way +uint32_t globalIndex(); +// since we test ITEMS_PER_WG; using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; - if (nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex()==0u) + if (globalIndex()==0u) output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); operation_t func; - type_t value = func(sourceVal); if (canStore()) - { - [unroll] - for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) - output[binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(uint32_t) * (globalFirstItemIndex(i) + nbl::hlsl::glsl::gl_SubgroupInvocationID()), value[i]); - } + output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); } type_t test() { - const uint32_t idx = nbl::hlsl::glsl::gl_SubgroupInvocationID(); - type_t sourceVal; - [unroll] - for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) - { - sourceVal[i] = inputValue[globalFirstItemIndex(i) + idx]; - } + const uint32_t idx = globalIndex(); + type_t sourceVal = inputValue[idx]; subtest(sourceVal); subtest(sourceVal); diff --git a/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl index 0001d39e0..2cc1ccb60 100644 --- a/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl +++ b/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl @@ -4,9 +4,9 @@ #include "shaderCommon.hlsl" -uint32_t globalFirstItemIndex(uint32_t itemIdx) +uint32_t globalIndex() { - return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE*ITEMS_PER_INVOCATION+((nbl::hlsl::glsl::gl_SubgroupID()*ITEMS_PER_INVOCATION+itemIdx)< Date: Mon, 28 Apr 2025 10:54:19 +0700 Subject: [PATCH 147/296] initial wg scan test --- 74a_Workgroup2ScanTest/CMakeLists.txt | 25 + .../app_resources/common.hlsl | 96 ++++ .../app_resources/shaderCommon.hlsl | 55 +++ .../app_resources/testSubgroup.comp.hlsl | 18 + .../app_resources/testWorkgroup.comp.hlsl | 107 ++++ 74a_Workgroup2ScanTest/config.json.template | 28 ++ 74a_Workgroup2ScanTest/main.cpp | 462 ++++++++++++++++++ 74a_Workgroup2ScanTest/pipeline.groovy | 50 ++ CMakeLists.txt | 1 + 9 files changed, 842 insertions(+) create mode 100644 74a_Workgroup2ScanTest/CMakeLists.txt create mode 100644 74a_Workgroup2ScanTest/app_resources/common.hlsl create mode 100644 74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl create mode 100644 74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl create mode 100644 74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl create mode 100644 74a_Workgroup2ScanTest/config.json.template create mode 100644 74a_Workgroup2ScanTest/main.cpp create mode 100644 74a_Workgroup2ScanTest/pipeline.groovy diff --git a/74a_Workgroup2ScanTest/CMakeLists.txt b/74a_Workgroup2ScanTest/CMakeLists.txt new file mode 100644 index 000000000..0724366c9 --- /dev/null +++ b/74a_Workgroup2ScanTest/CMakeLists.txt @@ -0,0 +1,25 @@ + +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/74a_Workgroup2ScanTest/app_resources/common.hlsl b/74a_Workgroup2ScanTest/app_resources/common.hlsl new file mode 100644 index 000000000..10892a2b9 --- /dev/null +++ b/74a_Workgroup2ScanTest/app_resources/common.hlsl @@ -0,0 +1,96 @@ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/functional.hlsl" + +template +struct Output +{ + NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount; + + uint32_t subgroupSize; + uint32_t data[ScanElementCount]; +}; + +// Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code +template +struct bit_and : nbl::hlsl::bit_and +{ + using base_t = nbl::hlsl::bit_and; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bit_and"; +#endif +}; +template +struct bit_or : nbl::hlsl::bit_or +{ + using base_t = nbl::hlsl::bit_or; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bit_xor"; +#endif +}; +template +struct bit_xor : nbl::hlsl::bit_xor +{ + using base_t = nbl::hlsl::bit_xor; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bit_or"; +#endif +}; +template +struct plus : nbl::hlsl::plus +{ + using base_t = nbl::hlsl::plus; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "plus"; +#endif +}; +template +struct multiplies : nbl::hlsl::multiplies +{ + using base_t = nbl::hlsl::multiplies; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "multiplies"; +#endif +}; +template +struct minimum : nbl::hlsl::minimum +{ + using base_t = nbl::hlsl::minimum; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "minimum"; +#endif +}; +template +struct maximum : nbl::hlsl::maximum +{ + using base_t = nbl::hlsl::maximum; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "maximum"; +#endif +}; + +template +struct ballot : nbl::hlsl::plus +{ + using base_t = nbl::hlsl::plus; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bitcount"; +#endif +}; + +#include "nbl/builtin/hlsl/subgroup/basic.hlsl" \ No newline at end of file diff --git a/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl new file mode 100644 index 000000000..13ee8d21e --- /dev/null +++ b/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl @@ -0,0 +1,55 @@ +#include "common.hlsl" + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/subgroup/basic.hlsl" +#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" + +#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" + +// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 +uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} + +// unfortunately DXC chokes on descriptors as static members +// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 +[[vk::binding(0, 0)]] StructuredBuffer inputValue; +[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; + +// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way +uint32_t globalIndex(); +// since we test ITEMS_PER_WG class binop> +static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + if (globalIndex()==0u) + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + operation_t::base_t,nbl::hlsl::jit::device_capabilities> func; + if (canStore()) + output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); +} + + +type_t test() +{ + const type_t sourceVal = inputValue[globalIndex()]; + + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + return sourceVal; +} + +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" \ No newline at end of file diff --git a/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl new file mode 100644 index 000000000..479265d73 --- /dev/null +++ b/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl @@ -0,0 +1,18 @@ +#pragma shader_stage(compute) + +#define operation_t nbl::hlsl::OPERATION + +#include "shaderCommon.hlsl" + +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} + +bool canStore() {return true;} + +[numthreads(WORKGROUP_SIZE,1,1)] +void main() +{ + test(); +} \ No newline at end of file diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl new file mode 100644 index 000000000..9bafae47f --- /dev/null +++ b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl @@ -0,0 +1,107 @@ +#pragma shader_stage(compute) + + +#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" + +static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic::value; +static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot::value; +static const uint32_t ScratchSz = ArithmeticSz+BallotSz; + +// TODO: Can we make it a static variable in the ScratchProxy struct? +groupshared uint32_t scratch[ScratchSz]; + + +#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" + + +template +struct ScratchProxy +{ + void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) + { + value = scratch[ix+offset]; + } + void set(const uint32_t ix, const uint32_t value) + { + scratch[ix+offset] = value; + } + + uint32_t atomicOr(const uint32_t ix, const uint32_t value) + { + return nbl::hlsl::glsl::atomicOr(scratch[ix],value); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } +}; + +static ScratchProxy<0> arithmeticAccessor; + + +#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" + + +template +struct operation_t +{ + using type_t = typename Binop::type_t; + + type_t operator()(type_t value) + { + type_t retval = nbl::hlsl::OPERATION::template __call >(value,arithmeticAccessor); + // we barrier before because we alias the accessors for Binop + arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); + return retval; + } +}; + + +#include "shaderCommon.hlsl" + +static ScratchProxy ballotAccessor; + + +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} + +bool canStore() +{ + return nbl::hlsl::workgroup::SubgroupContiguousIndex()::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + // we can only ballot booleans, so low bit + nbl::hlsl::workgroup::ballot >(bool(sourceVal & 0x1u), ballotAccessor); + // need to barrier between ballot and usages of a ballot by myself + ballotAccessor.workgroupExecutionAndMemoryBarrier(); + + uint32_t destVal = 0xdeadbeefu; +#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same,0x45>,nbl::hlsl::workgroup::IS_OP,0x45> >::value +#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities + if (CONSTEXPR_OP_TYPE_TEST(reduction)) + destVal = nbl::hlsl::workgroup::ballotBitCount(ballotAccessor,arithmeticAccessor); + else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan)) + destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount(ballotAccessor,arithmeticAccessor); + else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan)) + destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount(ballotAccessor,arithmeticAccessor); + else + { + assert(false); + } +#undef BALLOT_TEMPLATE_ARGS +#undef CONSTEXPR_OP_TYPE_TEST + + if (canStore()) + output[ballot::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal); +} \ No newline at end of file diff --git a/74a_Workgroup2ScanTest/config.json.template b/74a_Workgroup2ScanTest/config.json.template new file mode 100644 index 000000000..f961745c1 --- /dev/null +++ b/74a_Workgroup2ScanTest/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp new file mode 100644 index 000000000..147d231e2 --- /dev/null +++ b/74a_Workgroup2ScanTest/main.cpp @@ -0,0 +1,462 @@ +#include "nbl/application_templates/BasicMultiQueueApplication.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "app_resources/common.hlsl" + +using namespace nbl; +using namespace core; +using namespace asset; +using namespace system; +using namespace video; + +// method emulations on the CPU, to verify the results of the GPU methods +template +struct emulatedReduction +{ + using type_t = typename Binop::type_t; + + static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) + { + const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop()); + std::fill(out,out+itemCount,red); + } + + static inline constexpr const char* name = "reduction"; +}; +template +struct emulatedScanInclusive +{ + using type_t = typename Binop::type_t; + + static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) + { + std::inclusive_scan(in,in+itemCount,out,Binop()); + } + static inline constexpr const char* name = "inclusive_scan"; +}; +template +struct emulatedScanExclusive +{ + using type_t = typename Binop::type_t; + + static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) + { + std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop()); + } + static inline constexpr const char* name = "exclusive_scan"; +}; + +class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = application_templates::BasicMultiQueueApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + +public: + ArithmeticUnitTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + transferDownQueue = getTransferDownQueue(); + computeQueue = getComputeQueue(); + + // TODO: get the element count from argv + const uint32_t elementCount = Output<>::ScanElementCount; + // populate our random data buffer on the CPU and create a GPU copy + inputData = new uint32_t[elementCount]; + smart_refctd_ptr gpuinputDataBuffer; + { + std::mt19937 randGenerator(0xdeadbeefu); + for (uint32_t i = 0u; i < elementCount; i++) + inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all + + IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; + inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount; + inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + m_utils->createFilledDeviceLocalBufferOnDedMem( + SIntendedSubmitInfo{.queue=getTransferUpQueue()}, + std::move(inputDataBufferCreationParams), + inputData + ).move_into(gpuinputDataBuffer); + } + + // create 8 buffers for 8 operations + for (auto i=0u; igetSize(); + params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT; + + outputBuffers[i] = m_device->createBuffer(std::move(params)); + auto mreq = outputBuffers[i]->getMemoryReqs(); + mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + assert(mreq.memoryTypeBits); + + auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get()); + assert(bufferMem.isValid()); + } + + // create Descriptor Set and Pipeline Layout + { + // create Descriptor Set Layout + smart_refctd_ptr dsLayout; + { + IGPUDescriptorSetLayout::SBinding binding[2]; + for (uint32_t i = 0u; i < 2; i++) + binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; + binding[1].count = OutputBufferCount; + dsLayout = m_device->createDescriptorSetLayout(binding); + } + + // set and transient pool + auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1}); + descriptorSet = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); + { + IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount]; + infos[0].desc = gpuinputDataBuffer; + infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() }; + for (uint32_t i = 1u; i <= OutputBufferCount; i++) + { + auto buff = outputBuffers[i - 1]; + infos[i].info.buffer = { 0u,buff->getSize() }; + infos[i].desc = std::move(buff); // save an atomic in the refcount + + } + + IGPUDescriptorSet::SWriteDescriptorSet writes[2]; + for (uint32_t i=0u; i<2; i++) + writes[i] = {descriptorSet.get(),i,0u,1u,infos+i}; + writes[1].count = OutputBufferCount; + + m_device->updateDescriptorSets(2, writes, 0u, nullptr); + } + + pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout)); + } + + const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin"; + // enclose to make sure file goes out of scope and we can reopen it + { + smart_refctd_ptr spirv_isa_cache_input; + // try to load SPIR-V to ISA cache + { + ISystem::future_t> fileCreate; + m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT); + if (auto lock=fileCreate.acquire()) + spirv_isa_cache_input = *lock; + } + // create the cache + { + std::span spirv_isa_cache_data = {}; + if (spirv_isa_cache_input) + spirv_isa_cache_data = {reinterpret_cast(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()}; + else + m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE); + // Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead + m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data); + } + } + { + // TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ? + m_system->deleteDirectory(spirv_isa_cache_path); + ISystem::future_t> fileCreate; + m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE); + // I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though. + m_spirv_isa_cache_output=*fileCreate.acquire(); + if (!m_spirv_isa_cache_output) + logFail("Failed to Create SPIR-V to ISA cache file."); + } + + // load shader source from file + auto getShaderSource = [&](const char* filePath) -> auto + { + IAssetLoader::SAssetLoadParams lparams = {}; + lparams.logger = m_logger.get(); + lparams.workingDirectory = ""; + auto bundle = m_assetMgr->getAsset(filePath, lparams); + if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER) + { + m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath); + exit(-1); + } + auto firstAssetInBundle = bundle.getContents()[0]; + return smart_refctd_ptr_static_cast(firstAssetInBundle); + }; + + auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); + auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl"); + // now create or retrieve final resources to run our tests + sema = m_device->createSemaphore(timelineValue); + resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() }); + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1})) + { + logFail("Failed to create Command Buffers!\n"); + return false; + } + } + + const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; + const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; + const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; + for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) + { + const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); + for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize) + { + // make sure renderdoc captures everything for debugging + m_api->startCapture(); + m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); + + bool passed = true; + // TODO async the testing + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; + logTestOutcome(passed, workgroupSize); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; + logTestOutcome(passed, workgroupSize); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; + logTestOutcome(passed, workgroupSize); + for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--) + { + m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + logTestOutcome(passed, itemsPerWG); + } + m_api->endCapture(); + + // save cache every now and then + { + auto cpu = m_spirv_isa_cache->convertToCPUCache(); + // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata + auto bin = cpu->getEntries().begin()->second.bin; + IFile::success_t success; + m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size()); + if (!success) + logFail("Could not write Create SPIR-V to ISA cache to disk!"); + } + } + } + + return true; + } + + virtual bool onAppTerminated() override + { + m_logger->log("==========Result==========", ILogger::ELL_INFO); + m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount); + delete[] inputData; + return true; + } + + // the unit test is carried out on init + void workLoopBody() override {} + + // + bool keepRunning() override { return false; } + +private: + void logTestOutcome(bool passed, uint32_t workgroupSize) + { + if (passed) + m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); + else + { + totalFailCount++; + m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); + } + } + + // create pipeline (specialized every test) [TODO: turn into a future/async] + smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2) + { + auto shader = m_device->createShader(overridenUnspecialized); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = pipelineLayout.get(); + params.shader = { + .entryPoint = "main", + .shader = shader.get(), + .entries = nullptr, + .requiredSubgroupSize = static_cast(subgroupSizeLog2), + .requireFullSubgroups = true + }; + core::smart_refctd_ptr pipeline; + if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{¶ms,1},&pipeline)) + return nullptr; + return pipeline; + } + + /*template class Arithmetic, bool WorkgroupTest> + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) + { + return true; + }*/ + + template class Arithmetic, bool WorkgroupTest> + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) + { + std::string arith_name = Arithmetic>::name; + + smart_refctd_ptr overridenUnspecialized; + if constexpr (WorkgroupTest) + { + overridenUnspecialized = CHLSLCompiler::createOverridenCopy( + source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n", + (("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG + ); + } + else + { + itemsPerWG = workgroupSize; + overridenUnspecialized = CHLSLCompiler::createOverridenCopy( + source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n", + (("subgroup::") + arith_name).c_str(), workgroupSize + ); + } + auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2); + + // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) + const uint32_t workgroupCount = elementCount / itemsPerWG; + cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); + cmdbuf->bindComputePipeline(pipeline.get()); + cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); + cmdbuf->dispatch(workgroupCount, 1, 1); + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; + for (auto i=0u; igetSize(),outputBuffers[i]} + }; + } + IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {.memBarriers={},.bufBarriers=memoryBarrier}; + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); + } + cmdbuf->end(); + + const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf.get()}}; + const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}}; + computeQueue->submit(submits); + const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}}; + m_device->blockForSemaphores(wait); + + // check results + bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount); + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + if constexpr (WorkgroupTest) + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + + return passed; + } + + //returns true if result matches + template class Arithmetic, class Binop, bool WorkgroupTest> + bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount) + { + bool success = true; + + // download data + const SBufferRange bufferRange = {0u, resultsBuffer->getSize(), outputBuffers[Binop::BindingIndex]}; + m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer()); + + using type_t = typename Binop::type_t; + const auto dataFromBuffer = reinterpret_cast(resultsBuffer->getPointer()); + const auto subgroupSize = dataFromBuffer[0]; + if (subgroupSizenbl::hlsl::subgroup::MaxSubgroupSize) + { + m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize); + return false; + } + + const auto testData = reinterpret_cast(dataFromBuffer + 1); + // TODO: parallel for (the temporary values need to be threadlocal or what?) + // now check if the data obtained has valid values + type_t* tmp = new type_t[itemsPerWG]; + type_t* ballotInput = new type_t[itemsPerWG]; + for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) + { + const auto workgroupOffset = workgroupID * itemsPerWG; + + if constexpr (WorkgroupTest) + { + if constexpr (std::is_same_v, Binop>) + { + for (auto i = 0u; i < itemsPerWG; i++) + ballotInput[i] = inputData[i + workgroupOffset] & 0x1u; + Arithmetic::impl(tmp, ballotInput, itemsPerWG); + } + else + Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); + } + else + { + for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) + Arithmetic::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize); + } + + for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) + { + const auto globalInvocationIndex = workgroupOffset + localInvocationIndex; + const auto cpuVal = tmp[localInvocationIndex]; + const auto gpuVal = testData[globalInvocationIndex]; + if (cpuVal != gpuVal) + { + m_logger->log( + "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d", + ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, + cpuVal, gpuVal, workgroupID, localInvocationIndex + ); + success = false; + break; + } + } + } + delete[] ballotInput; + delete[] tmp; + + return success; + } + + IQueue* transferDownQueue; + IQueue* computeQueue; + smart_refctd_ptr m_spirv_isa_cache; + smart_refctd_ptr m_spirv_isa_cache_output; + + uint32_t* inputData = nullptr; + constexpr static inline uint32_t OutputBufferCount = 8u; + smart_refctd_ptr outputBuffers[OutputBufferCount]; + smart_refctd_ptr descriptorSet; + smart_refctd_ptr pipelineLayout; + + smart_refctd_ptr sema; + uint64_t timelineValue = 0; + smart_refctd_ptr cmdbuf; + smart_refctd_ptr resultsBuffer; + + uint32_t totalFailCount = 0; +}; + +NBL_MAIN_FUNC(ArithmeticUnitTestApp) \ No newline at end of file diff --git a/74a_Workgroup2ScanTest/pipeline.groovy b/74a_Workgroup2ScanTest/pipeline.groovy new file mode 100644 index 000000000..7ea9947e0 --- /dev/null +++ b/74a_Workgroup2ScanTest/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CArithemticUnitTestBuilder extends IBuilder +{ + public CArithemticUnitTestBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CArithemticUnitTestBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 59fe4ea46..5d7369560 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,6 +92,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) add_subdirectory(73_ArithmeticBench EXCLUDE_FROM_ALL) + add_subdirectory(74a_Workgroup2ScanTest EXCLUDE_FROM_ALL) NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") endif() From 750b3d2094484bca2e0c92f03277e0861b1adb77 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 28 Apr 2025 17:03:04 +0700 Subject: [PATCH 148/296] working? test for workgroup2 reduce --- .../app_resources/shaderCommon.hlsl | 49 ++-- .../app_resources/testWorkgroup.comp.hlsl | 255 +++++++++++++----- 74a_Workgroup2ScanTest/main.cpp | 192 +++++++------ 3 files changed, 306 insertions(+), 190 deletions(-) diff --git a/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl index 13ee8d21e..79bf74e71 100644 --- a/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl +++ b/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl @@ -2,16 +2,22 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/subgroup/basic.hlsl" -#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" // https://github.com/microsoft/DirectXShaderCompiler/issues/6144 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} +#ifndef ITEMS_PER_INVOCATION +#error "Define ITEMS_PER_INVOCATION!" +#endif + +typedef vector type_t; + // unfortunately DXC chokes on descriptors as static members // https://github.com/microsoft/DirectXShaderCompiler/issues/5940 -[[vk::binding(0, 0)]] StructuredBuffer inputValue; +[[vk::binding(0, 0)]] StructuredBuffer inputValue; [[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; // because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way @@ -19,37 +25,36 @@ uint32_t globalIndex(); // since we test ITEMS_PER_WG class binop> static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { - if (globalIndex()==0u) - output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - - operation_t::base_t,nbl::hlsl::jit::device_capabilities> func; - if (canStore()) - output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); + if (globalIndex()==0u) + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + operation_t::base_t,nbl::hlsl::jit::device_capabilities> func; + if (canStore()) + output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); } type_t test() { - const type_t sourceVal = inputValue[globalIndex()]; - - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - return sourceVal; + const type_t sourceVal = inputValue[globalIndex()]; + + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + return sourceVal; } #include "nbl/builtin/hlsl/workgroup/basic.hlsl" \ No newline at end of file diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl index 9bafae47f..315550da0 100644 --- a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl +++ b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl @@ -3,105 +3,222 @@ #include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" -static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic::value; -static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot::value; -static const uint32_t ScratchSz = ArithmeticSz+BallotSz; +// static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic::value; +// static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot::value; +// static const uint32_t ScratchSz = ArithmeticSz+BallotSz; // TODO: Can we make it a static variable in the ScratchProxy struct? -groupshared uint32_t scratch[ScratchSz]; +// groupshared uint32_t ballotScratch[ScratchSz]; // TODO probably remove, not balloting -#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" +#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/subgroup/basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" + +#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" + +#include "common.hlsl" + +// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 +uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} + +#define ITEMS_PER_INVOCATION 1 + +#ifndef ITEMS_PER_INVOCATION +#error "Define ITEMS_PER_INVOCATION!" +#endif + +typedef vector type_t; -template +// unfortunately DXC chokes on descriptors as static members +// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 +[[vk::binding(0, 0)]] StructuredBuffer inputValue; +[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; + +// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way +uint32_t globalIndex(); +// since we test ITEMS_PER_WG; + +groupshared vector scratch[config_t::SubgroupSize]; // final (level 1) scan needs to fit in one subgroup exactly + +template struct ScratchProxy { - void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) - { - value = scratch[ix+offset]; - } - void set(const uint32_t ix, const uint32_t value) - { - scratch[ix+offset] = value; - } - - uint32_t atomicOr(const uint32_t ix, const uint32_t value) - { - return nbl::hlsl::glsl::atomicOr(scratch[ix],value); - } - - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } + using stype_t = vector; + + stype_t get(const uint32_t ix) + { + return scratch[ix]; + } + void set(const uint32_t ix, const stype_t value) + { + scratch[ix] = value; + } + + stype_t atomicOr(const uint32_t ix, const stype_t value) + { + return nbl::hlsl::glsl::atomicOr(scratch[ix],value); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } }; -static ScratchProxy<0> arithmeticAccessor; - - -#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" +template +struct DataProxy +{ + using dtype_t = vector; + static_assert(nbl::hlsl::is_same_v); + + dtype_t get(const uint32_t ix) + { + return inputValue[ix]; + } + void set(const uint32_t ix, const dtype_t value) + { + // inputValue[ix] = value; + output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); + output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } +}; +static ScratchProxy arithmeticAccessor; template struct operation_t { - using type_t = typename Binop::type_t; - - type_t operator()(type_t value) - { - type_t retval = nbl::hlsl::OPERATION::template __call >(value,arithmeticAccessor); - // we barrier before because we alias the accessors for Binop - arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); - return retval; - } + using binop_base_t = typename Binop::base_t; + using otype_t = typename Binop::type_t; + + void operator()() + { + DataProxy dataAccessor; + nbl::hlsl::OPERATION::template __call, ScratchProxy >(dataAccessor,arithmeticAccessor); + // we barrier before because we alias the accessors for Binop + arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); + // return retval; + } }; -#include "shaderCommon.hlsl" +template class binop, typename T, uint32_t N> +static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + if (globalIndex()==0u) + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + operation_t,nbl::hlsl::jit::device_capabilities> func; + // if (canStore()) + // output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func()); + func(); // store is done with data accessor now +} + + +type_t test() +{ + const type_t sourceVal = inputValue[globalIndex()]; + + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + return sourceVal; +} + + +// template +// struct BallotProxy +// { +// void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) +// { +// value = ballotScratch[ix+offset]; +// } +// void set(const uint32_t ix, const uint32_t value) +// { +// ballotScratch[ix+offset] = value; +// } + +// uint32_t atomicOr(const uint32_t ix, const uint32_t value) +// { +// return nbl::hlsl::glsl::atomicOr(ballotScratch[ix],value); +// } + +// void workgroupExecutionAndMemoryBarrier() +// { +// nbl::hlsl::glsl::barrier(); +// //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above +// } +// }; -static ScratchProxy ballotAccessor; +// static BallotProxy ballotAccessor; uint32_t globalIndex() { - return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); + return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); } bool canStore() { - return nbl::hlsl::workgroup::SubgroupContiguousIndex()::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - - // we can only ballot booleans, so low bit - nbl::hlsl::workgroup::ballot >(bool(sourceVal & 0x1u), ballotAccessor); - // need to barrier between ballot and usages of a ballot by myself - ballotAccessor.workgroupExecutionAndMemoryBarrier(); - - uint32_t destVal = 0xdeadbeefu; -#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same,0x45>,nbl::hlsl::workgroup::IS_OP,0x45> >::value -#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities - if (CONSTEXPR_OP_TYPE_TEST(reduction)) - destVal = nbl::hlsl::workgroup::ballotBitCount(ballotAccessor,arithmeticAccessor); - else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan)) - destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount(ballotAccessor,arithmeticAccessor); - else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan)) - destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount(ballotAccessor,arithmeticAccessor); - else - { - assert(false); - } -#undef BALLOT_TEMPLATE_ARGS -#undef CONSTEXPR_OP_TYPE_TEST - - if (canStore()) - output[ballot::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal); + const type_t sourceVal = test(); +// if (globalIndex()==0u) +// output[ballot::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + +// // we can only ballot booleans, so low bit +// nbl::hlsl::workgroup::ballot >(bool(sourceVal & 0x1u), ballotAccessor); +// // need to barrier between ballot and usages of a ballot by myself +// ballotAccessor.workgroupExecutionAndMemoryBarrier(); + +// uint32_t destVal = 0xdeadbeefu; +// #define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same,0x45>,nbl::hlsl::workgroup::IS_OP,0x45> >::value +// #define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities +// if (CONSTEXPR_OP_TYPE_TEST(reduction)) +// destVal = nbl::hlsl::workgroup::ballotBitCount(ballotAccessor,arithmeticAccessor); +// else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan)) +// destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount(ballotAccessor,arithmeticAccessor); +// else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan)) +// destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount(ballotAccessor,arithmeticAccessor); +// else +// { +// assert(false); +// } +// #undef BALLOT_TEMPLATE_ARGS +// #undef CONSTEXPR_OP_TYPE_TEST + +// if (canStore()) +// output[ballot::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal); } \ No newline at end of file diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp index 147d231e2..7e11726d6 100644 --- a/74a_Workgroup2ScanTest/main.cpp +++ b/74a_Workgroup2ScanTest/main.cpp @@ -45,13 +45,13 @@ struct emulatedScanExclusive static inline constexpr const char* name = "exclusive_scan"; }; -class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { using device_base_t = application_templates::BasicMultiQueueApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; public: - ArithmeticUnitTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + Workgroup2ScanTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} bool onAppInitialized(smart_refctd_ptr&& system) override @@ -138,38 +138,38 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout)); } - const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin"; - // enclose to make sure file goes out of scope and we can reopen it - { - smart_refctd_ptr spirv_isa_cache_input; - // try to load SPIR-V to ISA cache - { - ISystem::future_t> fileCreate; - m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT); - if (auto lock=fileCreate.acquire()) - spirv_isa_cache_input = *lock; - } - // create the cache - { - std::span spirv_isa_cache_data = {}; - if (spirv_isa_cache_input) - spirv_isa_cache_data = {reinterpret_cast(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()}; - else - m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE); - // Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead - m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data); - } - } - { - // TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ? - m_system->deleteDirectory(spirv_isa_cache_path); - ISystem::future_t> fileCreate; - m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE); - // I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though. - m_spirv_isa_cache_output=*fileCreate.acquire(); - if (!m_spirv_isa_cache_output) - logFail("Failed to Create SPIR-V to ISA cache file."); - } + //const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin"; + //// enclose to make sure file goes out of scope and we can reopen it + //{ + // smart_refctd_ptr spirv_isa_cache_input; + // // try to load SPIR-V to ISA cache + // { + // ISystem::future_t> fileCreate; + // m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT); + // if (auto lock=fileCreate.acquire()) + // spirv_isa_cache_input = *lock; + // } + // // create the cache + // { + // std::span spirv_isa_cache_data = {}; + // if (spirv_isa_cache_input) + // spirv_isa_cache_data = {reinterpret_cast(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()}; + // else + // m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE); + // // Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead + // m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data); + // } + //} + //{ + // // TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ? + // m_system->deleteDirectory(spirv_isa_cache_path); + // ISystem::future_t> fileCreate; + // m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE); + // // I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though. + // m_spirv_isa_cache_output=*fileCreate.acquire(); + // if (!m_spirv_isa_cache_output) + // logFail("Failed to Create SPIR-V to ISA cache file."); + //} // load shader source from file auto getShaderSource = [&](const char* filePath) -> auto @@ -187,7 +187,7 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu return smart_refctd_ptr_static_cast(firstAssetInBundle); }; - auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); + //auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl"); // now create or retrieve final resources to run our tests sema = m_device->createSemaphore(timelineValue); @@ -202,47 +202,47 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu } const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; + const std::array WorkgroupSizes = { 512, 1024 }; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize) + for (uint32_t i = 0; i < WorkgroupSizes.size(); i++) { + const uint32_t workgroupSize = WorkgroupSizes[i]; // make sure renderdoc captures everything for debugging m_api->startCapture(); m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); bool passed = true; // TODO async the testing - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - logTestOutcome(passed, workgroupSize); - for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--) - { - m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - } + //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; + //logTestOutcome(passed, workgroupSize); + //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; + //logTestOutcome(passed, workgroupSize); + //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; + //logTestOutcome(passed, workgroupSize); + const uint32_t itemsPerWG = workgroupSize; + m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + logTestOutcome(passed, itemsPerWG); + //passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + //logTestOutcome(passed, itemsPerWG); + //passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + //logTestOutcome(passed, itemsPerWG); m_api->endCapture(); // save cache every now and then - { - auto cpu = m_spirv_isa_cache->convertToCPUCache(); - // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata - auto bin = cpu->getEntries().begin()->second.bin; - IFile::success_t success; - m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size()); - if (!success) - logFail("Could not write Create SPIR-V to ISA cache to disk!"); - } + //{ + // auto cpu = m_spirv_isa_cache->convertToCPUCache(); + // // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata + // auto bin = cpu->getEntries().begin()->second.bin; + // IFile::success_t success; + // m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size()); + // if (!success) + // logFail("Could not write Create SPIR-V to ISA cache to disk!"); + //} } } @@ -294,33 +294,27 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu return pipeline; } - /*template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) - { - return true; - }*/ - template class Arithmetic, bool WorkgroupTest> bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) { std::string arith_name = Arithmetic>::name; smart_refctd_ptr overridenUnspecialized; - if constexpr (WorkgroupTest) - { + //if constexpr (WorkgroupTest) + //{ overridenUnspecialized = CHLSLCompiler::createOverridenCopy( source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n", - (("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG - ); - } - else - { - itemsPerWG = workgroupSize; - overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n", - (("subgroup::") + arith_name).c_str(), workgroupSize + (("workgroup2::") + arith_name).c_str(), workgroupSize, itemsPerWG ); - } + //} + //else + //{ + // itemsPerWG = workgroupSize; + // overridenUnspecialized = CHLSLCompiler::createOverridenCopy( + // source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n", + // (("subgroup::") + arith_name).c_str(), workgroupSize + // ); + //} auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2); // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) @@ -366,8 +360,8 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - if constexpr (WorkgroupTest) - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + //if constexpr (WorkgroupTest) + // passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; return passed; } @@ -395,27 +389,27 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu // TODO: parallel for (the temporary values need to be threadlocal or what?) // now check if the data obtained has valid values type_t* tmp = new type_t[itemsPerWG]; - type_t* ballotInput = new type_t[itemsPerWG]; + //type_t* ballotInput = new type_t[itemsPerWG]; for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) { const auto workgroupOffset = workgroupID * itemsPerWG; - if constexpr (WorkgroupTest) - { - if constexpr (std::is_same_v, Binop>) - { - for (auto i = 0u; i < itemsPerWG; i++) - ballotInput[i] = inputData[i + workgroupOffset] & 0x1u; - Arithmetic::impl(tmp, ballotInput, itemsPerWG); - } - else + //if constexpr (WorkgroupTest) + //{ + // if constexpr (std::is_same_v, Binop>) + // { + // for (auto i = 0u; i < itemsPerWG; i++) + // ballotInput[i] = inputData[i + workgroupOffset] & 0x1u; + // Arithmetic::impl(tmp, ballotInput, itemsPerWG); + // } + // else Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); - } - else - { - for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) - Arithmetic::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize); - } + //} + //else + //{ + // for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) + // Arithmetic::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize); + //} for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) { @@ -434,7 +428,7 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu } } } - delete[] ballotInput; + //delete[] ballotInput; delete[] tmp; return success; @@ -459,4 +453,4 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu uint32_t totalFailCount = 0; }; -NBL_MAIN_FUNC(ArithmeticUnitTestApp) \ No newline at end of file +NBL_MAIN_FUNC(Workgroup2ScanTestApp) \ No newline at end of file From f11b3df746c4c69721daeb264a925c9e8dd86d1d Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 29 Apr 2025 12:04:14 +0700 Subject: [PATCH 149/296] fixes to test --- 74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl | 2 +- 74a_Workgroup2ScanTest/main.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl index 315550da0..1f313e6f8 100644 --- a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl +++ b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl @@ -96,7 +96,7 @@ struct DataProxy void set(const uint32_t ix, const dtype_t value) { // inputValue[ix] = value; - output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); + // output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); } diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp index 7e11726d6..4dc337e20 100644 --- a/74a_Workgroup2ScanTest/main.cpp +++ b/74a_Workgroup2ScanTest/main.cpp @@ -223,7 +223,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu //logTestOutcome(passed, workgroupSize); //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; //logTestOutcome(passed, workgroupSize); - const uint32_t itemsPerWG = workgroupSize; + const uint32_t itemsPerWG = 1024; // TODO use Config::VirtualWorkgroupSize somehow m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; logTestOutcome(passed, itemsPerWG); @@ -318,7 +318,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2); // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) - const uint32_t workgroupCount = elementCount / itemsPerWG; + const uint32_t workgroupCount = elementCount / itemsPerWG; // TODO use Config::VirtualWorkgroupSize somehow cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); cmdbuf->bindComputePipeline(pipeline.get()); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); From 9f690ee8077344754aa4045f7cbce99b2c16abee Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 29 Apr 2025 15:53:42 +0700 Subject: [PATCH 150/296] tests with multiple items per invoc --- .../app_resources/testWorkgroup.comp.hlsl | 4 +- 74a_Workgroup2ScanTest/main.cpp | 59 +++++++++++++++---- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl index 1f313e6f8..67bb9c5f2 100644 --- a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl +++ b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl @@ -26,7 +26,7 @@ // https://github.com/microsoft/DirectXShaderCompiler/issues/6144 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} -#define ITEMS_PER_INVOCATION 1 +// #define ITEMS_PER_INVOCATION 1 #ifndef ITEMS_PER_INVOCATION #error "Define ITEMS_PER_INVOCATION!" @@ -44,7 +44,7 @@ uint32_t globalIndex(); // since we test ITEMS_PER_WG(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; //logTestOutcome(passed, workgroupSize); - const uint32_t itemsPerWG = 1024; // TODO use Config::VirtualWorkgroupSize somehow + const uint32_t itemsPerWG = ItemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config::VirtualWorkgroupSize somehow m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; logTestOutcome(passed, itemsPerWG); @@ -289,7 +289,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu .requireFullSubgroups = true }; core::smart_refctd_ptr pipeline; - if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{¶ms,1},&pipeline)) + if (!m_device->createComputePipelines(nullptr,{¶ms,1},&pipeline)) return nullptr; return pipeline; } @@ -299,13 +299,13 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu { std::string arith_name = Arithmetic>::name; - smart_refctd_ptr overridenUnspecialized; + //smart_refctd_ptr overridenUnspecialized; //if constexpr (WorkgroupTest) //{ - overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n", - (("workgroup2::") + arith_name).c_str(), workgroupSize, itemsPerWG - ); + //overridenUnspecialized = CHLSLCompiler::createOverridenCopy( + // source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n", + // (("workgroup2::") + arith_name).c_str(), workgroupSize, itemsPerWG + //); //} //else //{ @@ -315,7 +315,46 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu // (("subgroup::") + arith_name).c_str(), workgroupSize // ); //} - auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2); + + auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + CHLSLCompiler::SOptions options = {}; + options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; + options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; + options.spirvOptimizer = nullptr; +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + options.spirvOptimizer = opt.get(); +#else + options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; +#endif + options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); + options.preprocessorOptions.logger = m_logger.get(); + + auto* includeFinder = compiler->getDefaultIncludeFinder(); + includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr(m_physicalDevice->getLimits(), m_device->getEnabledFeatures())); + options.preprocessorOptions.includeFinder = includeFinder; + + const std::string definitions[5] = { + "workgroup2::" + arith_name, + std::to_string(workgroupSize), + std::to_string(itemsPerWG), + std::to_string(ItemsPerInvocation), + std::to_string(subgroupSizeLog2) + }; + + const IShaderCompiler::SMacroDefinition defines[5] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE", definitions[1] }, + { "ITEMS_PER_WG", definitions[2] }, + { "ITEMS_PER_INVOCATION", definitions[3] }, + { "SUBGROUP_SIZE_LOG2", definitions[4] } + }; + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + + smart_refctd_ptr overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + + auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2); // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) const uint32_t workgroupCount = elementCount / itemsPerWG; // TODO use Config::VirtualWorkgroupSize somehow @@ -436,8 +475,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu IQueue* transferDownQueue; IQueue* computeQueue; - smart_refctd_ptr m_spirv_isa_cache; - smart_refctd_ptr m_spirv_isa_cache_output; uint32_t* inputData = nullptr; constexpr static inline uint32_t OutputBufferCount = 8u; @@ -451,6 +488,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu smart_refctd_ptr resultsBuffer; uint32_t totalFailCount = 0; + + uint32_t ItemsPerInvocation = 4u; }; NBL_MAIN_FUNC(Workgroup2ScanTestApp) \ No newline at end of file From 755f89ac0b833fe22c2f832c171e465b1ecbd31b Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 29 Apr 2025 17:02:35 +0700 Subject: [PATCH 151/296] inclusive scan test --- 74a_Workgroup2ScanTest/main.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp index 123dda5a4..9c695b280 100644 --- a/74a_Workgroup2ScanTest/main.cpp +++ b/74a_Workgroup2ScanTest/main.cpp @@ -227,8 +227,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; logTestOutcome(passed, itemsPerWG); - //passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - //logTestOutcome(passed, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + logTestOutcome(passed, itemsPerWG); //passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; //logTestOutcome(passed, itemsPerWG); m_api->endCapture(); @@ -489,7 +489,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu uint32_t totalFailCount = 0; - uint32_t ItemsPerInvocation = 4u; + uint32_t ItemsPerInvocation = 1u; }; NBL_MAIN_FUNC(Workgroup2ScanTestApp) \ No newline at end of file From b8415ad608844d9cfaa8ffc9fe9d15e2c31db71b Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 30 Apr 2025 14:08:05 +0700 Subject: [PATCH 152/296] exclusive scan test, remove comments --- 74a_Workgroup2ScanTest/main.cpp | 99 ++------------------------------- 1 file changed, 5 insertions(+), 94 deletions(-) diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp index 9c695b280..f0064a4c0 100644 --- a/74a_Workgroup2ScanTest/main.cpp +++ b/74a_Workgroup2ScanTest/main.cpp @@ -138,39 +138,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout)); } - //const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin"; - //// enclose to make sure file goes out of scope and we can reopen it - //{ - // smart_refctd_ptr spirv_isa_cache_input; - // // try to load SPIR-V to ISA cache - // { - // ISystem::future_t> fileCreate; - // m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT); - // if (auto lock=fileCreate.acquire()) - // spirv_isa_cache_input = *lock; - // } - // // create the cache - // { - // std::span spirv_isa_cache_data = {}; - // if (spirv_isa_cache_input) - // spirv_isa_cache_data = {reinterpret_cast(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()}; - // else - // m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE); - // // Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead - // m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data); - // } - //} - //{ - // // TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ? - // m_system->deleteDirectory(spirv_isa_cache_path); - // ISystem::future_t> fileCreate; - // m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE); - // // I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though. - // m_spirv_isa_cache_output=*fileCreate.acquire(); - // if (!m_spirv_isa_cache_output) - // logFail("Failed to Create SPIR-V to ISA cache file."); - //} - // load shader source from file auto getShaderSource = [&](const char* filePath) -> auto { @@ -187,7 +154,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu return smart_refctd_ptr_static_cast(firstAssetInBundle); }; - //auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl"); // now create or retrieve final resources to run our tests sema = m_device->createSemaphore(timelineValue); @@ -216,33 +182,15 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); bool passed = true; - // TODO async the testing - //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - //logTestOutcome(passed, workgroupSize); - //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - //logTestOutcome(passed, workgroupSize); - //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - //logTestOutcome(passed, workgroupSize); const uint32_t itemsPerWG = ItemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config::VirtualWorkgroupSize somehow m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; logTestOutcome(passed, itemsPerWG); passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; logTestOutcome(passed, itemsPerWG); - //passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - //logTestOutcome(passed, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + logTestOutcome(passed, itemsPerWG); m_api->endCapture(); - - // save cache every now and then - //{ - // auto cpu = m_spirv_isa_cache->convertToCPUCache(); - // // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata - // auto bin = cpu->getEntries().begin()->second.bin; - // IFile::success_t success; - // m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size()); - // if (!success) - // logFail("Could not write Create SPIR-V to ISA cache to disk!"); - //} } } @@ -299,23 +247,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu { std::string arith_name = Arithmetic>::name; - //smart_refctd_ptr overridenUnspecialized; - //if constexpr (WorkgroupTest) - //{ - //overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - // source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n", - // (("workgroup2::") + arith_name).c_str(), workgroupSize, itemsPerWG - //); - //} - //else - //{ - // itemsPerWG = workgroupSize; - // overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - // source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n", - // (("subgroup::") + arith_name).c_str(), workgroupSize - // ); - //} - auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); CHLSLCompiler::SOptions options = {}; options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; @@ -357,7 +288,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2); // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) - const uint32_t workgroupCount = elementCount / itemsPerWG; // TODO use Config::VirtualWorkgroupSize somehow + const uint32_t workgroupCount = elementCount / itemsPerWG; cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); cmdbuf->bindComputePipeline(pipeline.get()); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); @@ -399,8 +330,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - //if constexpr (WorkgroupTest) - // passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; return passed; } @@ -428,27 +357,10 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu // TODO: parallel for (the temporary values need to be threadlocal or what?) // now check if the data obtained has valid values type_t* tmp = new type_t[itemsPerWG]; - //type_t* ballotInput = new type_t[itemsPerWG]; for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) { const auto workgroupOffset = workgroupID * itemsPerWG; - - //if constexpr (WorkgroupTest) - //{ - // if constexpr (std::is_same_v, Binop>) - // { - // for (auto i = 0u; i < itemsPerWG; i++) - // ballotInput[i] = inputData[i + workgroupOffset] & 0x1u; - // Arithmetic::impl(tmp, ballotInput, itemsPerWG); - // } - // else - Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); - //} - //else - //{ - // for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) - // Arithmetic::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize); - //} + Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) { @@ -467,7 +379,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } } } - //delete[] ballotInput; delete[] tmp; return success; @@ -489,7 +400,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu uint32_t totalFailCount = 0; - uint32_t ItemsPerInvocation = 1u; + uint32_t ItemsPerInvocation = 4u; }; NBL_MAIN_FUNC(Workgroup2ScanTestApp) \ No newline at end of file From 474281d8d81cddc30aa289355889ead235014e98 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 1 May 2025 12:18:08 +0700 Subject: [PATCH 153/296] benchmark shader, new common header --- .../benchmarkWorkgroup.comp.hlsl | 97 ++++++++++++ .../app_resources/testWorkgroup.comp.hlsl | 145 +----------------- .../app_resources/workgroupCommon.hlsl | 71 +++++++++ 74a_Workgroup2ScanTest/main.cpp | 5 +- 4 files changed, 174 insertions(+), 144 deletions(-) create mode 100644 74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl create mode 100644 74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl diff --git a/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl new file mode 100644 index 000000000..f758f6ac8 --- /dev/null +++ b/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl @@ -0,0 +1,97 @@ +#pragma shader_stage(compute) + +#include "workgroupCommon.hlsl" + +template +struct DataProxy +{ + using dtype_t = vector; + static_assert(nbl::hlsl::is_same_v); + + dtype_t get(const uint32_t ix) + { + // return inputValue[ix]; + return inputVal; + } + void set(const uint32_t ix, const dtype_t value) + { + // output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); + outputVal = value; + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } + + // to avoid multiple load/store in benchmark, also values not that important? + dtype_t inputVal; + dtype_t outputVal; +}; + +static ScratchProxy arithmeticAccessor; + +template +struct operation_t +{ + using binop_base_t = typename Binop::base_t; + using otype_t = typename Binop::type_t; + + otype_t operator()() + { + DataProxy dataAccessor; + dataAccessor.inputVal = inputValue[globalIndex()]; + nbl::hlsl::OPERATION::template __call, ScratchProxy >(dataAccessor,arithmeticAccessor); + // we barrier before because we alias the accessors for Binop + arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); + return dataAccessor.outputVal; + } +}; + + +template class binop, typename T, uint32_t N> +static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + if (globalIndex()==0u) + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + type_t value; + operation_t,nbl::hlsl::jit::device_capabilities> func; + for (uint32_t i = 0; i < NUM_LOOPS; i++) + value = func(); // store is done with data accessor now + + output[binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); +} + + +type_t test() +{ + const type_t sourceVal = inputValue[globalIndex()]; + + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + return sourceVal; +} + + +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} + +bool canStore() +{ + return nbl::hlsl::workgroup::SubgroupContiguousIndex()::value; -// static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot::value; -// static const uint32_t ScratchSz = ArithmeticSz+BallotSz; - -// TODO: Can we make it a static variable in the ScratchProxy struct? -// groupshared uint32_t ballotScratch[ScratchSz]; // TODO probably remove, not balloting - - -#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" - -#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" - -#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" -#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" - -#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" - -#include "common.hlsl" - -// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 -uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} - -// #define ITEMS_PER_INVOCATION 1 - -#ifndef ITEMS_PER_INVOCATION -#error "Define ITEMS_PER_INVOCATION!" -#endif - -typedef vector type_t; - -// unfortunately DXC chokes on descriptors as static members -// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 -[[vk::binding(0, 0)]] StructuredBuffer inputValue; -[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; - -// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way -uint32_t globalIndex(); -// since we test ITEMS_PER_WG; - -groupshared vector scratch[config_t::SubgroupSize]; // final (level 1) scan needs to fit in one subgroup exactly - -template -struct ScratchProxy -{ - using stype_t = vector; - - stype_t get(const uint32_t ix) - { - return scratch[ix]; - } - void set(const uint32_t ix, const stype_t value) - { - scratch[ix] = value; - } - - stype_t atomicOr(const uint32_t ix, const stype_t value) - { - return nbl::hlsl::glsl::atomicOr(scratch[ix],value); - } - - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } -}; +#include "workgroupCommon.hlsl" template struct DataProxy @@ -95,8 +14,6 @@ struct DataProxy } void set(const uint32_t ix, const dtype_t value) { - // inputValue[ix] = value; - // output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); } @@ -121,7 +38,6 @@ struct operation_t nbl::hlsl::OPERATION::template __call, ScratchProxy >(dataAccessor,arithmeticAccessor); // we barrier before because we alias the accessors for Binop arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); - // return retval; } }; @@ -131,11 +47,9 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { if (globalIndex()==0u) output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - + operation_t,nbl::hlsl::jit::device_capabilities> func; - // if (canStore()) - // output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func()); - func(); // store is done with data accessor now + func(); // store is done with data accessor now } @@ -154,33 +68,6 @@ type_t test() } -// template -// struct BallotProxy -// { -// void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) -// { -// value = ballotScratch[ix+offset]; -// } -// void set(const uint32_t ix, const uint32_t value) -// { -// ballotScratch[ix+offset] = value; -// } - -// uint32_t atomicOr(const uint32_t ix, const uint32_t value) -// { -// return nbl::hlsl::glsl::atomicOr(ballotScratch[ix],value); -// } - -// void workgroupExecutionAndMemoryBarrier() -// { -// nbl::hlsl::glsl::barrier(); -// //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above -// } -// }; - -// static BallotProxy ballotAccessor; - - uint32_t globalIndex() { return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); @@ -195,30 +82,4 @@ bool canStore() void main() { const type_t sourceVal = test(); -// if (globalIndex()==0u) -// output[ballot::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - -// // we can only ballot booleans, so low bit -// nbl::hlsl::workgroup::ballot >(bool(sourceVal & 0x1u), ballotAccessor); -// // need to barrier between ballot and usages of a ballot by myself -// ballotAccessor.workgroupExecutionAndMemoryBarrier(); - -// uint32_t destVal = 0xdeadbeefu; -// #define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same,0x45>,nbl::hlsl::workgroup::IS_OP,0x45> >::value -// #define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities -// if (CONSTEXPR_OP_TYPE_TEST(reduction)) -// destVal = nbl::hlsl::workgroup::ballotBitCount(ballotAccessor,arithmeticAccessor); -// else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan)) -// destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount(ballotAccessor,arithmeticAccessor); -// else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan)) -// destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount(ballotAccessor,arithmeticAccessor); -// else -// { -// assert(false); -// } -// #undef BALLOT_TEMPLATE_ARGS -// #undef CONSTEXPR_OP_TYPE_TEST - -// if (canStore()) -// output[ballot::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal); } \ No newline at end of file diff --git a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl new file mode 100644 index 000000000..362b48253 --- /dev/null +++ b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl @@ -0,0 +1,71 @@ +#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" + +#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" + +#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/subgroup/basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" + +#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" + +#include "common.hlsl" + +static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2; + +// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 +uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} + +#ifndef ITEMS_PER_INVOCATION +#error "Define ITEMS_PER_INVOCATION!" +#endif + +typedef vector type_t; + +// unfortunately DXC chokes on descriptors as static members +// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 +[[vk::binding(0, 0)]] StructuredBuffer inputValue; +[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; + +// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way +uint32_t globalIndex(); +// since we test ITEMS_PER_WG; + +groupshared vector scratch[config_t::SubgroupSize]; // final (level 1) scan needs to fit in one subgroup exactly + +template +struct ScratchProxy +{ + using stype_t = vector; + + stype_t get(const uint32_t ix) + { + return scratch[ix]; + } + void set(const uint32_t ix, const stype_t value) + { + scratch[ix] = value; + } + + stype_t atomicOr(const uint32_t ix, const stype_t value) + { + return nbl::hlsl::glsl::atomicOr(scratch[ix],value); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } +}; diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp index f0064a4c0..c5e8370be 100644 --- a/74a_Workgroup2ScanTest/main.cpp +++ b/74a_Workgroup2ScanTest/main.cpp @@ -246,6 +246,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) { std::string arith_name = Arithmetic>::name; + const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); CHLSLCompiler::SOptions options = {}; @@ -268,7 +269,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu const std::string definitions[5] = { "workgroup2::" + arith_name, - std::to_string(workgroupSize), + std::to_string(workgroupSizeLog2), std::to_string(itemsPerWG), std::to_string(ItemsPerInvocation), std::to_string(subgroupSizeLog2) @@ -276,7 +277,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu const IShaderCompiler::SMacroDefinition defines[5] = { { "OPERATION", definitions[0] }, - { "WORKGROUP_SIZE", definitions[1] }, + { "WORKGROUP_SIZE_LOG2", definitions[1] }, { "ITEMS_PER_WG", definitions[2] }, { "ITEMS_PER_INVOCATION", definitions[3] }, { "SUBGROUP_SIZE_LOG2", definitions[4] } From 7d063322b5994d66e871367bddaf9a5631f39bc3 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 2 May 2025 09:41:18 +0700 Subject: [PATCH 154/296] test smaller workgroup sizes --- 74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl | 6 +++--- 74a_Workgroup2ScanTest/main.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl index 362b48253..5de666c4b 100644 --- a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl +++ b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl @@ -21,7 +21,9 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1 #error "Define ITEMS_PER_INVOCATION!" #endif -typedef vector type_t; +using config_t = nbl::hlsl::workgroup2::Configuration; + +typedef vector type_t; // unfortunately DXC chokes on descriptors as static members // https://github.com/microsoft/DirectXShaderCompiler/issues/5940 @@ -40,8 +42,6 @@ bool canStore(); #error "Define SUBGROUP_SIZE_LOG2!" #endif -using config_t = nbl::hlsl::workgroup2::Configuration; - groupshared vector scratch[config_t::SubgroupSize]; // final (level 1) scan needs to fit in one subgroup exactly template diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp index c5e8370be..e40b87100 100644 --- a/74a_Workgroup2ScanTest/main.cpp +++ b/74a_Workgroup2ScanTest/main.cpp @@ -168,7 +168,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; - const std::array WorkgroupSizes = { 512, 1024 }; + const std::array WorkgroupSizes = { 32, 64, 512, 1024 }; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) @@ -182,7 +182,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); bool passed = true; - const uint32_t itemsPerWG = ItemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config::VirtualWorkgroupSize somehow + const uint32_t itemsPerWG = workgroupSize <= 4 * subgroupSize ? workgroupSize : ItemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; logTestOutcome(passed, itemsPerWG); From 874557c1e091634c945b643e85bf215c2e304d87 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 2 May 2025 10:52:11 +0700 Subject: [PATCH 155/296] expanded scratch proxy funcs --- .../app_resources/workgroupCommon.hlsl | 10 ++++++++++ 74a_Workgroup2ScanTest/main.cpp | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl index 5de666c4b..e60856bf8 100644 --- a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl +++ b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl @@ -47,6 +47,7 @@ groupshared vector scratch[config_t::S template struct ScratchProxy { + using scalar_t = uint32_t; using stype_t = vector; stype_t get(const uint32_t ix) @@ -58,6 +59,15 @@ struct ScratchProxy scratch[ix] = value; } + scalar_t getByComponent(const uint32_t ix) + { + return scratch[ix/Config::ItemsPerInvocation_1][ix&(Config::ItemsPerInvocation_1-1)]; + } + void setByComponent(const uint32_t ix, const scalar_t value) + { + scratch[ix/Config::ItemsPerInvocation_1][ix&(Config::ItemsPerInvocation_1-1)] = value; + } + stype_t atomicOr(const uint32_t ix, const stype_t value) { return nbl::hlsl::glsl::atomicOr(scratch[ix],value); diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp index e40b87100..57e70bf68 100644 --- a/74a_Workgroup2ScanTest/main.cpp +++ b/74a_Workgroup2ScanTest/main.cpp @@ -168,7 +168,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; - const std::array WorkgroupSizes = { 32, 64, 512, 1024 }; + const std::array WorkgroupSizes = { 64, 512, 1024 }; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) From 28ea75fbc40b4687d92bbb1ac9b67c6430e90b86 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 5 May 2025 17:14:37 +0700 Subject: [PATCH 156/296] simplify scratch,proxy to just scalar types --- .../benchmarkWorkgroup.comp.hlsl | 2 +- .../app_resources/testWorkgroup.comp.hlsl | 4 ++-- .../app_resources/workgroupCommon.hlsl | 24 +++++-------------- 3 files changed, 9 insertions(+), 21 deletions(-) diff --git a/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl index f758f6ac8..e20e528d7 100644 --- a/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl @@ -30,7 +30,7 @@ struct DataProxy dtype_t outputVal; }; -static ScratchProxy arithmeticAccessor; +static ScratchProxy arithmeticAccessor; template struct operation_t diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl index ac4104279..f9453a165 100644 --- a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl +++ b/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl @@ -24,7 +24,7 @@ struct DataProxy } }; -static ScratchProxy arithmeticAccessor; +static ScratchProxy arithmeticAccessor; template struct operation_t @@ -35,7 +35,7 @@ struct operation_t void operator()() { DataProxy dataAccessor; - nbl::hlsl::OPERATION::template __call, ScratchProxy >(dataAccessor,arithmeticAccessor); + nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); // we barrier before because we alias the accessors for Binop arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); } diff --git a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl index e60856bf8..7e8512e72 100644 --- a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl +++ b/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl @@ -42,33 +42,21 @@ bool canStore(); #error "Define SUBGROUP_SIZE_LOG2!" #endif -groupshared vector scratch[config_t::SubgroupSize]; // final (level 1) scan needs to fit in one subgroup exactly +// final (level 1/2) scan needs to fit in one subgroup exactly +groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroupLog2*config_t::ItemsPerInvocation_1]; -template struct ScratchProxy { - using scalar_t = uint32_t; - using stype_t = vector; - - stype_t get(const uint32_t ix) + void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) { - return scratch[ix]; + value = scratch[ix]; } - void set(const uint32_t ix, const stype_t value) + void set(const uint32_t ix, const uint32_t value) { scratch[ix] = value; } - scalar_t getByComponent(const uint32_t ix) - { - return scratch[ix/Config::ItemsPerInvocation_1][ix&(Config::ItemsPerInvocation_1-1)]; - } - void setByComponent(const uint32_t ix, const scalar_t value) - { - scratch[ix/Config::ItemsPerInvocation_1][ix&(Config::ItemsPerInvocation_1-1)] = value; - } - - stype_t atomicOr(const uint32_t ix, const stype_t value) + uint32_t atomicOr(const uint32_t ix, const uint32_t value) { return nbl::hlsl::glsl::atomicOr(scratch[ix],value); } From 8c76367c1c226cce3d66f1c60f540e29a501a1cb Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 6 May 2025 15:05:59 +0200 Subject: [PATCH 157/296] update the Acceleration Structure Position fetch code in one example after AS-refactor --- 71_RayTracingPipeline/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 219a7aacb..e31f5c280 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1483,7 +1483,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) - blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR; + blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS; IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo; initBuildInfo.buildFlags = blasFlags; From e8c2831c4117b2daaa0d1d61654c271496705f80 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 7 May 2025 09:51:43 +0700 Subject: [PATCH 158/296] move all tests into new example --- .../app_resources/shaderCommon.hlsl | 36 ++-- .../app_resources/testSubgroup.comp.hlsl | 6 +- 74a_Workgroup2ScanTest/main.cpp | 179 +++++++++++++----- 3 files changed, 152 insertions(+), 69 deletions(-) diff --git a/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl b/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl index 79bf74e71..376f69579 100644 --- a/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl +++ b/74a_Workgroup2ScanTest/app_resources/shaderCommon.hlsl @@ -2,6 +2,7 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/subgroup/basic.hlsl" +#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" @@ -28,33 +29,40 @@ bool canStore(); #ifndef OPERATION #error "Define OPERATION!" #endif + #ifndef SUBGROUP_SIZE_LOG2 #error "Define SUBGROUP_SIZE_LOG2!" #endif -template class binop> +template class binop, typename T, uint32_t N> static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { + // TODO static assert vector == type_t + //using type_t = vector; + using config_t = nbl::hlsl::subgroup2::Configuration; + using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + if (globalIndex()==0u) - output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - operation_t::base_t,nbl::hlsl::jit::device_capabilities> func; + operation_t func; if (canStore()) - output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); + output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); } type_t test() { - const type_t sourceVal = inputValue[globalIndex()]; - - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); + const uint32_t idx = globalIndex(); + type_t sourceVal = inputValue[idx]; + + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); return sourceVal; } -#include "nbl/builtin/hlsl/workgroup/basic.hlsl" \ No newline at end of file +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" diff --git a/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl b/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl index 479265d73..2cc1ccb60 100644 --- a/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl +++ b/74a_Workgroup2ScanTest/app_resources/testSubgroup.comp.hlsl @@ -6,7 +6,7 @@ uint32_t globalIndex() { - return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); } bool canStore() {return true;} @@ -14,5 +14,5 @@ bool canStore() {return true;} [numthreads(WORKGROUP_SIZE,1,1)] void main() { - test(); -} \ No newline at end of file + test(); +} diff --git a/74a_Workgroup2ScanTest/main.cpp b/74a_Workgroup2ScanTest/main.cpp index 57e70bf68..bde717d7b 100644 --- a/74a_Workgroup2ScanTest/main.cpp +++ b/74a_Workgroup2ScanTest/main.cpp @@ -154,6 +154,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu return smart_refctd_ptr_static_cast(firstAssetInBundle); }; + auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl"); // now create or retrieve final resources to run our tests sema = m_device->createSemaphore(timelineValue); @@ -168,7 +169,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; - const std::array WorkgroupSizes = { 64, 512, 1024 }; + const std::array WorkgroupSizes = { 128, 256, 512, 1024 }; + const std::array ItemsPerInvocations = { 1, 2, 4 }; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) @@ -181,15 +183,27 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu m_api->startCapture(); m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); - bool passed = true; - const uint32_t itemsPerWG = workgroupSize <= 4 * subgroupSize ? workgroupSize : ItemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow - m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); + for (uint32_t j = 0; j < ItemsPerInvocations.size(); j++) + { + const uint32_t itemsPerInvocation = ItemsPerInvocations[j]; + m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation); + bool passed = true; + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; + logTestOutcome(passed, workgroupSize); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; + logTestOutcome(passed, workgroupSize); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; + logTestOutcome(passed, workgroupSize); + + const uint32_t itemsPerWG = workgroupSize <= 4 * subgroupSize ? workgroupSize : itemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow + m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + } m_api->endCapture(); } } @@ -243,7 +257,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u) { std::string arith_name = Arithmetic>::name; const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); @@ -267,29 +281,59 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr(m_physicalDevice->getLimits(), m_device->getEnabledFeatures())); options.preprocessorOptions.includeFinder = includeFinder; - const std::string definitions[5] = { - "workgroup2::" + arith_name, - std::to_string(workgroupSizeLog2), - std::to_string(itemsPerWG), - std::to_string(ItemsPerInvocation), - std::to_string(subgroupSizeLog2) - }; - - const IShaderCompiler::SMacroDefinition defines[5] = { - { "OPERATION", definitions[0] }, - { "WORKGROUP_SIZE_LOG2", definitions[1] }, - { "ITEMS_PER_WG", definitions[2] }, - { "ITEMS_PER_INVOCATION", definitions[3] }, - { "SUBGROUP_SIZE_LOG2", definitions[4] } - }; - options.preprocessorOptions.extraDefines = { defines, defines + 5 }; - - smart_refctd_ptr overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + smart_refctd_ptr overriddenUnspecialized; + if constexpr (WorkgroupTest) + { + const std::string definitions[5] = { + "workgroup2::" + arith_name, + std::to_string(workgroupSizeLog2), + std::to_string(itemsPerWG), + std::to_string(itemsPerInvoc), + std::to_string(subgroupSizeLog2) + }; + + const IShaderCompiler::SMacroDefinition defines[5] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE_LOG2", definitions[1] }, + { "ITEMS_PER_WG", definitions[2] }, + { "ITEMS_PER_INVOCATION", definitions[3] }, + { "SUBGROUP_SIZE_LOG2", definitions[4] } + }; + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + + overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + } + else + { + const std::string definitions[4] = { + "subgroup2::" + arith_name, + std::to_string(workgroupSize), + std::to_string(itemsPerInvoc), + std::to_string(subgroupSizeLog2) + }; + + const IShaderCompiler::SMacroDefinition defines[4] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE", definitions[1] }, + { "ITEMS_PER_INVOCATION", definitions[2] }, + { "SUBGROUP_SIZE_LOG2", definitions[3] } + }; + options.preprocessorOptions.extraDefines = { defines, defines + 4 }; + + overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + } auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2); // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) - const uint32_t workgroupCount = elementCount / itemsPerWG; + uint32_t workgroupCount; + if constexpr (WorkgroupTest) + workgroupCount = elementCount / itemsPerWG; + else + { + itemsPerWG = workgroupSize; + workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc); + } cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); cmdbuf->bindComputePipeline(pipeline.get()); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); @@ -324,20 +368,20 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu m_device->blockForSemaphores(wait); // check results - bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount); - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc); + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; return passed; } //returns true if result matches template class Arithmetic, class Binop, bool WorkgroupTest> - bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount) + bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, const uint32_t itemsPerInvoc) { bool success = true; @@ -361,22 +405,53 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) { const auto workgroupOffset = workgroupID * itemsPerWG; - Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); - for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) + if constexpr (WorkgroupTest) + { + Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); + + for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) + { + const auto globalInvocationIndex = workgroupOffset + localInvocationIndex; + const auto cpuVal = tmp[localInvocationIndex]; + const auto gpuVal = testData[globalInvocationIndex]; + if (cpuVal != gpuVal) + { + m_logger->log( + "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d", + ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, + cpuVal, gpuVal, workgroupID, localInvocationIndex + ); + success = false; + break; + } + } + } + else { - const auto globalInvocationIndex = workgroupOffset + localInvocationIndex; - const auto cpuVal = tmp[localInvocationIndex]; - const auto gpuVal = testData[globalInvocationIndex]; - if (cpuVal != gpuVal) + for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) + Arithmetic::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc); + + for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) { - m_logger->log( - "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d", - ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, - cpuVal, gpuVal, workgroupID, localInvocationIndex - ); - success = false; - break; + const auto localOffset = localInvocationIndex * itemsPerInvoc; + const auto globalInvocationIndex = workgroupOffset + localOffset; + + for (uint32_t itemInvocationIndex = 0u; itemInvocationIndex < itemsPerInvoc; itemInvocationIndex++) + { + const auto cpuVal = tmp[localOffset + itemInvocationIndex]; + const auto gpuVal = testData[globalInvocationIndex + itemInvocationIndex]; + if (cpuVal != gpuVal) + { + m_logger->log( + "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d and iteminvoc %d", + ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, + cpuVal, gpuVal, workgroupID, localInvocationIndex, itemInvocationIndex + ); + success = false; + break; + } + } } } } @@ -401,7 +476,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu uint32_t totalFailCount = 0; - uint32_t ItemsPerInvocation = 4u; + //uint32_t ItemsPerInvocation = 4u; }; NBL_MAIN_FUNC(Workgroup2ScanTestApp) \ No newline at end of file From 2ba2b824213e0730813bf61f55680226b52c2479 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 7 May 2025 15:10:56 +0700 Subject: [PATCH 159/296] workgroup scan benchmark, renamed examples --- .../CMakeLists.txt | 0 .../benchmarkWorkgroup.comp.hlsl | 0 .../app_resources/common.hlsl | 0 .../app_resources/shaderCommon.hlsl | 0 .../app_resources/testSubgroup.comp.hlsl | 0 .../app_resources/testWorkgroup.comp.hlsl | 0 .../app_resources/workgroupCommon.hlsl | 0 .../config.json.template | 0 .../main.cpp | 2 - .../pipeline.groovy | 0 .../CMakeLists.txt | 0 .../app_resources/benchmarkSubgroup.comp.hlsl | 0 .../benchmarkWorkgroup.comp.hlsl | 93 ++++++ .../app_resources/common.hlsl | 0 .../app_resources/shaderCommon.hlsl | 0 .../app_resources/testSubgroup.comp.hlsl | 0 .../app_resources/testWorkgroup.comp.hlsl | 0 .../app_resources/workgroupCommon.hlsl | 69 ++++ .../config.json.template | 0 .../imgui.ini | 0 .../main.cpp | 303 +++++------------- .../pipeline.groovy | 0 CMakeLists.txt | 4 +- 23 files changed, 244 insertions(+), 227 deletions(-) rename {73_ArithmeticBench => 73_Arithmetic2UnitTest}/CMakeLists.txt (100%) rename {74a_Workgroup2ScanTest => 73_Arithmetic2UnitTest}/app_resources/benchmarkWorkgroup.comp.hlsl (100%) rename {74a_Workgroup2ScanTest => 73_Arithmetic2UnitTest}/app_resources/common.hlsl (100%) rename {73_ArithmeticBench => 73_Arithmetic2UnitTest}/app_resources/shaderCommon.hlsl (100%) rename {73_ArithmeticBench => 73_Arithmetic2UnitTest}/app_resources/testSubgroup.comp.hlsl (100%) rename {74a_Workgroup2ScanTest => 73_Arithmetic2UnitTest}/app_resources/testWorkgroup.comp.hlsl (100%) rename {74a_Workgroup2ScanTest => 73_Arithmetic2UnitTest}/app_resources/workgroupCommon.hlsl (100%) rename {73_ArithmeticBench => 73_Arithmetic2UnitTest}/config.json.template (100%) rename {74a_Workgroup2ScanTest => 73_Arithmetic2UnitTest}/main.cpp (99%) rename {73_ArithmeticBench => 73_Arithmetic2UnitTest}/pipeline.groovy (100%) rename {74a_Workgroup2ScanTest => 74_Arithmetic2Bench}/CMakeLists.txt (100%) rename {73_ArithmeticBench => 74_Arithmetic2Bench}/app_resources/benchmarkSubgroup.comp.hlsl (100%) create mode 100644 74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl rename {73_ArithmeticBench => 74_Arithmetic2Bench}/app_resources/common.hlsl (100%) rename {74a_Workgroup2ScanTest => 74_Arithmetic2Bench}/app_resources/shaderCommon.hlsl (100%) rename {74a_Workgroup2ScanTest => 74_Arithmetic2Bench}/app_resources/testSubgroup.comp.hlsl (100%) rename {73_ArithmeticBench => 74_Arithmetic2Bench}/app_resources/testWorkgroup.comp.hlsl (100%) create mode 100644 74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl rename {74a_Workgroup2ScanTest => 74_Arithmetic2Bench}/config.json.template (100%) rename {73_ArithmeticBench => 74_Arithmetic2Bench}/imgui.ini (100%) rename {73_ArithmeticBench => 74_Arithmetic2Bench}/main.cpp (68%) rename {74a_Workgroup2ScanTest => 74_Arithmetic2Bench}/pipeline.groovy (100%) diff --git a/73_ArithmeticBench/CMakeLists.txt b/73_Arithmetic2UnitTest/CMakeLists.txt similarity index 100% rename from 73_ArithmeticBench/CMakeLists.txt rename to 73_Arithmetic2UnitTest/CMakeLists.txt diff --git a/74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl b/73_Arithmetic2UnitTest/app_resources/benchmarkWorkgroup.comp.hlsl similarity index 100% rename from 74a_Workgroup2ScanTest/app_resources/benchmarkWorkgroup.comp.hlsl rename to 73_Arithmetic2UnitTest/app_resources/benchmarkWorkgroup.comp.hlsl diff --git a/74a_Workgroup2ScanTest/app_resources/common.hlsl b/73_Arithmetic2UnitTest/app_resources/common.hlsl similarity index 100% rename from 74a_Workgroup2ScanTest/app_resources/common.hlsl rename to 73_Arithmetic2UnitTest/app_resources/common.hlsl diff --git a/73_ArithmeticBench/app_resources/shaderCommon.hlsl b/73_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl similarity index 100% rename from 73_ArithmeticBench/app_resources/shaderCommon.hlsl rename to 73_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl diff --git a/73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl b/73_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl similarity index 100% rename from 73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl rename to 73_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl diff --git a/74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl b/73_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl similarity index 100% rename from 74a_Workgroup2ScanTest/app_resources/testWorkgroup.comp.hlsl rename to 73_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl diff --git a/74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl b/73_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl similarity index 100% rename from 74a_Workgroup2ScanTest/app_resources/workgroupCommon.hlsl rename to 73_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl diff --git a/73_ArithmeticBench/config.json.template b/73_Arithmetic2UnitTest/config.json.template similarity index 100% rename from 73_ArithmeticBench/config.json.template rename to 73_Arithmetic2UnitTest/config.json.template diff --git a/74a_Workgroup2ScanTest/main.cpp b/73_Arithmetic2UnitTest/main.cpp similarity index 99% rename from 74a_Workgroup2ScanTest/main.cpp rename to 73_Arithmetic2UnitTest/main.cpp index bde717d7b..31eb4ab8f 100644 --- a/74a_Workgroup2ScanTest/main.cpp +++ b/73_Arithmetic2UnitTest/main.cpp @@ -475,8 +475,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu smart_refctd_ptr resultsBuffer; uint32_t totalFailCount = 0; - - //uint32_t ItemsPerInvocation = 4u; }; NBL_MAIN_FUNC(Workgroup2ScanTestApp) \ No newline at end of file diff --git a/73_ArithmeticBench/pipeline.groovy b/73_Arithmetic2UnitTest/pipeline.groovy similarity index 100% rename from 73_ArithmeticBench/pipeline.groovy rename to 73_Arithmetic2UnitTest/pipeline.groovy diff --git a/74a_Workgroup2ScanTest/CMakeLists.txt b/74_Arithmetic2Bench/CMakeLists.txt similarity index 100% rename from 74a_Workgroup2ScanTest/CMakeLists.txt rename to 74_Arithmetic2Bench/CMakeLists.txt diff --git a/73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl similarity index 100% rename from 73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl rename to 74_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl diff --git a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl new file mode 100644 index 000000000..ed56dd766 --- /dev/null +++ b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -0,0 +1,93 @@ +#pragma shader_stage(compute) + +#include "workgroupCommon.hlsl" + +// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders +[[vk::binding(2, 0)]] RWTexture2D outImage; // dummy + +template +struct DataProxy +{ + using dtype_t = vector; + static_assert(nbl::hlsl::is_same_v); + + dtype_t get(const uint32_t ix) + { + return inputValue[ix]; + } + void set(const uint32_t ix, const dtype_t value) + { + output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } +}; + +static ScratchProxy arithmeticAccessor; + +template +struct operation_t +{ + using binop_base_t = typename Binop::base_t; + using otype_t = typename Binop::type_t; + + void operator()() + { + DataProxy dataAccessor; + nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); + // we barrier before because we alias the accessors for Binop + arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); + } +}; + +#ifndef NUM_LOOPS +#error "Define NUM_LOOPS!" +#endif + +template class binop, typename T, uint32_t N> +static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + if (globalIndex()==0u) + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + operation_t,nbl::hlsl::jit::device_capabilities> func; + // TODO separate out store/load from DataProxy? so we don't do too many RW in benchmark + for (uint32_t i = 0; i < NUM_LOOPS; i++) + func(); // store is done with data accessor now +} + + +type_t benchmark() +{ + const type_t sourceVal = inputValue[globalIndex()]; + + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + return sourceVal; +} + + +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} + +bool canStore() +{ + return nbl::hlsl::workgroup::SubgroupContiguousIndex(); + +typedef vector type_t; + +// unfortunately DXC chokes on descriptors as static members +// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 +[[vk::binding(0, 0)]] StructuredBuffer inputValue; +[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; + +// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way +uint32_t globalIndex(); +// since we test ITEMS_PER_WG(firstAssetInBundle); }; - auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); auto subgroupBenchSource = getShaderSource("app_resources/benchmarkSubgroup.comp.hlsl"); - //auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl"); + auto workgroupBenchSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl"); // now create or retrieve final resources to run our tests sema = m_device->createSemaphore(timelineValue); resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() }); @@ -280,25 +279,22 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub return false; } } - - // TODO variable items per invocation? - const uint32_t NumLoops = 1000u; - const std::array workgroupSizes = { 256, 512, 1024 }; + // const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; - - if (b_runTests) - { - runTests(cmdbuf.get(), subgroupTestSource, elementCount, ItemsPerInvocation, MinSubgroupSize, MaxSubgroupSize, workgroupSizes); - - m_logger->log("==========Result==========", ILogger::ELL_INFO); - m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount); - } // for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram) - for (uint32_t i = 0; i < workgroupSizes.size(); i++) - benchSets[i] = createBenchmarkPipelines(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); + if constexpr (DoWorkgroupBenchmarks) + { + for (uint32_t i = 0; i < workgroupSizes.size(); i++) + benchSets[i] = createBenchmarkPipelines(workgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); + } + else + { + for (uint32_t i = 0; i < workgroupSizes.size(); i++) + benchSets[i] = createBenchmarkPipelines(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); + } m_winMgr->show(m_window.get()); @@ -399,7 +395,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get()); for (uint32_t i = 0; i < benchSets.size(); i++) - runBenchmark(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2); + runBenchmark(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2); // blit @@ -570,40 +566,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } } - void runTests(IGPUCommandBuffer* cmdbuf, smart_refctd_ptr subgroupTestSource, uint32_t elementCount, uint32_t itemsPerInvocation, uint32_t MinSubgroupSize, uint32_t MaxSubgroupSize, const std::array& workgroupSizes) - { - for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) - { - const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - for (const auto& workgroupSize : workgroupSizes) - { - // make sure renderdoc captures everything for debugging - m_api->startCapture(); - m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); - - bool passed = true; - // TODO async the testing - passed = runTest(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(cmdbuf, subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; - logTestOutcome(passed, workgroupSize); - //for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--) - //{ - // m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); - // passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - // logTestOutcome(passed, itemsPerWG); - // passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - // logTestOutcome(passed, itemsPerWG); - // passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - // logTestOutcome(passed, itemsPerWG); - //} - m_api->endCapture(); - } - } - } - // create pipeline (specialized every test) [TODO: turn into a future/async] smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2) { @@ -630,15 +592,10 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub uint32_t itemsPerInvocation; }; - template class Arithmetic> + template class Arithmetic, bool WorkgroupBench> BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) { - std::string arith_name = Arithmetic>::name; // TODO all operations - - //smart_refctd_ptr overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - // source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n", - // (("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2 - //); + std::string arith_name = Arithmetic>::name; auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); CHLSLCompiler::SOptions options = {}; @@ -659,182 +616,78 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr(m_physicalDevice->getLimits(), m_device->getEnabledFeatures())); options.preprocessorOptions.includeFinder = includeFinder; - const std::string definitions[5] = { - "subgroup2::" + arith_name, - std::to_string(workgroupSize), - std::to_string(itemsPerInvoc), - std::to_string(subgroupSizeLog2), - std::to_string(numLoops) - }; - - const IShaderCompiler::SMacroDefinition defines[5] = { - { "OPERATION", definitions[0] }, - { "WORKGROUP_SIZE", definitions[1] }, - { "ITEMS_PER_INVOCATION", definitions[2] }, - { "SUBGROUP_SIZE_LOG2", definitions[3] }, - { "NUM_LOOPS", definitions[4] }, - }; - options.preprocessorOptions.extraDefines = { defines, defines + 5 }; - - smart_refctd_ptr overridenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); - - BenchmarkSet set; - set.pipeline = createPipeline(overridenUnspecialized.get(), layout, subgroupSizeLog2); - set.workgroupSize = workgroupSize; - set.itemsPerInvocation = itemsPerInvoc; - - return set; - }; - - template class Arithmetic, bool WorkgroupTest> - bool runTest(IGPUCommandBuffer* cmdbuf, const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u) - { - std::string arith_name = Arithmetic>::name; - - smart_refctd_ptr overridenUnspecialized; - //if constexpr (WorkgroupTest) - //{ - // overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - // source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n", - // (("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG - // ); - //} - //else - //{ - itemsPerWG = workgroupSize; - overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_INVOCATION %d\n#define SUBGROUP_SIZE_LOG2 %d\n", - (("subgroup2::") + arith_name).c_str(), workgroupSize, itemsPerInvoc, subgroupSizeLog2 - ); - //} - auto pipeline = createPipeline(overridenUnspecialized.get(),testPplnLayout.get(), subgroupSizeLog2); - - // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) - const uint32_t workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc); - cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); - cmdbuf->bindComputePipeline(pipeline.get()); - cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &testDs.get()); - cmdbuf->dispatch(workgroupCount, 1, 1); + const uint32_t subgroupSize = 0x1u << subgroupSizeLog2; + const uint32_t itemsPerWG = workgroupSize <= 4 * subgroupSize ? workgroupSize : itemsPerInvoc * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow + smart_refctd_ptr overriddenUnspecialized; + if constexpr (WorkgroupBench) { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; - for (auto i=0u; igetSize(),outputBuffers[i]} - }; - } - IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {.memBarriers={},.bufBarriers=memoryBarrier}; - cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); - } - cmdbuf->end(); + const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); + const std::string definitions[6] = { + "workgroup2::" + arith_name, + std::to_string(workgroupSizeLog2), + std::to_string(itemsPerWG), + std::to_string(itemsPerInvoc), + std::to_string(subgroupSizeLog2), + std::to_string(numLoops) + }; - const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf}}; - const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}}; - computeQueue->submit(submits); - const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}}; - m_device->blockForSemaphores(wait); - - // check results - bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc); - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - //if constexpr (WorkgroupTest) - // passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - - return passed; - } + const IShaderCompiler::SMacroDefinition defines[6] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE_LOG2", definitions[1] }, + { "ITEMS_PER_WG", definitions[2] }, + { "ITEMS_PER_INVOCATION", definitions[3] }, + { "SUBGROUP_SIZE_LOG2", definitions[4] }, + { "NUM_LOOPS", definitions[5] } + }; + options.preprocessorOptions.extraDefines = { defines, defines + 6 }; - //returns true if result matches - template class Arithmetic, class Binop, bool WorkgroupTest> - bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, uint32_t itemsPerInvoc = 1u) - { - bool success = true; + overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + } + else + { + const std::string definitions[5] = { + "subgroup2::" + arith_name, + std::to_string(workgroupSize), + std::to_string(itemsPerInvoc), + std::to_string(subgroupSizeLog2), + std::to_string(numLoops) + }; - // download data - const SBufferRange bufferRange = {0u, resultsBuffer->getSize(), outputBuffers[Binop::BindingIndex]}; - m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer()); + const IShaderCompiler::SMacroDefinition defines[5] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE", definitions[1] }, + { "ITEMS_PER_INVOCATION", definitions[2] }, + { "SUBGROUP_SIZE_LOG2", definitions[3] }, + { "NUM_LOOPS", definitions[4] } + }; + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; - using type_t = typename Binop::type_t; - const auto dataFromBuffer = reinterpret_cast(resultsBuffer->getPointer()); - const auto subgroupSize = dataFromBuffer[0]; - if (subgroupSizenbl::hlsl::subgroup::MaxSubgroupSize) + overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + } + + BenchmarkSet set; + set.pipeline = createPipeline(overriddenUnspecialized.get(), layout, subgroupSizeLog2); + if constexpr (WorkgroupBench) { - m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize); - return false; + set.workgroupSize = itemsPerWG; } - - const auto testData = reinterpret_cast(dataFromBuffer + 1); - // TODO: parallel for (the temporary values need to be threadlocal or what?) - // now check if the data obtained has valid values - type_t* tmp = new type_t[itemsPerWG * itemsPerInvoc]; - //type_t* ballotInput = new type_t[itemsPerWG]; - for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) + else { - const auto workgroupOffset = workgroupID * itemsPerWG * itemsPerInvoc; - - //if constexpr (WorkgroupTest) - //{ - // if constexpr (std::is_same_v, Binop>) - // { - // for (auto i = 0u; i < itemsPerWG; i++) - // ballotInput[i] = inputData[i + workgroupOffset] & 0x1u; - // Arithmetic::impl(tmp, ballotInput, itemsPerWG); - // } - // else - // Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); - //} - //else - //{ - for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) - Arithmetic::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc); - //} - - for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) - { - const auto localOffset = localInvocationIndex * itemsPerInvoc; - const auto globalInvocationIndex = workgroupOffset + localOffset; - - for (uint32_t itemInvocationIndex = 0u; itemInvocationIndex < itemsPerInvoc; itemInvocationIndex++) - { - const auto cpuVal = tmp[localOffset + itemInvocationIndex]; - const auto gpuVal = testData[globalInvocationIndex + itemInvocationIndex]; - if (cpuVal != gpuVal) - { - m_logger->log( - "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d and iteminvoc %d", - ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, - cpuVal, gpuVal, workgroupID, localInvocationIndex, itemInvocationIndex - ); - success = false; - break; - } - } - } + set.workgroupSize = workgroupSize; } - //delete[] ballotInput; - delete[] tmp; - - return success; - } + set.itemsPerInvocation = itemsPerInvoc; + return set; + }; + template void runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2) { - const uint32_t workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation); + uint32_t workgroupCount; + if constexpr (WorkgroupBench) + workgroupCount = elementCount / set.workgroupSize; + else + workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation); cmdbuf->bindComputePipeline(set.pipeline.get()); cmdbuf->dispatch(workgroupCount, 1, 1); @@ -884,12 +737,16 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub constexpr static inline uint32_t MaxNumSubmits = 30; uint32_t numSubmits = 0; + /* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */ + constexpr static inline bool DoWorkgroupBenchmarks = true; + uint32_t ItemsPerInvocation = 4u; + constexpr static inline uint32_t NumLoops = 1000u; + constexpr static inline std::array workgroupSizes = { 256, 512, 1024 }; template using ArithmeticOp = emulatedReduction; // change this to test other arithmetic ops - bool b_runTests = false; + uint32_t* inputData = nullptr; - uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t OutputBufferCount = 8u; smart_refctd_ptr outputBuffers[OutputBufferCount]; diff --git a/74a_Workgroup2ScanTest/pipeline.groovy b/74_Arithmetic2Bench/pipeline.groovy similarity index 100% rename from 74a_Workgroup2ScanTest/pipeline.groovy rename to 74_Arithmetic2Bench/pipeline.groovy diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d7369560..dc6b74de1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,8 +91,8 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) - add_subdirectory(73_ArithmeticBench EXCLUDE_FROM_ALL) - add_subdirectory(74a_Workgroup2ScanTest EXCLUDE_FROM_ALL) + add_subdirectory(73_Arithmetic2UnitTest EXCLUDE_FROM_ALL) + add_subdirectory(74_Arithmetic2Bench EXCLUDE_FROM_ALL) NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") endif() From d567e716682695ec5ebcdff17e144e25576cd1f0 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 7 May 2025 15:16:57 +0700 Subject: [PATCH 160/296] removed obsolete files --- .../benchmarkWorkgroup.comp.hlsl | 97 ---------------- .../app_resources/testSubgroup.comp.hlsl | 18 --- .../app_resources/testWorkgroup.comp.hlsl | 107 ------------------ 74_Arithmetic2Bench/imgui.ini | 5 - 4 files changed, 227 deletions(-) delete mode 100644 73_Arithmetic2UnitTest/app_resources/benchmarkWorkgroup.comp.hlsl delete mode 100644 74_Arithmetic2Bench/app_resources/testSubgroup.comp.hlsl delete mode 100644 74_Arithmetic2Bench/app_resources/testWorkgroup.comp.hlsl delete mode 100644 74_Arithmetic2Bench/imgui.ini diff --git a/73_Arithmetic2UnitTest/app_resources/benchmarkWorkgroup.comp.hlsl b/73_Arithmetic2UnitTest/app_resources/benchmarkWorkgroup.comp.hlsl deleted file mode 100644 index e20e528d7..000000000 --- a/73_Arithmetic2UnitTest/app_resources/benchmarkWorkgroup.comp.hlsl +++ /dev/null @@ -1,97 +0,0 @@ -#pragma shader_stage(compute) - -#include "workgroupCommon.hlsl" - -template -struct DataProxy -{ - using dtype_t = vector; - static_assert(nbl::hlsl::is_same_v); - - dtype_t get(const uint32_t ix) - { - // return inputValue[ix]; - return inputVal; - } - void set(const uint32_t ix, const dtype_t value) - { - // output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); - outputVal = value; - } - - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } - - // to avoid multiple load/store in benchmark, also values not that important? - dtype_t inputVal; - dtype_t outputVal; -}; - -static ScratchProxy arithmeticAccessor; - -template -struct operation_t -{ - using binop_base_t = typename Binop::base_t; - using otype_t = typename Binop::type_t; - - otype_t operator()() - { - DataProxy dataAccessor; - dataAccessor.inputVal = inputValue[globalIndex()]; - nbl::hlsl::OPERATION::template __call, ScratchProxy >(dataAccessor,arithmeticAccessor); - // we barrier before because we alias the accessors for Binop - arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); - return dataAccessor.outputVal; - } -}; - - -template class binop, typename T, uint32_t N> -static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) -{ - if (globalIndex()==0u) - output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - - type_t value; - operation_t,nbl::hlsl::jit::device_capabilities> func; - for (uint32_t i = 0; i < NUM_LOOPS; i++) - value = func(); // store is done with data accessor now - - output[binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); -} - - -type_t test() -{ - const type_t sourceVal = inputValue[globalIndex()]; - - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - return sourceVal; -} - - -uint32_t globalIndex() -{ - return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); -} - -bool canStore() -{ - return nbl::hlsl::workgroup::SubgroupContiguousIndex()::value; -static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot::value; -static const uint32_t ScratchSz = ArithmeticSz+BallotSz; - -// TODO: Can we make it a static variable in the ScratchProxy struct? -groupshared uint32_t scratch[ScratchSz]; - - -#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" - - -template -struct ScratchProxy -{ - void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) - { - value = scratch[ix+offset]; - } - void set(const uint32_t ix, const uint32_t value) - { - scratch[ix+offset] = value; - } - - uint32_t atomicOr(const uint32_t ix, const uint32_t value) - { - return nbl::hlsl::glsl::atomicOr(scratch[ix],value); - } - - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } -}; - -static ScratchProxy<0> arithmeticAccessor; - - -#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" - - -template -struct operation_t -{ - using type_t = typename Binop::type_t; - - type_t operator()(type_t value) - { - type_t retval = nbl::hlsl::OPERATION::template __call >(value,arithmeticAccessor); - // we barrier before because we alias the accessors for Binop - arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); - return retval; - } -}; - - -#include "shaderCommon.hlsl" - -static ScratchProxy ballotAccessor; - - -uint32_t globalIndex() -{ - return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); -} - -bool canStore() -{ - return nbl::hlsl::workgroup::SubgroupContiguousIndex()::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - - // we can only ballot booleans, so low bit - nbl::hlsl::workgroup::ballot >(bool(sourceVal & 0x1u), ballotAccessor); - // need to barrier between ballot and usages of a ballot by myself - ballotAccessor.workgroupExecutionAndMemoryBarrier(); - - uint32_t destVal = 0xdeadbeefu; -#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same,0x45>,nbl::hlsl::workgroup::IS_OP,0x45> >::value -#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities - if (CONSTEXPR_OP_TYPE_TEST(reduction)) - destVal = nbl::hlsl::workgroup::ballotBitCount(ballotAccessor,arithmeticAccessor); - else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan)) - destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount(ballotAccessor,arithmeticAccessor); - else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan)) - destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount(ballotAccessor,arithmeticAccessor); - else - { - assert(false); - } -#undef BALLOT_TEMPLATE_ARGS -#undef CONSTEXPR_OP_TYPE_TEST - - if (canStore()) - output[ballot::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal); -} \ No newline at end of file diff --git a/74_Arithmetic2Bench/imgui.ini b/74_Arithmetic2Bench/imgui.ini deleted file mode 100644 index 4a5c20148..000000000 --- a/74_Arithmetic2Bench/imgui.ini +++ /dev/null @@ -1,5 +0,0 @@ -[Window][Debug##Default] -Pos=60,60 -Size=400,400 -Collapsed=0 - From 54acf2a433f3d5d4abaedfc8b0a33a435b45977e Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 7 May 2025 15:26:03 +0700 Subject: [PATCH 161/296] replaced old ex 23 unit test with new tests --- .../CMakeLists.txt | 0 .../app_resources/common.hlsl | 0 .../app_resources/shaderCommon.hlsl | 0 .../app_resources/testSubgroup.comp.hlsl | 0 .../app_resources/testWorkgroup.comp.hlsl | 0 .../app_resources/workgroupCommon.hlsl | 0 .../config.json.template | 0 .../main.cpp | 0 .../pipeline.groovy | 0 .../app_resources/shaderCommon.hlsl | 55 --- .../app_resources/testSubgroup.comp.hlsl | 18 - .../app_resources/testWorkgroup.comp.hlsl | 107 ---- 23_ArithmeticUnitTest/main.cpp | 462 ------------------ 73_Arithmetic2UnitTest/CMakeLists.txt | 25 - .../app_resources/common.hlsl | 96 ---- 73_Arithmetic2UnitTest/config.json.template | 28 -- 73_Arithmetic2UnitTest/pipeline.groovy | 50 -- 17 files changed, 841 deletions(-) rename {23_ArithmeticUnitTest => 23_Arithmetic2UnitTest}/CMakeLists.txt (100%) rename {23_ArithmeticUnitTest => 23_Arithmetic2UnitTest}/app_resources/common.hlsl (100%) rename {73_Arithmetic2UnitTest => 23_Arithmetic2UnitTest}/app_resources/shaderCommon.hlsl (100%) rename {73_Arithmetic2UnitTest => 23_Arithmetic2UnitTest}/app_resources/testSubgroup.comp.hlsl (100%) rename {73_Arithmetic2UnitTest => 23_Arithmetic2UnitTest}/app_resources/testWorkgroup.comp.hlsl (100%) rename {73_Arithmetic2UnitTest => 23_Arithmetic2UnitTest}/app_resources/workgroupCommon.hlsl (100%) rename {23_ArithmeticUnitTest => 23_Arithmetic2UnitTest}/config.json.template (100%) rename {73_Arithmetic2UnitTest => 23_Arithmetic2UnitTest}/main.cpp (100%) rename {23_ArithmeticUnitTest => 23_Arithmetic2UnitTest}/pipeline.groovy (100%) delete mode 100644 23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl delete mode 100644 23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl delete mode 100644 23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl delete mode 100644 23_ArithmeticUnitTest/main.cpp delete mode 100644 73_Arithmetic2UnitTest/CMakeLists.txt delete mode 100644 73_Arithmetic2UnitTest/app_resources/common.hlsl delete mode 100644 73_Arithmetic2UnitTest/config.json.template delete mode 100644 73_Arithmetic2UnitTest/pipeline.groovy diff --git a/23_ArithmeticUnitTest/CMakeLists.txt b/23_Arithmetic2UnitTest/CMakeLists.txt similarity index 100% rename from 23_ArithmeticUnitTest/CMakeLists.txt rename to 23_Arithmetic2UnitTest/CMakeLists.txt diff --git a/23_ArithmeticUnitTest/app_resources/common.hlsl b/23_Arithmetic2UnitTest/app_resources/common.hlsl similarity index 100% rename from 23_ArithmeticUnitTest/app_resources/common.hlsl rename to 23_Arithmetic2UnitTest/app_resources/common.hlsl diff --git a/73_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl similarity index 100% rename from 73_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl rename to 23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl diff --git a/73_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl similarity index 100% rename from 73_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl rename to 23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl diff --git a/73_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl similarity index 100% rename from 73_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl rename to 23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl diff --git a/73_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl similarity index 100% rename from 73_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl rename to 23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl diff --git a/23_ArithmeticUnitTest/config.json.template b/23_Arithmetic2UnitTest/config.json.template similarity index 100% rename from 23_ArithmeticUnitTest/config.json.template rename to 23_Arithmetic2UnitTest/config.json.template diff --git a/73_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp similarity index 100% rename from 73_Arithmetic2UnitTest/main.cpp rename to 23_Arithmetic2UnitTest/main.cpp diff --git a/23_ArithmeticUnitTest/pipeline.groovy b/23_Arithmetic2UnitTest/pipeline.groovy similarity index 100% rename from 23_ArithmeticUnitTest/pipeline.groovy rename to 23_Arithmetic2UnitTest/pipeline.groovy diff --git a/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl b/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl deleted file mode 100644 index 13ee8d21e..000000000 --- a/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl +++ /dev/null @@ -1,55 +0,0 @@ -#include "common.hlsl" - -#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" -#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" - -#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" - -// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 -uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} - -// unfortunately DXC chokes on descriptors as static members -// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 -[[vk::binding(0, 0)]] StructuredBuffer inputValue; -[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; - -// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way -uint32_t globalIndex(); -// since we test ITEMS_PER_WG class binop> -static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) -{ - if (globalIndex()==0u) - output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - - operation_t::base_t,nbl::hlsl::jit::device_capabilities> func; - if (canStore()) - output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); -} - - -type_t test() -{ - const type_t sourceVal = inputValue[globalIndex()]; - - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - return sourceVal; -} - -#include "nbl/builtin/hlsl/workgroup/basic.hlsl" \ No newline at end of file diff --git a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl deleted file mode 100644 index 479265d73..000000000 --- a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl +++ /dev/null @@ -1,18 +0,0 @@ -#pragma shader_stage(compute) - -#define operation_t nbl::hlsl::OPERATION - -#include "shaderCommon.hlsl" - -uint32_t globalIndex() -{ - return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); -} - -bool canStore() {return true;} - -[numthreads(WORKGROUP_SIZE,1,1)] -void main() -{ - test(); -} \ No newline at end of file diff --git a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl deleted file mode 100644 index 9bafae47f..000000000 --- a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl +++ /dev/null @@ -1,107 +0,0 @@ -#pragma shader_stage(compute) - - -#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" - -static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic::value; -static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot::value; -static const uint32_t ScratchSz = ArithmeticSz+BallotSz; - -// TODO: Can we make it a static variable in the ScratchProxy struct? -groupshared uint32_t scratch[ScratchSz]; - - -#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" - - -template -struct ScratchProxy -{ - void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) - { - value = scratch[ix+offset]; - } - void set(const uint32_t ix, const uint32_t value) - { - scratch[ix+offset] = value; - } - - uint32_t atomicOr(const uint32_t ix, const uint32_t value) - { - return nbl::hlsl::glsl::atomicOr(scratch[ix],value); - } - - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } -}; - -static ScratchProxy<0> arithmeticAccessor; - - -#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" - - -template -struct operation_t -{ - using type_t = typename Binop::type_t; - - type_t operator()(type_t value) - { - type_t retval = nbl::hlsl::OPERATION::template __call >(value,arithmeticAccessor); - // we barrier before because we alias the accessors for Binop - arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); - return retval; - } -}; - - -#include "shaderCommon.hlsl" - -static ScratchProxy ballotAccessor; - - -uint32_t globalIndex() -{ - return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); -} - -bool canStore() -{ - return nbl::hlsl::workgroup::SubgroupContiguousIndex()::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - - // we can only ballot booleans, so low bit - nbl::hlsl::workgroup::ballot >(bool(sourceVal & 0x1u), ballotAccessor); - // need to barrier between ballot and usages of a ballot by myself - ballotAccessor.workgroupExecutionAndMemoryBarrier(); - - uint32_t destVal = 0xdeadbeefu; -#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same,0x45>,nbl::hlsl::workgroup::IS_OP,0x45> >::value -#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities - if (CONSTEXPR_OP_TYPE_TEST(reduction)) - destVal = nbl::hlsl::workgroup::ballotBitCount(ballotAccessor,arithmeticAccessor); - else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan)) - destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount(ballotAccessor,arithmeticAccessor); - else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan)) - destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount(ballotAccessor,arithmeticAccessor); - else - { - assert(false); - } -#undef BALLOT_TEMPLATE_ARGS -#undef CONSTEXPR_OP_TYPE_TEST - - if (canStore()) - output[ballot::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal); -} \ No newline at end of file diff --git a/23_ArithmeticUnitTest/main.cpp b/23_ArithmeticUnitTest/main.cpp deleted file mode 100644 index 147d231e2..000000000 --- a/23_ArithmeticUnitTest/main.cpp +++ /dev/null @@ -1,462 +0,0 @@ -#include "nbl/application_templates/BasicMultiQueueApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" -#include "app_resources/common.hlsl" - -using namespace nbl; -using namespace core; -using namespace asset; -using namespace system; -using namespace video; - -// method emulations on the CPU, to verify the results of the GPU methods -template -struct emulatedReduction -{ - using type_t = typename Binop::type_t; - - static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) - { - const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop()); - std::fill(out,out+itemCount,red); - } - - static inline constexpr const char* name = "reduction"; -}; -template -struct emulatedScanInclusive -{ - using type_t = typename Binop::type_t; - - static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) - { - std::inclusive_scan(in,in+itemCount,out,Binop()); - } - static inline constexpr const char* name = "inclusive_scan"; -}; -template -struct emulatedScanExclusive -{ - using type_t = typename Binop::type_t; - - static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) - { - std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop()); - } - static inline constexpr const char* name = "exclusive_scan"; -}; - -class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication -{ - using device_base_t = application_templates::BasicMultiQueueApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; - -public: - ArithmeticUnitTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : - system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} - - bool onAppInitialized(smart_refctd_ptr&& system) override - { - if (!device_base_t::onAppInitialized(std::move(system))) - return false; - if (!asset_base_t::onAppInitialized(std::move(system))) - return false; - - transferDownQueue = getTransferDownQueue(); - computeQueue = getComputeQueue(); - - // TODO: get the element count from argv - const uint32_t elementCount = Output<>::ScanElementCount; - // populate our random data buffer on the CPU and create a GPU copy - inputData = new uint32_t[elementCount]; - smart_refctd_ptr gpuinputDataBuffer; - { - std::mt19937 randGenerator(0xdeadbeefu); - for (uint32_t i = 0u; i < elementCount; i++) - inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all - - IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; - inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount; - inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; - m_utils->createFilledDeviceLocalBufferOnDedMem( - SIntendedSubmitInfo{.queue=getTransferUpQueue()}, - std::move(inputDataBufferCreationParams), - inputData - ).move_into(gpuinputDataBuffer); - } - - // create 8 buffers for 8 operations - for (auto i=0u; igetSize(); - params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT; - - outputBuffers[i] = m_device->createBuffer(std::move(params)); - auto mreq = outputBuffers[i]->getMemoryReqs(); - mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); - assert(mreq.memoryTypeBits); - - auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get()); - assert(bufferMem.isValid()); - } - - // create Descriptor Set and Pipeline Layout - { - // create Descriptor Set Layout - smart_refctd_ptr dsLayout; - { - IGPUDescriptorSetLayout::SBinding binding[2]; - for (uint32_t i = 0u; i < 2; i++) - binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; - binding[1].count = OutputBufferCount; - dsLayout = m_device->createDescriptorSetLayout(binding); - } - - // set and transient pool - auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1}); - descriptorSet = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); - { - IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount]; - infos[0].desc = gpuinputDataBuffer; - infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() }; - for (uint32_t i = 1u; i <= OutputBufferCount; i++) - { - auto buff = outputBuffers[i - 1]; - infos[i].info.buffer = { 0u,buff->getSize() }; - infos[i].desc = std::move(buff); // save an atomic in the refcount - - } - - IGPUDescriptorSet::SWriteDescriptorSet writes[2]; - for (uint32_t i=0u; i<2; i++) - writes[i] = {descriptorSet.get(),i,0u,1u,infos+i}; - writes[1].count = OutputBufferCount; - - m_device->updateDescriptorSets(2, writes, 0u, nullptr); - } - - pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout)); - } - - const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin"; - // enclose to make sure file goes out of scope and we can reopen it - { - smart_refctd_ptr spirv_isa_cache_input; - // try to load SPIR-V to ISA cache - { - ISystem::future_t> fileCreate; - m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT); - if (auto lock=fileCreate.acquire()) - spirv_isa_cache_input = *lock; - } - // create the cache - { - std::span spirv_isa_cache_data = {}; - if (spirv_isa_cache_input) - spirv_isa_cache_data = {reinterpret_cast(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()}; - else - m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE); - // Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead - m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data); - } - } - { - // TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ? - m_system->deleteDirectory(spirv_isa_cache_path); - ISystem::future_t> fileCreate; - m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE); - // I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though. - m_spirv_isa_cache_output=*fileCreate.acquire(); - if (!m_spirv_isa_cache_output) - logFail("Failed to Create SPIR-V to ISA cache file."); - } - - // load shader source from file - auto getShaderSource = [&](const char* filePath) -> auto - { - IAssetLoader::SAssetLoadParams lparams = {}; - lparams.logger = m_logger.get(); - lparams.workingDirectory = ""; - auto bundle = m_assetMgr->getAsset(filePath, lparams); - if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER) - { - m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath); - exit(-1); - } - auto firstAssetInBundle = bundle.getContents()[0]; - return smart_refctd_ptr_static_cast(firstAssetInBundle); - }; - - auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); - auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl"); - // now create or retrieve final resources to run our tests - sema = m_device->createSemaphore(timelineValue); - resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() }); - { - smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1})) - { - logFail("Failed to create Command Buffers!\n"); - return false; - } - } - - const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; - const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; - const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; - for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) - { - const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize) - { - // make sure renderdoc captures everything for debugging - m_api->startCapture(); - m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); - - bool passed = true; - // TODO async the testing - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - logTestOutcome(passed, workgroupSize); - for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--) - { - m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - } - m_api->endCapture(); - - // save cache every now and then - { - auto cpu = m_spirv_isa_cache->convertToCPUCache(); - // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata - auto bin = cpu->getEntries().begin()->second.bin; - IFile::success_t success; - m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size()); - if (!success) - logFail("Could not write Create SPIR-V to ISA cache to disk!"); - } - } - } - - return true; - } - - virtual bool onAppTerminated() override - { - m_logger->log("==========Result==========", ILogger::ELL_INFO); - m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount); - delete[] inputData; - return true; - } - - // the unit test is carried out on init - void workLoopBody() override {} - - // - bool keepRunning() override { return false; } - -private: - void logTestOutcome(bool passed, uint32_t workgroupSize) - { - if (passed) - m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); - else - { - totalFailCount++; - m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); - } - } - - // create pipeline (specialized every test) [TODO: turn into a future/async] - smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2) - { - auto shader = m_device->createShader(overridenUnspecialized); - IGPUComputePipeline::SCreationParams params = {}; - params.layout = pipelineLayout.get(); - params.shader = { - .entryPoint = "main", - .shader = shader.get(), - .entries = nullptr, - .requiredSubgroupSize = static_cast(subgroupSizeLog2), - .requireFullSubgroups = true - }; - core::smart_refctd_ptr pipeline; - if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{¶ms,1},&pipeline)) - return nullptr; - return pipeline; - } - - /*template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) - { - return true; - }*/ - - template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) - { - std::string arith_name = Arithmetic>::name; - - smart_refctd_ptr overridenUnspecialized; - if constexpr (WorkgroupTest) - { - overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n", - (("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG - ); - } - else - { - itemsPerWG = workgroupSize; - overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n", - (("subgroup::") + arith_name).c_str(), workgroupSize - ); - } - auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2); - - // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) - const uint32_t workgroupCount = elementCount / itemsPerWG; - cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); - cmdbuf->bindComputePipeline(pipeline.get()); - cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); - cmdbuf->dispatch(workgroupCount, 1, 1); - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; - for (auto i=0u; igetSize(),outputBuffers[i]} - }; - } - IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {.memBarriers={},.bufBarriers=memoryBarrier}; - cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); - } - cmdbuf->end(); - - const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf.get()}}; - const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}}; - computeQueue->submit(submits); - const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}}; - m_device->blockForSemaphores(wait); - - // check results - bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount); - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - if constexpr (WorkgroupTest) - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - - return passed; - } - - //returns true if result matches - template class Arithmetic, class Binop, bool WorkgroupTest> - bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount) - { - bool success = true; - - // download data - const SBufferRange bufferRange = {0u, resultsBuffer->getSize(), outputBuffers[Binop::BindingIndex]}; - m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer()); - - using type_t = typename Binop::type_t; - const auto dataFromBuffer = reinterpret_cast(resultsBuffer->getPointer()); - const auto subgroupSize = dataFromBuffer[0]; - if (subgroupSizenbl::hlsl::subgroup::MaxSubgroupSize) - { - m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize); - return false; - } - - const auto testData = reinterpret_cast(dataFromBuffer + 1); - // TODO: parallel for (the temporary values need to be threadlocal or what?) - // now check if the data obtained has valid values - type_t* tmp = new type_t[itemsPerWG]; - type_t* ballotInput = new type_t[itemsPerWG]; - for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) - { - const auto workgroupOffset = workgroupID * itemsPerWG; - - if constexpr (WorkgroupTest) - { - if constexpr (std::is_same_v, Binop>) - { - for (auto i = 0u; i < itemsPerWG; i++) - ballotInput[i] = inputData[i + workgroupOffset] & 0x1u; - Arithmetic::impl(tmp, ballotInput, itemsPerWG); - } - else - Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); - } - else - { - for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) - Arithmetic::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize); - } - - for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) - { - const auto globalInvocationIndex = workgroupOffset + localInvocationIndex; - const auto cpuVal = tmp[localInvocationIndex]; - const auto gpuVal = testData[globalInvocationIndex]; - if (cpuVal != gpuVal) - { - m_logger->log( - "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d", - ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, - cpuVal, gpuVal, workgroupID, localInvocationIndex - ); - success = false; - break; - } - } - } - delete[] ballotInput; - delete[] tmp; - - return success; - } - - IQueue* transferDownQueue; - IQueue* computeQueue; - smart_refctd_ptr m_spirv_isa_cache; - smart_refctd_ptr m_spirv_isa_cache_output; - - uint32_t* inputData = nullptr; - constexpr static inline uint32_t OutputBufferCount = 8u; - smart_refctd_ptr outputBuffers[OutputBufferCount]; - smart_refctd_ptr descriptorSet; - smart_refctd_ptr pipelineLayout; - - smart_refctd_ptr sema; - uint64_t timelineValue = 0; - smart_refctd_ptr cmdbuf; - smart_refctd_ptr resultsBuffer; - - uint32_t totalFailCount = 0; -}; - -NBL_MAIN_FUNC(ArithmeticUnitTestApp) \ No newline at end of file diff --git a/73_Arithmetic2UnitTest/CMakeLists.txt b/73_Arithmetic2UnitTest/CMakeLists.txt deleted file mode 100644 index 0724366c9..000000000 --- a/73_Arithmetic2UnitTest/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ - -include(common RESULT_VARIABLE RES) -if(NOT RES) - message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") -endif() - -nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") - -if(NBL_EMBED_BUILTIN_RESOURCES) - set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) - set(RESOURCE_DIR "app_resources") - - get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) - get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) - get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) - - file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") - foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) - LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") - endforeach() - - ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") - - LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) -endif() \ No newline at end of file diff --git a/73_Arithmetic2UnitTest/app_resources/common.hlsl b/73_Arithmetic2UnitTest/app_resources/common.hlsl deleted file mode 100644 index 10892a2b9..000000000 --- a/73_Arithmetic2UnitTest/app_resources/common.hlsl +++ /dev/null @@ -1,96 +0,0 @@ -#include "nbl/builtin/hlsl/cpp_compat.hlsl" -#include "nbl/builtin/hlsl/functional.hlsl" - -template -struct Output -{ - NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount; - - uint32_t subgroupSize; - uint32_t data[ScanElementCount]; -}; - -// Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code -template -struct bit_and : nbl::hlsl::bit_and -{ - using base_t = nbl::hlsl::bit_and; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "bit_and"; -#endif -}; -template -struct bit_or : nbl::hlsl::bit_or -{ - using base_t = nbl::hlsl::bit_or; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "bit_xor"; -#endif -}; -template -struct bit_xor : nbl::hlsl::bit_xor -{ - using base_t = nbl::hlsl::bit_xor; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "bit_or"; -#endif -}; -template -struct plus : nbl::hlsl::plus -{ - using base_t = nbl::hlsl::plus; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "plus"; -#endif -}; -template -struct multiplies : nbl::hlsl::multiplies -{ - using base_t = nbl::hlsl::multiplies; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "multiplies"; -#endif -}; -template -struct minimum : nbl::hlsl::minimum -{ - using base_t = nbl::hlsl::minimum; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "minimum"; -#endif -}; -template -struct maximum : nbl::hlsl::maximum -{ - using base_t = nbl::hlsl::maximum; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "maximum"; -#endif -}; - -template -struct ballot : nbl::hlsl::plus -{ - using base_t = nbl::hlsl::plus; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "bitcount"; -#endif -}; - -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" \ No newline at end of file diff --git a/73_Arithmetic2UnitTest/config.json.template b/73_Arithmetic2UnitTest/config.json.template deleted file mode 100644 index f961745c1..000000000 --- a/73_Arithmetic2UnitTest/config.json.template +++ /dev/null @@ -1,28 +0,0 @@ -{ - "enableParallelBuild": true, - "threadsPerBuildProcess" : 2, - "isExecuted": false, - "scriptPath": "", - "cmake": { - "configurations": [ "Release", "Debug", "RelWithDebInfo" ], - "buildModes": [], - "requiredOptions": [] - }, - "profiles": [ - { - "backend": "vulkan", - "platform": "windows", - "buildModes": [], - "runConfiguration": "Release", - "gpuArchitectures": [] - } - ], - "dependencies": [], - "data": [ - { - "dependencies": [], - "command": [""], - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/73_Arithmetic2UnitTest/pipeline.groovy b/73_Arithmetic2UnitTest/pipeline.groovy deleted file mode 100644 index 7ea9947e0..000000000 --- a/73_Arithmetic2UnitTest/pipeline.groovy +++ /dev/null @@ -1,50 +0,0 @@ -import org.DevshGraphicsProgramming.Agent -import org.DevshGraphicsProgramming.BuilderInfo -import org.DevshGraphicsProgramming.IBuilder - -class CArithemticUnitTestBuilder extends IBuilder -{ - public CArithemticUnitTestBuilder(Agent _agent, _info) - { - super(_agent, _info) - } - - @Override - public boolean prepare(Map axisMapping) - { - return true - } - - @Override - public boolean build(Map axisMapping) - { - IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") - IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") - - def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) - def nameOfConfig = getNameOfConfig(config) - - agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") - - return true - } - - @Override - public boolean test(Map axisMapping) - { - return true - } - - @Override - public boolean install(Map axisMapping) - { - return true - } -} - -def create(Agent _agent, _info) -{ - return new CArithemticUnitTestBuilder(_agent, _info) -} - -return this \ No newline at end of file From 030d6227ff20939ea838f51ce82969f96cbd12ca Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 7 May 2025 16:54:59 +0700 Subject: [PATCH 162/296] minor fixes --- 23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl | 4 ++-- .../app_resources/benchmarkWorkgroup.comp.hlsl | 4 ++-- CMakeLists.txt | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index f9453a165..7f1b5dcbe 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -8,9 +8,9 @@ struct DataProxy using dtype_t = vector; static_assert(nbl::hlsl::is_same_v); - dtype_t get(const uint32_t ix) + void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) { - return inputValue[ix]; + value = inputValue[ix]; } void set(const uint32_t ix, const dtype_t value) { diff --git a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index ed56dd766..aa0717112 100644 --- a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -11,9 +11,9 @@ struct DataProxy using dtype_t = vector; static_assert(nbl::hlsl::is_same_v); - dtype_t get(const uint32_t ix) + void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) { - return inputValue[ix]; + value = inputValue[ix]; } void set(const uint32_t ix, const dtype_t value) { diff --git a/CMakeLists.txt b/CMakeLists.txt index dc6b74de1..ed3992203 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,7 +58,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(20_AllocatorTest EXCLUDE_FROM_ALL) add_subdirectory(21_LRUCacheUnitTest EXCLUDE_FROM_ALL) add_subdirectory(22_CppCompat EXCLUDE_FROM_ALL) - add_subdirectory(23_ArithmeticUnitTest EXCLUDE_FROM_ALL) + add_subdirectory(23_Arithmetic2UnitTest EXCLUDE_FROM_ALL) add_subdirectory(24_ColorSpaceTest EXCLUDE_FROM_ALL) add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL) add_subdirectory(26_Blur EXCLUDE_FROM_ALL) @@ -91,7 +91,6 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) - add_subdirectory(73_Arithmetic2UnitTest EXCLUDE_FROM_ALL) add_subdirectory(74_Arithmetic2Bench EXCLUDE_FROM_ALL) NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") From ca71a39db753938a7ae90a8445cb4186efe7fa56 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 8 May 2025 14:14:59 +0700 Subject: [PATCH 163/296] minor fixes to workgroup benchmark --- .../app_resources/benchmarkWorkgroup.comp.hlsl | 6 ++++-- 74_Arithmetic2Bench/main.cpp | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index aa0717112..ec3f9b295 100644 --- a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -11,13 +11,15 @@ struct DataProxy using dtype_t = vector; static_assert(nbl::hlsl::is_same_v); + // we don't want to write/read storage multiple times in loop; doesn't seem optimized out in generated spirv void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) { - value = inputValue[ix]; + // value = inputValue[ix]; + value = globalIndex(); } void set(const uint32_t ix, const dtype_t value) { - output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); + // output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); } void workgroupExecutionAndMemoryBarrier() diff --git a/74_Arithmetic2Bench/main.cpp b/74_Arithmetic2Bench/main.cpp index abbae38fb..b6bffb2b4 100644 --- a/74_Arithmetic2Bench/main.cpp +++ b/74_Arithmetic2Bench/main.cpp @@ -395,7 +395,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get()); for (uint32_t i = 0; i < benchSets.size(); i++) - runBenchmark(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2); + runBenchmark(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2); // blit @@ -741,7 +741,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub constexpr static inline bool DoWorkgroupBenchmarks = true; uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t NumLoops = 1000u; - constexpr static inline std::array workgroupSizes = { 256, 512, 1024 }; + constexpr static inline std::array workgroupSizes = { 128, 512, 1024 }; template using ArithmeticOp = emulatedReduction; // change this to test other arithmetic ops From 6018e9a7e0bd5cb4eeaa47571610ff7dbb0ce054 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 8 May 2025 17:03:09 +0700 Subject: [PATCH 164/296] more minor fixes --- .../benchmarkWorkgroup.comp.hlsl | 2 +- 74_Arithmetic2Bench/main.cpp | 37 +++---------------- 2 files changed, 7 insertions(+), 32 deletions(-) diff --git a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index ec3f9b295..ac6ea7fd8 100644 --- a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -15,7 +15,7 @@ struct DataProxy void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) { // value = inputValue[ix]; - value = globalIndex(); + value = nbl::hlsl::promote(globalIndex()); } void set(const uint32_t ix, const dtype_t value) { diff --git a/74_Arithmetic2Bench/main.cpp b/74_Arithmetic2Bench/main.cpp index b6bffb2b4..1d8e41a24 100644 --- a/74_Arithmetic2Bench/main.cpp +++ b/74_Arithmetic2Bench/main.cpp @@ -192,29 +192,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } // set and transient pool - auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1}); - testDs = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); - { - IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount]; - infos[0].desc = gpuinputDataBuffer; - infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() }; - for (uint32_t i = 1u; i <= OutputBufferCount; i++) - { - auto buff = outputBuffers[i - 1]; - infos[i].info.buffer = { 0u,buff->getSize() }; - infos[i].desc = std::move(buff); // save an atomic in the refcount - } - - IGPUDescriptorSet::SWriteDescriptorSet writes[2]; - for (uint32_t i=0u; i<2; i++) - writes[i] = {testDs.get(),i,0u,1u,infos+i}; - writes[1].count = OutputBufferCount; - - m_device->updateDescriptorSets(2, writes, 0u, nullptr); - } - testPplnLayout = m_device->createPipelineLayout({}, std::move(dsLayout)); - - smart_refctd_ptr benchLayout; { IGPUDescriptorSetLayout::SBinding binding[3]; @@ -727,24 +704,22 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub smart_refctd_ptr dummyImg; - std::array benchSets; - smart_refctd_ptr benchPool; - smart_refctd_ptr benchDs; - - smart_refctd_ptr testDs; - smart_refctd_ptr testPplnLayout; - constexpr static inline uint32_t MaxNumSubmits = 30; uint32_t numSubmits = 0; /* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */ + constexpr static inline bool DoWorkgroupBenchmarks = true; uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t NumLoops = 1000u; - constexpr static inline std::array workgroupSizes = { 128, 512, 1024 }; + constexpr static inline uint32_t NumBenchmarks = 6u; + constexpr static inline std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; template using ArithmeticOp = emulatedReduction; // change this to test other arithmetic ops + std::array benchSets; + smart_refctd_ptr benchPool; + smart_refctd_ptr benchDs; uint32_t* inputData = nullptr; constexpr static inline uint32_t OutputBufferCount = 8u; From 3a9758c176c55652831ced820904282a76be03db Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 9 May 2025 14:03:07 +0700 Subject: [PATCH 165/296] some fixes to using config vars --- 23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl | 2 +- 23_Arithmetic2UnitTest/main.cpp | 2 +- 74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl | 2 +- 74_Arithmetic2Bench/main.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl index 7e8512e72..702fcbe25 100644 --- a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl @@ -43,7 +43,7 @@ bool canStore(); #endif // final (level 1/2) scan needs to fit in one subgroup exactly -groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroupLog2*config_t::ItemsPerInvocation_1]; +groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroup*config_t::ItemsPerInvocation_1]; struct ScratchProxy { diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 31eb4ab8f..d5a251f39 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -195,7 +195,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; logTestOutcome(passed, workgroupSize); - const uint32_t itemsPerWG = workgroupSize <= 4 * subgroupSize ? workgroupSize : itemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow + const uint32_t itemsPerWG = workgroupSize <= subgroupSize ? workgroupSize * itemsPerInvocation : itemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); diff --git a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl index 7e8512e72..702fcbe25 100644 --- a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl +++ b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl @@ -43,7 +43,7 @@ bool canStore(); #endif // final (level 1/2) scan needs to fit in one subgroup exactly -groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroupLog2*config_t::ItemsPerInvocation_1]; +groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroup*config_t::ItemsPerInvocation_1]; struct ScratchProxy { diff --git a/74_Arithmetic2Bench/main.cpp b/74_Arithmetic2Bench/main.cpp index 1d8e41a24..bf20d5faa 100644 --- a/74_Arithmetic2Bench/main.cpp +++ b/74_Arithmetic2Bench/main.cpp @@ -594,7 +594,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub options.preprocessorOptions.includeFinder = includeFinder; const uint32_t subgroupSize = 0x1u << subgroupSizeLog2; - const uint32_t itemsPerWG = workgroupSize <= 4 * subgroupSize ? workgroupSize : itemsPerInvoc * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow + const uint32_t itemsPerWG = workgroupSize <= subgroupSize ? workgroupSize * itemsPerInvoc : itemsPerInvoc * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow smart_refctd_ptr overriddenUnspecialized; if constexpr (WorkgroupBench) { From e496e987296338a3ba7492b18f0249c7cca56d68 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 12 May 2025 11:00:12 +0700 Subject: [PATCH 166/296] fixes to test mem errors --- 23_Arithmetic2UnitTest/main.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index d5a251f39..49cba28d1 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -169,8 +169,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; - const std::array WorkgroupSizes = { 128, 256, 512, 1024 }; - const std::array ItemsPerInvocations = { 1, 2, 4 }; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) @@ -401,13 +399,16 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu const auto testData = reinterpret_cast(dataFromBuffer + 1); // TODO: parallel for (the temporary values need to be threadlocal or what?) // now check if the data obtained has valid values - type_t* tmp = new type_t[itemsPerWG]; + type_t* tmp; + if constexpr (WorkgroupTest) + tmp = new type_t[itemsPerWG]; + else + tmp = new type_t[itemsPerWG * itemsPerInvoc]; for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) { - const auto workgroupOffset = workgroupID * itemsPerWG; - if constexpr (WorkgroupTest) { + const auto workgroupOffset = workgroupID * itemsPerWG; Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) @@ -429,6 +430,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } else { + const auto workgroupOffset = workgroupID * itemsPerWG * itemsPerInvoc; for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) Arithmetic::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc); @@ -475,6 +477,9 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu smart_refctd_ptr resultsBuffer; uint32_t totalFailCount = 0; + + constexpr static inline std::array WorkgroupSizes = { 32, 256, 512, 1024 }; + constexpr static inline std::array ItemsPerInvocations = { 1, 2, 4 }; }; NBL_MAIN_FUNC(Workgroup2ScanTestApp) \ No newline at end of file From 20011f5fdd3e8454bb830ded6f4221ec75036809 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 12 May 2025 16:17:01 +0700 Subject: [PATCH 167/296] config struct changes --- 74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl index 702fcbe25..026687cfa 100644 --- a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl +++ b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl @@ -43,7 +43,7 @@ bool canStore(); #endif // final (level 1/2) scan needs to fit in one subgroup exactly -groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroup*config_t::ItemsPerInvocation_1]; +groupshared uint32_t scratch[config_t::SharedMemSize]; struct ScratchProxy { From 55a9e135ed1cf7e66a4d3b5bc0a161408949008f Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 14 May 2025 13:36:08 +0700 Subject: [PATCH 168/296] block to test tlas/blas asset converter --- 67_RayQueryGeometry/main.cpp | 237 +++++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index aff687742..a6f6dfcc0 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -4,6 +4,8 @@ #include "common.hpp" +#define TEST_ASSET_CONV_AS + class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { using device_base_t = examples::SimpleWindowedApplication; @@ -126,6 +128,10 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu auto cQueue = getComputeQueue(); +#ifdef TEST_ASSET_CONV_AS + if (!createAccelerationStructuresFromGeometry(cQueue, geometryCreator)) + return logFail("Could not create acceleration structures from provided geometry creator"); +#else // create geometry objects if (!createGeometries(gQueue, geometryCreator)) return logFail("Could not create geometries from geometry creator"); @@ -147,6 +153,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu if (!createAccelerationStructures(cQueue)) #endif return logFail("Could not create acceleration structures"); +#endif // TEST_ASSET_CONV_AS // create pipelines { @@ -590,6 +597,235 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu } } +#ifdef TEST_ASSET_CONV_AS + bool createAccelerationStructuresFromGeometry(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) + { + // get geometries in ICPUBuffers + std::array objectsCpu; + objectsCpu[OT_CUBE] = ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) }; + objectsCpu[OT_SPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) }; + objectsCpu[OT_CYLINDER] = ReferenceObjectCpu{ .meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .shadersType = GP_BASIC, .data = gc->createCylinderMesh(2, 2, 20) }; + objectsCpu[OT_RECTANGLE] = ReferenceObjectCpu{ .meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .shadersType = GP_BASIC, .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(1.5, 3)) }; + objectsCpu[OT_DISK] = ReferenceObjectCpu{ .meta = {.type = OT_DISK, .name = "Disk Mesh" }, .shadersType = GP_BASIC, .data = gc->createDiskMesh(2, 30) }; + objectsCpu[OT_ARROW] = ReferenceObjectCpu{ .meta = {.type = OT_ARROW, .name = "Arrow Mesh" }, .shadersType = GP_BASIC, .data = gc->createArrowMesh() }; + objectsCpu[OT_CONE] = ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) }; + objectsCpu[OT_ICOSPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_ICOSPHERE, .name = "Icosphere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) }; + + auto geomInfoBuffer = ICPUBuffer::create({ OT_COUNT * sizeof(SGeomInfo) }); + + SGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); + const uint32_t byteOffsets[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 }; // based on normals data position + const uint32_t smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 }; + + struct ScratchVIBindings + { + nbl::asset::SBufferBinding vertex, index; + }; + std::array scratchBuffers; + + for (uint32_t i = 0; i < scratchBuffers.size(); i++) + { + const auto& geom = objectsCpu[i]; + auto& scratchObj = scratchBuffers[i]; + const bool useIndex = geom.data.indexType != EIT_UNKNOWN; + + auto vBuffer = smart_refctd_ptr(geom.data.bindings[0].buffer); // no offset + auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + + auto iBuffer = smart_refctd_ptr(geom.data.indexBuffer.buffer); // no offset + auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + + vBuffer->addUsageFlags(vUsage); + vBuffer->setContentHash(vBuffer->computeContentHash()); + scratchObj.vertex = { .offset = 0, .buffer = vBuffer }; + + if (useIndex) + if (iBuffer) + { + iBuffer->addUsageFlags(iUsage); + iBuffer->setContentHash(iBuffer->computeContentHash()); + } + scratchObj.index = { .offset = 0, .buffer = iBuffer }; + } + + // get ICPUBuffers into ICPUBottomLevelAccelerationStructures + std::array, OT_COUNT> cpuBlas; + for (uint32_t i = 0; i < cpuBlas.size(); i++) + { + auto triangles = make_refctd_dynamic_array>>(1u); + auto primitiveCounts = make_refctd_dynamic_array>(1u); + + auto& tri = triangles->front(); + auto& primCount = primitiveCounts->front(); + const auto& geom = objectsCpu[i]; + const auto& scratchObj = scratchBuffers[i]; + + const bool useIndex = geom.data.indexType != EIT_UNKNOWN; + const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride; + const uint32_t numVertices = scratchObj.vertex.buffer->getSize() / vertexStride; + + if (useIndex) + primCount = geom.data.indexCount / 3; + else + primCount = numVertices / 3; + + geomInfos[i].indexType = geom.data.indexType; + geomInfos[i].vertexStride = vertexStride; + geomInfos[i].smoothNormals = smoothNormals[i]; + + tri.vertexData[0] = scratchObj.vertex; + tri.indexData = useIndex ? scratchObj.index : scratchObj.vertex; + tri.maxVertex = numVertices - 1; + tri.vertexStride = vertexStride; + tri.vertexFormat = EF_R32G32B32_SFLOAT; + tri.indexType = geom.data.indexType; + tri.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; + + auto& blas = cpuBlas[i]; + blas->setGeometries(std::move(triangles), std::move(primitiveCounts)); + + auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; + if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) + blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS; + + blas->setBuildFlags(blasFlags); + blas->setContentHash(blas->computeContentHash()); + } + + // TODO: when does compact blas happen? + + // get ICPUBottomLevelAccelerationStructure into ICPUTopLevelAccelerationStructure + auto geomInstances = make_refctd_dynamic_array>(OT_COUNT); + { + uint32_t i = 0; + for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++) + { + ICPUTopLevelAccelerationStructure::StaticInstance inst; + inst.base.blas = cpuBlas[i]; + inst.base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); + inst.base.instanceCustomIndex = i; + inst.base.instanceShaderBindingTableRecordOffset = 0; + inst.base.mask = 0xFF; + + core::matrix3x4SIMD transform; + transform.setTranslation(nbl::core::vectorSIMDf(5.f * i, 0, 0, 0)); + inst.transform = transform; + + instance->instance = inst; + } + } + + smart_refctd_ptr cpuTlas; + cpuTlas->setInstances(std::move(geomInstances)); + cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); + + // convert with asset converter + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!pool) + return logFail("Couldn't create Command Pool for geometry creation!"); + auto cmdbuf = getSingleUseCommandBufferAndBegin(pool); + cmdbuf->beginDebugMarker("Build geometry vertex and index buffers"); + + smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); + CAssetConverter::SInputs inputs = {}; + inputs.logger = m_logger.get(); + + std::array tmpTlas; + std::array tmpBuffers; + { + tmpTlas[0] = cpuTlas.get(); + for (uint32_t i = 0; i < objectsCpu.size(); i++) + { + tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get(); + tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get(); + } + + std::get>(inputs.assets) = tmpTlas; + std::get>(inputs.assets) = tmpBuffers; + } + + auto reservation = converter->reserve(inputs); + { + auto prepass = [&](const auto & references) -> bool + { + auto objects = reservation.getGPUObjects(); + uint32_t counter = {}; + for (auto& object : objects) + { + auto gpu = object.value; + auto* reference = references[counter]; + + if (reference) + { + if (!gpu) + { + m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); + return false; + } + } + counter++; + } + return true; + }; + + prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas); + prepass.template operator() < ICPUBuffer > (tmpBuffers); + } + + auto semaphore = m_device->createSemaphore(0u); + + std::array cmdbufs = {}; + cmdbufs.front().cmdbuf = cmdbuf.get(); + + SIntendedSubmitInfo transfer = {}; + transfer.queue = queue; + transfer.scratchCommandBuffers = cmdbufs; + transfer.scratchSemaphore = { + .semaphore = semaphore.get(), + .value = 0u, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS // TODO mask for AS? + }; + // convert + { + CAssetConverter::SConvertParams params = {}; + params.utilities = m_utils.get(); + params.transfer = &transfer; + + auto future = reservation.convert(params); + if (future.copy() != IQueue::RESULT::SUCCESS) + { + m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); + return false; + } + + // assign gpu objects to output + auto&& tlases = reservation.getGPUObjects(); + gpuTlas = tlases[0].value; + auto&& buffers = reservation.getGPUObjects(); + for (uint32_t i = 0; i < objectsCpu.size(); i++) + { + auto vBuffer = buffers[2 * i + 0].value; + auto iBuffer = buffers[2 * i + 1].value; + const auto& geom = objectsCpu[i]; + const bool useIndex = geom.data.indexType != EIT_UNKNOWN; + + geomInfos[i].vertexBufferAddress = vBuffer->getDeviceAddress() + byteOffsets[i]; + geomInfos[i].indexBufferAddress = useIndex ? iBuffer->getDeviceAddress() : geomInfos[i].vertexBufferAddress; + } + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = OT_COUNT * sizeof(SGeomInfo); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(geometryInfoBuffer); + } + + return true; + } +#else bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) { auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); @@ -1057,6 +1293,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu return true; } +#endif // TEST_ASSET_CONV_AS smart_refctd_ptr m_window; From 4a951b307b09ecf4a054f7ac27d4dac01f5e8fb9 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 14 May 2025 15:27:54 +0700 Subject: [PATCH 169/296] more test case coverage --- 23_Arithmetic2UnitTest/main.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 49cba28d1..a3c274160 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -174,9 +174,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - for (uint32_t i = 0; i < WorkgroupSizes.size(); i++) + for (uint32_t workgroupSize = subgroupSize; workgroupSize < MaxWorkgroupSize; workgroupSize *= 2) { - const uint32_t workgroupSize = WorkgroupSizes[i]; // make sure renderdoc captures everything for debugging m_api->startCapture(); m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); @@ -478,8 +477,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu uint32_t totalFailCount = 0; - constexpr static inline std::array WorkgroupSizes = { 32, 256, 512, 1024 }; - constexpr static inline std::array ItemsPerInvocations = { 1, 2, 4 }; + constexpr static inline std::array ItemsPerInvocations = { 1, 2, 3, 4 }; }; NBL_MAIN_FUNC(Workgroup2ScanTestApp) \ No newline at end of file From 16b7349f55344cafc8ec9ab28ce72e129fe938bd Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 14 May 2025 16:41:59 +0700 Subject: [PATCH 170/296] some fixes + log debug --- 67_RayQueryGeometry/main.cpp | 52 +++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index a6f6dfcc0..cec4e5270 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -617,29 +617,27 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu const uint32_t byteOffsets[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 }; // based on normals data position const uint32_t smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 }; - struct ScratchVIBindings + struct CPUBufferBindings { nbl::asset::SBufferBinding vertex, index; }; - std::array scratchBuffers; + std::array cpuBuffers; - for (uint32_t i = 0; i < scratchBuffers.size(); i++) + for (uint32_t i = 0; i < cpuBuffers.size(); i++) { const auto& geom = objectsCpu[i]; - auto& scratchObj = scratchBuffers[i]; + auto& cpuObj = cpuBuffers[i]; const bool useIndex = geom.data.indexType != EIT_UNKNOWN; auto vBuffer = smart_refctd_ptr(geom.data.bindings[0].buffer); // no offset - auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; auto iBuffer = smart_refctd_ptr(geom.data.indexBuffer.buffer); // no offset - auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; vBuffer->addUsageFlags(vUsage); vBuffer->setContentHash(vBuffer->computeContentHash()); - scratchObj.vertex = { .offset = 0, .buffer = vBuffer }; + cpuObj.vertex = { .offset = 0, .buffer = vBuffer }; if (useIndex) if (iBuffer) @@ -647,7 +645,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu iBuffer->addUsageFlags(iUsage); iBuffer->setContentHash(iBuffer->computeContentHash()); } - scratchObj.index = { .offset = 0, .buffer = iBuffer }; + cpuObj.index = { .offset = 0, .buffer = iBuffer }; } // get ICPUBuffers into ICPUBottomLevelAccelerationStructures @@ -660,11 +658,11 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu auto& tri = triangles->front(); auto& primCount = primitiveCounts->front(); const auto& geom = objectsCpu[i]; - const auto& scratchObj = scratchBuffers[i]; + const auto& cpuBuf = cpuBuffers[i]; const bool useIndex = geom.data.indexType != EIT_UNKNOWN; const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride; - const uint32_t numVertices = scratchObj.vertex.buffer->getSize() / vertexStride; + const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride; if (useIndex) primCount = geom.data.indexCount / 3; @@ -675,8 +673,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu geomInfos[i].vertexStride = vertexStride; geomInfos[i].smoothNormals = smoothNormals[i]; - tri.vertexData[0] = scratchObj.vertex; - tri.indexData = useIndex ? scratchObj.index : scratchObj.vertex; + tri.vertexData[0] = cpuBuf.vertex; + tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex; tri.maxVertex = numVertices - 1; tri.vertexStride = vertexStride; tri.vertexFormat = EF_R32G32B32_SFLOAT; @@ -684,6 +682,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu tri.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; auto& blas = cpuBlas[i]; + blas = make_smart_refctd_ptr(); blas->setGeometries(std::move(triangles), std::move(primitiveCounts)); auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; @@ -717,7 +716,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu } } - smart_refctd_ptr cpuTlas; + auto cpuTlas = make_smart_refctd_ptr(); cpuTlas->setInstances(std::move(geomInstances)); cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); @@ -726,7 +725,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu if (!pool) return logFail("Couldn't create Command Pool for geometry creation!"); auto cmdbuf = getSingleUseCommandBufferAndBegin(pool); - cmdbuf->beginDebugMarker("Build geometry vertex and index buffers"); smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); CAssetConverter::SInputs inputs = {}; @@ -738,8 +736,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu tmpTlas[0] = cpuTlas.get(); for (uint32_t i = 0; i < objectsCpu.size(); i++) { - tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get(); - tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get(); + tmpBuffers[2 * i + 0] = cpuBuffers[i].vertex.buffer.get(); + tmpBuffers[2 * i + 1] = cpuBuffers[i].index.buffer.get(); } std::get>(inputs.assets) = tmpTlas; @@ -774,6 +772,13 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu prepass.template operator() < ICPUBuffer > (tmpBuffers); } + // TODO wait for convert + m_logger->log("willDeviceASBuild: %d, willHostASBuild: %d\nminASBuildScratchSize: %d, maxASBuildScratchSize: %d\nminCompactedASAllocatorSpace: %d, requiredQueueFlags: %d\n", ILogger::ELL_INFO, + reservation.willDeviceASBuild(), reservation.willHostASBuild(), + reservation.getMinASBuildScratchSize(false), reservation.getMaxASBuildScratchSize(false), + reservation.getMinCompactedASAllocatorSpace(), reservation.getRequiredQueueFlags(false)); + return false; + auto semaphore = m_device->createSemaphore(0u); std::array cmdbufs = {}; @@ -785,13 +790,22 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu transfer.scratchSemaphore = { .semaphore = semaphore.get(), .value = 0u, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS // TODO mask for AS? + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + SIntendedSubmitInfo compute = {}; + compute.queue = queue; + compute.scratchCommandBuffers = cmdbufs; + compute.scratchSemaphore = { + .semaphore = semaphore.get(), + .value = 0u, + .stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT // TODO correct mask? }; // convert { CAssetConverter::SConvertParams params = {}; params.utilities = m_utils.get(); params.transfer = &transfer; + params.compute = &compute; auto future = reservation.convert(params); if (future.copy() != IQueue::RESULT::SUCCESS) From 825c73d5d8307efef2488f0b6ce82b69c32855ea Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 14 May 2025 11:56:11 +0200 Subject: [PATCH 171/296] update media submodule --- media | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/media b/media index a98646358..68dbe85b9 160000 --- a/media +++ b/media @@ -1 +1 @@ -Subproject commit a9864635879e5a616ac400eecd8b6451b498fbf1 +Subproject commit 68dbe85b9849c9b094760428a3639f5c8917d85e From 052148f0d1611df0ae2e9cb4d9ee4edc08e3f351 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 14 May 2025 16:55:37 +0200 Subject: [PATCH 172/296] disable more old code, use two queues for BLAS & TLAS convert and multiple command buffers add code to test ReBAR uploads --- 67_RayQueryGeometry/main.cpp | 140 ++++++++++++++++++++++++++++------- 1 file changed, 115 insertions(+), 25 deletions(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index cec4e5270..0d7494e9c 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -538,6 +538,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu return buffer; } +#ifndef TEST_ASSET_CONV_AS smart_refctd_ptr getSingleUseCommandBufferAndBegin(smart_refctd_ptr pool) { smart_refctd_ptr cmdbuf; @@ -596,6 +597,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu m_device->blockForSemaphores(info); } } +#endif #ifdef TEST_ASSET_CONV_AS bool createAccelerationStructuresFromGeometry(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) @@ -720,15 +722,41 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu cpuTlas->setInstances(std::move(geomInstances)); cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); +//#define TEST_REBAR_FALLBACK // convert with asset converter - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for geometry creation!"); - auto cmdbuf = getSingleUseCommandBufferAndBegin(pool); - smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); - CAssetConverter::SInputs inputs = {}; + struct MyInputs : CAssetConverter::SInputs + { +#ifndef TEST_REBAR_FALLBACK + inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override + { + assert(memoryBacked); + return memoryBacked->getObjectType()!=IDeviceMemoryBacked::EOT_BUFFER ? (~0u):rebarMemoryTypes; + } +#endif + uint32_t rebarMemoryTypes; + } inputs = {}; inputs.logger = m_logger.get(); + inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); +#ifndef TEST_REBAR_FALLBACK + struct MyAllocator final : public IDeviceMemoryAllocator + { + ILogicalDevice* getDeviceForAllocations() const override {return device;} + + SAllocation allocate(const SAllocateInfo& info) override + { + auto retval = device->allocate(info); + // map what is mappable by default so ReBAR checks succeed + if (retval.isValid() && retval.memory->isMappable()) + retval.memory->map({.offset=0,.length=info.size}); + return retval; + } + + ILogicalDevice* device; + } myalloc; + myalloc.device = m_device.get(); + inputs.allocator = &myalloc; +#endif std::array tmpTlas; std::array tmpBuffers; @@ -772,40 +800,102 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu prepass.template operator() < ICPUBuffer > (tmpBuffers); } - // TODO wait for convert - m_logger->log("willDeviceASBuild: %d, willHostASBuild: %d\nminASBuildScratchSize: %d, maxASBuildScratchSize: %d\nminCompactedASAllocatorSpace: %d, requiredQueueFlags: %d\n", ILogger::ELL_INFO, - reservation.willDeviceASBuild(), reservation.willHostASBuild(), - reservation.getMinASBuildScratchSize(false), reservation.getMaxASBuildScratchSize(false), - reservation.getMinCompactedASAllocatorSpace(), reservation.getRequiredQueueFlags(false)); - return false; - - auto semaphore = m_device->createSemaphore(0u); - - std::array cmdbufs = {}; - cmdbufs.front().cmdbuf = cmdbuf.get(); + constexpr auto XferBufferCount = 2; + std::array,XferBufferCount> xferBufs = {}; + std::array xferBufInfos = {}; + { + auto pool = m_device->createCommandPool(getTransferUpQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,xferBufs); + xferBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + for (auto i=0; icreateSemaphore(0u); SIntendedSubmitInfo transfer = {}; - transfer.queue = queue; - transfer.scratchCommandBuffers = cmdbufs; + transfer.queue = getTransferUpQueue(); + transfer.scratchCommandBuffers = xferBufInfos; transfer.scratchSemaphore = { - .semaphore = semaphore.get(), + .semaphore = xferSema.get(), .value = 0u, .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS }; + + constexpr auto CompBufferCount = 2; + std::array,CompBufferCount> compBufs = {}; + std::array compBufInfos = {}; + { + auto pool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT|IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,compBufs); + compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + for (auto i=0; icreateSemaphore(0u); SIntendedSubmitInfo compute = {}; - compute.queue = queue; - compute.scratchCommandBuffers = cmdbufs; + compute.queue = getComputeQueue(); + compute.scratchCommandBuffers = compBufInfos; compute.scratchSemaphore = { - .semaphore = semaphore.get(), + .semaphore = compSema.get(), .value = 0u, - .stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT // TODO correct mask? + .stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT }; // convert { - CAssetConverter::SConvertParams params = {}; + smart_refctd_ptr scratchAlloc; + { + constexpr auto MaxAlignment = 256; + constexpr auto MinAllocationSize = 1024; + const auto scratchSize = core::alignUp(reservation.getMinASBuildScratchSize(false),MaxAlignment); + + + IGPUBuffer::SCreationParams creationParams = {}; + creationParams.size = scratchSize; + creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT|IGPUBuffer::EUF_STORAGE_BUFFER_BIT; +#ifdef TEST_REBAR_FALLBACK + creationParams.usage |= IGPUBuffer::EUF_TRANSFER_DST_BIT; + core::unordered_set sharingSet = {compute.queue->getFamilyIndex(),transfer.queue->getFamilyIndex()}; + core::vector sharingIndices(sharingSet.begin(),sharingSet.end()); + if (sharingIndices.size()>1) + creationParams.queueFamilyIndexCount = sharingIndices.size(); + creationParams.queueFamilyIndices = sharingIndices.data(); +#endif + auto scratchBuffer = m_device->createBuffer(std::move(creationParams)); + + auto reqs = scratchBuffer->getMemoryReqs(); +#ifndef TEST_REBAR_FALLBACK + reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); +#endif + auto allocation = m_device->allocate(reqs,scratchBuffer.get(),IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); +#ifndef TEST_REBAR_FALLBACK + allocation.memory->map({.offset=0,.length=reqs.size}); +#endif + + scratchAlloc = make_smart_refctd_ptr( + SBufferRange{0ull,scratchSize,std::move(scratchBuffer)}, + core::allocator(),MaxAlignment,MinAllocationSize + ); + } + + struct MyParams final : CAssetConverter::SConvertParams + { + inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override + { + return finalUser; + } + inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override + { + return finalUser; + } + + uint8_t finalUser; + } params = {}; +#undef TEST_REBAR_FALLBACK params.utilities = m_utils.get(); params.transfer = &transfer; params.compute = &compute; + params.scratchForDeviceASBuild = scratchAlloc.get(); + params.finalUser = queue->getFamilyIndex(); auto future = reservation.convert(params); if (future.copy() != IQueue::RESULT::SUCCESS) From 908abd110c387d48110ce8aeb67f0e0f2dd68943 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 15 May 2025 10:37:32 +0700 Subject: [PATCH 173/296] refactor name changes --- 23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl | 2 +- 74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl index 702fcbe25..031a34d08 100644 --- a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl @@ -21,7 +21,7 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1 #error "Define ITEMS_PER_INVOCATION!" #endif -using config_t = nbl::hlsl::workgroup2::Configuration; +using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration; typedef vector type_t; diff --git a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl index 026687cfa..19e5893f0 100644 --- a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl +++ b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl @@ -21,7 +21,7 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1 #error "Define ITEMS_PER_INVOCATION!" #endif -using config_t = nbl::hlsl::workgroup2::Configuration; +using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration; typedef vector type_t; From 81238adaecbd8d717bdab0dd73e08e2938a794c6 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 15 May 2025 14:41:16 +0700 Subject: [PATCH 174/296] minor refactor --- 23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl | 2 +- 23_Arithmetic2UnitTest/main.cpp | 2 +- 74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl index 031a34d08..69aa11ecc 100644 --- a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl @@ -43,7 +43,7 @@ bool canStore(); #endif // final (level 1/2) scan needs to fit in one subgroup exactly -groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroup*config_t::ItemsPerInvocation_1]; +groupshared uint32_t scratch[config_t::ElementCount]; struct ScratchProxy { diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index a3c274160..e7dfcefa1 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -174,7 +174,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - for (uint32_t workgroupSize = subgroupSize; workgroupSize < MaxWorkgroupSize; workgroupSize *= 2) + for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u) { // make sure renderdoc captures everything for debugging m_api->startCapture(); diff --git a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl index 19e5893f0..69aa11ecc 100644 --- a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl +++ b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl @@ -43,7 +43,7 @@ bool canStore(); #endif // final (level 1/2) scan needs to fit in one subgroup exactly -groupshared uint32_t scratch[config_t::SharedMemSize]; +groupshared uint32_t scratch[config_t::ElementCount]; struct ScratchProxy { From 749658f2027632a73ce1ee9a07f6abe51ae1c0f0 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 15 May 2025 15:55:34 +0700 Subject: [PATCH 175/296] manage workgroup in example --- .../app_resources/testWorkgroup.comp.hlsl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 7f1b5dcbe..eb7d8e936 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -10,11 +10,13 @@ struct DataProxy void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) { - value = inputValue[ix]; + const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; + value = inputValue[workgroupOffset + ix]; } void set(const uint32_t ix, const dtype_t value) { - output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); + const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; + output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * (workgroupOffset+ix), value); } void workgroupExecutionAndMemoryBarrier() From 1de31ddfd725009bd650f1fe80f1c4a8c2e6a14a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 15 May 2025 15:57:04 +0700 Subject: [PATCH 176/296] moved benchmark to ex 29 --- {74_Arithmetic2Bench => 29_Arithmetic2Bench}/CMakeLists.txt | 0 .../app_resources/benchmarkSubgroup.comp.hlsl | 0 .../app_resources/benchmarkWorkgroup.comp.hlsl | 0 .../app_resources/common.hlsl | 0 .../app_resources/shaderCommon.hlsl | 0 .../app_resources/workgroupCommon.hlsl | 0 .../config.json.template | 0 {74_Arithmetic2Bench => 29_Arithmetic2Bench}/main.cpp | 0 {74_Arithmetic2Bench => 29_Arithmetic2Bench}/pipeline.groovy | 0 CMakeLists.txt | 3 +-- 10 files changed, 1 insertion(+), 2 deletions(-) rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/CMakeLists.txt (100%) rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/app_resources/benchmarkSubgroup.comp.hlsl (100%) rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/app_resources/benchmarkWorkgroup.comp.hlsl (100%) rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/app_resources/common.hlsl (100%) rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/app_resources/shaderCommon.hlsl (100%) rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/app_resources/workgroupCommon.hlsl (100%) rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/config.json.template (100%) rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/main.cpp (100%) rename {74_Arithmetic2Bench => 29_Arithmetic2Bench}/pipeline.groovy (100%) diff --git a/74_Arithmetic2Bench/CMakeLists.txt b/29_Arithmetic2Bench/CMakeLists.txt similarity index 100% rename from 74_Arithmetic2Bench/CMakeLists.txt rename to 29_Arithmetic2Bench/CMakeLists.txt diff --git a/74_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl similarity index 100% rename from 74_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl rename to 29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl diff --git a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl similarity index 100% rename from 74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl rename to 29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl diff --git a/74_Arithmetic2Bench/app_resources/common.hlsl b/29_Arithmetic2Bench/app_resources/common.hlsl similarity index 100% rename from 74_Arithmetic2Bench/app_resources/common.hlsl rename to 29_Arithmetic2Bench/app_resources/common.hlsl diff --git a/74_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl similarity index 100% rename from 74_Arithmetic2Bench/app_resources/shaderCommon.hlsl rename to 29_Arithmetic2Bench/app_resources/shaderCommon.hlsl diff --git a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl similarity index 100% rename from 74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl rename to 29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl diff --git a/74_Arithmetic2Bench/config.json.template b/29_Arithmetic2Bench/config.json.template similarity index 100% rename from 74_Arithmetic2Bench/config.json.template rename to 29_Arithmetic2Bench/config.json.template diff --git a/74_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp similarity index 100% rename from 74_Arithmetic2Bench/main.cpp rename to 29_Arithmetic2Bench/main.cpp diff --git a/74_Arithmetic2Bench/pipeline.groovy b/29_Arithmetic2Bench/pipeline.groovy similarity index 100% rename from 74_Arithmetic2Bench/pipeline.groovy rename to 29_Arithmetic2Bench/pipeline.groovy diff --git a/CMakeLists.txt b/CMakeLists.txt index ed3992203..31ebaddf9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(26_Blur EXCLUDE_FROM_ALL) add_subdirectory(27_MPMCScheduler EXCLUDE_FROM_ALL) add_subdirectory(28_FFTBloom EXCLUDE_FROM_ALL) + add_subdirectory(29_Arithmetic2Bench EXCLUDE_FROM_ALL) # add_subdirectory(36_CUDAInterop EXCLUDE_FROM_ALL) # Showcase compute pathtracing @@ -91,7 +92,5 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) - add_subdirectory(74_Arithmetic2Bench EXCLUDE_FROM_ALL) - NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") endif() From e828dc49ef0a223dcbb8b4af8d722974747f29ee Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 16 May 2025 11:18:11 +0700 Subject: [PATCH 177/296] fit accessors to concept --- .../app_resources/testWorkgroup.comp.hlsl | 2 ++ 23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl | 6 ++++-- .../app_resources/benchmarkWorkgroup.comp.hlsl | 2 ++ 29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl | 6 ++++-- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index eb7d8e936..3aafc0aa7 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -8,11 +8,13 @@ struct DataProxy using dtype_t = vector; static_assert(nbl::hlsl::is_same_v); + template void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) { const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; value = inputValue[workgroupOffset + ix]; } + template void set(const uint32_t ix, const dtype_t value) { const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; diff --git a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl index 69aa11ecc..b0ccbf295 100644 --- a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl @@ -47,11 +47,13 @@ groupshared uint32_t scratch[config_t::ElementCount]; struct ScratchProxy { - void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) + template + void get(const uint32_t ix, NBL_REF_ARG(AccessType) value) { value = scratch[ix]; } - void set(const uint32_t ix, const uint32_t value) + template + void set(const uint32_t ix, const AccessType value) { scratch[ix] = value; } diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index ac6ea7fd8..6e32bedbd 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -12,11 +12,13 @@ struct DataProxy static_assert(nbl::hlsl::is_same_v); // we don't want to write/read storage multiple times in loop; doesn't seem optimized out in generated spirv + template void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) { // value = inputValue[ix]; value = nbl::hlsl::promote(globalIndex()); } + template void set(const uint32_t ix, const dtype_t value) { // output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); diff --git a/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl index 69aa11ecc..b0ccbf295 100644 --- a/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl +++ b/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl @@ -47,11 +47,13 @@ groupshared uint32_t scratch[config_t::ElementCount]; struct ScratchProxy { - void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) + template + void get(const uint32_t ix, NBL_REF_ARG(AccessType) value) { value = scratch[ix]; } - void set(const uint32_t ix, const uint32_t value) + template + void set(const uint32_t ix, const AccessType value) { scratch[ix] = value; } From c41617b0506e4c1830c5d9f90b4827df1a807d33 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 16 May 2025 15:45:25 +0700 Subject: [PATCH 178/296] moved stuff around, check inputs in imgui --- 71_RayTracingPipeline/main.cpp | 3342 ++++++++++++++++---------------- 1 file changed, 1675 insertions(+), 1667 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index e31f5c280..ad13b4a5d 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -6,6 +6,8 @@ #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" #include "nbl/builtin/hlsl/indirect_commands.hlsl" +#define TEST_ASSET_CONV_AS + class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { using device_base_t = examples::SimpleWindowedApplication; @@ -18,768 +20,768 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, constexpr static inline uint32_t NumberOfProceduralGeometries = 5; static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = { - "Directional", - "Point", - "Spot" + "Directional", + "Point", + "Spot" }; constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); struct ShaderBindingTable { - SBufferRange raygenGroupRange; - SBufferRange hitGroupsRange; - uint32_t hitGroupsStride; - SBufferRange missGroupsRange; - uint32_t missGroupsStride; - SBufferRange callableGroupsRange; - uint32_t callableGroupsStride; + SBufferRange raygenGroupRange; + SBufferRange hitGroupsRange; + uint32_t hitGroupsStride; + SBufferRange missGroupsRange; + uint32_t missGroupsStride; + SBufferRange callableGroupsRange; + uint32_t callableGroupsStride; }; public: inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) - : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) { } inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override { - auto retval = device_base_t::getRequiredDeviceFeatures(); - retval.rayTracingPipeline = true; - retval.accelerationStructure = true; - retval.rayQuery = true; - return retval; + auto retval = device_base_t::getRequiredDeviceFeatures(); + retval.rayTracingPipeline = true; + retval.accelerationStructure = true; + retval.rayQuery = true; + return retval; } inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override { - auto retval = device_base_t::getPreferredDeviceFeatures(); - retval.accelerationStructureHostCommands = true; - return retval; + auto retval = device_base_t::getPreferredDeviceFeatures(); + retval.accelerationStructureHostCommands = true; + return retval; } inline core::vector getSurfaces() const override { - if (!m_surface) - { - { - auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); - IWindow::SCreationParams params = {}; - params.callback = core::make_smart_refctd_ptr(); - params.width = WIN_W; - params.height = WIN_H; - params.x = 32; - params.y = 32; - params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; - params.windowCaption = "RaytracingPipelineApp"; - params.callback = windowCallback; - const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); - } - - auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); - const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); - } - - if (m_surface) - return { {m_surface->getSurface()/*,EQF_NONE*/} }; - - return {}; + if (!m_surface) + { + { + auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); + IWindow::SCreationParams params = {}; + params.callback = core::make_smart_refctd_ptr(); + params.width = WIN_W; + params.height = WIN_H; + params.x = 32; + params.y = 32; + params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; + params.windowCaption = "RaytracingPipelineApp"; + params.callback = windowCallback; + const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); + } + + auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); + const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); + } + + if (m_surface) + return { {m_surface->getSurface()/*,EQF_NONE*/} }; + + return {}; } // so that we can use the same queue for asset converter and rendering inline core::vector getQueueRequirements() const override { - auto reqs = device_base_t::getQueueRequirements(); - reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; - return reqs; + auto reqs = device_base_t::getQueueRequirements(); + reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + return reqs; } inline bool onAppInitialized(smart_refctd_ptr&& system) override { - m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); - - if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - - if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - - smart_refctd_ptr shaderReadCache = nullptr; - smart_refctd_ptr shaderWriteCache = core::make_smart_refctd_ptr(); - auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin"; - - { - core::smart_refctd_ptr shaderReadCacheFile; - { - system::ISystem::future_t> future; - m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ); - if (future.wait()) - { - future.acquire().move_into(shaderReadCacheFile); - if (shaderReadCacheFile) - { - const size_t size = shaderReadCacheFile->getSize(); - if (size > 0ull) - { - std::vector contents(size); - system::IFile::success_t succ; - shaderReadCacheFile->read(succ, contents.data(), 0, size); - if (succ) - shaderReadCache = IShaderCompiler::CCache::deserialize(contents); - } - } - } - else - m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR); - } - - } - - // Load Custom Shader - auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = ""; // virtual root - auto assetBundle = m_assetMgr->getAsset(relPath, lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - return nullptr; - - // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - auto sourceRaw = IAsset::castDown(assets[0]); - if (!sourceRaw) - return nullptr; - - return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); - }; - - // load shaders - const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl"); - const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl"); - const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl"); - const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl"); - const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl"); - const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl"); - const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); - const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl"); - const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); - const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); - const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl"); - const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl"); - - core::smart_refctd_ptr shaderWriteCacheFile; - { - system::ISystem::future_t> future; - m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json - m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE); - if (future.wait()) - { - future.acquire().move_into(shaderWriteCacheFile); - if (shaderWriteCacheFile) - { - auto serializedCache = shaderWriteCache->serialize(); - if (shaderWriteCacheFile) - { - system::IFile::success_t succ; - shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize()); - if (!succ) - m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR); - } - } - else - m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); - } - else - m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); - } - - m_semaphore = m_device->createSemaphore(m_realFrameIx); - if (!m_semaphore) - return logFail("Failed to Create a Semaphore!"); - - auto gQueue = getGraphicsQueue(); - - // Create renderpass and init surface - nbl::video::IGPURenderpass* renderpass; - { - ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr(m_surface->getSurface()) }; - if (!swapchainParams.deduceFormat(m_physicalDevice)) - return logFail("Could not choose a Surface Format for the Swapchain!"); - - const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = - { - { - .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .dstSubpass = 0, - .memoryBarrier = - { - .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, - .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, - .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - }, - { - .srcSubpass = 0, - .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .memoryBarrier = - { - .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - }, - IGPURenderpass::SCreationParams::DependenciesEnd - }; - - auto scResources = std::make_unique(m_device.get(), swapchainParams.surfaceFormat.format, dependencies); - renderpass = scResources->getRenderpass(); - - if (!renderpass) - return logFail("Failed to create Renderpass!"); - - if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) - return logFail("Could not create Window & Surface or initialize the Surface!"); - } - - auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - - m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); - - for (auto i = 0u; i < MaxFramesInFlight; i++) - { - if (!pool) - return logFail("Couldn't create Command Pool!"); - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) - return logFail("Couldn't create Command Buffer!"); - } - - m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); - m_surface->recreateSwapchain(); - - - // create output images - m_hdrImage = m_device->createImage({ - { - .type = IGPUImage::ET_2D, - .samples = ICPUImage::ESCF_1_BIT, - .format = EF_R16G16B16A16_SFLOAT, - .extent = {WIN_W, WIN_H, 1}, - .mipLevels = 1, - .arrayLayers = 1, - .flags = IImage::ECF_NONE, - .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT - } - }); - - if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid()) - return logFail("Could not create HDR Image"); - - m_hdrImageView = m_device->createImageView({ - .flags = IGPUImageView::ECF_NONE, - .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT, - .image = m_hdrImage, - .viewType = IGPUImageView::E_TYPE::ET_2D, - .format = asset::EF_R16G16B16A16_SFLOAT - }); - - - - // ray trace pipeline and descriptor set layout setup - { - const IGPUDescriptorSetLayout::SBinding bindings[] = { - { - .binding = 0, - .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, - .count = 1, - }, - { - .binding = 1, - .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, - .count = 1, - } - }; - const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); - - const std::array dsLayoutPtrs = { descriptorSetLayout.get() }; - m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); - m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout); - - const SPushConstantRange pcRange = { - .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, - .offset = 0u, - .size = sizeof(SPushConstants), - }; - const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr); - - IGPURayTracingPipeline::SCreationParams params = {}; - - enum RtDemoShader - { - RTDS_RAYGEN, - RTDS_MISS, - RTDS_MISS_SHADOW, - RTDS_CLOSEST_HIT, - RTDS_SPHERE_CLOSEST_HIT, - RTDS_ANYHIT_PRIMARY, - RTDS_ANYHIT_SHADOW, - RTDS_INTERSECTION, - RTDS_DIRECTIONAL_CALL, - RTDS_POINT_CALL, - RTDS_SPOT_CALL, - RTDS_COUNT - }; - - IGPUShader::SSpecInfo shaders[RTDS_COUNT]; - shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()}; - shaders[RTDS_MISS] = {.shader = missShader.get()}; - shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() }; - shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()}; - shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()}; - shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()}; - shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get()}; - shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get() }; - shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get()}; - shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get()}; - shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get()}; - - params.layout = pipelineLayout.get(); - params.shaders = std::span(shaders); - using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS; - params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) | - RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | - RayTracingFlags::NO_NULL_ANY_HIT_SHADERS; - - auto& shaderGroups = params.shaderGroups; - - shaderGroups.raygen = { .index = RTDS_RAYGEN }; - - IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT]; - missGroups[EMT_PRIMARY] = { .index = RTDS_MISS }; - missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW }; - shaderGroups.misses = missGroups; - - auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) - { - return geomType * ERT_COUNT + rayType; - }; - IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; - hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { - .closestHit = RTDS_CLOSEST_HIT, - .anyHit = RTDS_ANYHIT_PRIMARY, - }; - hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { - .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, - .anyHit = RTDS_ANYHIT_SHADOW, - }; - hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { - .closestHit = RTDS_SPHERE_CLOSEST_HIT, - .anyHit = RTDS_ANYHIT_PRIMARY, - .intersection = RTDS_INTERSECTION, - }; - hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { - .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, - .anyHit = RTDS_ANYHIT_SHADOW, - .intersection = RTDS_INTERSECTION, - }; - shaderGroups.hits = hitGroups; - - IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT]; - callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL }; - callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL }; - callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL }; - shaderGroups.callables = callableGroups; - - params.cached.maxRecursionDepth = 1; - params.cached.dynamicStackSize = true; - - if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) - return logFail("Failed to create ray tracing pipeline"); - - calculateRayTracingStackSize(m_rayTracingPipeline); - - if (!createShaderBindingTable(gQueue, m_rayTracingPipeline)) - return logFail("Could not create shader binding table"); - - } - - auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); - auto* geometryCreator = assetManager->getGeometryCreator(); - - if (!createIndirectBuffer(gQueue)) - return logFail("Could not create indirect buffer"); - - // create geometry objects - if (!createGeometries(gQueue, geometryCreator)) - return logFail("Could not create geometries from geometry creator"); - - if (!createAccelerationStructures(getComputeQueue())) - return logFail("Could not create acceleration structures"); - - ISampler::SParams samplerParams = { - .AnisotropicFilter = 0 - }; - auto defaultSampler = m_device->createSampler(samplerParams); - - { - const IGPUDescriptorSetLayout::SBinding bindings[] = { - { - .binding = 0u, - .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, - .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT, - .count = 1u, - .immutableSamplers = &defaultSampler - } - }; - auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings); - const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() }; - const uint32_t setCounts[] = { 1u }; - m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts); - m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout); - - auto scRes = static_cast(m_surface->getSwapchainResources()); - ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); - if (!fsTriProtoPPln) - return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); - - const IGPUShader::SSpecInfo fragSpec = { - .entryPoint = "main", - .shader = fragmentShader.get() - }; - - auto presentLayout = m_device->createPipelineLayout( - {}, - core::smart_refctd_ptr(gpuPresentDescriptorSetLayout), - nullptr, - nullptr, - nullptr - ); - m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass()); - if (!m_presentPipeline) - return logFail("Could not create Graphics Pipeline!"); - } - - // write descriptors - IGPUDescriptorSet::SDescriptorInfo infos[3]; - infos[0].desc = m_gpuTlas; - - infos[1].desc = m_hdrImageView; - if (!infos[1].desc) - return logFail("Failed to create image view"); - infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL; - - infos[2].desc = m_hdrImageView; - infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; - - IGPUDescriptorSet::SWriteDescriptorSet writes[] = { - {.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]}, - {.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]}, - {.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] }, - }; - m_device->updateDescriptorSets(std::span(writes), {}); - - // gui descriptor setup - { - using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS; - { - IGPUSampler::SParams params; - params.AnisotropicFilter = 1u; - params.TextureWrapU = ETC_REPEAT; - params.TextureWrapV = ETC_REPEAT; - params.TextureWrapW = ETC_REPEAT; - - m_ui.samplers.gui = m_device->createSampler(params); - m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler"); - } - - std::array, 69u> immutableSamplers; - for (auto& it : immutableSamplers) - it = smart_refctd_ptr(m_ui.samplers.scene); - - immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui); - - nbl::ext::imgui::UI::SCreationParameters params; - - params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u }; - params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u }; - params.assetManager = m_assetMgr; - params.pipelineCache = nullptr; - params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount); - params.renderpass = smart_refctd_ptr(renderpass); - params.streamingBuffer = nullptr; - params.subpassIx = 0u; - params.transfer = getTransferUpQueue(); - params.utilities = m_utils; - { - m_ui.manager = ext::imgui::UI::create(std::move(params)); - - // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources - const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u); - const auto& params = m_ui.manager->getCreationParameters(); - - IDescriptorPool::SCreateInfo descriptorPoolInfo = {}; - descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT; - descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount; - descriptorPoolInfo.maxSets = 1u; - descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT; - - m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo)); - assert(m_guiDescriptorSetPool); - - m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet); - assert(m_ui.descriptorSet); - } - } - - m_ui.manager->registerListener( - [this]() -> void { - ImGuiIO& io = ImGui::GetIO(); - - m_camera.setProjectionMatrix([&]() - { - static matrix4SIMD projection; - - projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( - core::radians(m_cameraSetting.fov), - io.DisplaySize.x / io.DisplaySize.y, - m_cameraSetting.zNear, - m_cameraSetting.zFar); - - return projection; - }()); - - ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); - ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); - - // create a window and insert the inspector - ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); - ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); - ImGui::Begin("Controls"); - - ImGui::SameLine(); - - ImGui::Text("Camera"); - - ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f); - ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f); - ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f); - ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f); - ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f); - Light m_oldLight = m_light; - int light_type = m_light.type; - ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT); - m_light.type = static_cast(light_type); - if (m_light.type == ELT_DIRECTIONAL) - { - ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); - } else if (m_light.type == ELT_POINT) - { - ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); - } else if (m_light.type == ELT_SPOT) - { - ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); - ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); - - float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff)); - if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f)) - { - m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff)); - } - } - ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand); - if (m_light != m_oldLight) - { - m_frameAccumulationCounter = 0; - } - - ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); - - ImGui::End(); - } - ); - - // Set Camera - { - core::vectorSIMDf cameraPosition(0, 5, -10); - matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( - core::radians(60.0f), - WIN_W / WIN_H, - 0.01f, - 500.0f - ); - m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); - } - - m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); - m_surface->recreateSwapchain(); - m_winMgr->show(m_window.get()); - m_oracle.reportBeginFrameRecord(); - m_camera.mapKeysToWASD(); - - return true; + m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + smart_refctd_ptr shaderReadCache = nullptr; + smart_refctd_ptr shaderWriteCache = core::make_smart_refctd_ptr(); + auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin"; + + { + core::smart_refctd_ptr shaderReadCacheFile; + { + system::ISystem::future_t> future; + m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ); + if (future.wait()) + { + future.acquire().move_into(shaderReadCacheFile); + if (shaderReadCacheFile) + { + const size_t size = shaderReadCacheFile->getSize(); + if (size > 0ull) + { + std::vector contents(size); + system::IFile::success_t succ; + shaderReadCacheFile->read(succ, contents.data(), 0, size); + if (succ) + shaderReadCache = IShaderCompiler::CCache::deserialize(contents); + } + } + } + else + m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR); + } + + } + + // Load Custom Shader + auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + auto assetBundle = m_assetMgr->getAsset(relPath, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + return nullptr; + + // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader + auto sourceRaw = IAsset::castDown(assets[0]); + if (!sourceRaw) + return nullptr; + + return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); + }; + + // load shaders + const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl"); + const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl"); + const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl"); + const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl"); + const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl"); + const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl"); + const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); + const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl"); + const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); + const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); + const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl"); + const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl"); + + core::smart_refctd_ptr shaderWriteCacheFile; + { + system::ISystem::future_t> future; + m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json + m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE); + if (future.wait()) + { + future.acquire().move_into(shaderWriteCacheFile); + if (shaderWriteCacheFile) + { + auto serializedCache = shaderWriteCache->serialize(); + if (shaderWriteCacheFile) + { + system::IFile::success_t succ; + shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize()); + if (!succ) + m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR); + } + } + else + m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); + } + else + m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); + } + + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); + + auto gQueue = getGraphicsQueue(); + + // Create renderpass and init surface + nbl::video::IGPURenderpass* renderpass; + { + ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr(m_surface->getSurface()) }; + if (!swapchainParams.deduceFormat(m_physicalDevice)) + return logFail("Could not choose a Surface Format for the Swapchain!"); + + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = + { + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; + + auto scResources = std::make_unique(m_device.get(), swapchainParams.surfaceFormat.format, dependencies); + renderpass = scResources->getRenderpass(); + + if (!renderpass) + return logFail("Failed to create Renderpass!"); + + if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) + return logFail("Could not create Window & Surface or initialize the Surface!"); + } + + auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + + m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); + + for (auto i = 0u; i < MaxFramesInFlight; i++) + { + if (!pool) + return logFail("Couldn't create Command Pool!"); + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) + return logFail("Couldn't create Command Buffer!"); + } + + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); + + + // create output images + m_hdrImage = m_device->createImage({ + { + .type = IGPUImage::ET_2D, + .samples = ICPUImage::ESCF_1_BIT, + .format = EF_R16G16B16A16_SFLOAT, + .extent = {WIN_W, WIN_H, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .flags = IImage::ECF_NONE, + .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT + } + }); + + if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid()) + return logFail("Could not create HDR Image"); + + m_hdrImageView = m_device->createImageView({ + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT, + .image = m_hdrImage, + .viewType = IGPUImageView::E_TYPE::ET_2D, + .format = asset::EF_R16G16B16A16_SFLOAT + }); + + + + // ray trace pipeline and descriptor set layout setup + { + const IGPUDescriptorSetLayout::SBinding bindings[] = { + { + .binding = 0, + .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, + .count = 1, + }, + { + .binding = 1, + .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, + .count = 1, + } + }; + const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); + + const std::array dsLayoutPtrs = { descriptorSetLayout.get() }; + m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); + m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout); + + const SPushConstantRange pcRange = { + .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, + .offset = 0u, + .size = sizeof(SPushConstants), + }; + const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr); + + IGPURayTracingPipeline::SCreationParams params = {}; + + enum RtDemoShader + { + RTDS_RAYGEN, + RTDS_MISS, + RTDS_MISS_SHADOW, + RTDS_CLOSEST_HIT, + RTDS_SPHERE_CLOSEST_HIT, + RTDS_ANYHIT_PRIMARY, + RTDS_ANYHIT_SHADOW, + RTDS_INTERSECTION, + RTDS_DIRECTIONAL_CALL, + RTDS_POINT_CALL, + RTDS_SPOT_CALL, + RTDS_COUNT + }; + + IGPUShader::SSpecInfo shaders[RTDS_COUNT]; + shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()}; + shaders[RTDS_MISS] = {.shader = missShader.get()}; + shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() }; + shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()}; + shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()}; + shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()}; + shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get()}; + shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get() }; + shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get()}; + shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get()}; + shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get()}; + + params.layout = pipelineLayout.get(); + params.shaders = std::span(shaders); + using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS; + params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) | + RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | + RayTracingFlags::NO_NULL_ANY_HIT_SHADERS; + + auto& shaderGroups = params.shaderGroups; + + shaderGroups.raygen = { .index = RTDS_RAYGEN }; + + IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT]; + missGroups[EMT_PRIMARY] = { .index = RTDS_MISS }; + missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW }; + shaderGroups.misses = missGroups; + + auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) + { + return geomType * ERT_COUNT + rayType; + }; + IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; + hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { + .closestHit = RTDS_CLOSEST_HIT, + .anyHit = RTDS_ANYHIT_PRIMARY, + }; + hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { + .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, + .anyHit = RTDS_ANYHIT_SHADOW, + }; + hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { + .closestHit = RTDS_SPHERE_CLOSEST_HIT, + .anyHit = RTDS_ANYHIT_PRIMARY, + .intersection = RTDS_INTERSECTION, + }; + hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { + .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, + .anyHit = RTDS_ANYHIT_SHADOW, + .intersection = RTDS_INTERSECTION, + }; + shaderGroups.hits = hitGroups; + + IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT]; + callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL }; + callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL }; + callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL }; + shaderGroups.callables = callableGroups; + + params.cached.maxRecursionDepth = 1; + params.cached.dynamicStackSize = true; + + if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) + return logFail("Failed to create ray tracing pipeline"); + + calculateRayTracingStackSize(m_rayTracingPipeline); + + if (!createShaderBindingTable(gQueue, m_rayTracingPipeline)) + return logFail("Could not create shader binding table"); + + } + + auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); + auto* geometryCreator = assetManager->getGeometryCreator(); + + if (!createIndirectBuffer(gQueue)) + return logFail("Could not create indirect buffer"); + + // create geometry objects + if (!createGeometries(gQueue, geometryCreator)) + return logFail("Could not create geometries from geometry creator"); + + if (!createAccelerationStructures(getComputeQueue())) + return logFail("Could not create acceleration structures"); + + ISampler::SParams samplerParams = { + .AnisotropicFilter = 0 + }; + auto defaultSampler = m_device->createSampler(samplerParams); + + { + const IGPUDescriptorSetLayout::SBinding bindings[] = { + { + .binding = 0u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, + .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT, + .count = 1u, + .immutableSamplers = &defaultSampler + } + }; + auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings); + const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() }; + const uint32_t setCounts[] = { 1u }; + m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts); + m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout); + + auto scRes = static_cast(m_surface->getSwapchainResources()); + ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); + if (!fsTriProtoPPln) + return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); + + const IGPUShader::SSpecInfo fragSpec = { + .entryPoint = "main", + .shader = fragmentShader.get() + }; + + auto presentLayout = m_device->createPipelineLayout( + {}, + core::smart_refctd_ptr(gpuPresentDescriptorSetLayout), + nullptr, + nullptr, + nullptr + ); + m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass()); + if (!m_presentPipeline) + return logFail("Could not create Graphics Pipeline!"); + } + + // write descriptors + IGPUDescriptorSet::SDescriptorInfo infos[3]; + infos[0].desc = m_gpuTlas; + + infos[1].desc = m_hdrImageView; + if (!infos[1].desc) + return logFail("Failed to create image view"); + infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL; + + infos[2].desc = m_hdrImageView; + infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + + IGPUDescriptorSet::SWriteDescriptorSet writes[] = { + {.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]}, + {.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]}, + {.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] }, + }; + m_device->updateDescriptorSets(std::span(writes), {}); + + // gui descriptor setup + { + using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS; + { + IGPUSampler::SParams params; + params.AnisotropicFilter = 1u; + params.TextureWrapU = ETC_REPEAT; + params.TextureWrapV = ETC_REPEAT; + params.TextureWrapW = ETC_REPEAT; + + m_ui.samplers.gui = m_device->createSampler(params); + m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler"); + } + + std::array, 69u> immutableSamplers; + for (auto& it : immutableSamplers) + it = smart_refctd_ptr(m_ui.samplers.scene); + + immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui); + + nbl::ext::imgui::UI::SCreationParameters params; + + params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u }; + params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u }; + params.assetManager = m_assetMgr; + params.pipelineCache = nullptr; + params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount); + params.renderpass = smart_refctd_ptr(renderpass); + params.streamingBuffer = nullptr; + params.subpassIx = 0u; + params.transfer = getTransferUpQueue(); + params.utilities = m_utils; + { + m_ui.manager = ext::imgui::UI::create(std::move(params)); + + // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources + const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u); + const auto& params = m_ui.manager->getCreationParameters(); + + IDescriptorPool::SCreateInfo descriptorPoolInfo = {}; + descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT; + descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount; + descriptorPoolInfo.maxSets = 1u; + descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT; + + m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo)); + assert(m_guiDescriptorSetPool); + + m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet); + assert(m_ui.descriptorSet); + } + } + + m_ui.manager->registerListener( + [this]() -> void { + ImGuiIO& io = ImGui::GetIO(); + + m_camera.setProjectionMatrix([&]() + { + static matrix4SIMD projection; + + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( + core::radians(m_cameraSetting.fov), + io.DisplaySize.x / io.DisplaySize.y, + m_cameraSetting.zNear, + m_cameraSetting.zFar); + + return projection; + }()); + + ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); + + // create a window and insert the inspector + ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); + ImGui::Begin("Controls"); + + ImGui::SameLine(); + + ImGui::Text("Camera"); + + ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f); + ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f); + ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f); + Light m_oldLight = m_light; + int light_type = m_light.type; + ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT); + m_light.type = static_cast(light_type); + if (m_light.type == ELT_DIRECTIONAL) + { + ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); + } else if (m_light.type == ELT_POINT) + { + ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); + } else if (m_light.type == ELT_SPOT) + { + ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); + ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); + + float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff)); + if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f)) + { + m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff)); + } + } + ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand); + if (m_light != m_oldLight) + { + m_frameAccumulationCounter = 0; + } + + ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); + + ImGui::End(); + } + ); + + // Set Camera + { + core::vectorSIMDf cameraPosition(0, 5, -10); + matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( + core::radians(60.0f), + WIN_W / WIN_H, + 0.01f, + 500.0f + ); + m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); + } + + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); + m_winMgr->show(m_window.get()); + m_oracle.reportBeginFrameRecord(); + m_camera.mapKeysToWASD(); + + return true; } bool updateGUIDescriptorSet() { - // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout - static std::array descriptorInfo; - static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount]; - - descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; - descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr(m_ui.manager->getFontAtlasView()); - - for (uint32_t i = 0; i < descriptorInfo.size(); ++i) - { - writes[i].dstSet = m_ui.descriptorSet.get(); - writes[i].binding = 0u; - writes[i].arrayElement = i; - writes[i].count = 1u; - } - writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId; - - return m_device->updateDescriptorSets(writes, {}); + // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout + static std::array descriptorInfo; + static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount]; + + descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr(m_ui.manager->getFontAtlasView()); + + for (uint32_t i = 0; i < descriptorInfo.size(); ++i) + { + writes[i].dstSet = m_ui.descriptorSet.get(); + writes[i].binding = 0u; + writes[i].arrayElement = i; + writes[i].count = 1u; + } + writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId; + + return m_device->updateDescriptorSets(writes, {}); } inline void workLoopBody() override { - // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. - const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); - // We block for semaphores for 2 reasons here: - // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] - // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] - if (m_realFrameIx >= framesInFlight) - { - const ISemaphore::SWaitInfo cbDonePending[] = - { - { - .semaphore = m_semaphore.get(), - .value = m_realFrameIx + 1 - framesInFlight - } - }; - if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) - return; - } - const auto resourceIx = m_realFrameIx % MaxFramesInFlight; - - m_api->startCapture(); - - update(); - - auto queue = getGraphicsQueue(); - auto cmdbuf = m_cmdBufs[resourceIx].get(); - - if (!keepRunning()) - return; - - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame"); - - const auto viewMatrix = m_camera.getViewMatrix(); - const auto projectionMatrix = m_camera.getProjectionMatrix(); - const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix(); - - core::matrix3x4SIMD modelMatrix; - modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0)); - modelMatrix.setRotation(quaternion(0, 0, 0)); - - core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); - if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix) - { - m_frameAccumulationCounter = 0; - m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix; - } - core::matrix4SIMD invModelViewProjectionMatrix; - modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix); - - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; - imageBarriers[0].barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader - .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS, - .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, - .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS - } - }; - imageBarriers[0].image = m_hdrImage.get(); - imageBarriers[0].subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .baseMipLevel = 0u, - .levelCount = 1u, - .baseArrayLayer = 0u, - .layerCount = 1u - }; - imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL; - imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; - cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); - } - - // Trace Rays Pass - { - SPushConstants pc; - pc.light = m_light; - pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress(); - pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress(); - pc.frameCounter = m_frameAccumulationCounter; - const core::vector3df camPos = m_camera.getPosition().getAsVector3df(); - pc.camPos = { camPos.X, camPos.Y, camPos.Z }; - memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP)); - - cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); - cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize); - cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); - cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get()); - if (m_useIndirectCommand) - { - cmdbuf->traceRaysIndirect( - SBufferBinding{ - .offset = 0, - .buffer = m_indirectBuffer, - }); - }else - { - cmdbuf->traceRays( - m_shaderBindingTable.raygenGroupRange, - m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, - m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, - m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, - WIN_W, WIN_H, 1); - } - } - - // pipeline barrier - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; - imageBarriers[0].barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, - .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, - .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - }; - imageBarriers[0].image = m_hdrImage.get(); - imageBarriers[0].subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .baseMipLevel = 0u, - .levelCount = 1u, - .baseArrayLayer = 0u, - .layerCount = 1u - }; - imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL; - imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; - - cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); - } - - { + // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. + const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); + // We block for semaphores for 2 reasons here: + // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] + // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] + if (m_realFrameIx >= framesInFlight) + { + const ISemaphore::SWaitInfo cbDonePending[] = + { + { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + 1 - framesInFlight + } + }; + if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) + return; + } + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + + m_api->startCapture(); + + update(); + + auto queue = getGraphicsQueue(); + auto cmdbuf = m_cmdBufs[resourceIx].get(); + + if (!keepRunning()) + return; + + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame"); + + const auto viewMatrix = m_camera.getViewMatrix(); + const auto projectionMatrix = m_camera.getProjectionMatrix(); + const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix(); + + core::matrix3x4SIMD modelMatrix; + modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0)); + modelMatrix.setRotation(quaternion(0, 0, 0)); + + core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); + if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix) + { + m_frameAccumulationCounter = 0; + m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix; + } + core::matrix4SIMD invModelViewProjectionMatrix; + modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix); + + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader + .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }; + imageBarriers[0].image = m_hdrImage.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL; + imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + // Trace Rays Pass + { + SPushConstants pc; + pc.light = m_light; + pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress(); + pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress(); + pc.frameCounter = m_frameAccumulationCounter; + const core::vector3df camPos = m_camera.getPosition().getAsVector3df(); + pc.camPos = { camPos.X, camPos.Y, camPos.Z }; + memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP)); + + cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); + cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize); + cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); + cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get()); + if (m_useIndirectCommand) + { + cmdbuf->traceRaysIndirect( + SBufferBinding{ + .offset = 0, + .buffer = m_indirectBuffer, + }); + }else + { + cmdbuf->traceRays( + m_shaderBindingTable.raygenGroupRange, + m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, + m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, + m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, + WIN_W, WIN_H, 1); + } + } + + // pipeline barrier + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }; + imageBarriers[0].image = m_hdrImage.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL; + imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + { asset::SViewport viewport; { viewport.minDepth = 1.f; @@ -795,993 +797,999 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} }; cmdbuf->setScissor(defaultScisors); - auto scRes = static_cast(m_surface->getSwapchainResources()); - const VkRect2D currentRenderArea = - { - .offset = {0,0}, - .extent = {m_window->getWidth(),m_window->getHeight()} - }; - const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; - const IGPUCommandBuffer::SRenderpassBeginInfo info = - { - .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex), - .colorClearValues = &clearColor, - .depthStencilClearValues = nullptr, - .renderArea = currentRenderArea - }; - nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u }; - - cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); - - cmdbuf->bindGraphicsPipeline(m_presentPipeline.get()); - cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get()); - ext::FullScreenTriangle::recordDrawCall(cmdbuf); - - const auto uiParams = m_ui.manager->getCreationParameters(); - auto* uiPipeline = m_ui.manager->getPipeline(); - cmdbuf->bindGraphicsPipeline(uiPipeline); - cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get()); - m_ui.manager->render(cmdbuf, waitInfo); - - cmdbuf->endRenderPass(); - - } - - cmdbuf->endDebugMarker(); - cmdbuf->end(); - - { - const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = - { - { - .semaphore = m_semaphore.get(), - .value = ++m_realFrameIx, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS - } - }; - { - { - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = - { - {.cmdbuf = cmdbuf } - }; - - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = - { - { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::NONE - } - }; - const IQueue::SSubmitInfo infos[] = - { - { - .waitSemaphores = acquired, - .commandBuffers = commandBuffers, - .signalSemaphores = rendered - } - }; - - updateGUIDescriptorSet(); - - if (queue->submit(infos) != IQueue::RESULT::SUCCESS) - m_realFrameIx--; - } - } - - m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline"); - m_surface->present(m_currentImageAcquire.imageIndex, rendered); - } - m_api->endCapture(); - m_frameAccumulationCounter++; - } - - inline void update() - { - m_camera.setMoveSpeed(m_cameraSetting.moveSpeed); - m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed); - - static std::chrono::microseconds previousEventTimestamp{}; - - m_inputSystem->getDefaultMouse(&m_mouse); - m_inputSystem->getDefaultKeyboard(&m_keyboard); - - auto updatePresentationTimestamp = [&]() - { - m_currentImageAcquire = m_surface->acquireNextImage(); - - m_oracle.reportEndFrameRecord(); - const auto timestamp = m_oracle.getNextPresentationTimeStamp(); - m_oracle.reportBeginFrameRecord(); - - return timestamp; - }; - - const auto nextPresentationTimestamp = updatePresentationTimestamp(); - - struct - { - std::vector mouse{}; - std::vector keyboard{}; - } capturedEvents; - - m_camera.beginInputProcessing(nextPresentationTimestamp); - { - m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void - { - m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl - - for (const auto& e : events) // here capture - { - if (e.timeStamp < previousEventTimestamp) - continue; - - previousEventTimestamp = e.timeStamp; - capturedEvents.mouse.emplace_back(e); - - } - }, m_logger.get()); - - m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void - { - m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl - - for (const auto& e : events) // here capture - { - if (e.timeStamp < previousEventTimestamp) - continue; - - previousEventTimestamp = e.timeStamp; - capturedEvents.keyboard.emplace_back(e); - } - }, m_logger.get()); + auto scRes = static_cast(m_surface->getSwapchainResources()); + const VkRect2D currentRenderArea = + { + .offset = {0,0}, + .extent = {m_window->getWidth(),m_window->getHeight()} + }; + const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; + const IGPUCommandBuffer::SRenderpassBeginInfo info = + { + .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex), + .colorClearValues = &clearColor, + .depthStencilClearValues = nullptr, + .renderArea = currentRenderArea + }; + nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u }; + + cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); + + cmdbuf->bindGraphicsPipeline(m_presentPipeline.get()); + cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get()); + ext::FullScreenTriangle::recordDrawCall(cmdbuf); + + const auto uiParams = m_ui.manager->getCreationParameters(); + auto* uiPipeline = m_ui.manager->getPipeline(); + cmdbuf->bindGraphicsPipeline(uiPipeline); + cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get()); + m_ui.manager->render(cmdbuf, waitInfo); + + cmdbuf->endRenderPass(); + + } + + cmdbuf->endDebugMarker(); + cmdbuf->end(); + + { + const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = + { + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } + }; + { + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cmdbuf } + }; + + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = + { + { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = rendered + } + }; - } - m_camera.endInputProcessing(nextPresentationTimestamp); + updateGUIDescriptorSet(); - const core::SRange mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size()); - const core::SRange keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size()); - const auto cursorPosition = m_window->getCursorControl()->getPosition(); - const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()); + if (queue->submit(infos) != IQueue::RESULT::SUCCESS) + m_realFrameIx--; + } + } - const ext::imgui::UI::SUpdateParameters params = - { - .mousePosition = mousePosition, - .displaySize = { m_window->getWidth(), m_window->getHeight() }, - .mouseEvents = mouseEvents, - .keyboardEvents = keyboardEvents - }; + m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline"); + m_surface->present(m_currentImageAcquire.imageIndex, rendered); + } + m_api->endCapture(); + m_frameAccumulationCounter++; + } - m_ui.manager->update(params); + inline void update() + { + m_camera.setMoveSpeed(m_cameraSetting.moveSpeed); + m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed); + + static std::chrono::microseconds previousEventTimestamp{}; + + m_inputSystem->getDefaultMouse(&m_mouse); + m_inputSystem->getDefaultKeyboard(&m_keyboard); + + auto updatePresentationTimestamp = [&]() + { + m_currentImageAcquire = m_surface->acquireNextImage(); + + m_oracle.reportEndFrameRecord(); + const auto timestamp = m_oracle.getNextPresentationTimeStamp(); + m_oracle.reportBeginFrameRecord(); + + return timestamp; + }; + + const auto nextPresentationTimestamp = updatePresentationTimestamp(); + + struct + { + std::vector mouse{}; + std::vector keyboard{}; + } capturedEvents; + + m_camera.beginInputProcessing(nextPresentationTimestamp); + { + const auto& io = ImGui::GetIO(); + m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + { + if (!io.WantCaptureMouse) + m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.mouse.emplace_back(e); + + } + }, m_logger.get()); + + m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + { + if (!io.WantCaptureKeyboard) + m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.keyboard.emplace_back(e); + } + }, m_logger.get()); + + } + m_camera.endInputProcessing(nextPresentationTimestamp); + + const core::SRange mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size()); + const core::SRange keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size()); + const auto cursorPosition = m_window->getCursorControl()->getPosition(); + const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()); + + const ext::imgui::UI::SUpdateParameters params = + { + .mousePosition = mousePosition, + .displaySize = { m_window->getWidth(), m_window->getHeight() }, + .mouseEvents = mouseEvents, + .keyboardEvents = keyboardEvents + }; + + m_ui.manager->update(params); } inline bool keepRunning() override { - if (m_surface->irrecoverable()) - return false; + if (m_surface->irrecoverable()) + return false; - return true; + return true; } inline bool onAppTerminated() override { - return device_base_t::onAppTerminated(); + return device_base_t::onAppTerminated(); } private: uint32_t getWorkgroupCount(uint32_t dim, uint32_t size) { - return (dim + size - 1) / size; + return (dim + size - 1) / size; } smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams& params) { - smart_refctd_ptr buffer; - buffer = m_device->createBuffer(std::move(params)); - auto bufReqs = buffer->getMemoryReqs(); - bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); - m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + smart_refctd_ptr buffer; + buffer = m_device->createBuffer(std::move(params)); + auto bufReqs = buffer->getMemoryReqs(); + bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - return buffer; + return buffer; } smart_refctd_ptr getSingleUseCommandBufferAndBegin(smart_refctd_ptr pool) { - smart_refctd_ptr cmdbuf; - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) - return nullptr; + smart_refctd_ptr cmdbuf; + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) + return nullptr; - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - return cmdbuf; + return cmdbuf; } void cmdbufSubmitAndWait(smart_refctd_ptr cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue) { - cmdbuf->end(); - - uint64_t finishedValue = startValue + 1; - - // submit builds - { - auto completed = m_device->createSemaphore(startValue); - - std::array signals; - { - auto& signal = signals.front(); - signal.value = finishedValue; - signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS); - signal.semaphore = completed.get(); - } - - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { - .cmdbuf = cmdbuf.get() - } }; - - const IQueue::SSubmitInfo infos[] = - { - { - .waitSemaphores = {}, - .commandBuffers = commandBuffers, - .signalSemaphores = signals - } - }; - - if (queue->submit(infos) != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR); - return; - } - - const ISemaphore::SWaitInfo info[] = - { { - .semaphore = completed.get(), - .value = finishedValue - } }; - - m_device->blockForSemaphores(info); - } + cmdbuf->end(); + + uint64_t finishedValue = startValue + 1; + + // submit builds + { + auto completed = m_device->createSemaphore(startValue); + + std::array signals; + { + auto& signal = signals.front(); + signal.value = finishedValue; + signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS); + signal.semaphore = completed.get(); + } + + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + .cmdbuf = cmdbuf.get() + } }; + + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = {}, + .commandBuffers = commandBuffers, + .signalSemaphores = signals + } + }; + + if (queue->submit(infos) != IQueue::RESULT::SUCCESS) + { + m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR); + return; + } + + const ISemaphore::SWaitInfo info[] = + { { + .semaphore = completed.get(), + .value = finishedValue + } }; + + m_device->blockForSemaphores(info); + } } bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue) { - const auto getBufferRangeAddress = [](const SBufferRange& range) - { - return range.buffer->getDeviceAddress() + range.offset; - }; - const auto command = TraceRaysIndirectCommand_t{ - .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange), - .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size, - .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange), - .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size, - .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride, - .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange), - .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size, - .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride, - .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange), - .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size, - .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride, - .width = WIN_W, - .height = WIN_H, - .depth = 1, - }; - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = sizeof(TraceRaysIndirectCommand_t); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer); - return true; + const auto getBufferRangeAddress = [](const SBufferRange& range) + { + return range.buffer->getDeviceAddress() + range.offset; + }; + const auto command = TraceRaysIndirectCommand_t{ + .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange), + .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size, + .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange), + .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size, + .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride, + .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange), + .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size, + .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride, + .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange), + .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size, + .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride, + .width = WIN_W, + .height = WIN_H, + .depth = 1, + }; + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = sizeof(TraceRaysIndirectCommand_t); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer); + return true; } - bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) + void calculateRayTracingStackSize(const smart_refctd_ptr& pipeline) { - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for geometry creation!"); - - const auto defaultMaterial = Material{ - .ambient = {0.2, 0.1, 0.1}, - .diffuse = {0.8, 0.3, 0.3}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - .alpha = 1.0f, - }; - - auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) - { - core::matrix3x4SIMD transform; - transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); - return transform; - }; - - core::matrix3x4SIMD planeTransform; - planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); - - const auto cpuObjects = std::array{ - ReferenceObjectCpu { - .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, - .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), - .material = defaultMaterial, - .transform = planeTransform, - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), - .material = defaultMaterial, - .transform = getTranslationMatrix(0, 0.5f, 0), - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = Material{ - .ambient = {0.1, 0.1, 0.2}, - .diffuse = {0.2, 0.2, 0.8}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - }, - .transform = getTranslationMatrix(-5.0f, 1.0f, 0), - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = Material{ - .ambient = {0.1, 0.2, 0.1}, - .diffuse = {0.2, 0.8, 0.2}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - .alpha = 0.2, - }, - .transform = getTranslationMatrix(5.0f, 1.0f, 0), - }, - }; - - struct ScratchVIBindings - { - nbl::asset::SBufferBinding vertex, index; - }; - std::array scratchBuffers; - - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - const auto& cpuObject = cpuObjects[i]; - - auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset - auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - vBuffer->addUsageFlags(vUsage); - vBuffer->setContentHash(vBuffer->computeContentHash()); - - auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset - auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - - if (cpuObject.data.indexType != EIT_UNKNOWN) - if (iBuffer) - { - iBuffer->addUsageFlags(iUsage); - iBuffer->setContentHash(iBuffer->computeContentHash()); - } - - scratchBuffers[i] = { - .vertex = {.offset = 0, .buffer = vBuffer}, - .index = {.offset = 0, .buffer = iBuffer}, - }; - - } - - auto cmdbuf = getSingleUseCommandBufferAndBegin(pool); - cmdbuf->beginDebugMarker("Build geometry vertex and index buffers"); - - CAssetConverter::SInputs inputs = {}; - inputs.logger = m_logger.get(); - std::array tmpBuffers; - { - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get(); - tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get(); - } - - std::get>(inputs.assets) = tmpBuffers; - } - - auto reservation = m_converter->reserve(inputs); - { - auto prepass = [&](const auto & references) -> bool - { - auto objects = reservation.getGPUObjects(); - uint32_t counter = {}; - for (auto& object : objects) - { - auto gpu = object.value; - auto* reference = references[counter]; - - if (reference) - { - if (!gpu) - { - m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); - return false; - } - } - counter++; - } - return true; - }; - - prepass.template operator() < ICPUBuffer > (tmpBuffers); - } - - auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) }); - STriangleGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); - - m_gpuTriangleGeometries.reserve(std::size(cpuObjects)); - // convert - { - // not sure if need this (probably not, originally for transition img view) - auto semaphore = m_device->createSemaphore(0u); - - std::array cmdbufs = {}; - cmdbufs.front().cmdbuf = cmdbuf.get(); - - SIntendedSubmitInfo transfer = {}; - transfer.queue = queue; - transfer.scratchCommandBuffers = cmdbufs; - transfer.scratchSemaphore = { - .semaphore = semaphore.get(), - .value = 0u, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS - }; - - CAssetConverter::SConvertParams params = {}; - params.utilities = m_utils.get(); - params.transfer = &transfer; - - auto future = reservation.convert(params); - if (future.copy() != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); - return false; - } - - auto&& buffers = reservation.getGPUObjects(); - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - auto& cpuObject = cpuObjects[i]; - - m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ - .meta = cpuObject.meta, - .bindings = { - .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, - .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, - }, - .vertexStride = cpuObject.data.inputParams.bindings[0].stride, - .indexType = cpuObject.data.indexType, - .indexCount = cpuObject.data.indexCount, - .material = hlsl::_static_cast(cpuObject.material), - .transform = cpuObject.transform, - }); - } - - for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) - { - const auto& gpuObject = m_gpuTriangleGeometries[i]; - const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); - geomInfos[i] = { - .material = gpuObject.material, - .vertexBufferAddress = vertexBufferAddress, - .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, - .vertexStride = gpuObject.vertexStride, - .objType = gpuObject.meta.type, - .indexType = gpuObject.indexType, - .smoothNormals = s_smoothNormals[gpuObject.meta.type], - }; - } - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = geomInfoBuffer->getSize(); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); - } - - // intersection geometries setup - { - core::vector proceduralGeoms; - proceduralGeoms.reserve(NumberOfProceduralGeometries); - using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; - core::vector aabbs; - aabbs.reserve(NumberOfProceduralGeometries); - for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) - { - const auto middle_i = NumberOfProceduralGeometries / 2.0; - SProceduralGeomInfo sphere = { - .material = hlsl::_static_cast(Material{ - .ambient = {0.1, 0.05 * i, 0.1}, - .diffuse = {0.3, 0.2 * i, 0.3}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - }), - .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), - .radius = 1, - }; - - proceduralGeoms.push_back(sphere); - const auto sphereMin = sphere.center - sphere.radius; - const auto sphereMax = sphere.center + sphere.radius; - aabbs.emplace_back( - vector3d(sphereMin.x, sphereMin.y, sphereMin.z), - vector3d(sphereMax.x, sphereMax.y, sphereMax.z)); - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT; - params.size = aabbs.size() * sizeof(Aabb); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer); - } - } - - return true; + const auto raygenStackSize = pipeline->getRaygenStackSize(); + auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t + { + auto maxValue = 0; + for (const auto& val : ranges) + { + maxValue = std::max(maxValue, std::invoke(valProj, val)); + } + return maxValue; + }; + + const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit); + const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit); + const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection); + const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{}); + const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{}); + auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax); + firstDepthStackSizeMax = std::max(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax); + m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax); } - void calculateRayTracingStackSize(const smart_refctd_ptr& pipeline) + bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr& pipeline) { - const auto raygenStackSize = pipeline->getRaygenStackSize(); - auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t - { - auto maxValue = 0; - for (const auto& val : ranges) - { - maxValue = std::max(maxValue, std::invoke(valProj, val)); - } - return maxValue; - }; - - const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit); - const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit); - const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection); - const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{}); - const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{}); - auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax); - firstDepthStackSizeMax = std::max(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax); - m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax); + const auto& limits = m_device->getPhysicalDevice()->getLimits(); + const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize; + const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment); + + auto& raygenRange = m_shaderBindingTable.raygenGroupRange; + + auto& hitRange = m_shaderBindingTable.hitGroupsRange; + const auto hitHandles = pipeline->getHitHandles(); + + auto& missRange = m_shaderBindingTable.missGroupsRange; + const auto missHandles = pipeline->getMissHandles(); + + auto& callableRange = m_shaderBindingTable.callableGroupsRange; + const auto callableHandles = pipeline->getCallableHandles(); + + raygenRange = { + .offset = 0, + .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment) + }; + + missRange = { + .offset = raygenRange.size, + .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), + }; + m_shaderBindingTable.missGroupsStride = handleSizeAligned; + + hitRange = { + .offset = missRange.offset + missRange.size, + .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), + }; + m_shaderBindingTable.hitGroupsStride = handleSizeAligned; + + callableRange = { + .offset = hitRange.offset + hitRange.size, + .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), + }; + m_shaderBindingTable.callableGroupsStride = handleSizeAligned; + + const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size; + + ICPUBuffer::SCreationParams cpuBufferParams; + cpuBufferParams.size = bufferSize; + auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams)); + uint8_t* pData = reinterpret_cast(cpuBuffer->getPointer()); + + // copy raygen region + memcpy(pData, &pipeline->getRaygen(), handleSize); + + // copy miss region + uint8_t* pMissData = pData + missRange.offset; + for (const auto& handle : missHandles) + { + memcpy(pMissData, &handle, handleSize); + pMissData += m_shaderBindingTable.missGroupsStride; + } + + // copy hit region + uint8_t* pHitData = pData + hitRange.offset; + for (const auto& handle : hitHandles) + { + memcpy(pHitData, &handle, handleSize); + pHitData += m_shaderBindingTable.hitGroupsStride; + } + + // copy callable region + uint8_t* pCallableData = pData + callableRange.offset; + for (const auto& handle : callableHandles) + { + memcpy(pCallableData, &handle, handleSize); + pCallableData += m_shaderBindingTable.callableGroupsStride; + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT; + params.size = bufferSize; + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer); + missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + } + + return true; } - bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr& pipeline) +#ifdef TEST_ASSET_CONV_AS +#else + bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) { - const auto& limits = m_device->getPhysicalDevice()->getLimits(); - const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize; - const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment); - - auto& raygenRange = m_shaderBindingTable.raygenGroupRange; - - auto& hitRange = m_shaderBindingTable.hitGroupsRange; - const auto hitHandles = pipeline->getHitHandles(); - - auto& missRange = m_shaderBindingTable.missGroupsRange; - const auto missHandles = pipeline->getMissHandles(); - - auto& callableRange = m_shaderBindingTable.callableGroupsRange; - const auto callableHandles = pipeline->getCallableHandles(); - - raygenRange = { - .offset = 0, - .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment) - }; - - missRange = { - .offset = raygenRange.size, - .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), - }; - m_shaderBindingTable.missGroupsStride = handleSizeAligned; - - hitRange = { - .offset = missRange.offset + missRange.size, - .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), - }; - m_shaderBindingTable.hitGroupsStride = handleSizeAligned; - - callableRange = { - .offset = hitRange.offset + hitRange.size, - .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), - }; - m_shaderBindingTable.callableGroupsStride = handleSizeAligned; - - const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size; - - ICPUBuffer::SCreationParams cpuBufferParams; - cpuBufferParams.size = bufferSize; - auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams)); - uint8_t* pData = reinterpret_cast(cpuBuffer->getPointer()); - - // copy raygen region - memcpy(pData, &pipeline->getRaygen(), handleSize); - - // copy miss region - uint8_t* pMissData = pData + missRange.offset; - for (const auto& handle : missHandles) - { - memcpy(pMissData, &handle, handleSize); - pMissData += m_shaderBindingTable.missGroupsStride; - } - - // copy hit region - uint8_t* pHitData = pData + hitRange.offset; - for (const auto& handle : hitHandles) - { - memcpy(pHitData, &handle, handleSize); - pHitData += m_shaderBindingTable.hitGroupsStride; - } - - // copy callable region - uint8_t* pCallableData = pData + callableRange.offset; - for (const auto& handle : callableHandles) - { - memcpy(pCallableData, &handle, handleSize); - pCallableData += m_shaderBindingTable.callableGroupsStride; - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT; - params.size = bufferSize; - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer); - missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); - hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); - callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); - } - - return true; + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!pool) + return logFail("Couldn't create Command Pool for geometry creation!"); + + const auto defaultMaterial = Material{ + .ambient = {0.2, 0.1, 0.1}, + .diffuse = {0.8, 0.3, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 1.0f, + }; + + auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) + { + core::matrix3x4SIMD transform; + transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); + return transform; + }; + + core::matrix3x4SIMD planeTransform; + planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); + + const auto cpuObjects = std::array{ + ReferenceObjectCpu { + .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, + .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), + .material = defaultMaterial, + .transform = planeTransform, + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), + .material = defaultMaterial, + .transform = getTranslationMatrix(0, 0.5f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.1, 0.2}, + .diffuse = {0.2, 0.2, 0.8}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }, + .transform = getTranslationMatrix(-5.0f, 1.0f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.2, 0.1}, + .diffuse = {0.2, 0.8, 0.2}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 0.2, + }, + .transform = getTranslationMatrix(5.0f, 1.0f, 0), + }, + }; + + struct ScratchVIBindings + { + nbl::asset::SBufferBinding vertex, index; + }; + std::array scratchBuffers; + + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + const auto& cpuObject = cpuObjects[i]; + + auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset + auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + vBuffer->addUsageFlags(vUsage); + vBuffer->setContentHash(vBuffer->computeContentHash()); + + auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset + auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + + if (cpuObject.data.indexType != EIT_UNKNOWN) + if (iBuffer) + { + iBuffer->addUsageFlags(iUsage); + iBuffer->setContentHash(iBuffer->computeContentHash()); + } + + scratchBuffers[i] = { + .vertex = {.offset = 0, .buffer = vBuffer}, + .index = {.offset = 0, .buffer = iBuffer}, + }; + + } + + auto cmdbuf = getSingleUseCommandBufferAndBegin(pool); + cmdbuf->beginDebugMarker("Build geometry vertex and index buffers"); + + CAssetConverter::SInputs inputs = {}; + inputs.logger = m_logger.get(); + std::array tmpBuffers; + { + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get(); + tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get(); + } + + std::get>(inputs.assets) = tmpBuffers; + } + + auto reservation = m_converter->reserve(inputs); + { + auto prepass = [&](const auto & references) -> bool + { + auto objects = reservation.getGPUObjects(); + uint32_t counter = {}; + for (auto& object : objects) + { + auto gpu = object.value; + auto* reference = references[counter]; + + if (reference) + { + if (!gpu) + { + m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); + return false; + } + } + counter++; + } + return true; + }; + + prepass.template operator() < ICPUBuffer > (tmpBuffers); + } + + auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) }); + STriangleGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); + + m_gpuTriangleGeometries.reserve(std::size(cpuObjects)); + // convert + { + // not sure if need this (probably not, originally for transition img view) + auto semaphore = m_device->createSemaphore(0u); + + std::array cmdbufs = {}; + cmdbufs.front().cmdbuf = cmdbuf.get(); + + SIntendedSubmitInfo transfer = {}; + transfer.queue = queue; + transfer.scratchCommandBuffers = cmdbufs; + transfer.scratchSemaphore = { + .semaphore = semaphore.get(), + .value = 0u, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + + CAssetConverter::SConvertParams params = {}; + params.utilities = m_utils.get(); + params.transfer = &transfer; + + auto future = reservation.convert(params); + if (future.copy() != IQueue::RESULT::SUCCESS) + { + m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); + return false; + } + + auto&& buffers = reservation.getGPUObjects(); + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + auto& cpuObject = cpuObjects[i]; + + m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ + .meta = cpuObject.meta, + .bindings = { + .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, + .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, + }, + .vertexStride = cpuObject.data.inputParams.bindings[0].stride, + .indexType = cpuObject.data.indexType, + .indexCount = cpuObject.data.indexCount, + .material = hlsl::_static_cast(cpuObject.material), + .transform = cpuObject.transform, + }); + } + + for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) + { + const auto& gpuObject = m_gpuTriangleGeometries[i]; + const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); + geomInfos[i] = { + .material = gpuObject.material, + .vertexBufferAddress = vertexBufferAddress, + .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, + .vertexStride = gpuObject.vertexStride, + .objType = gpuObject.meta.type, + .indexType = gpuObject.indexType, + .smoothNormals = s_smoothNormals[gpuObject.meta.type], + }; + } + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = geomInfoBuffer->getSize(); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); + } + + // intersection geometries setup + { + core::vector proceduralGeoms; + proceduralGeoms.reserve(NumberOfProceduralGeometries); + using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; + core::vector aabbs; + aabbs.reserve(NumberOfProceduralGeometries); + for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) + { + const auto middle_i = NumberOfProceduralGeometries / 2.0; + SProceduralGeomInfo sphere = { + .material = hlsl::_static_cast(Material{ + .ambient = {0.1, 0.05 * i, 0.1}, + .diffuse = {0.3, 0.2 * i, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }), + .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), + .radius = 1, + }; + + proceduralGeoms.push_back(sphere); + const auto sphereMin = sphere.center - sphere.radius; + const auto sphereMax = sphere.center + sphere.radius; + aabbs.emplace_back( + vector3d(sphereMin.x, sphereMin.y, sphereMin.z), + vector3d(sphereMax.x, sphereMax.y, sphereMax.z)); + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT; + params.size = aabbs.size() * sizeof(Aabb); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer); + } + } + + return true; } bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue) { - // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}} - // spheres. Each sphere is a primitive instead one instance or geometry - const auto blasCount = m_gpuTriangleGeometries.size() + 1; - const auto proceduralBlasIdx = m_gpuTriangleGeometries.size(); + // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}} + // spheres. Each sphere is a primitive instead one instance or geometry + const auto blasCount = m_gpuTriangleGeometries.size() + 1; + const auto proceduralBlasIdx = m_gpuTriangleGeometries.size(); - IQueryPool::SCreationParams qParams{ .queryCount = static_cast(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE }; - smart_refctd_ptr queryPool = m_device->createQueryPool(std::move(qParams)); + IQueryPool::SCreationParams qParams{ .queryCount = static_cast(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE }; + smart_refctd_ptr queryPool = m_device->createQueryPool(std::move(qParams)); - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for blas/tlas creation!"); + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + if (!pool) + return logFail("Couldn't create Command Pool for blas/tlas creation!"); - m_api->startCapture(); + m_api->startCapture(); #ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it - m_currentImageAcquire = m_surface->acquireNextImage(); - { - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS - } }; - m_surface->present(m_currentImageAcquire.imageIndex, acquired); - } - m_currentImageAcquire = m_surface->acquireNextImage(); + m_currentImageAcquire = m_surface->acquireNextImage(); + { + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS + } }; + m_surface->present(m_currentImageAcquire.imageIndex, acquired); + } + m_currentImageAcquire = m_surface->acquireNextImage(); #endif - size_t totalScratchSize = 0; - const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; - - // build bottom level ASes - { - core::vector primitiveCounts(blasCount); - core::vector> triangles(m_gpuTriangleGeometries.size()); - core::vector scratchSizes(blasCount); - IGPUBottomLevelAccelerationStructure::AABBs aabbs; - - auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; - if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) - blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS; - - IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo; - initBuildInfo.buildFlags = blasFlags; - initBuildInfo.geometryCount = 1; // only 1 geometry object per blas - initBuildInfo.srcAS = nullptr; - initBuildInfo.dstAS = nullptr; - initBuildInfo.scratch = {}; - - auto blasBuildInfos = core::vector(blasCount, initBuildInfo); - - m_gpuBlasList.resize(blasCount); - // setup blas info for triangle geometries - for (uint32_t i = 0; i < blasCount; i++) - { - const auto isProcedural = i == proceduralBlasIdx; - if (isProcedural) - { - aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer); - aabbs.data.offset = 0; - aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t); - aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now - - primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries; - blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs; - blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; - } else - { - const auto& gpuObject = m_gpuTriangleGeometries[i]; - - const uint32_t vertexStride = gpuObject.vertexStride; - const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride; - if (gpuObject.useIndex()) - primitiveCounts[i] = gpuObject.indexCount / 3; - else - primitiveCounts[i] = numVertices / 3; - - triangles[i].vertexData[0] = gpuObject.bindings.vertex; - triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex; - triangles[i].maxVertex = numVertices - 1; - triangles[i].vertexStride = vertexStride; - triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; - triangles[i].indexType = gpuObject.indexType; - triangles[i].geometryFlags = gpuObject.material.isTransparent() ? - IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : - IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; - - blasBuildInfos[i].triangles = &triangles[i]; - } - ILogicalDevice::AccelerationStructureBuildSizes buildSizes; - { - const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; - if (isProcedural) - { - const auto* aabbData = &aabbs; - buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1}, maxPrimCount); - } - else - { - const auto* trianglesData = triangles.data(); - buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{trianglesData,1}, maxPrimCount); - } - if (!buildSizes) - return logFail("Failed to get BLAS build sizes"); - } - - scratchSizes[i] = buildSizes.buildScratchSize; - totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment); - totalScratchSize += buildSizes.buildScratchSize; - - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = buildSizes.accelerationStructureSize; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = buildSizes.accelerationStructureSize; - blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); - if (!m_gpuBlasList[i]) - return logFail("Could not create BLAS"); - } - } - - - auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool); - cmdbufBlas->beginDebugMarker("Build BLAS"); - - cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount); - - smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - params.size = totalScratchSize; - scratchBuffer = createBuffer(params); - } - - core::vector buildRangeInfos(blasCount); - core::vector pRangeInfos(blasCount); - for (uint32_t i = 0; i < blasCount; i++) - { - blasBuildInfos[i].dstAS = m_gpuBlasList[i].get(); - blasBuildInfos[i].scratch.buffer = scratchBuffer; - if (i == 0) - { - blasBuildInfos[i].scratch.offset = 0u; - } else - { - const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1]; - blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment); - } - - buildRangeInfos[i].primitiveCount = primitiveCounts[i]; - buildRangeInfos[i].primitiveByteOffset = 0u; - buildRangeInfos[i].firstVertex = 0u; - buildRangeInfos[i].transformByteOffset = 0u; - - pRangeInfos[i] = &buildRangeInfos[i]; - } - - if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data())) - return logFail("Failed to build BLAS"); - - { - SMemoryBarrier memBarrier; - memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; - memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT; - cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); - } - - - core::vector ases(blasCount); - for (uint32_t i = 0; i < blasCount; i++) - ases[i] = m_gpuBlasList[i].get(); - if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE, - queryPool.get(), 0)) - return logFail("Failed to write acceleration structure properties!"); - - cmdbufBlas->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufBlas, queue, 39); - } - - auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool); - cmdbufCompact->beginDebugMarker("Compact BLAS"); - - // compact blas - { - core::vector asSizes(blasCount); - if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT)) - return logFail("Could not get query pool results for AS sizes"); - - core::vector> cleanupBlas(blasCount); - for (uint32_t i = 0; i < blasCount; i++) - { - if (asSizes[i] == 0) continue; - cleanupBlas[i] = m_gpuBlasList[i]; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = asSizes[i]; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = asSizes[i]; - blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); - if (!m_gpuBlasList[i]) - return logFail("Could not create compacted BLAS"); - } - - IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo; - copyInfo.src = cleanupBlas[i].get(); - copyInfo.dst = m_gpuBlasList[i].get(); - copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT; - if (!cmdbufCompact->copyAccelerationStructure(copyInfo)) - return logFail("Failed to copy AS to compact"); - } - } - - cmdbufCompact->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufCompact, queue, 40); - - auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool); - cmdbufTlas->beginDebugMarker("Build TLAS"); - - // build top level AS - { - const uint32_t instancesCount = blasCount; - core::vector instances(instancesCount); - for (uint32_t i = 0; i < instancesCount; i++) - { - const auto isProceduralInstance = i == proceduralBlasIdx; - instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress; - instances[i].base.mask = 0xFF; - instances[i].base.instanceCustomIndex = i; - instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0; - instances[i].base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); - instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform; - } - - { - size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT | - IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = bufSize; - m_instanceBuffer = createBuffer(params); - - SBufferRange range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer }; - cmdbufTlas->updateBuffer(range, instances.data()); - } - - // make sure instances upload complete first - { - SMemoryBarrier memBarrier; - memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; - memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; - memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; - cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); - } - - auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); - - IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo; - tlasBuildInfo.buildFlags = tlasFlags; - tlasBuildInfo.srcAS = nullptr; - tlasBuildInfo.dstAS = nullptr; - tlasBuildInfo.instanceData.buffer = m_instanceBuffer; - tlasBuildInfo.instanceData.offset = 0u; - tlasBuildInfo.scratch = {}; - - auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount); - if (!buildSizes) - return logFail("Failed to get TLAS build sizes"); - - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = buildSizes.accelerationStructureSize; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUTopLevelAccelerationStructure::SCreationParams tlasParams; - tlasParams.bufferRange.buffer = asBuffer; - tlasParams.bufferRange.offset = 0u; - tlasParams.bufferRange.size = buildSizes.accelerationStructureSize; - tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams)); - if (!m_gpuTlas) - return logFail("Could not create TLAS"); - } - - smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - params.size = buildSizes.buildScratchSize; - scratchBuffer = createBuffer(params); - } - - tlasBuildInfo.dstAS = m_gpuTlas.get(); - tlasBuildInfo.scratch.buffer = scratchBuffer; - tlasBuildInfo.scratch.offset = 0u; - - IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u]; - buildRangeInfo[0].instanceCount = instancesCount; - buildRangeInfo[0].instanceByteOffset = 0u; - IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos; - pRangeInfos = &buildRangeInfo[0]; - - if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos)) - return logFail("Failed to build TLAS"); - } - - cmdbufTlas->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufTlas, queue, 45); + size_t totalScratchSize = 0; + const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; + + // build bottom level ASes + { + core::vector primitiveCounts(blasCount); + core::vector> triangles(m_gpuTriangleGeometries.size()); + core::vector scratchSizes(blasCount); + IGPUBottomLevelAccelerationStructure::AABBs aabbs; + + auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; + if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) + blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS; + + IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo; + initBuildInfo.buildFlags = blasFlags; + initBuildInfo.geometryCount = 1; // only 1 geometry object per blas + initBuildInfo.srcAS = nullptr; + initBuildInfo.dstAS = nullptr; + initBuildInfo.scratch = {}; + + auto blasBuildInfos = core::vector(blasCount, initBuildInfo); + + m_gpuBlasList.resize(blasCount); + // setup blas info for triangle geometries + for (uint32_t i = 0; i < blasCount; i++) + { + const auto isProcedural = i == proceduralBlasIdx; + if (isProcedural) + { + aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer); + aabbs.data.offset = 0; + aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t); + aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now + + primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries; + blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs; + blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; + } else + { + const auto& gpuObject = m_gpuTriangleGeometries[i]; + + const uint32_t vertexStride = gpuObject.vertexStride; + const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride; + if (gpuObject.useIndex()) + primitiveCounts[i] = gpuObject.indexCount / 3; + else + primitiveCounts[i] = numVertices / 3; + + triangles[i].vertexData[0] = gpuObject.bindings.vertex; + triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex; + triangles[i].maxVertex = numVertices - 1; + triangles[i].vertexStride = vertexStride; + triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; + triangles[i].indexType = gpuObject.indexType; + triangles[i].geometryFlags = gpuObject.material.isTransparent() ? + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; + + blasBuildInfos[i].triangles = &triangles[i]; + } + ILogicalDevice::AccelerationStructureBuildSizes buildSizes; + { + const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; + if (isProcedural) + { + const auto* aabbData = &aabbs; + buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1}, maxPrimCount); + } + else + { + const auto* trianglesData = triangles.data(); + buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{trianglesData,1}, maxPrimCount); + } + if (!buildSizes) + return logFail("Failed to get BLAS build sizes"); + } + + scratchSizes[i] = buildSizes.buildScratchSize; + totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment); + totalScratchSize += buildSizes.buildScratchSize; + + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; + params.size = buildSizes.accelerationStructureSize; + smart_refctd_ptr asBuffer = createBuffer(params); + + IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; + blasParams.bufferRange.buffer = asBuffer; + blasParams.bufferRange.offset = 0u; + blasParams.bufferRange.size = buildSizes.accelerationStructureSize; + blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; + m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); + if (!m_gpuBlasList[i]) + return logFail("Could not create BLAS"); + } + } + + + auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool); + cmdbufBlas->beginDebugMarker("Build BLAS"); + + cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount); + + smart_refctd_ptr scratchBuffer; + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + params.size = totalScratchSize; + scratchBuffer = createBuffer(params); + } + + core::vector buildRangeInfos(blasCount); + core::vector pRangeInfos(blasCount); + for (uint32_t i = 0; i < blasCount; i++) + { + blasBuildInfos[i].dstAS = m_gpuBlasList[i].get(); + blasBuildInfos[i].scratch.buffer = scratchBuffer; + if (i == 0) + { + blasBuildInfos[i].scratch.offset = 0u; + } else + { + const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1]; + blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment); + } + + buildRangeInfos[i].primitiveCount = primitiveCounts[i]; + buildRangeInfos[i].primitiveByteOffset = 0u; + buildRangeInfos[i].firstVertex = 0u; + buildRangeInfos[i].transformByteOffset = 0u; + + pRangeInfos[i] = &buildRangeInfos[i]; + } + + if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data())) + return logFail("Failed to build BLAS"); + + { + SMemoryBarrier memBarrier; + memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; + memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; + memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; + memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT; + cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); + } + + + core::vector ases(blasCount); + for (uint32_t i = 0; i < blasCount; i++) + ases[i] = m_gpuBlasList[i].get(); + if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE, + queryPool.get(), 0)) + return logFail("Failed to write acceleration structure properties!"); + + cmdbufBlas->endDebugMarker(); + cmdbufSubmitAndWait(cmdbufBlas, queue, 39); + } + + auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool); + cmdbufCompact->beginDebugMarker("Compact BLAS"); + + // compact blas + { + core::vector asSizes(blasCount); + if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT)) + return logFail("Could not get query pool results for AS sizes"); + + core::vector> cleanupBlas(blasCount); + for (uint32_t i = 0; i < blasCount; i++) + { + if (asSizes[i] == 0) continue; + cleanupBlas[i] = m_gpuBlasList[i]; + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; + params.size = asSizes[i]; + smart_refctd_ptr asBuffer = createBuffer(params); + + IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; + blasParams.bufferRange.buffer = asBuffer; + blasParams.bufferRange.offset = 0u; + blasParams.bufferRange.size = asSizes[i]; + blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; + m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); + if (!m_gpuBlasList[i]) + return logFail("Could not create compacted BLAS"); + } + + IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo; + copyInfo.src = cleanupBlas[i].get(); + copyInfo.dst = m_gpuBlasList[i].get(); + copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT; + if (!cmdbufCompact->copyAccelerationStructure(copyInfo)) + return logFail("Failed to copy AS to compact"); + } + } + + cmdbufCompact->endDebugMarker(); + cmdbufSubmitAndWait(cmdbufCompact, queue, 40); + + auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool); + cmdbufTlas->beginDebugMarker("Build TLAS"); + + // build top level AS + { + const uint32_t instancesCount = blasCount; + core::vector instances(instancesCount); + for (uint32_t i = 0; i < instancesCount; i++) + { + const auto isProceduralInstance = i == proceduralBlasIdx; + instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress; + instances[i].base.mask = 0xFF; + instances[i].base.instanceCustomIndex = i; + instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0; + instances[i].base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); + instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform; + } + + { + size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT | + IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = bufSize; + m_instanceBuffer = createBuffer(params); + + SBufferRange range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer }; + cmdbufTlas->updateBuffer(range, instances.data()); + } + + // make sure instances upload complete first + { + SMemoryBarrier memBarrier; + memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; + memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; + cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); + } + + auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); + + IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo; + tlasBuildInfo.buildFlags = tlasFlags; + tlasBuildInfo.srcAS = nullptr; + tlasBuildInfo.dstAS = nullptr; + tlasBuildInfo.instanceData.buffer = m_instanceBuffer; + tlasBuildInfo.instanceData.offset = 0u; + tlasBuildInfo.scratch = {}; + + auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount); + if (!buildSizes) + return logFail("Failed to get TLAS build sizes"); + + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; + params.size = buildSizes.accelerationStructureSize; + smart_refctd_ptr asBuffer = createBuffer(params); + + IGPUTopLevelAccelerationStructure::SCreationParams tlasParams; + tlasParams.bufferRange.buffer = asBuffer; + tlasParams.bufferRange.offset = 0u; + tlasParams.bufferRange.size = buildSizes.accelerationStructureSize; + tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE; + m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams)); + if (!m_gpuTlas) + return logFail("Could not create TLAS"); + } + + smart_refctd_ptr scratchBuffer; + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + params.size = buildSizes.buildScratchSize; + scratchBuffer = createBuffer(params); + } + + tlasBuildInfo.dstAS = m_gpuTlas.get(); + tlasBuildInfo.scratch.buffer = scratchBuffer; + tlasBuildInfo.scratch.offset = 0u; + + IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u]; + buildRangeInfo[0].instanceCount = instancesCount; + buildRangeInfo[0].instanceByteOffset = 0u; + IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos; + pRangeInfos = &buildRangeInfo[0]; + + if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos)) + return logFail("Failed to build TLAS"); + } + + cmdbufTlas->endDebugMarker(); + cmdbufSubmitAndWait(cmdbufTlas, queue, 45); #ifdef TRY_BUILD_FOR_NGFX - { - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS - } }; - m_surface->present(m_currentImageAcquire.imageIndex, acquired); - } + { + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS + } }; + m_surface->present(m_currentImageAcquire.imageIndex, acquired); + } #endif - m_api->endCapture(); + m_api->endCapture(); - return true; + return true; } +#endif // TEST_ASSET_CONV_AS smart_refctd_ptr m_window; @@ -1798,37 +1806,37 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, struct CameraSetting { - float fov = 60.f; - float zNear = 0.1f; - float zFar = 10000.f; - float moveSpeed = 1.f; - float rotateSpeed = 1.f; - float viewWidth = 10.f; - float camYAngle = 165.f / 180.f * 3.14159f; - float camXAngle = 32.f / 180.f * 3.14159f; - + float fov = 60.f; + float zNear = 0.1f; + float zFar = 10000.f; + float moveSpeed = 1.f; + float rotateSpeed = 1.f; + float viewWidth = 10.f; + float camYAngle = 165.f / 180.f * 3.14159f; + float camXAngle = 32.f / 180.f * 3.14159f; + } m_cameraSetting; Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); Light m_light = { - .direction = {-1.0f, -1.0f, -0.4f}, - .position = {10.0f, 15.0f, 8.0f}, - .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, - .type = ELT_DIRECTIONAL + .direction = {-1.0f, -1.0f, -0.4f}, + .position = {10.0f, 15.0f, 8.0f}, + .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, + .type = ELT_DIRECTIONAL }; video::CDumbPresentationOracle m_oracle; struct C_UI { - nbl::core::smart_refctd_ptr manager; + nbl::core::smart_refctd_ptr manager; - struct - { - core::smart_refctd_ptr gui, scene; - } samplers; + struct + { + core::smart_refctd_ptr gui, scene; + } samplers; - core::smart_refctd_ptr descriptorSet; + core::smart_refctd_ptr descriptorSet; } m_ui; core::smart_refctd_ptr m_guiDescriptorSetPool; From b498e9cf6a5b2c0badf0ccc528c4306f055a015a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 16 May 2025 17:13:38 +0700 Subject: [PATCH 179/296] triangles and aabbs into icpubuffers --- 71_RayTracingPipeline/main.cpp | 3409 +++++++++++++++++--------------- 1 file changed, 1780 insertions(+), 1629 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index ad13b4a5d..528b2c314 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -10,778 +10,786 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::SimpleWindowedApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; - using clock_t = std::chrono::steady_clock; - - constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; - constexpr static inline uint32_t MaxFramesInFlight = 3u; - constexpr static inline uint8_t MaxUITextureCount = 1u; - constexpr static inline uint32_t NumberOfProceduralGeometries = 5; - - static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = { - "Directional", - "Point", - "Spot" - }; - - constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); - - struct ShaderBindingTable - { - SBufferRange raygenGroupRange; - SBufferRange hitGroupsRange; - uint32_t hitGroupsStride; - SBufferRange missGroupsRange; - uint32_t missGroupsStride; - SBufferRange callableGroupsRange; - uint32_t callableGroupsStride; - }; + using device_base_t = examples::SimpleWindowedApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using clock_t = std::chrono::steady_clock; + + constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; + constexpr static inline uint32_t MaxFramesInFlight = 3u; + constexpr static inline uint8_t MaxUITextureCount = 1u; + constexpr static inline uint32_t NumberOfProceduralGeometries = 5; + + static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = { + "Directional", + "Point", + "Spot" + }; + + constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); + + struct ShaderBindingTable + { + SBufferRange raygenGroupRange; + SBufferRange hitGroupsRange; + uint32_t hitGroupsStride; + SBufferRange missGroupsRange; + uint32_t missGroupsStride; + SBufferRange callableGroupsRange; + uint32_t callableGroupsStride; + }; public: - inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) - : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) - { - } - - inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override - { - auto retval = device_base_t::getRequiredDeviceFeatures(); - retval.rayTracingPipeline = true; - retval.accelerationStructure = true; - retval.rayQuery = true; - return retval; - } - - inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override - { - auto retval = device_base_t::getPreferredDeviceFeatures(); - retval.accelerationStructureHostCommands = true; - return retval; - } - - inline core::vector getSurfaces() const override - { - if (!m_surface) + inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) { - { - auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); - IWindow::SCreationParams params = {}; - params.callback = core::make_smart_refctd_ptr(); - params.width = WIN_W; - params.height = WIN_H; - params.x = 32; - params.y = 32; - params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; - params.windowCaption = "RaytracingPipelineApp"; - params.callback = windowCallback; - const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); - } - - auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); - const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); } - if (m_surface) - return { {m_surface->getSurface()/*,EQF_NONE*/} }; + inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + { + auto retval = device_base_t::getRequiredDeviceFeatures(); + retval.rayTracingPipeline = true; + retval.accelerationStructure = true; + retval.rayQuery = true; + return retval; + } - return {}; - } + inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override + { + auto retval = device_base_t::getPreferredDeviceFeatures(); + retval.accelerationStructureHostCommands = true; + return retval; + } - // so that we can use the same queue for asset converter and rendering - inline core::vector getQueueRequirements() const override - { - auto reqs = device_base_t::getQueueRequirements(); - reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; - return reqs; - } + inline core::vector getSurfaces() const override + { + if (!m_surface) + { + { + auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); + IWindow::SCreationParams params = {}; + params.callback = core::make_smart_refctd_ptr(); + params.width = WIN_W; + params.height = WIN_H; + params.x = 32; + params.y = 32; + params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; + params.windowCaption = "RaytracingPipelineApp"; + params.callback = windowCallback; + const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); + } - inline bool onAppInitialized(smart_refctd_ptr&& system) override - { - m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); + const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); + } - if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; + if (m_surface) + return { {m_surface->getSurface()/*,EQF_NONE*/} }; - if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; + return {}; + } - smart_refctd_ptr shaderReadCache = nullptr; - smart_refctd_ptr shaderWriteCache = core::make_smart_refctd_ptr(); - auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin"; + // so that we can use the same queue for asset converter and rendering + inline core::vector getQueueRequirements() const override + { + auto reqs = device_base_t::getQueueRequirements(); + reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + return reqs; + } + inline bool onAppInitialized(smart_refctd_ptr&& system) override { - core::smart_refctd_ptr shaderReadCacheFile; + m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + smart_refctd_ptr shaderReadCache = nullptr; + smart_refctd_ptr shaderWriteCache = core::make_smart_refctd_ptr(); + auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin"; + + { + core::smart_refctd_ptr shaderReadCacheFile; + { + system::ISystem::future_t> future; + m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ); + if (future.wait()) + { + future.acquire().move_into(shaderReadCacheFile); + if (shaderReadCacheFile) + { + const size_t size = shaderReadCacheFile->getSize(); + if (size > 0ull) + { + std::vector contents(size); + system::IFile::success_t succ; + shaderReadCacheFile->read(succ, contents.data(), 0, size); + if (succ) + shaderReadCache = IShaderCompiler::CCache::deserialize(contents); + } + } + } + else + m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR); + } + + } + + // Load Custom Shader + auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + auto assetBundle = m_assetMgr->getAsset(relPath, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + return nullptr; + + // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader + auto sourceRaw = IAsset::castDown(assets[0]); + if (!sourceRaw) + return nullptr; + + return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); + }; + + // load shaders + const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl"); + const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl"); + const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl"); + const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl"); + const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl"); + const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl"); + const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); + const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl"); + const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); + const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); + const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl"); + const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl"); + + core::smart_refctd_ptr shaderWriteCacheFile; { system::ISystem::future_t> future; - m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ); + m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json + m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE); if (future.wait()) { - future.acquire().move_into(shaderReadCacheFile); - if (shaderReadCacheFile) + future.acquire().move_into(shaderWriteCacheFile); + if (shaderWriteCacheFile) { - const size_t size = shaderReadCacheFile->getSize(); - if (size > 0ull) + auto serializedCache = shaderWriteCache->serialize(); + if (shaderWriteCacheFile) { - std::vector contents(size); system::IFile::success_t succ; - shaderReadCacheFile->read(succ, contents.data(), 0, size); - if (succ) - shaderReadCache = IShaderCompiler::CCache::deserialize(contents); + shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize()); + if (!succ) + m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR); } } + else + m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); } else - m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR); + m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); } - } + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); - // Load Custom Shader - auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = ""; // virtual root - auto assetBundle = m_assetMgr->getAsset(relPath, lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - return nullptr; - - // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - auto sourceRaw = IAsset::castDown(assets[0]); - if (!sourceRaw) - return nullptr; - - return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); - }; + auto gQueue = getGraphicsQueue(); - // load shaders - const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl"); - const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl"); - const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl"); - const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl"); - const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl"); - const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl"); - const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); - const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl"); - const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); - const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); - const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl"); - const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl"); - - core::smart_refctd_ptr shaderWriteCacheFile; - { - system::ISystem::future_t> future; - m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json - m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE); - if (future.wait()) + // Create renderpass and init surface + nbl::video::IGPURenderpass* renderpass; { - future.acquire().move_into(shaderWriteCacheFile); - if (shaderWriteCacheFile) + ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr(m_surface->getSurface()) }; + if (!swapchainParams.deduceFormat(m_physicalDevice)) + return logFail("Could not choose a Surface Format for the Swapchain!"); + + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { - auto serializedCache = shaderWriteCache->serialize(); - if (shaderWriteCacheFile) + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = { - system::IFile::success_t succ; - shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize()); - if (!succ) - m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR); + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT } - } - else - m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); - } - else - m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); - } + }, + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; - m_semaphore = m_device->createSemaphore(m_realFrameIx); - if (!m_semaphore) - return logFail("Failed to Create a Semaphore!"); + auto scResources = std::make_unique(m_device.get(), swapchainParams.surfaceFormat.format, dependencies); + renderpass = scResources->getRenderpass(); - auto gQueue = getGraphicsQueue(); + if (!renderpass) + return logFail("Failed to create Renderpass!"); - // Create renderpass and init surface - nbl::video::IGPURenderpass* renderpass; - { - ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr(m_surface->getSurface()) }; - if (!swapchainParams.deduceFormat(m_physicalDevice)) - return logFail("Could not choose a Surface Format for the Swapchain!"); + if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) + return logFail("Could not create Window & Surface or initialize the Surface!"); + } - const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = - { - { - .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .dstSubpass = 0, - .memoryBarrier = - { - .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, - .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, - .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - }, + auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + + m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); + + for (auto i = 0u; i < MaxFramesInFlight; i++) { - .srcSubpass = 0, - .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .memoryBarrier = - { - .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - }, - IGPURenderpass::SCreationParams::DependenciesEnd - }; - - auto scResources = std::make_unique(m_device.get(), swapchainParams.surfaceFormat.format, dependencies); - renderpass = scResources->getRenderpass(); - - if (!renderpass) - return logFail("Failed to create Renderpass!"); - - if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) - return logFail("Could not create Window & Surface or initialize the Surface!"); - } + if (!pool) + return logFail("Couldn't create Command Pool!"); + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) + return logFail("Couldn't create Command Buffer!"); + } - auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); - m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); - for (auto i = 0u; i < MaxFramesInFlight; i++) - { - if (!pool) - return logFail("Couldn't create Command Pool!"); - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) - return logFail("Couldn't create Command Buffer!"); - } + // create output images + m_hdrImage = m_device->createImage({ + { + .type = IGPUImage::ET_2D, + .samples = ICPUImage::ESCF_1_BIT, + .format = EF_R16G16B16A16_SFLOAT, + .extent = {WIN_W, WIN_H, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .flags = IImage::ECF_NONE, + .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT + } + }); + + if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid()) + return logFail("Could not create HDR Image"); - m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); - m_surface->recreateSwapchain(); + m_hdrImageView = m_device->createImageView({ + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT, + .image = m_hdrImage, + .viewType = IGPUImageView::E_TYPE::ET_2D, + .format = asset::EF_R16G16B16A16_SFLOAT + }); - // create output images - m_hdrImage = m_device->createImage({ + + // ray trace pipeline and descriptor set layout setup { - .type = IGPUImage::ET_2D, - .samples = ICPUImage::ESCF_1_BIT, - .format = EF_R16G16B16A16_SFLOAT, - .extent = {WIN_W, WIN_H, 1}, - .mipLevels = 1, - .arrayLayers = 1, - .flags = IImage::ECF_NONE, - .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT + const IGPUDescriptorSetLayout::SBinding bindings[] = { + { + .binding = 0, + .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, + .count = 1, + }, + { + .binding = 1, + .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, + .count = 1, + } + }; + const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); + + const std::array dsLayoutPtrs = { descriptorSetLayout.get() }; + m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); + m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout); + + const SPushConstantRange pcRange = { + .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, + .offset = 0u, + .size = sizeof(SPushConstants), + }; + const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr); + + IGPURayTracingPipeline::SCreationParams params = {}; + + enum RtDemoShader + { + RTDS_RAYGEN, + RTDS_MISS, + RTDS_MISS_SHADOW, + RTDS_CLOSEST_HIT, + RTDS_SPHERE_CLOSEST_HIT, + RTDS_ANYHIT_PRIMARY, + RTDS_ANYHIT_SHADOW, + RTDS_INTERSECTION, + RTDS_DIRECTIONAL_CALL, + RTDS_POINT_CALL, + RTDS_SPOT_CALL, + RTDS_COUNT + }; + + IGPUShader::SSpecInfo shaders[RTDS_COUNT]; + shaders[RTDS_RAYGEN] = { .shader = raygenShader.get() }; + shaders[RTDS_MISS] = { .shader = missShader.get() }; + shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() }; + shaders[RTDS_CLOSEST_HIT] = { .shader = closestHitShader.get() }; + shaders[RTDS_SPHERE_CLOSEST_HIT] = { .shader = proceduralClosestHitShader.get() }; + shaders[RTDS_ANYHIT_PRIMARY] = { .shader = anyHitShaderColorPayload.get() }; + shaders[RTDS_ANYHIT_SHADOW] = { .shader = anyHitShaderShadowPayload.get() }; + shaders[RTDS_INTERSECTION] = { .shader = intersectionHitShader.get() }; + shaders[RTDS_DIRECTIONAL_CALL] = { .shader = directionalLightCallShader.get() }; + shaders[RTDS_POINT_CALL] = { .shader = pointLightCallShader.get() }; + shaders[RTDS_SPOT_CALL] = { .shader = spotLightCallShader.get() }; + + params.layout = pipelineLayout.get(); + params.shaders = std::span(shaders); + using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS; + params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) | + RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | + RayTracingFlags::NO_NULL_ANY_HIT_SHADERS; + + auto& shaderGroups = params.shaderGroups; + + shaderGroups.raygen = { .index = RTDS_RAYGEN }; + + IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT]; + missGroups[EMT_PRIMARY] = { .index = RTDS_MISS }; + missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW }; + shaderGroups.misses = missGroups; + + auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) + { + return geomType * ERT_COUNT + rayType; + }; + IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; + hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { + .closestHit = RTDS_CLOSEST_HIT, + .anyHit = RTDS_ANYHIT_PRIMARY, + }; + hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { + .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, + .anyHit = RTDS_ANYHIT_SHADOW, + }; + hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { + .closestHit = RTDS_SPHERE_CLOSEST_HIT, + .anyHit = RTDS_ANYHIT_PRIMARY, + .intersection = RTDS_INTERSECTION, + }; + hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { + .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, + .anyHit = RTDS_ANYHIT_SHADOW, + .intersection = RTDS_INTERSECTION, + }; + shaderGroups.hits = hitGroups; + + IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT]; + callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL }; + callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL }; + callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL }; + shaderGroups.callables = callableGroups; + + params.cached.maxRecursionDepth = 1; + params.cached.dynamicStackSize = true; + + if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) + return logFail("Failed to create ray tracing pipeline"); + + calculateRayTracingStackSize(m_rayTracingPipeline); + + if (!createShaderBindingTable(gQueue, m_rayTracingPipeline)) + return logFail("Could not create shader binding table"); + } - }); - if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid()) - return logFail("Could not create HDR Image"); + auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); + auto* geometryCreator = assetManager->getGeometryCreator(); - m_hdrImageView = m_device->createImageView({ - .flags = IGPUImageView::ECF_NONE, - .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT, - .image = m_hdrImage, - .viewType = IGPUImageView::E_TYPE::ET_2D, - .format = asset::EF_R16G16B16A16_SFLOAT - }); + if (!createIndirectBuffer(gQueue)) + return logFail("Could not create indirect buffer"); +#ifdef TEST_ASSET_CONV_AS + if (!createAccelerationStructuresFromGeometry(getComputeQueue(), geometryCreator)) + return logFail("Could not create acceleration structures from geometry creator"); +#else + // create geometry objects + if (!createGeometries(gQueue, geometryCreator)) + return logFail("Could not create geometries from geometry creator"); + if (!createAccelerationStructures(getComputeQueue())) + return logFail("Could not create acceleration structures"); +#endif // TEST_ASSET_CONV_AS + + ISampler::SParams samplerParams = { + .AnisotropicFilter = 0 + }; + auto defaultSampler = m_device->createSampler(samplerParams); - // ray trace pipeline and descriptor set layout setup - { - const IGPUDescriptorSetLayout::SBinding bindings[] = { - { - .binding = 0, - .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, - .count = 1, - }, { - .binding = 1, - .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, - .count = 1, + const IGPUDescriptorSetLayout::SBinding bindings[] = { + { + .binding = 0u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, + .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT, + .count = 1u, + .immutableSamplers = &defaultSampler + } + }; + auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings); + const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() }; + const uint32_t setCounts[] = { 1u }; + m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts); + m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout); + + auto scRes = static_cast(m_surface->getSwapchainResources()); + ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); + if (!fsTriProtoPPln) + return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); + + const IGPUShader::SSpecInfo fragSpec = { + .entryPoint = "main", + .shader = fragmentShader.get() + }; + + auto presentLayout = m_device->createPipelineLayout( + {}, + core::smart_refctd_ptr(gpuPresentDescriptorSetLayout), + nullptr, + nullptr, + nullptr + ); + m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass()); + if (!m_presentPipeline) + return logFail("Could not create Graphics Pipeline!"); } - }; - const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); - - const std::array dsLayoutPtrs = { descriptorSetLayout.get() }; - m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); - m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout); - - const SPushConstantRange pcRange = { - .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, - .offset = 0u, - .size = sizeof(SPushConstants), - }; - const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr); - - IGPURayTracingPipeline::SCreationParams params = {}; - - enum RtDemoShader - { - RTDS_RAYGEN, - RTDS_MISS, - RTDS_MISS_SHADOW, - RTDS_CLOSEST_HIT, - RTDS_SPHERE_CLOSEST_HIT, - RTDS_ANYHIT_PRIMARY, - RTDS_ANYHIT_SHADOW, - RTDS_INTERSECTION, - RTDS_DIRECTIONAL_CALL, - RTDS_POINT_CALL, - RTDS_SPOT_CALL, - RTDS_COUNT - }; - - IGPUShader::SSpecInfo shaders[RTDS_COUNT]; - shaders[RTDS_RAYGEN] = {.shader = raygenShader.get()}; - shaders[RTDS_MISS] = {.shader = missShader.get()}; - shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() }; - shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get()}; - shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get()}; - shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get()}; - shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get()}; - shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get() }; - shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get()}; - shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get()}; - shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get()}; - - params.layout = pipelineLayout.get(); - params.shaders = std::span(shaders); - using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS; - params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) | - RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | - RayTracingFlags::NO_NULL_ANY_HIT_SHADERS; - - auto& shaderGroups = params.shaderGroups; - - shaderGroups.raygen = { .index = RTDS_RAYGEN }; - - IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT]; - missGroups[EMT_PRIMARY] = { .index = RTDS_MISS }; - missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW }; - shaderGroups.misses = missGroups; - - auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) - { - return geomType * ERT_COUNT + rayType; + + // write descriptors + IGPUDescriptorSet::SDescriptorInfo infos[3]; + infos[0].desc = m_gpuTlas; + + infos[1].desc = m_hdrImageView; + if (!infos[1].desc) + return logFail("Failed to create image view"); + infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL; + + infos[2].desc = m_hdrImageView; + infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + + IGPUDescriptorSet::SWriteDescriptorSet writes[] = { + {.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]}, + {.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]}, + {.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] }, }; - IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; - hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { - .closestHit = RTDS_CLOSEST_HIT, - .anyHit = RTDS_ANYHIT_PRIMARY, - }; - hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { - .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, - .anyHit = RTDS_ANYHIT_SHADOW, - }; - hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { - .closestHit = RTDS_SPHERE_CLOSEST_HIT, - .anyHit = RTDS_ANYHIT_PRIMARY, - .intersection = RTDS_INTERSECTION, - }; - hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { - .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, - .anyHit = RTDS_ANYHIT_SHADOW, - .intersection = RTDS_INTERSECTION, - }; - shaderGroups.hits = hitGroups; - - IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT]; - callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL }; - callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL }; - callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL }; - shaderGroups.callables = callableGroups; - - params.cached.maxRecursionDepth = 1; - params.cached.dynamicStackSize = true; - - if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) - return logFail("Failed to create ray tracing pipeline"); - - calculateRayTracingStackSize(m_rayTracingPipeline); - - if (!createShaderBindingTable(gQueue, m_rayTracingPipeline)) - return logFail("Could not create shader binding table"); + m_device->updateDescriptorSets(std::span(writes), {}); - } + // gui descriptor setup + { + using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS; + { + IGPUSampler::SParams params; + params.AnisotropicFilter = 1u; + params.TextureWrapU = ETC_REPEAT; + params.TextureWrapV = ETC_REPEAT; + params.TextureWrapW = ETC_REPEAT; + + m_ui.samplers.gui = m_device->createSampler(params); + m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler"); + } - auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); - auto* geometryCreator = assetManager->getGeometryCreator(); + std::array, 69u> immutableSamplers; + for (auto& it : immutableSamplers) + it = smart_refctd_ptr(m_ui.samplers.scene); - if (!createIndirectBuffer(gQueue)) - return logFail("Could not create indirect buffer"); + immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui); - // create geometry objects - if (!createGeometries(gQueue, geometryCreator)) - return logFail("Could not create geometries from geometry creator"); + nbl::ext::imgui::UI::SCreationParameters params; - if (!createAccelerationStructures(getComputeQueue())) - return logFail("Could not create acceleration structures"); + params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u }; + params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u }; + params.assetManager = m_assetMgr; + params.pipelineCache = nullptr; + params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount); + params.renderpass = smart_refctd_ptr(renderpass); + params.streamingBuffer = nullptr; + params.subpassIx = 0u; + params.transfer = getTransferUpQueue(); + params.utilities = m_utils; + { + m_ui.manager = ext::imgui::UI::create(std::move(params)); - ISampler::SParams samplerParams = { - .AnisotropicFilter = 0 - }; - auto defaultSampler = m_device->createSampler(samplerParams); + // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources + const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u); + const auto& params = m_ui.manager->getCreationParameters(); - { - const IGPUDescriptorSetLayout::SBinding bindings[] = { - { - .binding = 0u, - .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, - .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT, - .count = 1u, - .immutableSamplers = &defaultSampler - } - }; - auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings); - const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() }; - const uint32_t setCounts[] = { 1u }; - m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts); - m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout); - - auto scRes = static_cast(m_surface->getSwapchainResources()); - ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); - if (!fsTriProtoPPln) - return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); - - const IGPUShader::SSpecInfo fragSpec = { - .entryPoint = "main", - .shader = fragmentShader.get() - }; - - auto presentLayout = m_device->createPipelineLayout( - {}, - core::smart_refctd_ptr(gpuPresentDescriptorSetLayout), - nullptr, - nullptr, - nullptr - ); - m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass()); - if (!m_presentPipeline) - return logFail("Could not create Graphics Pipeline!"); - } + IDescriptorPool::SCreateInfo descriptorPoolInfo = {}; + descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT; + descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount; + descriptorPoolInfo.maxSets = 1u; + descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT; - // write descriptors - IGPUDescriptorSet::SDescriptorInfo infos[3]; - infos[0].desc = m_gpuTlas; + m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo)); + assert(m_guiDescriptorSetPool); - infos[1].desc = m_hdrImageView; - if (!infos[1].desc) - return logFail("Failed to create image view"); - infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL; + m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet); + assert(m_ui.descriptorSet); + } + } - infos[2].desc = m_hdrImageView; - infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + m_ui.manager->registerListener( + [this]() -> void { + ImGuiIO& io = ImGui::GetIO(); - IGPUDescriptorSet::SWriteDescriptorSet writes[] = { - {.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]}, - {.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]}, - {.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] }, - }; - m_device->updateDescriptorSets(std::span(writes), {}); + m_camera.setProjectionMatrix([&]() + { + static matrix4SIMD projection; + + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( + core::radians(m_cameraSetting.fov), + io.DisplaySize.x / io.DisplaySize.y, + m_cameraSetting.zNear, + m_cameraSetting.zFar); + + return projection; + }()); + + ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); + + // create a window and insert the inspector + ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); + ImGui::Begin("Controls"); + + ImGui::SameLine(); + + ImGui::Text("Camera"); + + ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f); + ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f); + ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f); + Light m_oldLight = m_light; + int light_type = m_light.type; + ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT); + m_light.type = static_cast(light_type); + if (m_light.type == ELT_DIRECTIONAL) + { + ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); + } + else if (m_light.type == ELT_POINT) + { + ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); + } + else if (m_light.type == ELT_SPOT) + { + ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); + ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); - // gui descriptor setup - { - using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS; - { - IGPUSampler::SParams params; - params.AnisotropicFilter = 1u; - params.TextureWrapU = ETC_REPEAT; - params.TextureWrapV = ETC_REPEAT; - params.TextureWrapW = ETC_REPEAT; - - m_ui.samplers.gui = m_device->createSampler(params); - m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler"); - } - - std::array, 69u> immutableSamplers; - for (auto& it : immutableSamplers) - it = smart_refctd_ptr(m_ui.samplers.scene); - - immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui); - - nbl::ext::imgui::UI::SCreationParameters params; - - params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u }; - params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u }; - params.assetManager = m_assetMgr; - params.pipelineCache = nullptr; - params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount); - params.renderpass = smart_refctd_ptr(renderpass); - params.streamingBuffer = nullptr; - params.subpassIx = 0u; - params.transfer = getTransferUpQueue(); - params.utilities = m_utils; - { - m_ui.manager = ext::imgui::UI::create(std::move(params)); - - // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources - const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u); - const auto& params = m_ui.manager->getCreationParameters(); - - IDescriptorPool::SCreateInfo descriptorPoolInfo = {}; - descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT; - descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount; - descriptorPoolInfo.maxSets = 1u; - descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT; - - m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo)); - assert(m_guiDescriptorSetPool); - - m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet); - assert(m_ui.descriptorSet); - } - } + float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff)); + if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f)) + { + m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff)); + } + } + ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand); + if (m_light != m_oldLight) + { + m_frameAccumulationCounter = 0; + } - m_ui.manager->registerListener( - [this]() -> void { - ImGuiIO& io = ImGui::GetIO(); + ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); - m_camera.setProjectionMatrix([&]() - { - static matrix4SIMD projection; - - projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( - core::radians(m_cameraSetting.fov), - io.DisplaySize.x / io.DisplaySize.y, - m_cameraSetting.zNear, - m_cameraSetting.zFar); - - return projection; - }()); - - ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); - ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); - - // create a window and insert the inspector - ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); - ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); - ImGui::Begin("Controls"); - - ImGui::SameLine(); - - ImGui::Text("Camera"); - - ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f); - ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f); - ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f); - ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f); - ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f); - Light m_oldLight = m_light; - int light_type = m_light.type; - ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT); - m_light.type = static_cast(light_type); - if (m_light.type == ELT_DIRECTIONAL) - { - ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); - } else if (m_light.type == ELT_POINT) - { - ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); - } else if (m_light.type == ELT_SPOT) - { - ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); - ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); - - float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff)); - if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f)) - { - m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff)); - } - } - ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand); - if (m_light != m_oldLight) + ImGui::End(); + } + ); + + // Set Camera { - m_frameAccumulationCounter = 0; + core::vectorSIMDf cameraPosition(0, 5, -10); + matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( + core::radians(60.0f), + WIN_W / WIN_H, + 0.01f, + 500.0f + ); + m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); } - ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); - - ImGui::End(); - } - ); + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); + m_winMgr->show(m_window.get()); + m_oracle.reportBeginFrameRecord(); + m_camera.mapKeysToWASD(); - // Set Camera - { - core::vectorSIMDf cameraPosition(0, 5, -10); - matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( - core::radians(60.0f), - WIN_W / WIN_H, - 0.01f, - 500.0f - ); - m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); + return true; } - m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); - m_surface->recreateSwapchain(); - m_winMgr->show(m_window.get()); - m_oracle.reportBeginFrameRecord(); - m_camera.mapKeysToWASD(); - - return true; - } + bool updateGUIDescriptorSet() + { + // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout + static std::array descriptorInfo; + static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount]; - bool updateGUIDescriptorSet() - { - // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout - static std::array descriptorInfo; - static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount]; + descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr(m_ui.manager->getFontAtlasView()); - descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; - descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr(m_ui.manager->getFontAtlasView()); + for (uint32_t i = 0; i < descriptorInfo.size(); ++i) + { + writes[i].dstSet = m_ui.descriptorSet.get(); + writes[i].binding = 0u; + writes[i].arrayElement = i; + writes[i].count = 1u; + } + writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId; - for (uint32_t i = 0; i < descriptorInfo.size(); ++i) - { - writes[i].dstSet = m_ui.descriptorSet.get(); - writes[i].binding = 0u; - writes[i].arrayElement = i; - writes[i].count = 1u; + return m_device->updateDescriptorSets(writes, {}); } - writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId; - - return m_device->updateDescriptorSets(writes, {}); - } - - inline void workLoopBody() override - { - // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. - const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); - // We block for semaphores for 2 reasons here: - // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] - // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] - if (m_realFrameIx >= framesInFlight) + + inline void workLoopBody() override { - const ISemaphore::SWaitInfo cbDonePending[] = - { + // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. + const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); + // We block for semaphores for 2 reasons here: + // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] + // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] + if (m_realFrameIx >= framesInFlight) { - .semaphore = m_semaphore.get(), - .value = m_realFrameIx + 1 - framesInFlight + const ISemaphore::SWaitInfo cbDonePending[] = + { + { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + 1 - framesInFlight + } + }; + if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) + return; } - }; - if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) - return; - } - const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; - m_api->startCapture(); + m_api->startCapture(); - update(); + update(); - auto queue = getGraphicsQueue(); - auto cmdbuf = m_cmdBufs[resourceIx].get(); + auto queue = getGraphicsQueue(); + auto cmdbuf = m_cmdBufs[resourceIx].get(); - if (!keepRunning()) - return; + if (!keepRunning()) + return; - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame"); + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame"); - const auto viewMatrix = m_camera.getViewMatrix(); - const auto projectionMatrix = m_camera.getProjectionMatrix(); - const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix(); + const auto viewMatrix = m_camera.getViewMatrix(); + const auto projectionMatrix = m_camera.getProjectionMatrix(); + const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix(); - core::matrix3x4SIMD modelMatrix; - modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0)); - modelMatrix.setRotation(quaternion(0, 0, 0)); + core::matrix3x4SIMD modelMatrix; + modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0)); + modelMatrix.setRotation(quaternion(0, 0, 0)); - core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); - if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix) - { - m_frameAccumulationCounter = 0; - m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix; - } - core::matrix4SIMD invModelViewProjectionMatrix; - modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix); + core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); + if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix) + { + m_frameAccumulationCounter = 0; + m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix; + } + core::matrix4SIMD invModelViewProjectionMatrix; + modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix); - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; - imageBarriers[0].barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader - .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS, - .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, - .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader + .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }; + imageBarriers[0].image = m_hdrImage.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL; + imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); } - }; - imageBarriers[0].image = m_hdrImage.get(); - imageBarriers[0].subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .baseMipLevel = 0u, - .levelCount = 1u, - .baseArrayLayer = 0u, - .layerCount = 1u - }; - imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL; - imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; - cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); - } - // Trace Rays Pass - { - SPushConstants pc; - pc.light = m_light; - pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress(); - pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress(); - pc.frameCounter = m_frameAccumulationCounter; - const core::vector3df camPos = m_camera.getPosition().getAsVector3df(); - pc.camPos = { camPos.X, camPos.Y, camPos.Z }; - memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP)); - - cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); - cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize); - cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); - cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get()); - if (m_useIndirectCommand) - { - cmdbuf->traceRaysIndirect( - SBufferBinding{ - .offset = 0, - .buffer = m_indirectBuffer, - }); - }else - { - cmdbuf->traceRays( - m_shaderBindingTable.raygenGroupRange, - m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, - m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, - m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, - WIN_W, WIN_H, 1); - } - } + // Trace Rays Pass + { + SPushConstants pc; + pc.light = m_light; + pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress(); + pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress(); + pc.frameCounter = m_frameAccumulationCounter; + const core::vector3df camPos = m_camera.getPosition().getAsVector3df(); + pc.camPos = { camPos.X, camPos.Y, camPos.Z }; + memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP)); + + cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); + cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize); + cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); + cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get()); + if (m_useIndirectCommand) + { + cmdbuf->traceRaysIndirect( + SBufferBinding{ + .offset = 0, + .buffer = m_indirectBuffer, + }); + } + else + { + cmdbuf->traceRays( + m_shaderBindingTable.raygenGroupRange, + m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, + m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, + m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, + WIN_W, WIN_H, 1); + } + } - // pipeline barrier - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; - imageBarriers[0].barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, - .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, - .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + // pipeline barrier + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }; + imageBarriers[0].image = m_hdrImage.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL; + imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); } - }; - imageBarriers[0].image = m_hdrImage.get(); - imageBarriers[0].subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .baseMipLevel = 0u, - .levelCount = 1u, - .baseArrayLayer = 0u, - .layerCount = 1u - }; - imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL; - imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; - - cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); - } - { + { asset::SViewport viewport; { viewport.minDepth = 1.f; @@ -797,1080 +805,1223 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} }; cmdbuf->setScissor(defaultScisors); - auto scRes = static_cast(m_surface->getSwapchainResources()); - const VkRect2D currentRenderArea = - { - .offset = {0,0}, - .extent = {m_window->getWidth(),m_window->getHeight()} - }; - const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; - const IGPUCommandBuffer::SRenderpassBeginInfo info = - { - .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex), - .colorClearValues = &clearColor, - .depthStencilClearValues = nullptr, - .renderArea = currentRenderArea - }; - nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u }; - - cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); - - cmdbuf->bindGraphicsPipeline(m_presentPipeline.get()); - cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get()); - ext::FullScreenTriangle::recordDrawCall(cmdbuf); - - const auto uiParams = m_ui.manager->getCreationParameters(); - auto* uiPipeline = m_ui.manager->getPipeline(); - cmdbuf->bindGraphicsPipeline(uiPipeline); - cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get()); - m_ui.manager->render(cmdbuf, waitInfo); - - cmdbuf->endRenderPass(); + auto scRes = static_cast(m_surface->getSwapchainResources()); + const VkRect2D currentRenderArea = + { + .offset = {0,0}, + .extent = {m_window->getWidth(),m_window->getHeight()} + }; + const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; + const IGPUCommandBuffer::SRenderpassBeginInfo info = + { + .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex), + .colorClearValues = &clearColor, + .depthStencilClearValues = nullptr, + .renderArea = currentRenderArea + }; + nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u }; + + cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); - } + cmdbuf->bindGraphicsPipeline(m_presentPipeline.get()); + cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get()); + ext::FullScreenTriangle::recordDrawCall(cmdbuf); - cmdbuf->endDebugMarker(); - cmdbuf->end(); + const auto uiParams = m_ui.manager->getCreationParameters(); + auto* uiPipeline = m_ui.manager->getPipeline(); + cmdbuf->bindGraphicsPipeline(uiPipeline); + cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get()); + m_ui.manager->render(cmdbuf, waitInfo); + + cmdbuf->endRenderPass(); - { - const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = - { - { - .semaphore = m_semaphore.get(), - .value = ++m_realFrameIx, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS } - }; - { - { - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = - { - {.cmdbuf = cmdbuf } - }; - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = - { + cmdbuf->endDebugMarker(); + cmdbuf->end(); + + { + const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::NONE - } - }; - const IQueue::SSubmitInfo infos[] = - { + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } + }; { - .waitSemaphores = acquired, - .commandBuffers = commandBuffers, - .signalSemaphores = rendered + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cmdbuf } + }; + + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = + { + { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = rendered + } + }; + + updateGUIDescriptorSet(); + + if (queue->submit(infos) != IQueue::RESULT::SUCCESS) + m_realFrameIx--; + } } - }; - updateGUIDescriptorSet(); + m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline"); + m_surface->present(m_currentImageAcquire.imageIndex, rendered); + } + m_api->endCapture(); + m_frameAccumulationCounter++; + } + + inline void update() + { + m_camera.setMoveSpeed(m_cameraSetting.moveSpeed); + m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed); + + static std::chrono::microseconds previousEventTimestamp{}; + + m_inputSystem->getDefaultMouse(&m_mouse); + m_inputSystem->getDefaultKeyboard(&m_keyboard); + + auto updatePresentationTimestamp = [&]() + { + m_currentImageAcquire = m_surface->acquireNextImage(); + + m_oracle.reportEndFrameRecord(); + const auto timestamp = m_oracle.getNextPresentationTimeStamp(); + m_oracle.reportBeginFrameRecord(); + + return timestamp; + }; + + const auto nextPresentationTimestamp = updatePresentationTimestamp(); + + struct + { + std::vector mouse{}; + std::vector keyboard{}; + } capturedEvents; + + m_camera.beginInputProcessing(nextPresentationTimestamp); + { + const auto& io = ImGui::GetIO(); + m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + { + if (!io.WantCaptureMouse) + m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.mouse.emplace_back(e); + + } + }, m_logger.get()); + + m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + { + if (!io.WantCaptureKeyboard) + m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.keyboard.emplace_back(e); + } + }, m_logger.get()); - if (queue->submit(infos) != IQueue::RESULT::SUCCESS) - m_realFrameIx--; } - } + m_camera.endInputProcessing(nextPresentationTimestamp); - m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline"); - m_surface->present(m_currentImageAcquire.imageIndex, rendered); - } - m_api->endCapture(); - m_frameAccumulationCounter++; - } + const core::SRange mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size()); + const core::SRange keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size()); + const auto cursorPosition = m_window->getCursorControl()->getPosition(); + const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()); - inline void update() - { - m_camera.setMoveSpeed(m_cameraSetting.moveSpeed); - m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed); + const ext::imgui::UI::SUpdateParameters params = + { + .mousePosition = mousePosition, + .displaySize = { m_window->getWidth(), m_window->getHeight() }, + .mouseEvents = mouseEvents, + .keyboardEvents = keyboardEvents + }; + + m_ui.manager->update(params); + } - static std::chrono::microseconds previousEventTimestamp{}; + inline bool keepRunning() override + { + if (m_surface->irrecoverable()) + return false; - m_inputSystem->getDefaultMouse(&m_mouse); - m_inputSystem->getDefaultKeyboard(&m_keyboard); + return true; + } - auto updatePresentationTimestamp = [&]() - { - m_currentImageAcquire = m_surface->acquireNextImage(); + inline bool onAppTerminated() override + { + return device_base_t::onAppTerminated(); + } - m_oracle.reportEndFrameRecord(); - const auto timestamp = m_oracle.getNextPresentationTimeStamp(); - m_oracle.reportBeginFrameRecord(); +private: + uint32_t getWorkgroupCount(uint32_t dim, uint32_t size) + { + return (dim + size - 1) / size; + } - return timestamp; - }; + smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams& params) + { + smart_refctd_ptr buffer; + buffer = m_device->createBuffer(std::move(params)); + auto bufReqs = buffer->getMemoryReqs(); + bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - const auto nextPresentationTimestamp = updatePresentationTimestamp(); + return buffer; + } - struct + smart_refctd_ptr getSingleUseCommandBufferAndBegin(smart_refctd_ptr pool) { - std::vector mouse{}; - std::vector keyboard{}; - } capturedEvents; + smart_refctd_ptr cmdbuf; + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) + return nullptr; + + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + return cmdbuf; + } - m_camera.beginInputProcessing(nextPresentationTimestamp); + void cmdbufSubmitAndWait(smart_refctd_ptr cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue) { - const auto& io = ImGui::GetIO(); - m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + cmdbuf->end(); + + uint64_t finishedValue = startValue + 1; + + // submit builds { - if (!io.WantCaptureMouse) - m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + auto completed = m_device->createSemaphore(startValue); - for (const auto& e : events) // here capture - { - if (e.timeStamp < previousEventTimestamp) - continue; + std::array signals; + { + auto& signal = signals.front(); + signal.value = finishedValue; + signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS); + signal.semaphore = completed.get(); + } - previousEventTimestamp = e.timeStamp; - capturedEvents.mouse.emplace_back(e); + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + .cmdbuf = cmdbuf.get() + } }; - } - }, m_logger.get()); + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = {}, + .commandBuffers = commandBuffers, + .signalSemaphores = signals + } + }; - m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void - { - if (!io.WantCaptureKeyboard) - m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl + if (queue->submit(infos) != IQueue::RESULT::SUCCESS) + { + m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR); + return; + } - for (const auto& e : events) // here capture - { - if (e.timeStamp < previousEventTimestamp) - continue; + const ISemaphore::SWaitInfo info[] = + { { + .semaphore = completed.get(), + .value = finishedValue + } }; - previousEventTimestamp = e.timeStamp; - capturedEvents.keyboard.emplace_back(e); - } - }, m_logger.get()); + m_device->blockForSemaphores(info); + } + } + bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue) + { + const auto getBufferRangeAddress = [](const SBufferRange& range) + { + return range.buffer->getDeviceAddress() + range.offset; + }; + const auto command = TraceRaysIndirectCommand_t{ + .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange), + .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size, + .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange), + .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size, + .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride, + .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange), + .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size, + .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride, + .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange), + .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size, + .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride, + .width = WIN_W, + .height = WIN_H, + .depth = 1, + }; + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = sizeof(TraceRaysIndirectCommand_t); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer); + return true; } - m_camera.endInputProcessing(nextPresentationTimestamp); - const core::SRange mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size()); - const core::SRange keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size()); - const auto cursorPosition = m_window->getCursorControl()->getPosition(); - const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()); + void calculateRayTracingStackSize(const smart_refctd_ptr& pipeline) + { + const auto raygenStackSize = pipeline->getRaygenStackSize(); + auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t + { + auto maxValue = 0; + for (const auto& val : ranges) + { + maxValue = std::max(maxValue, std::invoke(valProj, val)); + } + return maxValue; + }; + + const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit); + const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit); + const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection); + const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{}); + const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{}); + auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax); + firstDepthStackSizeMax = std::max(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax); + m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax); + } - const ext::imgui::UI::SUpdateParameters params = + bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr& pipeline) { - .mousePosition = mousePosition, - .displaySize = { m_window->getWidth(), m_window->getHeight() }, - .mouseEvents = mouseEvents, - .keyboardEvents = keyboardEvents - }; + const auto& limits = m_device->getPhysicalDevice()->getLimits(); + const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize; + const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment); - m_ui.manager->update(params); - } + auto& raygenRange = m_shaderBindingTable.raygenGroupRange; - inline bool keepRunning() override - { - if (m_surface->irrecoverable()) - return false; + auto& hitRange = m_shaderBindingTable.hitGroupsRange; + const auto hitHandles = pipeline->getHitHandles(); - return true; - } + auto& missRange = m_shaderBindingTable.missGroupsRange; + const auto missHandles = pipeline->getMissHandles(); - inline bool onAppTerminated() override - { - return device_base_t::onAppTerminated(); - } + auto& callableRange = m_shaderBindingTable.callableGroupsRange; + const auto callableHandles = pipeline->getCallableHandles(); -private: - uint32_t getWorkgroupCount(uint32_t dim, uint32_t size) - { - return (dim + size - 1) / size; - } + raygenRange = { + .offset = 0, + .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment) + }; - smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams& params) - { - smart_refctd_ptr buffer; - buffer = m_device->createBuffer(std::move(params)); - auto bufReqs = buffer->getMemoryReqs(); - bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); - m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + missRange = { + .offset = raygenRange.size, + .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), + }; + m_shaderBindingTable.missGroupsStride = handleSizeAligned; - return buffer; - } + hitRange = { + .offset = missRange.offset + missRange.size, + .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), + }; + m_shaderBindingTable.hitGroupsStride = handleSizeAligned; - smart_refctd_ptr getSingleUseCommandBufferAndBegin(smart_refctd_ptr pool) - { - smart_refctd_ptr cmdbuf; - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) - return nullptr; + callableRange = { + .offset = hitRange.offset + hitRange.size, + .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), + }; + m_shaderBindingTable.callableGroupsStride = handleSizeAligned; - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size; - return cmdbuf; - } + ICPUBuffer::SCreationParams cpuBufferParams; + cpuBufferParams.size = bufferSize; + auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams)); + uint8_t* pData = reinterpret_cast(cpuBuffer->getPointer()); - void cmdbufSubmitAndWait(smart_refctd_ptr cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue) - { - cmdbuf->end(); + // copy raygen region + memcpy(pData, &pipeline->getRaygen(), handleSize); - uint64_t finishedValue = startValue + 1; + // copy miss region + uint8_t* pMissData = pData + missRange.offset; + for (const auto& handle : missHandles) + { + memcpy(pMissData, &handle, handleSize); + pMissData += m_shaderBindingTable.missGroupsStride; + } - // submit builds - { - auto completed = m_device->createSemaphore(startValue); - - std::array signals; - { - auto& signal = signals.front(); - signal.value = finishedValue; - signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS); - signal.semaphore = completed.get(); - } - - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { - .cmdbuf = cmdbuf.get() - } }; - - const IQueue::SSubmitInfo infos[] = - { + // copy hit region + uint8_t* pHitData = pData + hitRange.offset; + for (const auto& handle : hitHandles) { - .waitSemaphores = {}, - .commandBuffers = commandBuffers, - .signalSemaphores = signals + memcpy(pHitData, &handle, handleSize); + pHitData += m_shaderBindingTable.hitGroupsStride; } - }; - if (queue->submit(infos) != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR); - return; - } + // copy callable region + uint8_t* pCallableData = pData + callableRange.offset; + for (const auto& handle : callableHandles) + { + memcpy(pCallableData, &handle, handleSize); + pCallableData += m_shaderBindingTable.callableGroupsStride; + } - const ISemaphore::SWaitInfo info[] = - { { - .semaphore = completed.get(), - .value = finishedValue - } }; + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT; + params.size = bufferSize; + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer); + missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + } - m_device->blockForSemaphores(info); + return true; } - } - - bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue) - { - const auto getBufferRangeAddress = [](const SBufferRange& range) - { - return range.buffer->getDeviceAddress() + range.offset; - }; - const auto command = TraceRaysIndirectCommand_t{ - .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange), - .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size, - .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange), - .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size, - .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride, - .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange), - .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size, - .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride, - .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange), - .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size, - .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride, - .width = WIN_W, - .height = WIN_H, - .depth = 1, - }; - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = sizeof(TraceRaysIndirectCommand_t); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer); - return true; - } - - void calculateRayTracingStackSize(const smart_refctd_ptr& pipeline) - { - const auto raygenStackSize = pipeline->getRaygenStackSize(); - auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t - { - auto maxValue = 0; - for (const auto& val : ranges) + +#ifdef TEST_ASSET_CONV_AS + bool createAccelerationStructuresFromGeometry(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) + { + // get geometries into ICPUBuffers + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!pool) + return logFail("Couldn't create Command Pool for geometry creation!"); + + const auto defaultMaterial = Material{ + .ambient = {0.2, 0.1, 0.1}, + .diffuse = {0.8, 0.3, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 1.0f, + }; + + auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) + { + core::matrix3x4SIMD transform; + transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); + return transform; + }; + + core::matrix3x4SIMD planeTransform; + planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); + + // triangles geometries + const auto cpuObjects = std::array{ + ReferenceObjectCpu { + .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, + .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), + .material = defaultMaterial, + .transform = planeTransform, + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), + .material = defaultMaterial, + .transform = getTranslationMatrix(0, 0.5f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.1, 0.2}, + .diffuse = {0.2, 0.2, 0.8}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }, + .transform = getTranslationMatrix(-5.0f, 1.0f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.2, 0.1}, + .diffuse = {0.2, 0.8, 0.2}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 0.2, + }, + .transform = getTranslationMatrix(5.0f, 1.0f, 0), + }, + }; + + struct CPUTriBufferBindings { - maxValue = std::max(maxValue, std::invoke(valProj, val)); - } - return maxValue; - }; - - const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit); - const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit); - const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection); - const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{}); - const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{}); - auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax); - firstDepthStackSizeMax = std::max(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax); - m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax); - } - - bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr& pipeline) - { - const auto& limits = m_device->getPhysicalDevice()->getLimits(); - const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize; - const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment); - - auto& raygenRange = m_shaderBindingTable.raygenGroupRange; - - auto& hitRange = m_shaderBindingTable.hitGroupsRange; - const auto hitHandles = pipeline->getHitHandles(); - - auto& missRange = m_shaderBindingTable.missGroupsRange; - const auto missHandles = pipeline->getMissHandles(); - - auto& callableRange = m_shaderBindingTable.callableGroupsRange; - const auto callableHandles = pipeline->getCallableHandles(); - - raygenRange = { - .offset = 0, - .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment) - }; + nbl::asset::SBufferBinding vertex, index; + }; + std::array cpuTriBuffers; - missRange = { - .offset = raygenRange.size, - .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), - }; - m_shaderBindingTable.missGroupsStride = handleSizeAligned; + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + const auto& cpuObject = cpuObjects[i]; - hitRange = { - .offset = missRange.offset + missRange.size, - .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), - }; - m_shaderBindingTable.hitGroupsStride = handleSizeAligned; + auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset + auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + vBuffer->addUsageFlags(vUsage); + vBuffer->setContentHash(vBuffer->computeContentHash()); - callableRange = { - .offset = hitRange.offset + hitRange.size, - .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), - }; - m_shaderBindingTable.callableGroupsStride = handleSizeAligned; + auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset + auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size; + if (cpuObject.data.indexType != EIT_UNKNOWN) + if (iBuffer) + { + iBuffer->addUsageFlags(iUsage); + iBuffer->setContentHash(iBuffer->computeContentHash()); + } - ICPUBuffer::SCreationParams cpuBufferParams; - cpuBufferParams.size = bufferSize; - auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams)); - uint8_t* pData = reinterpret_cast(cpuBuffer->getPointer()); + cpuTriBuffers[i] = { + .vertex = {.offset = 0, .buffer = vBuffer}, + .index = {.offset = 0, .buffer = iBuffer}, + }; - // copy raygen region - memcpy(pData, &pipeline->getRaygen(), handleSize); + } - // copy miss region - uint8_t* pMissData = pData + missRange.offset; - for (const auto& handle : missHandles) - { - memcpy(pMissData, &handle, handleSize); - pMissData += m_shaderBindingTable.missGroupsStride; - } + // procedural geometries + using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; - // copy hit region - uint8_t* pHitData = pData + hitRange.offset; - for (const auto& handle : hitHandles) - { - memcpy(pHitData, &handle, handleSize); - pHitData += m_shaderBindingTable.hitGroupsStride; - } + smart_refctd_ptr cpuProcBuffer; + { + ICPUBuffer::SCreationParams params; + params.size = NumberOfProceduralGeometries * sizeof(Aabb); + cpuProcBuffer = ICPUBuffer::create(std::move(params)); + } - // copy callable region - uint8_t* pCallableData = pData + callableRange.offset; - for (const auto& handle : callableHandles) - { - memcpy(pCallableData, &handle, handleSize); - pCallableData += m_shaderBindingTable.callableGroupsStride; - } + core::vector proceduralGeoms; + proceduralGeoms.reserve(NumberOfProceduralGeometries); + auto proceduralGeometries = reinterpret_cast(cpuProcBuffer->getPointer()); + for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) + { + const auto middle_i = NumberOfProceduralGeometries / 2.0; + SProceduralGeomInfo sphere = { + .material = hlsl::_static_cast(Material{ + .ambient = {0.1, 0.05 * i, 0.1}, + .diffuse = {0.3, 0.2 * i, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }), + .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), + .radius = 1, + }; + + proceduralGeoms.push_back(sphere); + const auto sphereMin = sphere.center - sphere.radius; + const auto sphereMax = sphere.center + sphere.radius; + proceduralGeometries[i] = { + vector3d(sphereMin.x, sphereMin.y, sphereMin.z), + vector3d(sphereMax.x, sphereMax.y, sphereMax.z) + }; + } - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT; - params.size = bufferSize; - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer); - missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); - hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); - callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); - } + // get ICPUBuffers into ICPUBLAS - return true; - } + // get ICPUBLAS into ICPUTLAS -#ifdef TEST_ASSET_CONV_AS + // reserve, convert + return true; + } #else - bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) - { - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for geometry creation!"); - - const auto defaultMaterial = Material{ - .ambient = {0.2, 0.1, 0.1}, - .diffuse = {0.8, 0.3, 0.3}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - .alpha = 1.0f, - }; - - auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) - { - core::matrix3x4SIMD transform; - transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); - return transform; - }; - - core::matrix3x4SIMD planeTransform; - planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); - - const auto cpuObjects = std::array{ - ReferenceObjectCpu { - .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, - .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), - .material = defaultMaterial, - .transform = planeTransform, - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), - .material = defaultMaterial, - .transform = getTranslationMatrix(0, 0.5f, 0), - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = Material{ - .ambient = {0.1, 0.1, 0.2}, - .diffuse = {0.2, 0.2, 0.8}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, + bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) + { + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!pool) + return logFail("Couldn't create Command Pool for geometry creation!"); + + const auto defaultMaterial = Material{ + .ambient = {0.2, 0.1, 0.1}, + .diffuse = {0.8, 0.3, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 1.0f, + }; + + auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) + { + core::matrix3x4SIMD transform; + transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); + return transform; + }; + + core::matrix3x4SIMD planeTransform; + planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); + + const auto cpuObjects = std::array{ + ReferenceObjectCpu { + .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, + .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), + .material = defaultMaterial, + .transform = planeTransform, }, - .transform = getTranslationMatrix(-5.0f, 1.0f, 0), - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = Material{ - .ambient = {0.1, 0.2, 0.1}, - .diffuse = {0.2, 0.8, 0.2}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - .alpha = 0.2, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), + .material = defaultMaterial, + .transform = getTranslationMatrix(0, 0.5f, 0), }, - .transform = getTranslationMatrix(5.0f, 1.0f, 0), - }, - }; - - struct ScratchVIBindings - { - nbl::asset::SBufferBinding vertex, index; - }; - std::array scratchBuffers; - - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - const auto& cpuObject = cpuObjects[i]; - - auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset - auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - vBuffer->addUsageFlags(vUsage); - vBuffer->setContentHash(vBuffer->computeContentHash()); - - auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset - auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - - if (cpuObject.data.indexType != EIT_UNKNOWN) - if (iBuffer) - { - iBuffer->addUsageFlags(iUsage); - iBuffer->setContentHash(iBuffer->computeContentHash()); - } + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.1, 0.2}, + .diffuse = {0.2, 0.2, 0.8}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }, + .transform = getTranslationMatrix(-5.0f, 1.0f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.2, 0.1}, + .diffuse = {0.2, 0.8, 0.2}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 0.2, + }, + .transform = getTranslationMatrix(5.0f, 1.0f, 0), + }, + }; - scratchBuffers[i] = { - .vertex = {.offset = 0, .buffer = vBuffer}, - .index = {.offset = 0, .buffer = iBuffer}, - }; - - } - - auto cmdbuf = getSingleUseCommandBufferAndBegin(pool); - cmdbuf->beginDebugMarker("Build geometry vertex and index buffers"); - - CAssetConverter::SInputs inputs = {}; - inputs.logger = m_logger.get(); - std::array tmpBuffers; - { - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get(); - tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get(); - } - - std::get>(inputs.assets) = tmpBuffers; - } - - auto reservation = m_converter->reserve(inputs); - { - auto prepass = [&](const auto & references) -> bool - { - auto objects = reservation.getGPUObjects(); - uint32_t counter = {}; - for (auto& object : objects) - { - auto gpu = object.value; - auto* reference = references[counter]; + struct ScratchVIBindings + { + nbl::asset::SBufferBinding vertex, index; + }; + std::array scratchBuffers; - if (reference) - { - if (!gpu) - { - m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); - return false; - } - } - counter++; - } - return true; - }; - - prepass.template operator() < ICPUBuffer > (tmpBuffers); - } - - auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) }); - STriangleGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); - - m_gpuTriangleGeometries.reserve(std::size(cpuObjects)); - // convert - { - // not sure if need this (probably not, originally for transition img view) - auto semaphore = m_device->createSemaphore(0u); - - std::array cmdbufs = {}; - cmdbufs.front().cmdbuf = cmdbuf.get(); - - SIntendedSubmitInfo transfer = {}; - transfer.queue = queue; - transfer.scratchCommandBuffers = cmdbufs; - transfer.scratchSemaphore = { - .semaphore = semaphore.get(), - .value = 0u, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS - }; - - CAssetConverter::SConvertParams params = {}; - params.utilities = m_utils.get(); - params.transfer = &transfer; - - auto future = reservation.convert(params); - if (future.copy() != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); - return false; - } - - auto&& buffers = reservation.getGPUObjects(); - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - auto& cpuObject = cpuObjects[i]; - - m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ - .meta = cpuObject.meta, - .bindings = { - .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, - .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, - }, - .vertexStride = cpuObject.data.inputParams.bindings[0].stride, - .indexType = cpuObject.data.indexType, - .indexCount = cpuObject.data.indexCount, - .material = hlsl::_static_cast(cpuObject.material), - .transform = cpuObject.transform, - }); - } - - for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) - { - const auto& gpuObject = m_gpuTriangleGeometries[i]; - const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); - geomInfos[i] = { - .material = gpuObject.material, - .vertexBufferAddress = vertexBufferAddress, - .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, - .vertexStride = gpuObject.vertexStride, - .objType = gpuObject.meta.type, - .indexType = gpuObject.indexType, - .smoothNormals = s_smoothNormals[gpuObject.meta.type], - }; - } - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = geomInfoBuffer->getSize(); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); - } - - // intersection geometries setup - { - core::vector proceduralGeoms; - proceduralGeoms.reserve(NumberOfProceduralGeometries); - using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; - core::vector aabbs; - aabbs.reserve(NumberOfProceduralGeometries); - for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) - { - const auto middle_i = NumberOfProceduralGeometries / 2.0; - SProceduralGeomInfo sphere = { - .material = hlsl::_static_cast(Material{ - .ambient = {0.1, 0.05 * i, 0.1}, - .diffuse = {0.3, 0.2 * i, 0.3}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - }), - .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), - .radius = 1, - }; - - proceduralGeoms.push_back(sphere); - const auto sphereMin = sphere.center - sphere.radius; - const auto sphereMax = sphere.center + sphere.radius; - aabbs.emplace_back( - vector3d(sphereMin.x, sphereMin.y, sphereMin.z), - vector3d(sphereMax.x, sphereMax.y, sphereMax.z)); - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT; - params.size = aabbs.size() * sizeof(Aabb); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer); - } - } - - return true; - } - - bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue) - { - // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}} - // spheres. Each sphere is a primitive instead one instance or geometry - const auto blasCount = m_gpuTriangleGeometries.size() + 1; - const auto proceduralBlasIdx = m_gpuTriangleGeometries.size(); - - IQueryPool::SCreationParams qParams{ .queryCount = static_cast(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE }; - smart_refctd_ptr queryPool = m_device->createQueryPool(std::move(qParams)); - - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for blas/tlas creation!"); - - m_api->startCapture(); -#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it - m_currentImageAcquire = m_surface->acquireNextImage(); - { - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS - } }; - m_surface->present(m_currentImageAcquire.imageIndex, acquired); - } - m_currentImageAcquire = m_surface->acquireNextImage(); -#endif - size_t totalScratchSize = 0; - const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + const auto& cpuObject = cpuObjects[i]; - // build bottom level ASes - { - core::vector primitiveCounts(blasCount); - core::vector> triangles(m_gpuTriangleGeometries.size()); - core::vector scratchSizes(blasCount); - IGPUBottomLevelAccelerationStructure::AABBs aabbs; - - auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; - if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) - blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS; - - IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo; - initBuildInfo.buildFlags = blasFlags; - initBuildInfo.geometryCount = 1; // only 1 geometry object per blas - initBuildInfo.srcAS = nullptr; - initBuildInfo.dstAS = nullptr; - initBuildInfo.scratch = {}; - - auto blasBuildInfos = core::vector(blasCount, initBuildInfo); - - m_gpuBlasList.resize(blasCount); - // setup blas info for triangle geometries - for (uint32_t i = 0; i < blasCount; i++) - { - const auto isProcedural = i == proceduralBlasIdx; - if (isProcedural) + auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset + auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + vBuffer->addUsageFlags(vUsage); + vBuffer->setContentHash(vBuffer->computeContentHash()); + + auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset + auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + + if (cpuObject.data.indexType != EIT_UNKNOWN) + if (iBuffer) + { + iBuffer->addUsageFlags(iUsage); + iBuffer->setContentHash(iBuffer->computeContentHash()); + } + + scratchBuffers[i] = { + .vertex = {.offset = 0, .buffer = vBuffer}, + .index = {.offset = 0, .buffer = iBuffer}, + }; + + } + + auto cmdbuf = getSingleUseCommandBufferAndBegin(pool); + cmdbuf->beginDebugMarker("Build geometry vertex and index buffers"); + + CAssetConverter::SInputs inputs = {}; + inputs.logger = m_logger.get(); + std::array tmpBuffers; { - aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer); - aabbs.data.offset = 0; - aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t); - aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now - - primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries; - blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs; - blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; - } else + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get(); + tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get(); + } + + std::get>(inputs.assets) = tmpBuffers; + } + + auto reservation = m_converter->reserve(inputs); { - const auto& gpuObject = m_gpuTriangleGeometries[i]; - - const uint32_t vertexStride = gpuObject.vertexStride; - const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride; - if (gpuObject.useIndex()) - primitiveCounts[i] = gpuObject.indexCount / 3; - else - primitiveCounts[i] = numVertices / 3; - - triangles[i].vertexData[0] = gpuObject.bindings.vertex; - triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex; - triangles[i].maxVertex = numVertices - 1; - triangles[i].vertexStride = vertexStride; - triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; - triangles[i].indexType = gpuObject.indexType; - triangles[i].geometryFlags = gpuObject.material.isTransparent() ? - IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : - IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; - - blasBuildInfos[i].triangles = &triangles[i]; + auto prepass = [&](const auto & references) -> bool + { + auto objects = reservation.getGPUObjects(); + uint32_t counter = {}; + for (auto& object : objects) + { + auto gpu = object.value; + auto* reference = references[counter]; + + if (reference) + { + if (!gpu) + { + m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); + return false; + } + } + counter++; + } + return true; + }; + + prepass.template operator() < ICPUBuffer > (tmpBuffers); } - ILogicalDevice::AccelerationStructureBuildSizes buildSizes; + + auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) }); + STriangleGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); + + m_gpuTriangleGeometries.reserve(std::size(cpuObjects)); + // convert { - const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; - if (isProcedural) - { - const auto* aabbData = &aabbs; - buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1}, maxPrimCount); - } - else - { - const auto* trianglesData = triangles.data(); - buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{trianglesData,1}, maxPrimCount); - } - if (!buildSizes) - return logFail("Failed to get BLAS build sizes"); + // not sure if need this (probably not, originally for transition img view) + auto semaphore = m_device->createSemaphore(0u); + + std::array cmdbufs = {}; + cmdbufs.front().cmdbuf = cmdbuf.get(); + + SIntendedSubmitInfo transfer = {}; + transfer.queue = queue; + transfer.scratchCommandBuffers = cmdbufs; + transfer.scratchSemaphore = { + .semaphore = semaphore.get(), + .value = 0u, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + + CAssetConverter::SConvertParams params = {}; + params.utilities = m_utils.get(); + params.transfer = &transfer; + + auto future = reservation.convert(params); + if (future.copy() != IQueue::RESULT::SUCCESS) + { + m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); + return false; + } + + auto&& buffers = reservation.getGPUObjects(); + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + auto& cpuObject = cpuObjects[i]; + + m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ + .meta = cpuObject.meta, + .bindings = { + .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, + .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, + }, + .vertexStride = cpuObject.data.inputParams.bindings[0].stride, + .indexType = cpuObject.data.indexType, + .indexCount = cpuObject.data.indexCount, + .material = hlsl::_static_cast(cpuObject.material), + .transform = cpuObject.transform, + }); + } + + for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) + { + const auto& gpuObject = m_gpuTriangleGeometries[i]; + const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); + geomInfos[i] = { + .material = gpuObject.material, + .vertexBufferAddress = vertexBufferAddress, + .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, + .vertexStride = gpuObject.vertexStride, + .objType = gpuObject.meta.type, + .indexType = gpuObject.indexType, + .smoothNormals = s_smoothNormals[gpuObject.meta.type], + }; + } } - scratchSizes[i] = buildSizes.buildScratchSize; - totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment); - totalScratchSize += buildSizes.buildScratchSize; + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = geomInfoBuffer->getSize(); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); + } + // intersection geometries setup { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = buildSizes.accelerationStructureSize; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = buildSizes.accelerationStructureSize; - blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); - if (!m_gpuBlasList[i]) - return logFail("Could not create BLAS"); + core::vector proceduralGeoms; + proceduralGeoms.reserve(NumberOfProceduralGeometries); + using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; + core::vector aabbs; + aabbs.reserve(NumberOfProceduralGeometries); + for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) + { + const auto middle_i = NumberOfProceduralGeometries / 2.0; + SProceduralGeomInfo sphere = { + .material = hlsl::_static_cast(Material{ + .ambient = {0.1, 0.05 * i, 0.1}, + .diffuse = {0.3, 0.2 * i, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }), + .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), + .radius = 1, + }; + + proceduralGeoms.push_back(sphere); + const auto sphereMin = sphere.center - sphere.radius; + const auto sphereMax = sphere.center + sphere.radius; + aabbs.emplace_back( + vector3d(sphereMin.x, sphereMin.y, sphereMin.z), + vector3d(sphereMax.x, sphereMax.y, sphereMax.z)); + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT; + params.size = aabbs.size() * sizeof(Aabb); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer); + } } - } + return true; + } - auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool); - cmdbufBlas->beginDebugMarker("Build BLAS"); + bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue) + { + // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}} + // spheres. Each sphere is a primitive instead one instance or geometry + const auto blasCount = m_gpuTriangleGeometries.size() + 1; + const auto proceduralBlasIdx = m_gpuTriangleGeometries.size(); - cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount); + IQueryPool::SCreationParams qParams{ .queryCount = static_cast(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE }; + smart_refctd_ptr queryPool = m_device->createQueryPool(std::move(qParams)); - smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - params.size = totalScratchSize; - scratchBuffer = createBuffer(params); - } - - core::vector buildRangeInfos(blasCount); - core::vector pRangeInfos(blasCount); - for (uint32_t i = 0; i < blasCount; i++) - { - blasBuildInfos[i].dstAS = m_gpuBlasList[i].get(); - blasBuildInfos[i].scratch.buffer = scratchBuffer; - if (i == 0) - { - blasBuildInfos[i].scratch.offset = 0u; - } else + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + if (!pool) + return logFail("Couldn't create Command Pool for blas/tlas creation!"); + + m_api->startCapture(); +#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it + m_currentImageAcquire = m_surface->acquireNextImage(); { - const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1]; - blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment); + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS + } }; + m_surface->present(m_currentImageAcquire.imageIndex, acquired); } + m_currentImageAcquire = m_surface->acquireNextImage(); +#endif + size_t totalScratchSize = 0; + const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; + + // build bottom level ASes + { + core::vector primitiveCounts(blasCount); + core::vector> triangles(m_gpuTriangleGeometries.size()); + core::vector scratchSizes(blasCount); + IGPUBottomLevelAccelerationStructure::AABBs aabbs; + + auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; + if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) + blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS; + + IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo; + initBuildInfo.buildFlags = blasFlags; + initBuildInfo.geometryCount = 1; // only 1 geometry object per blas + initBuildInfo.srcAS = nullptr; + initBuildInfo.dstAS = nullptr; + initBuildInfo.scratch = {}; + + auto blasBuildInfos = core::vector(blasCount, initBuildInfo); + + m_gpuBlasList.resize(blasCount); + // setup blas info for triangle geometries + for (uint32_t i = 0; i < blasCount; i++) + { + const auto isProcedural = i == proceduralBlasIdx; + if (isProcedural) + { + aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer); + aabbs.data.offset = 0; + aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t); + aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now + + primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries; + blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs; + blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; + } + else + { + const auto& gpuObject = m_gpuTriangleGeometries[i]; + + const uint32_t vertexStride = gpuObject.vertexStride; + const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride; + if (gpuObject.useIndex()) + primitiveCounts[i] = gpuObject.indexCount / 3; + else + primitiveCounts[i] = numVertices / 3; + + triangles[i].vertexData[0] = gpuObject.bindings.vertex; + triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex; + triangles[i].maxVertex = numVertices - 1; + triangles[i].vertexStride = vertexStride; + triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; + triangles[i].indexType = gpuObject.indexType; + triangles[i].geometryFlags = gpuObject.material.isTransparent() ? + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; + + blasBuildInfos[i].triangles = &triangles[i]; + } + ILogicalDevice::AccelerationStructureBuildSizes buildSizes; + { + const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; + if (isProcedural) + { + const auto* aabbData = &aabbs; + buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1 }, maxPrimCount); + } + else + { + const auto* trianglesData = triangles.data(); + buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ trianglesData,1 }, maxPrimCount); + } + if (!buildSizes) + return logFail("Failed to get BLAS build sizes"); + } - buildRangeInfos[i].primitiveCount = primitiveCounts[i]; - buildRangeInfos[i].primitiveByteOffset = 0u; - buildRangeInfos[i].firstVertex = 0u; - buildRangeInfos[i].transformByteOffset = 0u; + scratchSizes[i] = buildSizes.buildScratchSize; + totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment); + totalScratchSize += buildSizes.buildScratchSize; - pRangeInfos[i] = &buildRangeInfos[i]; - } + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; + params.size = buildSizes.accelerationStructureSize; + smart_refctd_ptr asBuffer = createBuffer(params); + + IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; + blasParams.bufferRange.buffer = asBuffer; + blasParams.bufferRange.offset = 0u; + blasParams.bufferRange.size = buildSizes.accelerationStructureSize; + blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; + m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); + if (!m_gpuBlasList[i]) + return logFail("Could not create BLAS"); + } + } - if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data())) - return logFail("Failed to build BLAS"); - { - SMemoryBarrier memBarrier; - memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; - memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT; - cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); - } + auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool); + cmdbufBlas->beginDebugMarker("Build BLAS"); + cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount); - core::vector ases(blasCount); - for (uint32_t i = 0; i < blasCount; i++) - ases[i] = m_gpuBlasList[i].get(); - if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE, - queryPool.get(), 0)) - return logFail("Failed to write acceleration structure properties!"); + smart_refctd_ptr scratchBuffer; + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + params.size = totalScratchSize; + scratchBuffer = createBuffer(params); + } - cmdbufBlas->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufBlas, queue, 39); - } + core::vector buildRangeInfos(blasCount); + core::vector pRangeInfos(blasCount); + for (uint32_t i = 0; i < blasCount; i++) + { + blasBuildInfos[i].dstAS = m_gpuBlasList[i].get(); + blasBuildInfos[i].scratch.buffer = scratchBuffer; + if (i == 0) + { + blasBuildInfos[i].scratch.offset = 0u; + } + else + { + const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1]; + blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment); + } - auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool); - cmdbufCompact->beginDebugMarker("Compact BLAS"); + buildRangeInfos[i].primitiveCount = primitiveCounts[i]; + buildRangeInfos[i].primitiveByteOffset = 0u; + buildRangeInfos[i].firstVertex = 0u; + buildRangeInfos[i].transformByteOffset = 0u; - // compact blas - { - core::vector asSizes(blasCount); - if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT)) - return logFail("Could not get query pool results for AS sizes"); - - core::vector> cleanupBlas(blasCount); - for (uint32_t i = 0; i < blasCount; i++) - { - if (asSizes[i] == 0) continue; - cleanupBlas[i] = m_gpuBlasList[i]; + pRangeInfos[i] = &buildRangeInfos[i]; + } + + if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data())) + return logFail("Failed to build BLAS"); + + { + SMemoryBarrier memBarrier; + memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; + memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; + memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; + memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT; + cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); + } + + + core::vector ases(blasCount); + for (uint32_t i = 0; i < blasCount; i++) + ases[i] = m_gpuBlasList[i].get(); + if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE, + queryPool.get(), 0)) + return logFail("Failed to write acceleration structure properties!"); + + cmdbufBlas->endDebugMarker(); + cmdbufSubmitAndWait(cmdbufBlas, queue, 39); + } + + auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool); + cmdbufCompact->beginDebugMarker("Compact BLAS"); + + // compact blas { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = asSizes[i]; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = asSizes[i]; - blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); - if (!m_gpuBlasList[i]) - return logFail("Could not create compacted BLAS"); + core::vector asSizes(blasCount); + if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT)) + return logFail("Could not get query pool results for AS sizes"); + + core::vector> cleanupBlas(blasCount); + for (uint32_t i = 0; i < blasCount; i++) + { + if (asSizes[i] == 0) continue; + cleanupBlas[i] = m_gpuBlasList[i]; + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; + params.size = asSizes[i]; + smart_refctd_ptr asBuffer = createBuffer(params); + + IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; + blasParams.bufferRange.buffer = asBuffer; + blasParams.bufferRange.offset = 0u; + blasParams.bufferRange.size = asSizes[i]; + blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; + m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); + if (!m_gpuBlasList[i]) + return logFail("Could not create compacted BLAS"); + } + + IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo; + copyInfo.src = cleanupBlas[i].get(); + copyInfo.dst = m_gpuBlasList[i].get(); + copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT; + if (!cmdbufCompact->copyAccelerationStructure(copyInfo)) + return logFail("Failed to copy AS to compact"); + } } - IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo; - copyInfo.src = cleanupBlas[i].get(); - copyInfo.dst = m_gpuBlasList[i].get(); - copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT; - if (!cmdbufCompact->copyAccelerationStructure(copyInfo)) - return logFail("Failed to copy AS to compact"); - } - } + cmdbufCompact->endDebugMarker(); + cmdbufSubmitAndWait(cmdbufCompact, queue, 40); - cmdbufCompact->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufCompact, queue, 40); + auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool); + cmdbufTlas->beginDebugMarker("Build TLAS"); - auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool); - cmdbufTlas->beginDebugMarker("Build TLAS"); + // build top level AS + { + const uint32_t instancesCount = blasCount; + core::vector instances(instancesCount); + for (uint32_t i = 0; i < instancesCount; i++) + { + const auto isProceduralInstance = i == proceduralBlasIdx; + instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress; + instances[i].base.mask = 0xFF; + instances[i].base.instanceCustomIndex = i; + instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0; + instances[i].base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); + instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform; + } - // build top level AS - { - const uint32_t instancesCount = blasCount; - core::vector instances(instancesCount); - for (uint32_t i = 0; i < instancesCount; i++) - { - const auto isProceduralInstance = i == proceduralBlasIdx; - instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress; - instances[i].base.mask = 0xFF; - instances[i].base.instanceCustomIndex = i; - instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0; - instances[i].base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); - instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform; - } - - { - size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT | - IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = bufSize; - m_instanceBuffer = createBuffer(params); - - SBufferRange range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer }; - cmdbufTlas->updateBuffer(range, instances.data()); - } - - // make sure instances upload complete first - { - SMemoryBarrier memBarrier; - memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; - memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; - memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; - cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); - } - - auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); - - IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo; - tlasBuildInfo.buildFlags = tlasFlags; - tlasBuildInfo.srcAS = nullptr; - tlasBuildInfo.dstAS = nullptr; - tlasBuildInfo.instanceData.buffer = m_instanceBuffer; - tlasBuildInfo.instanceData.offset = 0u; - tlasBuildInfo.scratch = {}; - - auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount); - if (!buildSizes) - return logFail("Failed to get TLAS build sizes"); - - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = buildSizes.accelerationStructureSize; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUTopLevelAccelerationStructure::SCreationParams tlasParams; - tlasParams.bufferRange.buffer = asBuffer; - tlasParams.bufferRange.offset = 0u; - tlasParams.bufferRange.size = buildSizes.accelerationStructureSize; - tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams)); - if (!m_gpuTlas) - return logFail("Could not create TLAS"); - } - - smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - params.size = buildSizes.buildScratchSize; - scratchBuffer = createBuffer(params); - } - - tlasBuildInfo.dstAS = m_gpuTlas.get(); - tlasBuildInfo.scratch.buffer = scratchBuffer; - tlasBuildInfo.scratch.offset = 0u; - - IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u]; - buildRangeInfo[0].instanceCount = instancesCount; - buildRangeInfo[0].instanceByteOffset = 0u; - IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos; - pRangeInfos = &buildRangeInfo[0]; - - if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos)) - return logFail("Failed to build TLAS"); - } + { + size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT | + IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = bufSize; + m_instanceBuffer = createBuffer(params); + + SBufferRange range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer }; + cmdbufTlas->updateBuffer(range, instances.data()); + } + + // make sure instances upload complete first + { + SMemoryBarrier memBarrier; + memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; + memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; + cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); + } - cmdbufTlas->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufTlas, queue, 45); + auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); + + IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo; + tlasBuildInfo.buildFlags = tlasFlags; + tlasBuildInfo.srcAS = nullptr; + tlasBuildInfo.dstAS = nullptr; + tlasBuildInfo.instanceData.buffer = m_instanceBuffer; + tlasBuildInfo.instanceData.offset = 0u; + tlasBuildInfo.scratch = {}; + + auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount); + if (!buildSizes) + return logFail("Failed to get TLAS build sizes"); + + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; + params.size = buildSizes.accelerationStructureSize; + smart_refctd_ptr asBuffer = createBuffer(params); + + IGPUTopLevelAccelerationStructure::SCreationParams tlasParams; + tlasParams.bufferRange.buffer = asBuffer; + tlasParams.bufferRange.offset = 0u; + tlasParams.bufferRange.size = buildSizes.accelerationStructureSize; + tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE; + m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams)); + if (!m_gpuTlas) + return logFail("Could not create TLAS"); + } + + smart_refctd_ptr scratchBuffer; + { + IGPUBuffer::SCreationParams params; + params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + params.size = buildSizes.buildScratchSize; + scratchBuffer = createBuffer(params); + } + + tlasBuildInfo.dstAS = m_gpuTlas.get(); + tlasBuildInfo.scratch.buffer = scratchBuffer; + tlasBuildInfo.scratch.offset = 0u; + + IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u]; + buildRangeInfo[0].instanceCount = instancesCount; + buildRangeInfo[0].instanceByteOffset = 0u; + IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos; + pRangeInfos = &buildRangeInfo[0]; + + if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos)) + return logFail("Failed to build TLAS"); + } + + cmdbufTlas->endDebugMarker(); + cmdbufSubmitAndWait(cmdbufTlas, queue, 45); #ifdef TRY_BUILD_FOR_NGFX - { - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS - } }; - m_surface->present(m_currentImageAcquire.imageIndex, acquired); - } + { + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS + } }; + m_surface->present(m_currentImageAcquire.imageIndex, acquired); + } #endif - m_api->endCapture(); + m_api->endCapture(); - return true; - } + return true; + } #endif // TEST_ASSET_CONV_AS - smart_refctd_ptr m_window; - smart_refctd_ptr> m_surface; - smart_refctd_ptr m_semaphore; - uint64_t m_realFrameIx = 0; - uint32_t m_frameAccumulationCounter = 0; - std::array, MaxFramesInFlight> m_cmdBufs; - ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; - - core::smart_refctd_ptr m_inputSystem; - InputSystem::ChannelReader m_mouse; - InputSystem::ChannelReader m_keyboard; - - struct CameraSetting - { - float fov = 60.f; - float zNear = 0.1f; - float zFar = 10000.f; - float moveSpeed = 1.f; - float rotateSpeed = 1.f; - float viewWidth = 10.f; - float camYAngle = 165.f / 180.f * 3.14159f; - float camXAngle = 32.f / 180.f * 3.14159f; - - } m_cameraSetting; - Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); - - Light m_light = { - .direction = {-1.0f, -1.0f, -0.4f}, - .position = {10.0f, 15.0f, 8.0f}, - .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, - .type = ELT_DIRECTIONAL - }; - - video::CDumbPresentationOracle m_oracle; - - struct C_UI - { - nbl::core::smart_refctd_ptr manager; - - struct + smart_refctd_ptr m_window; + smart_refctd_ptr> m_surface; + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; + uint32_t m_frameAccumulationCounter = 0; + std::array, MaxFramesInFlight> m_cmdBufs; + ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + + core::smart_refctd_ptr m_inputSystem; + InputSystem::ChannelReader m_mouse; + InputSystem::ChannelReader m_keyboard; + + struct CameraSetting { - core::smart_refctd_ptr gui, scene; - } samplers; + float fov = 60.f; + float zNear = 0.1f; + float zFar = 10000.f; + float moveSpeed = 1.f; + float rotateSpeed = 1.f; + float viewWidth = 10.f; + float camYAngle = 165.f / 180.f * 3.14159f; + float camXAngle = 32.f / 180.f * 3.14159f; + + } m_cameraSetting; + Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); + + Light m_light = { + .direction = {-1.0f, -1.0f, -0.4f}, + .position = {10.0f, 15.0f, 8.0f}, + .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, + .type = ELT_DIRECTIONAL + }; + + video::CDumbPresentationOracle m_oracle; + + struct C_UI + { + nbl::core::smart_refctd_ptr manager; + + struct + { + core::smart_refctd_ptr gui, scene; + } samplers; - core::smart_refctd_ptr descriptorSet; - } m_ui; - core::smart_refctd_ptr m_guiDescriptorSetPool; + core::smart_refctd_ptr descriptorSet; + } m_ui; + core::smart_refctd_ptr m_guiDescriptorSetPool; - core::vector m_gpuTriangleGeometries; - core::vector m_gpuIntersectionSpheres; - uint32_t m_intersectionHitGroupIdx; + core::vector m_gpuTriangleGeometries; + core::vector m_gpuIntersectionSpheres; + uint32_t m_intersectionHitGroupIdx; - std::vector> m_gpuBlasList; - smart_refctd_ptr m_gpuTlas; - smart_refctd_ptr m_instanceBuffer; + std::vector> m_gpuBlasList; + smart_refctd_ptr m_gpuTlas; + smart_refctd_ptr m_instanceBuffer; - smart_refctd_ptr m_triangleGeomInfoBuffer; - smart_refctd_ptr m_proceduralGeomInfoBuffer; - smart_refctd_ptr m_proceduralAabbBuffer; - smart_refctd_ptr m_indirectBuffer; + smart_refctd_ptr m_triangleGeomInfoBuffer; + smart_refctd_ptr m_proceduralGeomInfoBuffer; + smart_refctd_ptr m_proceduralAabbBuffer; + smart_refctd_ptr m_indirectBuffer; - smart_refctd_ptr m_hdrImage; - smart_refctd_ptr m_hdrImageView; + smart_refctd_ptr m_hdrImage; + smart_refctd_ptr m_hdrImageView; - smart_refctd_ptr m_rayTracingDsPool; - smart_refctd_ptr m_rayTracingDs; - smart_refctd_ptr m_rayTracingPipeline; - uint64_t m_rayTracingStackSize; - ShaderBindingTable m_shaderBindingTable; + smart_refctd_ptr m_rayTracingDsPool; + smart_refctd_ptr m_rayTracingDs; + smart_refctd_ptr m_rayTracingPipeline; + uint64_t m_rayTracingStackSize; + ShaderBindingTable m_shaderBindingTable; - smart_refctd_ptr m_presentDs; - smart_refctd_ptr m_presentDsPool; - smart_refctd_ptr m_presentPipeline; + smart_refctd_ptr m_presentDs; + smart_refctd_ptr m_presentDsPool; + smart_refctd_ptr m_presentPipeline; - smart_refctd_ptr m_converter; + smart_refctd_ptr m_converter; - core::matrix4SIMD m_cachedModelViewProjectionMatrix; - bool m_useIndirectCommand = false; + core::matrix4SIMD m_cachedModelViewProjectionMatrix; + bool m_useIndirectCommand = false; }; NBL_MAIN_FUNC(RaytracingPipelineApp) From b4c5ff2f37489357648bc6b629970d4767ceabd9 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 19 May 2025 13:51:10 +0700 Subject: [PATCH 180/296] use asset converter to build tlas --- 71_RayTracingPipeline/main.cpp | 406 +++++++++++++++++++++++++++++---- 1 file changed, 364 insertions(+), 42 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 528b2c314..faa392a46 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1225,41 +1225,41 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, // triangles geometries const auto cpuObjects = std::array{ - ReferenceObjectCpu { - .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, - .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), - .material = defaultMaterial, - .transform = planeTransform, - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), - .material = defaultMaterial, - .transform = getTranslationMatrix(0, 0.5f, 0), - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = Material{ - .ambient = {0.1, 0.1, 0.2}, - .diffuse = {0.2, 0.2, 0.8}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, + ReferenceObjectCpu { + .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, + .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), + .material = defaultMaterial, + .transform = planeTransform, }, - .transform = getTranslationMatrix(-5.0f, 1.0f, 0), - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = Material{ - .ambient = {0.1, 0.2, 0.1}, - .diffuse = {0.2, 0.8, 0.2}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - .alpha = 0.2, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), + .material = defaultMaterial, + .transform = getTranslationMatrix(0, 0.5f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.1, 0.2}, + .diffuse = {0.2, 0.2, 0.8}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }, + .transform = getTranslationMatrix(-5.0f, 1.0f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.2, 0.1}, + .diffuse = {0.2, 0.8, 0.2}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 0.2, + }, + .transform = getTranslationMatrix(5.0f, 1.0f, 0), }, - .transform = getTranslationMatrix(5.0f, 1.0f, 0), - }, }; struct CPUTriBufferBindings @@ -1313,14 +1313,14 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { const auto middle_i = NumberOfProceduralGeometries / 2.0; SProceduralGeomInfo sphere = { - .material = hlsl::_static_cast(Material{ - .ambient = {0.1, 0.05 * i, 0.1}, - .diffuse = {0.3, 0.2 * i, 0.3}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - }), - .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), - .radius = 1, + .material = hlsl::_static_cast(Material{ + .ambient = {0.1, 0.05 * i, 0.1}, + .diffuse = {0.3, 0.2 * i, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }), + .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), + .radius = 1, }; proceduralGeoms.push_back(sphere); @@ -1332,11 +1332,333 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, }; } + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); + } + // get ICPUBuffers into ICPUBLAS + // TODO use one BLAS and multiple triangles/aabbs in one + const auto blasCount = std::size(cpuObjects) + 1; + const auto proceduralBlasIdx = std::size(cpuObjects); + + std::array, std::size(cpuObjects)+1u> cpuBlas; + for (uint32_t i = 0; i < blasCount; i++) + { + auto& blas = cpuBlas[i]; + blas = make_smart_refctd_ptr(); + + if (i == proceduralBlasIdx) + { + auto aabbs = make_refctd_dynamic_array>>(1u); + auto primitiveCounts = make_refctd_dynamic_array>(1u); + + auto& aabb = aabbs->front(); + auto& primCount = primitiveCounts->front(); + + primCount = NumberOfProceduralGeometries; + aabb.data = { .offset = 0, .buffer = cpuProcBuffer }; + aabb.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t); + aabb.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now + + blas->setGeometries(std::move(aabbs), std::move(primitiveCounts)); + } + else + { + auto triangles = make_refctd_dynamic_array>>(1u); + auto primitiveCounts = make_refctd_dynamic_array>(1u); + + auto& tri = triangles->front(); + auto& primCount = primitiveCounts->front(); + const auto& geom = cpuObjects[i]; + const auto& cpuBuf = cpuTriBuffers[i]; + + const bool useIndex = geom.data.indexType != EIT_UNKNOWN; + const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride; + const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride; + + if (useIndex) + primCount = geom.data.indexCount / 3; + else + primCount = numVertices / 3; + + tri.vertexData[0] = cpuBuf.vertex; + tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex; + tri.maxVertex = numVertices - 1; + tri.vertexStride = vertexStride; + tri.vertexFormat = EF_R32G32B32_SFLOAT; + tri.indexType = geom.data.indexType; + tri.geometryFlags = geom.material.isTransparent() ? + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; + + blas->setGeometries(std::move(triangles), std::move(primitiveCounts)); + } + + auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; + if (i == proceduralBlasIdx) + blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; + + blas->setBuildFlags(blasFlags); + blas->setContentHash(blas->computeContentHash()); + } + + auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) }); + STriangleGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); // get ICPUBLAS into ICPUTLAS + auto geomInstances = make_refctd_dynamic_array>(blasCount); + { + uint32_t i = 0; + for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++) + { + const auto isProceduralInstance = i == proceduralBlasIdx; + ICPUTopLevelAccelerationStructure::StaticInstance inst; + inst.base.blas = cpuBlas[i]; + inst.base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); + inst.base.instanceCustomIndex = i; + inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;; + inst.base.mask = 0xFF; + inst.transform = isProceduralInstance ? matrix3x4SIMD() : cpuObjects[i].transform; + + instance->instance = inst; + } + } + + auto cpuTlas = make_smart_refctd_ptr(); + cpuTlas->setInstances(std::move(geomInstances)); + cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); + +//#define TEST_REBAR_FALLBACK + // convert with asset converter + smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); + struct MyInputs : CAssetConverter::SInputs + { +#ifndef TEST_REBAR_FALLBACK + inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override + { + assert(memoryBacked); + return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes; + } +#endif + uint32_t rebarMemoryTypes; + } inputs = {}; + inputs.logger = m_logger.get(); + inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); +#ifndef TEST_REBAR_FALLBACK + struct MyAllocator final : public IDeviceMemoryAllocator + { + ILogicalDevice* getDeviceForAllocations() const override { return device; } + + SAllocation allocate(const SAllocateInfo& info) override + { + auto retval = device->allocate(info); + // map what is mappable by default so ReBAR checks succeed + if (retval.isValid() && retval.memory->isMappable()) + retval.memory->map({ .offset = 0,.length = info.size }); + return retval; + } + + ILogicalDevice* device; + } myalloc; + myalloc.device = m_device.get(); + inputs.allocator = &myalloc; +#endif + + std::array tmpTlas; + std::array tmpBuffers; + { + tmpTlas[0] = cpuTlas.get(); + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get(); + tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get(); + } + tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get(); + + std::get>(inputs.assets) = tmpTlas; + std::get>(inputs.assets) = tmpBuffers; + } + + auto reservation = converter->reserve(inputs); + { + auto prepass = [&](const auto & references) -> bool + { + auto objects = reservation.getGPUObjects(); + uint32_t counter = {}; + for (auto& object : objects) + { + auto gpu = object.value; + auto* reference = references[counter]; + + if (reference) + { + if (!gpu) + { + m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); + return false; + } + } + counter++; + } + return true; + }; + + prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas); + prepass.template operator() < ICPUBuffer > (tmpBuffers); + } + + constexpr auto XferBufferCount = 2; + std::array, XferBufferCount> xferBufs = {}; + std::array xferBufInfos = {}; + { + auto pool = m_device->createCommandPool(getTransferUpQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, xferBufs); + xferBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + for (auto i = 0; i < XferBufferCount; i++) + xferBufInfos[i].cmdbuf = xferBufs[i].get(); + } + auto xferSema = m_device->createSemaphore(0u); + SIntendedSubmitInfo transfer = {}; + transfer.queue = getTransferUpQueue(); + transfer.scratchCommandBuffers = xferBufInfos; + transfer.scratchSemaphore = { + .semaphore = xferSema.get(), + .value = 0u, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + + constexpr auto CompBufferCount = 2; + std::array, CompBufferCount> compBufs = {}; + std::array compBufInfos = {}; + { + auto pool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs); + compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + for (auto i = 0; i < CompBufferCount; i++) + compBufInfos[i].cmdbuf = compBufs[i].get(); + } + auto compSema = m_device->createSemaphore(0u); + SIntendedSubmitInfo compute = {}; + compute.queue = getComputeQueue(); + compute.scratchCommandBuffers = compBufInfos; + compute.scratchSemaphore = { + .semaphore = compSema.get(), + .value = 0u, + .stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT | PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT + }; + // convert + { + smart_refctd_ptr scratchAlloc; + { + constexpr auto MaxAlignment = 256; + constexpr auto MinAllocationSize = 1024; + const auto scratchSize = core::alignUp(reservation.getMinASBuildScratchSize(false), MaxAlignment); + + + IGPUBuffer::SCreationParams creationParams = {}; + creationParams.size = scratchSize; + creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; +#ifdef TEST_REBAR_FALLBACK + creationParams.usage |= IGPUBuffer::EUF_TRANSFER_DST_BIT; + core::unordered_set sharingSet = { compute.queue->getFamilyIndex(),transfer.queue->getFamilyIndex() }; + core::vector sharingIndices(sharingSet.begin(), sharingSet.end()); + if (sharingIndices.size() > 1) + creationParams.queueFamilyIndexCount = sharingIndices.size(); + creationParams.queueFamilyIndices = sharingIndices.data(); +#endif + auto scratchBuffer = m_device->createBuffer(std::move(creationParams)); + + auto reqs = scratchBuffer->getMemoryReqs(); +#ifndef TEST_REBAR_FALLBACK + reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); +#endif + auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); +#ifndef TEST_REBAR_FALLBACK + allocation.memory->map({ .offset = 0,.length = reqs.size }); +#endif + + scratchAlloc = make_smart_refctd_ptr( + SBufferRange{0ull, scratchSize, std::move(scratchBuffer)}, + core::allocator(), MaxAlignment, MinAllocationSize + ); + } + + struct MyParams final : CAssetConverter::SConvertParams + { + inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override + { + return finalUser; + } + inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override + { + return finalUser; + } + + uint8_t finalUser; + } params = {}; +#undef TEST_REBAR_FALLBACK + params.utilities = m_utils.get(); + params.transfer = &transfer; + params.compute = &compute; + params.scratchForDeviceASBuild = scratchAlloc.get(); + params.finalUser = queue->getFamilyIndex(); + + auto future = reservation.convert(params); + if (future.copy() != IQueue::RESULT::SUCCESS) + { + m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); + return false; + } + + // assign gpu objects to output + auto&& tlases = reservation.getGPUObjects(); + m_gpuTlas = tlases[0].value; + auto&& buffers = reservation.getGPUObjects(); + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + auto& cpuObject = cpuObjects[i]; + + m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ + .meta = cpuObject.meta, + .bindings = { + .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, + .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, + }, + .vertexStride = cpuObject.data.inputParams.bindings[0].stride, + .indexType = cpuObject.data.indexType, + .indexCount = cpuObject.data.indexCount, + .material = hlsl::_static_cast(cpuObject.material), + .transform = cpuObject.transform, + }); + } + m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value; + + for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) + { + const auto& gpuObject = m_gpuTriangleGeometries[i]; + const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); + geomInfos[i] = { + .material = gpuObject.material, + .vertexBufferAddress = vertexBufferAddress, + .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, + .vertexStride = gpuObject.vertexStride, + .objType = gpuObject.meta.type, + .indexType = gpuObject.indexType, + .smoothNormals = s_smoothNormals[gpuObject.meta.type], + }; + } + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = geomInfoBuffer->getSize(); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); + } - // reserve, convert return true; } #else From 086c21e3c5237149cc82289d0920f61940eb9d00 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 20 May 2025 11:02:23 +0700 Subject: [PATCH 181/296] use bda in unit test --- .../app_resources/shaderCommon.hlsl | 25 ++++--- .../app_resources/testWorkgroup.comp.hlsl | 13 ++-- .../app_resources/workgroupCommon.hlsl | 11 ++-- 23_Arithmetic2UnitTest/main.cpp | 66 ++++++++----------- 4 files changed, 61 insertions(+), 54 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl index 376f69579..45a1f8097 100644 --- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl @@ -16,10 +16,13 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1 typedef vector type_t; -// unfortunately DXC chokes on descriptors as static members -// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 -[[vk::binding(0, 0)]] StructuredBuffer inputValue; -[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; +struct PushConstantData +{ + uint64_t inputBufAddress; + uint64_t outputAddressBufAddress; +}; + +[[vk::push_constant]] PushConstantData pc; // because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way uint32_t globalIndex(); @@ -41,19 +44,25 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) using config_t = nbl::hlsl::subgroup2::Configuration; using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); + if (globalIndex()==0u) - output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - + vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); + operation_t func; + type_t val = func(sourceVal); if (canStore()) - output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); + [unroll] + for (uint32_t i = 0; i < N; i++) + vk::RawBufferStore(outputBufAddr+sizeof(uint32_t)+sizeof(type_t)*globalIndex()+i*sizeof(uint32_t), val[i]); + // vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * globalIndex(), value, sizeof(uint32_t)); TODO why won't this work??? } type_t test() { const uint32_t idx = globalIndex(); - type_t sourceVal = inputValue[idx]; + type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + idx * sizeof(type_t)); subtest(sourceVal); subtest(sourceVal); diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 3aafc0aa7..9debd184d 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -12,13 +12,17 @@ struct DataProxy void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) { const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - value = inputValue[workgroupOffset + ix]; + value = vk::RawBufferLoad(pc.inputBufAddress + (workgroupOffset + ix) * sizeof(dtype_t)); } template void set(const uint32_t ix, const dtype_t value) { const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * (workgroupOffset+ix), value); + uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t)); + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) + vk::RawBufferStore(outputBufAddr+sizeof(uint32_t)+sizeof(dtype_t)*(workgroupOffset+ix)+i*sizeof(uint32_t), value[i]); + // vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset+ix), value, sizeof(uint32_t)); TODO why won't this work??? } void workgroupExecutionAndMemoryBarrier() @@ -49,8 +53,9 @@ struct operation_t template class binop, typename T, uint32_t N> static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { + uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + binop::BindingIndex * sizeof(uint64_t)); if (globalIndex()==0u) - output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); operation_t,nbl::hlsl::jit::device_capabilities> func; func(); // store is done with data accessor now @@ -59,7 +64,7 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) type_t test() { - const type_t sourceVal = inputValue[globalIndex()]; + type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + globalIndex() * sizeof(type_t)); subtest(sourceVal); subtest(sourceVal); diff --git a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl index b0ccbf295..c02d86969 100644 --- a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl @@ -25,10 +25,13 @@ using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration type_t; -// unfortunately DXC chokes on descriptors as static members -// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 -[[vk::binding(0, 0)]] StructuredBuffer inputValue; -[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; +struct PushConstantData +{ + uint64_t inputBufAddress; + uint64_t outputAddressBufAddress; +}; + +[[vk::push_constant]] PushConstantData pc; // because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way uint32_t globalIndex(); diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index e7dfcefa1..282473d12 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -45,6 +45,12 @@ struct emulatedScanExclusive static inline constexpr const char* name = "exclusive_scan"; }; +struct PushConstantData +{ + uint64_t inputBufAddress; + uint64_t outputAddressBufAddress; +}; + class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { using device_base_t = application_templates::BasicMultiQueueApplication; @@ -76,7 +82,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount; - inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; m_utils->createFilledDeviceLocalBufferOnDedMem( SIntendedSubmitInfo{.queue=getTransferUpQueue()}, std::move(inputDataBufferCreationParams), @@ -89,53 +95,37 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu { IGPUBuffer::SCreationParams params = {}; params.size = sizeof(uint32_t) + gpuinputDataBuffer->getSize(); - params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT; + params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; outputBuffers[i] = m_device->createBuffer(std::move(params)); auto mreq = outputBuffers[i]->getMemoryReqs(); mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); assert(mreq.memoryTypeBits); - auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get()); + auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); assert(bufferMem.isValid()); } - // create Descriptor Set and Pipeline Layout + // create buffer to store BDA of output buffers + smart_refctd_ptr gpuOutputAddressesBuffer; { - // create Descriptor Set Layout - smart_refctd_ptr dsLayout; - { - IGPUDescriptorSetLayout::SBinding binding[2]; - for (uint32_t i = 0u; i < 2; i++) - binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; - binding[1].count = OutputBufferCount; - dsLayout = m_device->createDescriptorSetLayout(binding); - } - - // set and transient pool - auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1}); - descriptorSet = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); - { - IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount]; - infos[0].desc = gpuinputDataBuffer; - infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() }; - for (uint32_t i = 1u; i <= OutputBufferCount; i++) - { - auto buff = outputBuffers[i - 1]; - infos[i].info.buffer = { 0u,buff->getSize() }; - infos[i].desc = std::move(buff); // save an atomic in the refcount - - } - - IGPUDescriptorSet::SWriteDescriptorSet writes[2]; - for (uint32_t i=0u; i<2; i++) - writes[i] = {descriptorSet.get(),i,0u,1u,infos+i}; - writes[1].count = OutputBufferCount; + std::array outputAddresses; + for (uint32_t i = 0; i < OutputBufferCount; i++) + outputAddresses[i] = outputBuffers[i]->getDeviceAddress(); + + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = OutputBufferCount * sizeof(uint64_t); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer); + } + pc.inputBufAddress = gpuinputDataBuffer->getDeviceAddress(); + pc.outputAddressBufAddress = gpuOutputAddressesBuffer->getDeviceAddress(); - m_device->updateDescriptorSets(2, writes, 0u, nullptr); - } + // create Pipeline Layout + { + SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) }; - pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout)); + pipelineLayout = m_device->createPipelineLayout({&pcRange, 1}); } // load shader source from file @@ -333,7 +323,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); cmdbuf->bindComputePipeline(pipeline.get()); - cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); + cmdbuf->pushConstants(pipelineLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc); cmdbuf->dispatch(workgroupCount, 1, 1); { IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; @@ -467,8 +457,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu uint32_t* inputData = nullptr; constexpr static inline uint32_t OutputBufferCount = 8u; smart_refctd_ptr outputBuffers[OutputBufferCount]; - smart_refctd_ptr descriptorSet; smart_refctd_ptr pipelineLayout; + PushConstantData pc; smart_refctd_ptr sema; uint64_t timelineValue = 0; From f4af3edc1cd8d152f6c67bd15577b2595cb2a43f Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 20 May 2025 12:05:15 +0700 Subject: [PATCH 182/296] benchmarks use bda --- .../app_resources/benchmarkSubgroup.comp.hlsl | 8 ++- .../benchmarkWorkgroup.comp.hlsl | 6 +- .../app_resources/shaderCommon.hlsl | 22 ++++--- .../app_resources/workgroupCommon.hlsl | 11 ++-- 29_Arithmetic2Bench/main.cpp | 59 ++++++++++--------- 5 files changed, 63 insertions(+), 43 deletions(-) diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl index 2f575d39a..e21d67fcb 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl @@ -25,18 +25,22 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; type_t value = sourceVal; + const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); + operation_t func; // [unroll] for (uint32_t i = 0; i < NUM_LOOPS; i++) value = func(value); - output[binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); + [unroll] + for (uint32_t i = 0; i < N; i++) + vk::RawBufferStore(outputBufAddr+sizeof(uint32_t)+sizeof(type_t)*globalIndex()+i*sizeof(uint32_t), val[i]); } void benchmark() { const uint32_t idx = globalIndex(); - type_t sourceVal = inputValue[idx]; + type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + idx * sizeof(type_t)); subbench(sourceVal); subbench(sourceVal); diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index 6e32bedbd..0194b2f75 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -55,8 +55,10 @@ struct operation_t template class binop, typename T, uint32_t N> static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) { + const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); + if (globalIndex()==0u) - output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); operation_t,nbl::hlsl::jit::device_capabilities> func; // TODO separate out store/load from DataProxy? so we don't do too many RW in benchmark @@ -67,7 +69,7 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) type_t benchmark() { - const type_t sourceVal = inputValue[globalIndex()]; + const type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + globalIndex() * sizeof(type_t)); subbench(sourceVal); subbench(sourceVal); diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl index 376f69579..ae0f61f33 100644 --- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl +++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl @@ -16,10 +16,13 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1 typedef vector type_t; -// unfortunately DXC chokes on descriptors as static members -// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 -[[vk::binding(0, 0)]] StructuredBuffer inputValue; -[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; +struct PushConstantData +{ + uint64_t inputBufAddress; + uint64_t outputAddressBufAddress; +}; + +[[vk::push_constant]] PushConstantData pc; // because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way uint32_t globalIndex(); @@ -41,19 +44,24 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) using config_t = nbl::hlsl::subgroup2::Configuration; using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); + if (globalIndex()==0u) - output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); operation_t func; if (canStore()) - output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); + [unroll] + for (uint32_t i = 0; i < N; i++) + vk::RawBufferStore(outputBufAddr+sizeof(uint32_t)+sizeof(type_t)*globalIndex()+i*sizeof(uint32_t), val[i]); + // vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * globalIndex(), value, sizeof(uint32_t)); TODO why won't this work??? } type_t test() { const uint32_t idx = globalIndex(); - type_t sourceVal = inputValue[idx]; + type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + idx * sizeof(type_t)); subtest(sourceVal); subtest(sourceVal); diff --git a/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl index b0ccbf295..c02d86969 100644 --- a/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl +++ b/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl @@ -25,10 +25,13 @@ using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration type_t; -// unfortunately DXC chokes on descriptors as static members -// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 -[[vk::binding(0, 0)]] StructuredBuffer inputValue; -[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; +struct PushConstantData +{ + uint64_t inputBufAddress; + uint64_t outputAddressBufAddress; +}; + +[[vk::push_constant]] PushConstantData pc; // because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way uint32_t globalIndex(); diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index bf20d5faa..0772997dc 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -47,6 +47,12 @@ struct emulatedScanExclusive static inline constexpr const char* name = "exclusive_scan"; }; +struct PushConstantData +{ + uint64_t inputBufAddress; + uint64_t outputAddressBufAddress; +}; + // NOTE added swapchain + drawing frames to be able to profile with Nsight, which still doesn't support profiling headless compute shaders class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { @@ -130,7 +136,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub const uint32_t elementCount = Output<>::ScanElementCount; // populate our random data buffer on the CPU and create a GPU copy inputData = new uint32_t[elementCount]; - smart_refctd_ptr gpuinputDataBuffer; { std::mt19937 randGenerator(0xdeadbeefu); for (uint32_t i = 0u; i < elementCount; i++) @@ -138,7 +143,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount; - inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; m_utils->createFilledDeviceLocalBufferOnDedMem( SIntendedSubmitInfo{.queue=getTransferUpQueue()}, std::move(inputDataBufferCreationParams), @@ -151,17 +156,31 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub { IGPUBuffer::SCreationParams params = {}; params.size = sizeof(uint32_t) + gpuinputDataBuffer->getSize(); - params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT; + params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; outputBuffers[i] = m_device->createBuffer(std::move(params)); auto mreq = outputBuffers[i]->getMemoryReqs(); mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); assert(mreq.memoryTypeBits); - auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get()); + auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); assert(bufferMem.isValid()); } + // create buffer to store BDA of output buffers + { + std::array outputAddresses; + for (uint32_t i = 0; i < OutputBufferCount; i++) + outputAddresses[i] = outputBuffers[i]->getDeviceAddress(); + + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = OutputBufferCount * sizeof(uint64_t); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer); + } + pc.inputBufAddress = gpuinputDataBuffer->getDeviceAddress(); + pc.outputAddressBufAddress = gpuOutputAddressesBuffer->getDeviceAddress(); + // create dummy image dummyImg = m_device->createImage({ { @@ -194,36 +213,16 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub // set and transient pool smart_refctd_ptr benchLayout; { - IGPUDescriptorSetLayout::SBinding binding[3]; - for (uint32_t i = 0u; i < 2; i++) - binding[i] = { {},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; - binding[1].count = OutputBufferCount; - binding[2] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; + IGPUDescriptorSetLayout::SBinding binding[1]; + binding[0] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; benchLayout = m_device->createDescriptorSetLayout(binding); } benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &benchLayout.get(),1 }); benchDs = benchPool->createDescriptorSet(smart_refctd_ptr(benchLayout)); - { - IGPUDescriptorSet::SDescriptorInfo infos[1 + OutputBufferCount]; - infos[0].desc = gpuinputDataBuffer; - infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() }; - for (uint32_t i = 1u; i <= OutputBufferCount; i++) - { - auto buff = outputBuffers[i - 1]; - infos[i].info.buffer = { 0u,buff->getSize() }; - infos[i].desc = std::move(buff); // save an atomic in the refcount - } - // write swapchain image descriptor in loop - IGPUDescriptorSet::SWriteDescriptorSet writes[2]; - for (uint32_t i = 0u; i < 2; i++) - writes[i] = { benchDs.get(),i,0u,1u,infos + i }; - writes[1].count = OutputBufferCount; - - m_device->updateDescriptorSets(2, writes, 0u, nullptr); - } - benchPplnLayout = m_device->createPipelineLayout({}, std::move(benchLayout)); + SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) }; + benchPplnLayout = m_device->createPipelineLayout({ &pcRange, 1 }, std::move(benchLayout)); } // load shader source from file @@ -370,6 +369,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get()); + cmdbuf->pushConstants(benchSets[0].pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc); for (uint32_t i = 0; i < benchSets.size(); i++) runBenchmark(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2); @@ -722,8 +722,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub smart_refctd_ptr benchDs; uint32_t* inputData = nullptr; + smart_refctd_ptr gpuinputDataBuffer; constexpr static inline uint32_t OutputBufferCount = 8u; smart_refctd_ptr outputBuffers[OutputBufferCount]; + smart_refctd_ptr gpuOutputAddressesBuffer; + PushConstantData pc; smart_refctd_ptr sema; uint64_t timelineValue = 0; From a394f2216ffa8a843350570ca4f4dafe66b27700 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 20 May 2025 15:47:40 +0700 Subject: [PATCH 183/296] use data accessor with preload data in reg --- .../app_resources/testWorkgroup.comp.hlsl | 53 ++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 9debd184d..047572cde 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -32,6 +32,53 @@ struct DataProxy } }; +template +struct PreloadedDataProxy +{ + using dtype_t = vector; + static_assert(nbl::hlsl::is_same_v); + + NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; + + template + void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) + { + value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2]; + } + template + void set(const uint32_t ix, const dtype_t value) + { + preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value; + } + + void preload() + { + const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; + [unroll] + for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) + preloaded[idx] = vk::RawBufferLoad(pc.inputBufAddress + (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t)); + } + void unload() + { + const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; + uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t)); + [unroll] + for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) + [unroll] + for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) + vk::RawBufferStore(outputBufAddr+sizeof(uint32_t)+sizeof(dtype_t)*(workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex())+i*sizeof(uint32_t), preloaded[idx][i]); + // vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t)); TODO why won't this work??? + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } + + dtype_t preloaded[PreloadedDataCount]; +}; + static ScratchProxy arithmeticAccessor; template @@ -42,10 +89,12 @@ struct operation_t void operator()() { - DataProxy dataAccessor; - nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); + PreloadedDataProxy dataAccessor; + dataAccessor.preload(); + nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); // we barrier before because we alias the accessors for Binop arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); + dataAccessor.unload(); } }; From 44c34a8a65866fb6304c12032efd08e2338c7116 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 20 May 2025 16:32:53 +0700 Subject: [PATCH 184/296] use store with data type because it works now --- 23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl | 5 +---- .../app_resources/testWorkgroup.comp.hlsl | 10 ++-------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl index 45a1f8097..05dcfb469 100644 --- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl @@ -52,10 +52,7 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) operation_t func; type_t val = func(sourceVal); if (canStore()) - [unroll] - for (uint32_t i = 0; i < N; i++) - vk::RawBufferStore(outputBufAddr+sizeof(uint32_t)+sizeof(type_t)*globalIndex()+i*sizeof(uint32_t), val[i]); - // vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * globalIndex(), value, sizeof(uint32_t)); TODO why won't this work??? + vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t)); } diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 047572cde..38b6714bd 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -19,10 +19,7 @@ struct DataProxy { const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t)); - [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) - vk::RawBufferStore(outputBufAddr+sizeof(uint32_t)+sizeof(dtype_t)*(workgroupOffset+ix)+i*sizeof(uint32_t), value[i]); - // vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset+ix), value, sizeof(uint32_t)); TODO why won't this work??? + vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset+ix), value, sizeof(uint32_t)); } void workgroupExecutionAndMemoryBarrier() @@ -64,10 +61,7 @@ struct PreloadedDataProxy uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t)); [unroll] for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) - [unroll] - for (uint32_t i = 0; i < Config::ItemsPerInvocation_0; i++) - vk::RawBufferStore(outputBufAddr+sizeof(uint32_t)+sizeof(dtype_t)*(workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex())+i*sizeof(uint32_t), preloaded[idx][i]); - // vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t)); TODO why won't this work??? + vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t)); } void workgroupExecutionAndMemoryBarrier() From 52f5485d78f88c721ca4a971349d512664355ef7 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 20 May 2025 14:25:10 +0200 Subject: [PATCH 185/296] make old code compile with new API and work with renderdoc --- 67_RayQueryGeometry/main.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 0d7494e9c..2b5145913 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -4,7 +4,7 @@ #include "common.hpp" -#define TEST_ASSET_CONV_AS +//#define TEST_ASSET_CONV_AS class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { @@ -722,7 +722,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu cpuTlas->setInstances(std::move(geomInstances)); cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); -//#define TEST_REBAR_FALLBACK +#define TEST_REBAR_FALLBACK // convert with asset converter smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); struct MyInputs : CAssetConverter::SInputs @@ -927,7 +927,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(geometryInfoBuffer); } - return true; + return bool(gpuTlas); } #else bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) @@ -1122,7 +1122,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu { IGPUBottomLevelAccelerationStructure::DeviceBuildInfo blasBuildInfos[OT_COUNT]; uint32_t primitiveCounts[OT_COUNT]; - IGPUBottomLevelAccelerationStructure::Triangles triangles[OT_COUNT]; + IGPUBottomLevelAccelerationStructure::Triangles triangles[OT_COUNT]; uint32_t scratchSizes[OT_COUNT]; for (uint32_t i = 0; i < objectsGpu.size(); i++) @@ -1159,7 +1159,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu { const auto* trianglesData = triangles; const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; - buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{trianglesData,1}, maxPrimCount); + buildSizes = m_device->getAccelerationStructureBuildSizes(false,blasFlags, false, std::span{trianglesData,1}, maxPrimCount); if (!buildSizes) return logFail("Failed to get BLAS build sizes"); } @@ -1252,7 +1252,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu // compact blas { std::array asSizes{ 0 }; - if (!m_device->getQueryPoolResults(queryPool.get(), 0, objectsGpu.size(), asSizes.data(), sizeof(size_t), IQueryPool::WAIT_BIT)) + if (!m_device->getQueryPoolResults(queryPool.get(), 0, objectsGpu.size(), asSizes.data(), sizeof(size_t), bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::_64_BIT)) return logFail("Could not get query pool results for AS sizes"); std::array, OT_COUNT> cleanupBlas; From 010a9e549619a3e6426474a2dd1625c43654d669 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 20 May 2025 16:14:15 +0200 Subject: [PATCH 186/296] add missing ownership acquire and clean up the code a bit --- 67_RayQueryGeometry/main.cpp | 136 +++++++++++++++++++---------------- 1 file changed, 75 insertions(+), 61 deletions(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 2b5145913..1248a1bf3 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -4,7 +4,7 @@ #include "common.hpp" -//#define TEST_ASSET_CONV_AS +#define TEST_ASSET_CONV_AS class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { @@ -619,37 +619,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu const uint32_t byteOffsets[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 }; // based on normals data position const uint32_t smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 }; - struct CPUBufferBindings - { - nbl::asset::SBufferBinding vertex, index; - }; - std::array cpuBuffers; - - for (uint32_t i = 0; i < cpuBuffers.size(); i++) - { - const auto& geom = objectsCpu[i]; - auto& cpuObj = cpuBuffers[i]; - const bool useIndex = geom.data.indexType != EIT_UNKNOWN; - - auto vBuffer = smart_refctd_ptr(geom.data.bindings[0].buffer); // no offset - auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - - auto iBuffer = smart_refctd_ptr(geom.data.indexBuffer.buffer); // no offset - auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - - vBuffer->addUsageFlags(vUsage); - vBuffer->setContentHash(vBuffer->computeContentHash()); - cpuObj.vertex = { .offset = 0, .buffer = vBuffer }; - - if (useIndex) - if (iBuffer) - { - iBuffer->addUsageFlags(iUsage); - iBuffer->setContentHash(iBuffer->computeContentHash()); - } - cpuObj.index = { .offset = 0, .buffer = iBuffer }; - } - // get ICPUBuffers into ICPUBottomLevelAccelerationStructures std::array, OT_COUNT> cpuBlas; for (uint32_t i = 0; i < cpuBlas.size(); i++) @@ -660,11 +629,10 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu auto& tri = triangles->front(); auto& primCount = primitiveCounts->front(); const auto& geom = objectsCpu[i]; - const auto& cpuBuf = cpuBuffers[i]; const bool useIndex = geom.data.indexType != EIT_UNKNOWN; const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride; - const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride; + const uint32_t numVertices = (geom.data.bindings[0].buffer->getSize()-geom.data.bindings[0].offset) / vertexStride; if (useIndex) primCount = geom.data.indexCount / 3; @@ -675,11 +643,16 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu geomInfos[i].vertexStride = vertexStride; geomInfos[i].smoothNormals = smoothNormals[i]; - tri.vertexData[0] = cpuBuf.vertex; - tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex; + geom.data.bindings[0].buffer->setContentHash(geom.data.bindings[0].buffer->computeContentHash()); + tri.vertexData[0] = geom.data.bindings[0]; + if (useIndex) + { + geom.data.indexBuffer.buffer->setContentHash(geom.data.indexBuffer.buffer->computeContentHash()); + tri.indexData = geom.data.indexBuffer; + } tri.maxVertex = numVertices - 1; tri.vertexStride = vertexStride; - tri.vertexFormat = EF_R32G32B32_SFLOAT; + tri.vertexFormat = static_cast(geom.data.inputParams.attributes[0].format); tri.indexType = geom.data.indexType; tri.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; @@ -758,46 +731,36 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu inputs.allocator = &myalloc; #endif - std::array tmpTlas; - std::array tmpBuffers; + std::array tmpBuffers; { - tmpTlas[0] = cpuTlas.get(); for (uint32_t i = 0; i < objectsCpu.size(); i++) { - tmpBuffers[2 * i + 0] = cpuBuffers[i].vertex.buffer.get(); - tmpBuffers[2 * i + 1] = cpuBuffers[i].index.buffer.get(); + tmpBuffers[2 * i + 0] = cpuBlas[i]->getTriangleGeometries().front().vertexData[0].buffer.get(); + tmpBuffers[2 * i + 1] = cpuBlas[i]->getTriangleGeometries().front().indexData.buffer.get(); } - std::get>(inputs.assets) = tmpTlas; + std::get>(inputs.assets) = {&cpuTlas.get(),1}; + std::get>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()}; std::get>(inputs.assets) = tmpBuffers; } auto reservation = converter->reserve(inputs); { - auto prepass = [&](const auto & references) -> bool + auto prepass = [&]() -> bool { auto objects = reservation.getGPUObjects(); - uint32_t counter = {}; for (auto& object : objects) + if (!object.value) { - auto gpu = object.value; - auto* reference = references[counter]; - - if (reference) - { - if (!gpu) - { - m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); - return false; - } - } - counter++; + m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); + return false; } return true; }; - prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas); - prepass.template operator() < ICPUBuffer > (tmpBuffers); + prepass.template operator()(); + prepass.template operator()(); + prepass.template operator()(); } @@ -812,6 +775,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu xferBufInfos[i].cmdbuf = xferBufs[i].get(); } auto xferSema = m_device->createSemaphore(0u); + xferSema->setObjectDebugName("Transfer Semaphore"); SIntendedSubmitInfo transfer = {}; transfer.queue = getTransferUpQueue(); transfer.scratchCommandBuffers = xferBufInfos; @@ -832,6 +796,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu compBufInfos[i].cmdbuf = compBufs[i].get(); } auto compSema = m_device->createSemaphore(0u); + compSema->setObjectDebugName("Compute Semaphore"); SIntendedSubmitInfo compute = {}; compute.queue = getComputeQueue(); compute.scratchCommandBuffers = compBufInfos; @@ -841,6 +806,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu .stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT }; // convert + auto gQueue = getGraphicsQueue(); { smart_refctd_ptr scratchAlloc; { @@ -895,7 +861,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu params.transfer = &transfer; params.compute = &compute; params.scratchForDeviceASBuild = scratchAlloc.get(); - params.finalUser = queue->getFamilyIndex(); + params.finalUser = gQueue->getFamilyIndex(); auto future = reservation.convert(params); if (future.copy() != IQueue::RESULT::SUCCESS) @@ -920,11 +886,59 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu } } + // { IGPUBuffer::SCreationParams params; params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; params.size = OT_COUNT * sizeof(SGeomInfo); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(geometryInfoBuffer); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = gQueue }, std::move(params), geomInfos).move_into(geometryInfoBuffer); + } + + // acquire ownership + if (const auto gQFI=gQueue->getFamilyIndex(), otherQueueFamilyIndex=queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex) + { + smart_refctd_ptr cmdbuf; + m_device->createCommandPool(gQFI,IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1}); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + core::vector> bufBarriers; + auto acquireBufferRange = [&bufBarriers,otherQueueFamilyIndex](const SBufferRange& bufferRange) + { + bufBarriers.push_back({ + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT|ACCESS_FLAGS::STORAGE_READ_BIT + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = otherQueueFamilyIndex + }, + .range = bufferRange + }); + }; + for (auto buffer : reservation.getGPUObjects()) + { + const auto& buff = buffer.value; + acquireBufferRange({.offset=0,.size=buff->getSize(),.buffer=buff}); + } + auto acquireAS = [&acquireBufferRange](const IGPUAccelerationStructure* as) + { + acquireBufferRange(as->getCreationParams().bufferRange); + }; + for (auto blas : reservation.getGPUObjects()) + acquireAS(blas.value.get()); + acquireAS(gpuTlas.get()); + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,{.memBarriers={},.bufBarriers=bufBarriers}); + cmdbuf->end(); + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = { + .cmdbuf = cmdbuf.get() + }; + const IQueue::SSubmitInfo info = { + .waitSemaphores = {}, // we already waited with the host on the AS build + .commandBuffers = {&cmdbufInfo,1} + }; + gQueue->submit({&info,1}); } return bool(gpuTlas); From 0ccd26fc93d22587219b12291f855929949cef74 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 21 May 2025 15:01:30 +0700 Subject: [PATCH 187/296] save reduction returns to storage --- .../app_resources/testWorkgroup.comp.hlsl | 20 ++++++++++++++++++- 23_Arithmetic2UnitTest/main.cpp | 12 ++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 38b6714bd..58e293ba3 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -81,6 +81,23 @@ struct operation_t using binop_base_t = typename Binop::base_t; using otype_t = typename Binop::type_t; + // workgroup reduction returns the value of the reduction + // workgroup scans do no return anything, but use the data accessor to do the storing directly +#if IS_REDUCTION + void operator()() + { + PreloadedDataProxy dataAccessor; + dataAccessor.preload(); + otype_t value = nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); + // we barrier before because we alias the accessors for Binop + arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); + + [unroll] + for (uint32_t i = 0; i < PreloadedDataProxy::PreloadedDataCount; i++) + dataAccessor.preloaded[i] = value; + dataAccessor.unload(); + } +#else void operator()() { PreloadedDataProxy dataAccessor; @@ -90,6 +107,7 @@ struct operation_t arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); dataAccessor.unload(); } +#endif }; @@ -101,7 +119,7 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); operation_t,nbl::hlsl::jit::device_capabilities> func; - func(); // store is done with data accessor now + func(); } diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 282473d12..2edd34439 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -271,22 +271,24 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu smart_refctd_ptr overriddenUnspecialized; if constexpr (WorkgroupTest) { - const std::string definitions[5] = { + const std::string definitions[6] = { "workgroup2::" + arith_name, std::to_string(workgroupSizeLog2), std::to_string(itemsPerWG), std::to_string(itemsPerInvoc), - std::to_string(subgroupSizeLog2) + std::to_string(subgroupSizeLog2), + std::to_string(arith_name=="reduction") }; - const IShaderCompiler::SMacroDefinition defines[5] = { + const IShaderCompiler::SMacroDefinition defines[6] = { { "OPERATION", definitions[0] }, { "WORKGROUP_SIZE_LOG2", definitions[1] }, { "ITEMS_PER_WG", definitions[2] }, { "ITEMS_PER_INVOCATION", definitions[3] }, - { "SUBGROUP_SIZE_LOG2", definitions[4] } + { "SUBGROUP_SIZE_LOG2", definitions[4] }, + { "IS_REDUCTION", definitions[5] } }; - options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + options.preprocessorOptions.extraDefines = { defines, defines + 6 }; overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); } From f71b1d3117a2434c670bc857286a028e6c27b33d Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 21 May 2025 17:10:19 +0200 Subject: [PATCH 188/296] do the ownership acquires properly --- 67_RayQueryGeometry/main.cpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 1248a1bf3..6ddefdcf2 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -871,9 +871,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu } // assign gpu objects to output - auto&& tlases = reservation.getGPUObjects(); - gpuTlas = tlases[0].value; auto&& buffers = reservation.getGPUObjects(); + gpuTlas = reservation.getGPUObjects().front().value; for (uint32_t i = 0; i < objectsCpu.size(); i++) { auto vBuffer = buffers[2 * i + 0].value; @@ -917,16 +916,17 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu .range = bufferRange }); }; - for (auto buffer : reservation.getGPUObjects()) + for (const auto& buffer : reservation.getGPUObjects()) { const auto& buff = buffer.value; - acquireBufferRange({.offset=0,.size=buff->getSize(),.buffer=buff}); + if (buff) + acquireBufferRange({.offset=0,.size=buff->getSize(),.buffer=buff}); } auto acquireAS = [&acquireBufferRange](const IGPUAccelerationStructure* as) { acquireBufferRange(as->getCreationParams().bufferRange); }; - for (auto blas : reservation.getGPUObjects()) + for (const auto& blas : reservation.getGPUObjects()) acquireAS(blas.value.get()); acquireAS(gpuTlas.get()); cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,{.memBarriers={},.bufBarriers=bufBarriers}); @@ -934,11 +934,18 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = { .cmdbuf = cmdbuf.get() }; + const IQueue::SSubmitInfo::SSemaphoreInfo signal = { + .semaphore = compute.scratchSemaphore.semaphore, + .value = compute.getFutureScratchSemaphore().value, + .stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS + }; const IQueue::SSubmitInfo info = { .waitSemaphores = {}, // we already waited with the host on the AS build - .commandBuffers = {&cmdbufInfo,1} + .commandBuffers = {&cmdbufInfo,1}, + .signalSemaphores = {&signal,1} }; - gQueue->submit({&info,1}); + if (const auto retval=gQueue->submit({&info,1}); retval!=IQueue::RESULT::SUCCESS) + m_logger->log("Failed to transfer ownership with code %d!",system::ILogger::ELL_ERROR,retval); } return bool(gpuTlas); From f5302ec98b37e5473a0563862f6afe2d12ec43d0 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 22 May 2025 00:13:12 +0200 Subject: [PATCH 189/296] Do the QFOT Acquires properly, fix a bug due to missing BDA on sideband smooth normal info. Also fix alignment issues in BDA raw load --- 67_RayQueryGeometry/app_resources/common.hlsl | 1 + 67_RayQueryGeometry/main.cpp | 119 +++++++++++------- 2 files changed, 72 insertions(+), 48 deletions(-) diff --git a/67_RayQueryGeometry/app_resources/common.hlsl b/67_RayQueryGeometry/app_resources/common.hlsl index ecc811e3f..e39e7192b 100644 --- a/67_RayQueryGeometry/app_resources/common.hlsl +++ b/67_RayQueryGeometry/app_resources/common.hlsl @@ -14,6 +14,7 @@ struct SGeomInfo uint32_t vertexStride : 29; uint32_t indexType : 2; // 16 bit, 32 bit or none uint32_t smoothNormals : 1; // flat for cube, rectangle, disk + uint32_t padding; }; struct SPushConstants diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 6ddefdcf2..9d002c1f0 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -732,16 +732,21 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu #endif std::array tmpBuffers; + std::array, OT_COUNT * 2u> tmpBufferPatches; { for (uint32_t i = 0; i < objectsCpu.size(); i++) { tmpBuffers[2 * i + 0] = cpuBlas[i]->getTriangleGeometries().front().vertexData[0].buffer.get(); tmpBuffers[2 * i + 1] = cpuBlas[i]->getTriangleGeometries().front().indexData.buffer.get(); } + // make sure all buffers are BDA-readable + for (auto& patch : tmpBufferPatches) + patch.usage |= asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; std::get>(inputs.assets) = {&cpuTlas.get(),1}; std::get>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()}; std::get>(inputs.assets) = tmpBuffers; + std::get>(inputs.patches) = tmpBufferPatches; } auto reservation = converter->reserve(inputs); @@ -806,6 +811,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu .stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT }; // convert + m_api->startCapture(); auto gQueue = getGraphicsQueue(); { smart_refctd_ptr scratchAlloc; @@ -881,7 +887,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu const bool useIndex = geom.data.indexType != EIT_UNKNOWN; geomInfos[i].vertexBufferAddress = vBuffer->getDeviceAddress() + byteOffsets[i]; - geomInfos[i].indexBufferAddress = useIndex ? iBuffer->getDeviceAddress() : geomInfos[i].vertexBufferAddress; + geomInfos[i].indexBufferAddress = useIndex ? iBuffer->getDeviceAddress():0x0ull; } } @@ -894,60 +900,77 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu } // acquire ownership - if (const auto gQFI=gQueue->getFamilyIndex(), otherQueueFamilyIndex=queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex) { smart_refctd_ptr cmdbuf; - m_device->createCommandPool(gQFI,IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1}); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - core::vector> bufBarriers; - auto acquireBufferRange = [&bufBarriers,otherQueueFamilyIndex](const SBufferRange& bufferRange) - { - bufBarriers.push_back({ - .barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, - .srcAccessMask = ACCESS_FLAGS::NONE, - .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, - .dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT|ACCESS_FLAGS::STORAGE_READ_BIT - }, - .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, - .otherQueueFamilyIndex = otherQueueFamilyIndex - }, - .range = bufferRange - }); - }; - for (const auto& buffer : reservation.getGPUObjects()) { - const auto& buff = buffer.value; - if (buff) - acquireBufferRange({.offset=0,.size=buff->getSize(),.buffer=buff}); + const auto gQFI = gQueue->getFamilyIndex(); + m_device->createCommandPool(gQFI,IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1}); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + { + core::vector> bufBarriers; + auto acquireBufferRange = [&bufBarriers](const uint8_t otherQueueFamilyIndex, const SBufferRange& bufferRange) + { + bufBarriers.push_back({ + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + // we don't care what exactly, uncomplex our code + .dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = otherQueueFamilyIndex + }, + .range = bufferRange + }); + }; + if (const auto otherQueueFamilyIndex=transfer.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex) + for (const auto& buffer : reservation.getGPUObjects()) + { + const auto& buff = buffer.value; + if (buff) + acquireBufferRange(otherQueueFamilyIndex,{.offset=0,.size=buff->getSize(),.buffer=buff}); + } + if (const auto otherQueueFamilyIndex=compute.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex) + { + auto acquireAS = [&acquireBufferRange,otherQueueFamilyIndex](const IGPUAccelerationStructure* as) + { + acquireBufferRange(otherQueueFamilyIndex,as->getCreationParams().bufferRange); + }; + for (const auto& blas : reservation.getGPUObjects()) + acquireAS(blas.value.get()); + acquireAS(gpuTlas.get()); + } + if (!bufBarriers.empty()) + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,{.memBarriers={},.bufBarriers=bufBarriers}); + } + cmdbuf->end(); } - auto acquireAS = [&acquireBufferRange](const IGPUAccelerationStructure* as) + if (!cmdbuf->empty()) { - acquireBufferRange(as->getCreationParams().bufferRange); - }; - for (const auto& blas : reservation.getGPUObjects()) - acquireAS(blas.value.get()); - acquireAS(gpuTlas.get()); - cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,{.memBarriers={},.bufBarriers=bufBarriers}); - cmdbuf->end(); - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = { - .cmdbuf = cmdbuf.get() - }; - const IQueue::SSubmitInfo::SSemaphoreInfo signal = { - .semaphore = compute.scratchSemaphore.semaphore, - .value = compute.getFutureScratchSemaphore().value, - .stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS - }; - const IQueue::SSubmitInfo info = { - .waitSemaphores = {}, // we already waited with the host on the AS build - .commandBuffers = {&cmdbufInfo,1}, - .signalSemaphores = {&signal,1} - }; - if (const auto retval=gQueue->submit({&info,1}); retval!=IQueue::RESULT::SUCCESS) - m_logger->log("Failed to transfer ownership with code %d!",system::ILogger::ELL_ERROR,retval); + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = { + .cmdbuf = cmdbuf.get() + }; + const IQueue::SSubmitInfo::SSemaphoreInfo signal = { + .semaphore = compute.scratchSemaphore.semaphore, + .value = compute.getFutureScratchSemaphore().value, + .stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS + }; + auto wait = signal; + wait.value--; + const IQueue::SSubmitInfo info = { + .waitSemaphores = {&wait,1}, // we already waited with the host on the AS build + .commandBuffers = {&cmdbufInfo,1}, + .signalSemaphores = {&signal,1} + }; + if (const auto retval=gQueue->submit({&info,1}); retval!=IQueue::RESULT::SUCCESS) + m_logger->log("Failed to transfer ownership with code %d!",system::ILogger::ELL_ERROR,retval); + } } + m_api->endCapture(); + return bool(gpuTlas); } #else From 0bd7fbad07365c205c7b0014d3a5c713937fbef6 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 22 May 2025 00:17:25 +0200 Subject: [PATCH 190/296] ReBAR AS-conversion codepath tested and enabled by default now --- 67_RayQueryGeometry/main.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 9d002c1f0..7a6abd1af 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -695,7 +695,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu cpuTlas->setInstances(std::move(geomInstances)); cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); -#define TEST_REBAR_FALLBACK +//#define TEST_REBAR_FALLBACK // convert with asset converter smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); struct MyInputs : CAssetConverter::SInputs @@ -862,7 +862,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu uint8_t finalUser; } params = {}; -#undef TEST_REBAR_FALLBACK params.utilities = m_utils.get(); params.transfer = &transfer; params.compute = &compute; @@ -925,6 +924,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu .range = bufferRange }); }; +#ifndef TEST_REBAR_FALLBACK if (const auto otherQueueFamilyIndex=transfer.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex) for (const auto& buffer : reservation.getGPUObjects()) { @@ -932,6 +932,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu if (buff) acquireBufferRange(otherQueueFamilyIndex,{.offset=0,.size=buff->getSize(),.buffer=buff}); } +#endif if (const auto otherQueueFamilyIndex=compute.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex) { auto acquireAS = [&acquireBufferRange,otherQueueFamilyIndex](const IGPUAccelerationStructure* as) @@ -968,6 +969,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu m_logger->log("Failed to transfer ownership with code %d!",system::ILogger::ELL_ERROR,retval); } } +#undef TEST_REBAR_FALLBACK m_api->endCapture(); From ff962daed731df536a909ee8d12d2f8d3579431b Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 22 May 2025 01:04:12 +0200 Subject: [PATCH 191/296] typo and no point announcing ass-conv failures if the ass-conv will do it for us --- 67_RayQueryGeometry/main.cpp | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 7a6abd1af..8690f55bc 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -750,24 +750,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu } auto reservation = converter->reserve(inputs); - { - auto prepass = [&]() -> bool - { - auto objects = reservation.getGPUObjects(); - for (auto& object : objects) - if (!object.value) - { - m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); - return false; - } - return true; - }; - - prepass.template operator()(); - prepass.template operator()(); - prepass.template operator()(); - } - constexpr auto XferBufferCount = 2; std::array,XferBufferCount> xferBufs = {}; @@ -893,7 +875,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu // { IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; params.size = OT_COUNT * sizeof(SGeomInfo); m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = gQueue }, std::move(params), geomInfos).move_into(geometryInfoBuffer); } @@ -924,7 +906,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu .range = bufferRange }); }; -#ifndef TEST_REBAR_FALLBACK +#ifdef TEST_REBAR_FALLBACK if (const auto otherQueueFamilyIndex=transfer.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex) for (const auto& buffer : reservation.getGPUObjects()) { From 09f0d32cf936dd5d200d39cd7eddc3f610766e0b Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 22 May 2025 01:47:53 +0200 Subject: [PATCH 192/296] stuff was crashing cause IGPUBuffers backing BDA were going out of scope --- .../app_resources/render.comp.hlsl | 4 +- 67_RayQueryGeometry/main.cpp | 584 +----------------- 2 files changed, 27 insertions(+), 561 deletions(-) diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl index e3d78f385..aa4524124 100644 --- a/67_RayQueryGeometry/app_resources/render.comp.hlsl +++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl @@ -125,8 +125,8 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) const int primID = spirv::rayQueryGetIntersectionPrimitiveIndexKHR(query, true); // TODO: candidate for `bda::__ptr` - const SGeomInfo geom = vk::RawBufferLoad(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo)); - + const SGeomInfo geom = vk::RawBufferLoad(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo),8); + float3 normals; if (jit::device_capabilities::rayTracingPositionFetch) { diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 8690f55bc..1faeaf196 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -1,11 +1,8 @@ // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h - #include "common.hpp" -#define TEST_ASSET_CONV_AS - class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { using device_base_t = examples::SimpleWindowedApplication; @@ -128,14 +125,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu auto cQueue = getComputeQueue(); -#ifdef TEST_ASSET_CONV_AS - if (!createAccelerationStructuresFromGeometry(cQueue, geometryCreator)) - return logFail("Could not create acceleration structures from provided geometry creator"); -#else - // create geometry objects - if (!createGeometries(gQueue, geometryCreator)) - return logFail("Could not create geometries from geometry creator"); - // create blas/tlas //#define TRY_BUILD_FOR_NGFX // Validation errors on the fake Acquire-Presents, TODO fix #ifdef TRY_BUILD_FOR_NGFX @@ -148,12 +137,11 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu std::this_thread::yield(); } // Nsight is special and can't capture anything not on the queue that performs the swapchain acquire/release - if (!createAccelerationStructures(gQueue)) + if (!createAccelerationStructuresFromGeometry(gQueue,geometryCreator)) #else - if (!createAccelerationStructures(cQueue)) + if (!createAccelerationStructuresFromGeometry(cQueue,geometryCreator)) #endif return logFail("Could not create acceleration structures"); -#endif // TEST_ASSET_CONV_AS // create pipelines { @@ -197,10 +185,10 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); const std::array dsLayoutPtrs = { descriptorSetLayout.get() }; - renderPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); - if (!renderPool) + auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); + if (!pool) return logFail("Could not create descriptor pool"); - renderDs = renderPool->createDescriptorSet(descriptorSetLayout); + renderDs = pool->createDescriptorSet(descriptorSetLayout); if (!renderDs) return logFail("Could not create descriptor set"); @@ -288,7 +276,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu static bool first = true; if (first) { - m_api->startCapture(); first = false; } @@ -527,79 +514,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu return (dim + size - 1) / size; } - smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams& params) - { - smart_refctd_ptr buffer; - buffer = m_device->createBuffer(std::move(params)); - auto bufReqs = buffer->getMemoryReqs(); - bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); - m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - - return buffer; - } - -#ifndef TEST_ASSET_CONV_AS - smart_refctd_ptr getSingleUseCommandBufferAndBegin(smart_refctd_ptr pool) - { - smart_refctd_ptr cmdbuf; - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) - return nullptr; - - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - - return cmdbuf; - } - - void cmdbufSubmitAndWait(smart_refctd_ptr cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue) - { - cmdbuf->end(); - - uint64_t finishedValue = startValue + 1; - - // submit builds - { - auto completed = m_device->createSemaphore(startValue); - - std::array signals; - { - auto& signal = signals.front(); - signal.value = finishedValue; - signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS); - signal.semaphore = completed.get(); - } - - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { - .cmdbuf = cmdbuf.get() - } }; - - const IQueue::SSubmitInfo infos[] = - { - { - .waitSemaphores = {}, - .commandBuffers = commandBuffers, - .signalSemaphores = signals - } - }; - - if (queue->submit(infos) != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR); - return; - } - - const ISemaphore::SWaitInfo info[] = - { { - .semaphore = completed.get(), - .value = finishedValue - } }; - - m_device->blockForSemaphores(info); - } - } -#endif - -#ifdef TEST_ASSET_CONV_AS bool createAccelerationStructuresFromGeometry(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) { // get geometries in ICPUBuffers @@ -793,6 +707,18 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu .stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT }; // convert +#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it + m_currentImageAcquire = m_surface->acquireNextImage(); + { + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS + } }; + m_surface->present(m_currentImageAcquire.imageIndex,acquired); + } + m_currentImageAcquire = m_surface->acquireNextImage(); +#endif m_api->startCapture(); auto gQueue = getGraphicsQueue(); { @@ -858,12 +784,14 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu } // assign gpu objects to output - auto&& buffers = reservation.getGPUObjects(); gpuTlas = reservation.getGPUObjects().front().value; + for (const auto& buffer : reservation.getGPUObjects()) + if (buffer) + retainedBuffers.push_back(buffer.value); for (uint32_t i = 0; i < objectsCpu.size(); i++) { - auto vBuffer = buffers[2 * i + 0].value; - auto iBuffer = buffers[2 * i + 1].value; + auto vBuffer = retainedBuffers[2 * i + 0].get(); + auto iBuffer = retainedBuffers[2 * i + 1].get(); const auto& geom = objectsCpu[i]; const bool useIndex = geom.data.indexType != EIT_UNKNOWN; @@ -952,465 +880,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu } } #undef TEST_REBAR_FALLBACK - - m_api->endCapture(); - - return bool(gpuTlas); - } -#else - bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) - { - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for geometry creation!"); - - std::array objectsCpu; - objectsCpu[OT_CUBE] = ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) }; - objectsCpu[OT_SPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) }; - objectsCpu[OT_CYLINDER] = ReferenceObjectCpu{ .meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .shadersType = GP_BASIC, .data = gc->createCylinderMesh(2, 2, 20) }; - objectsCpu[OT_RECTANGLE] = ReferenceObjectCpu{ .meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .shadersType = GP_BASIC, .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(1.5, 3)) }; - objectsCpu[OT_DISK] = ReferenceObjectCpu{ .meta = {.type = OT_DISK, .name = "Disk Mesh" }, .shadersType = GP_BASIC, .data = gc->createDiskMesh(2, 30) }; - objectsCpu[OT_ARROW] = ReferenceObjectCpu{ .meta = {.type = OT_ARROW, .name = "Arrow Mesh" }, .shadersType = GP_BASIC, .data = gc->createArrowMesh() }; - objectsCpu[OT_CONE] = ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) }; - objectsCpu[OT_ICOSPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_ICOSPHERE, .name = "Icosphere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) }; - - struct ScratchVIBindings - { - nbl::asset::SBufferBinding vertex, index; - }; - std::array scratchBuffers; - //std::array geomInfos; - auto geomInfoBuffer = ICPUBuffer::create({ OT_COUNT * sizeof(SGeomInfo) }); - SGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); - const uint32_t byteOffsets[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 }; // based on normals data position - const uint32_t smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 }; - - for (uint32_t i = 0; i < objectsCpu.size(); i++) - { - const auto& geom = objectsCpu[i]; - auto& obj = objectsGpu[i]; - auto& scratchObj = scratchBuffers[i]; - - obj.meta.name = geom.meta.name; - obj.meta.type = geom.meta.type; - - obj.indexCount = geom.data.indexCount; - obj.indexType = geom.data.indexType; - obj.vertexStride = geom.data.inputParams.bindings[0].stride; - - geomInfos[i].indexType = obj.indexType; - geomInfos[i].vertexStride = obj.vertexStride; - geomInfos[i].smoothNormals = smoothNormals[i]; - - auto vBuffer = smart_refctd_ptr(geom.data.bindings[0].buffer); // no offset - auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - obj.bindings.vertex.offset = 0u; - - auto iBuffer = smart_refctd_ptr(geom.data.indexBuffer.buffer); // no offset - auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - obj.bindings.index.offset = 0u; - - vBuffer->addUsageFlags(vUsage); - vBuffer->setContentHash(vBuffer->computeContentHash()); - scratchObj.vertex = { .offset = 0, .buffer = vBuffer }; - - if (geom.data.indexType != EIT_UNKNOWN) - if (iBuffer) - { - iBuffer->addUsageFlags(iUsage); - iBuffer->setContentHash(iBuffer->computeContentHash()); - } - scratchObj.index = { .offset = 0, .buffer = iBuffer }; - } - - auto cmdbuf = getSingleUseCommandBufferAndBegin(pool); - cmdbuf->beginDebugMarker("Build geometry vertex and index buffers"); - - smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); - CAssetConverter::SInputs inputs = {}; - inputs.logger = m_logger.get(); - - std::array tmpBuffers; - { - for (uint32_t i = 0; i < objectsCpu.size(); i++) - { - tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get(); - tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get(); - } - - std::get>(inputs.assets) = tmpBuffers; - } - - auto reservation = converter->reserve(inputs); - { - auto prepass = [&](const auto & references) -> bool - { - auto objects = reservation.getGPUObjects(); - uint32_t counter = {}; - for (auto& object : objects) - { - auto gpu = object.value; - auto* reference = references[counter]; - - if (reference) - { - if (!gpu) - { - m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); - return false; - } - } - counter++; - } - return true; - }; - - prepass.template operator() < ICPUBuffer > (tmpBuffers); - } - - // not sure if need this (probably not, originally for transition img view) - auto semaphore = m_device->createSemaphore(0u); - - std::array cmdbufs = {}; - cmdbufs.front().cmdbuf = cmdbuf.get(); - - SIntendedSubmitInfo transfer = {}; - transfer.queue = queue; - transfer.scratchCommandBuffers = cmdbufs; - transfer.scratchSemaphore = { - .semaphore = semaphore.get(), - .value = 0u, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS - }; - // convert - { - CAssetConverter::SConvertParams params = {}; - params.utilities = m_utils.get(); - params.transfer = &transfer; - - auto future = reservation.convert(params); - if (future.copy() != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); - return false; - } - - // assign gpu objects to output - auto&& buffers = reservation.getGPUObjects(); - for (uint32_t i = 0; i < objectsCpu.size(); i++) - { - auto& obj = objectsGpu[i]; - obj.bindings.vertex = { .offset = 0, .buffer = buffers[2 * i + 0].value }; - obj.bindings.index = { .offset = 0, .buffer = buffers[2 * i + 1].value }; - - geomInfos[i].vertexBufferAddress = obj.bindings.vertex.buffer->getDeviceAddress() + byteOffsets[i]; - geomInfos[i].indexBufferAddress = obj.useIndex() ? obj.bindings.index.buffer->getDeviceAddress() : geomInfos[i].vertexBufferAddress; - } - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = OT_COUNT * sizeof(SGeomInfo); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{.queue = queue}, std::move(params), geomInfos).move_into(geometryInfoBuffer); - } - - return true; - } - - bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue) - { - IQueryPool::SCreationParams qParams{ .queryCount = OT_COUNT, .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE }; - smart_refctd_ptr queryPool = m_device->createQueryPool(std::move(qParams)); - - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for blas/tlas creation!"); - - m_api->startCapture(); -#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it - m_currentImageAcquire = m_surface->acquireNextImage(); - { - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS - } }; - m_surface->present(m_currentImageAcquire.imageIndex,acquired); - } - m_currentImageAcquire = m_surface->acquireNextImage(); -#endif - size_t totalScratchSize = 0; - const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; - - // build bottom level ASes - { - IGPUBottomLevelAccelerationStructure::DeviceBuildInfo blasBuildInfos[OT_COUNT]; - uint32_t primitiveCounts[OT_COUNT]; - IGPUBottomLevelAccelerationStructure::Triangles triangles[OT_COUNT]; - uint32_t scratchSizes[OT_COUNT]; - - for (uint32_t i = 0; i < objectsGpu.size(); i++) - { - const auto& obj = objectsGpu[i]; - - const uint32_t vertexStride = obj.vertexStride; - const uint32_t numVertices = obj.bindings.vertex.buffer->getSize() / vertexStride; - if (obj.useIndex()) - primitiveCounts[i] = obj.indexCount / 3; - else - primitiveCounts[i] = numVertices / 3; - - triangles[i].vertexData[0] = obj.bindings.vertex; - triangles[i].indexData = obj.useIndex() ? obj.bindings.index : obj.bindings.vertex; - triangles[i].maxVertex = numVertices - 1; - triangles[i].vertexStride = vertexStride; - triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; - triangles[i].indexType = obj.indexType; - triangles[i].geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; - - auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; - if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) - blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS; - - blasBuildInfos[i].buildFlags = blasFlags; - blasBuildInfos[i].geometryCount = 1; // only 1 geometry object per blas - blasBuildInfos[i].srcAS = nullptr; - blasBuildInfos[i].dstAS = nullptr; - blasBuildInfos[i].triangles = &triangles[i]; - blasBuildInfos[i].scratch = {}; - - ILogicalDevice::AccelerationStructureBuildSizes buildSizes; - { - const auto* trianglesData = triangles; - const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; - buildSizes = m_device->getAccelerationStructureBuildSizes(false,blasFlags, false, std::span{trianglesData,1}, maxPrimCount); - if (!buildSizes) - return logFail("Failed to get BLAS build sizes"); - } - - scratchSizes[i] = buildSizes.buildScratchSize; - totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment); - totalScratchSize += buildSizes.buildScratchSize; - - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = buildSizes.accelerationStructureSize; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = buildSizes.accelerationStructureSize; - blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - gpuBlas[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); - if (!gpuBlas[i]) - return logFail("Could not create BLAS"); - } - } - - auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool); - cmdbufBlas->beginDebugMarker("Build BLAS"); - - cmdbufBlas->resetQueryPool(queryPool.get(), 0, objectsGpu.size()); - - smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - params.size = totalScratchSize; - scratchBuffer = createBuffer(params); - } - - uint32_t queryCount = 0; - IGPUBottomLevelAccelerationStructure::BuildRangeInfo buildRangeInfos[OT_COUNT]; - IGPUBottomLevelAccelerationStructure::BuildRangeInfo* pRangeInfos[OT_COUNT]; - for (uint32_t i = 0; i < objectsGpu.size(); i++) - { - blasBuildInfos[i].dstAS = gpuBlas[i].get(); - blasBuildInfos[i].scratch.buffer = scratchBuffer; - if (i == 0) - { - blasBuildInfos[i].scratch.offset = 0u; - } - else - { - const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1]; - blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment); - } - - buildRangeInfos[i].primitiveCount = primitiveCounts[i]; - buildRangeInfos[i].primitiveByteOffset = 0u; - buildRangeInfos[i].firstVertex = 0u; - buildRangeInfos[i].transformByteOffset = 0u; - - pRangeInfos[i] = &buildRangeInfos[i]; - } - - if (!cmdbufBlas->buildAccelerationStructures({ blasBuildInfos, OT_COUNT }, pRangeInfos)) - return logFail("Failed to build BLAS"); - - { - SMemoryBarrier memBarrier; - memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; - memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT; - cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); - } - - const IGPUAccelerationStructure* ases[OT_COUNT]; - for (uint32_t i = 0; i < objectsGpu.size(); i++) - ases[i] = gpuBlas[i].get(); - if (!cmdbufBlas->writeAccelerationStructureProperties({ ases, OT_COUNT }, IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE, - queryPool.get(), queryCount++)) - return logFail("Failed to write acceleration structure properties!"); - - cmdbufBlas->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufBlas, queue, 39); - } - - auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool); - cmdbufCompact->beginDebugMarker("Compact BLAS"); - - // compact blas - { - std::array asSizes{ 0 }; - if (!m_device->getQueryPoolResults(queryPool.get(), 0, objectsGpu.size(), asSizes.data(), sizeof(size_t), bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::_64_BIT)) - return logFail("Could not get query pool results for AS sizes"); - - std::array, OT_COUNT> cleanupBlas; - for (uint32_t i = 0; i < objectsGpu.size(); i++) - { - cleanupBlas[i] = gpuBlas[i]; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = asSizes[i]; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = asSizes[i]; - blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - gpuBlas[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); - if (!gpuBlas[i]) - return logFail("Could not create compacted BLAS"); - } - - IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo; - copyInfo.src = cleanupBlas[i].get(); - copyInfo.dst = gpuBlas[i].get(); - copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT; - if (!cmdbufCompact->copyAccelerationStructure(copyInfo)) - return logFail("Failed to copy AS to compact"); - } - } - - cmdbufCompact->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufCompact, queue, 40); - - auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool); - cmdbufTlas->beginDebugMarker("Build TLAS"); - - // build top level AS - { - const uint32_t instancesCount = objectsGpu.size(); - IGPUTopLevelAccelerationStructure::DeviceStaticInstance instances[OT_COUNT]; - for (uint32_t i = 0; i < instancesCount; i++) - { - core::matrix3x4SIMD transform; - transform.setTranslation(nbl::core::vectorSIMDf(5.f * i, 0, 0, 0)); - instances[i].base.blas.deviceAddress = gpuBlas[i]->getReferenceForDeviceOperations().deviceAddress; - instances[i].base.mask = 0xFF; - instances[i].base.instanceCustomIndex = i; - instances[i].base.instanceShaderBindingTableRecordOffset = 0; - instances[i].base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); - instances[i].transform = transform; - } - - { - size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT | - IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = bufSize; - instancesBuffer = createBuffer(params); - - SBufferRange range = { .offset = 0u, .size = bufSize, .buffer = instancesBuffer }; - cmdbufTlas->updateBuffer(range, instances); - } - - // make sure instances upload complete first - { - SMemoryBarrier memBarrier; - memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; - memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; - memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; - cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); - } - - auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); - - IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo; - tlasBuildInfo.buildFlags = tlasFlags; - tlasBuildInfo.srcAS = nullptr; - tlasBuildInfo.dstAS = nullptr; - tlasBuildInfo.instanceData.buffer = instancesBuffer; - tlasBuildInfo.instanceData.offset = 0u; - tlasBuildInfo.scratch = {}; - - auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount); - if (!buildSizes) - return logFail("Failed to get TLAS build sizes"); - - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = buildSizes.accelerationStructureSize; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUTopLevelAccelerationStructure::SCreationParams tlasParams; - tlasParams.bufferRange.buffer = asBuffer; - tlasParams.bufferRange.offset = 0u; - tlasParams.bufferRange.size = buildSizes.accelerationStructureSize; - tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams)); - if (!gpuTlas) - return logFail("Could not create TLAS"); - } - - smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - params.size = buildSizes.buildScratchSize; - scratchBuffer = createBuffer(params); - } - - tlasBuildInfo.dstAS = gpuTlas.get(); - tlasBuildInfo.scratch.buffer = scratchBuffer; - tlasBuildInfo.scratch.offset = 0u; - - IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u]; - buildRangeInfo[0].instanceCount = instancesCount; - buildRangeInfo[0].instanceByteOffset = 0u; - IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos; - pRangeInfos = &buildRangeInfo[0]; - - if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos)) - return logFail("Failed to build TLAS"); - } - - cmdbufTlas->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufTlas, queue, 45); - #ifdef TRY_BUILD_FOR_NGFX { const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { @@ -1423,9 +893,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu #endif m_api->endCapture(); - return true; + return bool(gpuTlas); } -#endif // TEST_ASSET_CONV_AS smart_refctd_ptr m_window; @@ -1442,18 +911,15 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); video::CDumbPresentationOracle oracle; - std::array objectsGpu; - - std::array, OT_COUNT> gpuBlas; + // TODO: maybe convert the descriptor set from ICPU as well? smart_refctd_ptr gpuTlas; - smart_refctd_ptr instancesBuffer; smart_refctd_ptr geometryInfoBuffer; + core::vector> retainedBuffers; smart_refctd_ptr outHDRImage; smart_refctd_ptr renderPipeline; smart_refctd_ptr renderDs; - smart_refctd_ptr renderPool; uint16_t gcIndex = {}; From cac9ea184c1eae6cb4279f6d8603b4f97de59699 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 22 May 2025 01:48:23 +0200 Subject: [PATCH 193/296] typo --- 67_RayQueryGeometry/main.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 1faeaf196..e096c1b71 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -786,7 +786,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu // assign gpu objects to output gpuTlas = reservation.getGPUObjects().front().value; for (const auto& buffer : reservation.getGPUObjects()) - if (buffer) retainedBuffers.push_back(buffer.value); for (uint32_t i = 0; i < objectsCpu.size(); i++) { From 2a991a95c7eb891e616aa8e79a0b624a43217a86 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 22 May 2025 14:42:26 +0700 Subject: [PATCH 194/296] combined headers between subgroup, workgroup stuff, restored spirv cache test --- .../app_resources/shaderCommon.hlsl | 43 ----------- .../app_resources/testSubgroup.comp.hlsl | 41 ++++++++++ .../app_resources/testWorkgroup.comp.hlsl | 41 +++++++++- .../app_resources/workgroupCommon.hlsl | 74 ------------------- 23_Arithmetic2UnitTest/main.cpp | 49 +++++++++++- .../app_resources/benchmarkSubgroup.comp.hlsl | 17 ++--- .../benchmarkWorkgroup.comp.hlsl | 47 ++++++++++-- .../app_resources/shaderCommon.hlsl | 48 ++---------- .../app_resources/workgroupCommon.hlsl | 74 ------------------- 29_Arithmetic2Bench/main.cpp | 1 - 10 files changed, 181 insertions(+), 254 deletions(-) delete mode 100644 23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl delete mode 100644 29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl index 05dcfb469..31d59121b 100644 --- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl @@ -1,10 +1,5 @@ #include "common.hlsl" -#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" -#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" -#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" - #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" // https://github.com/microsoft/DirectXShaderCompiler/issues/6144 @@ -14,8 +9,6 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1 #error "Define ITEMS_PER_INVOCATION!" #endif -typedef vector type_t; - struct PushConstantData { uint64_t inputBufAddress; @@ -36,39 +29,3 @@ bool canStore(); #ifndef SUBGROUP_SIZE_LOG2 #error "Define SUBGROUP_SIZE_LOG2!" #endif -template class binop, typename T, uint32_t N> -static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) -{ - // TODO static assert vector == type_t - //using type_t = vector; - using config_t = nbl::hlsl::subgroup2::Configuration; - using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; - - const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); - - if (globalIndex()==0u) - vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); - - operation_t func; - type_t val = func(sourceVal); - if (canStore()) - vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t)); -} - - -type_t test() -{ - const uint32_t idx = globalIndex(); - type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + idx * sizeof(type_t)); - - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - return sourceVal; -} - -#include "nbl/builtin/hlsl/workgroup/basic.hlsl" diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl index 2cc1ccb60..c5a030851 100644 --- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl @@ -2,7 +2,48 @@ #define operation_t nbl::hlsl::OPERATION +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" + #include "shaderCommon.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" + +typedef vector type_t; + +template class binop, typename T, uint32_t N> +static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + // TODO static assert vector == type_t + //using type_t = vector; + using config_t = nbl::hlsl::subgroup2::Configuration; + using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + + const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); + + if (globalIndex()==0u) + vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); + + operation_t func; + type_t val = func(sourceVal); + if (canStore()) + vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t)); +} + +type_t test() +{ + const uint32_t idx = globalIndex(); + type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + idx * sizeof(type_t)); + + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + return sourceVal; +} uint32_t globalIndex() { diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 58e293ba3..51f556797 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -1,6 +1,45 @@ #pragma shader_stage(compute) -#include "workgroupCommon.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" + +static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2; + +#include "shaderCommon.hlsl" + +using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration; + +typedef vector type_t; + +// final (level 1/2) scan needs to fit in one subgroup exactly +groupshared uint32_t scratch[config_t::ElementCount]; + +struct ScratchProxy +{ + template + void get(const uint32_t ix, NBL_REF_ARG(AccessType) value) + { + value = scratch[ix]; + } + template + void set(const uint32_t ix, const AccessType value) + { + scratch[ix] = value; + } + + uint32_t atomicOr(const uint32_t ix, const uint32_t value) + { + return nbl::hlsl::glsl::atomicOr(scratch[ix],value); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } +}; template struct DataProxy diff --git a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl deleted file mode 100644 index c02d86969..000000000 --- a/23_Arithmetic2UnitTest/app_resources/workgroupCommon.hlsl +++ /dev/null @@ -1,74 +0,0 @@ -#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" - -#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" - -#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" - -#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" -#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" - -#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" - -#include "common.hlsl" - -static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2; - -// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 -uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} - -#ifndef ITEMS_PER_INVOCATION -#error "Define ITEMS_PER_INVOCATION!" -#endif - -using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration; - -typedef vector type_t; - -struct PushConstantData -{ - uint64_t inputBufAddress; - uint64_t outputAddressBufAddress; -}; - -[[vk::push_constant]] PushConstantData pc; - -// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way -uint32_t globalIndex(); -// since we test ITEMS_PER_WG - void get(const uint32_t ix, NBL_REF_ARG(AccessType) value) - { - value = scratch[ix]; - } - template - void set(const uint32_t ix, const AccessType value) - { - scratch[ix] = value; - } - - uint32_t atomicOr(const uint32_t ix, const uint32_t value) - { - return nbl::hlsl::glsl::atomicOr(scratch[ix],value); - } - - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } -}; diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 2edd34439..2daa772ae 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -124,10 +124,42 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu // create Pipeline Layout { SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) }; - pipelineLayout = m_device->createPipelineLayout({&pcRange, 1}); } + const auto spirv_isa_cache_path = localOutputCWD / "spirv_isa_cache.bin"; + // enclose to make sure file goes out of scope and we can reopen it + { + smart_refctd_ptr spirv_isa_cache_input; + // try to load SPIR-V to ISA cache + { + ISystem::future_t> fileCreate; + m_system->createFile(fileCreate, spirv_isa_cache_path, IFile::ECF_READ | IFile::ECF_MAPPABLE | IFile::ECF_COHERENT); + if (auto lock = fileCreate.acquire()) + spirv_isa_cache_input = *lock; + } + // create the cache + { + std::span spirv_isa_cache_data = {}; + if (spirv_isa_cache_input) + spirv_isa_cache_data = { reinterpret_cast(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize() }; + else + m_logger->log("Failed to load SPIR-V 2 ISA cache!", ILogger::ELL_PERFORMANCE); + // Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead + m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data); + } + } + { + // TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ? + m_system->deleteDirectory(spirv_isa_cache_path); + ISystem::future_t> fileCreate; + m_system->createFile(fileCreate, spirv_isa_cache_path, IFile::ECF_WRITE); + // I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though. + m_spirv_isa_cache_output = *fileCreate.acquire(); + if (!m_spirv_isa_cache_output) + logFail("Failed to Create SPIR-V to ISA cache file."); + } + // load shader source from file auto getShaderSource = [&](const char* filePath) -> auto { @@ -192,6 +224,17 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu logTestOutcome(passed, itemsPerWG); } m_api->endCapture(); + + // save cache every now and then + { + auto cpu = m_spirv_isa_cache->convertToCPUCache(); + // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata + auto bin = cpu->getEntries().begin()->second.bin; + IFile::success_t success; + m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size()); + if (!success) + logFail("Could not write Create SPIR-V to ISA cache to disk!"); + } } } @@ -238,7 +281,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu .requireFullSubgroups = true }; core::smart_refctd_ptr pipeline; - if (!m_device->createComputePipelines(nullptr,{¶ms,1},&pipeline)) + if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{¶ms,1},&pipeline)) return nullptr; return pipeline; } @@ -455,6 +498,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu IQueue* transferDownQueue; IQueue* computeQueue; + smart_refctd_ptr m_spirv_isa_cache; + smart_refctd_ptr m_spirv_isa_cache_output; uint32_t* inputData = nullptr; constexpr static inline uint32_t OutputBufferCount = 8u; diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl index e21d67fcb..cb033a5bb 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl @@ -2,10 +2,14 @@ #define operation_t nbl::hlsl::OPERATION +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" + #include "shaderCommon.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" -// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders -[[vk::binding(2, 0)]] RWTexture2D outImage; // dummy +typedef vector type_t; uint32_t globalIndex() { @@ -14,10 +18,6 @@ uint32_t globalIndex() bool canStore() {return true;} -#ifndef NUM_LOOPS -#error "Define NUM_LOOPS!" -#endif - template class binop, typename T, uint32_t N> static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) { @@ -32,9 +32,8 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) for (uint32_t i = 0; i < NUM_LOOPS; i++) value = func(value); - [unroll] - for (uint32_t i = 0; i < N; i++) - vk::RawBufferStore(outputBufAddr+sizeof(uint32_t)+sizeof(type_t)*globalIndex()+i*sizeof(uint32_t), val[i]); + if (canStore()) + vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value, sizeof(uint32_t)); } void benchmark() diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index 0194b2f75..8815eb037 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -1,9 +1,46 @@ #pragma shader_stage(compute) -#include "workgroupCommon.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" + +static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2; + +#include "shaderCommon.hlsl" + +using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration; + +typedef vector type_t; + +// final (level 1/2) scan needs to fit in one subgroup exactly +groupshared uint32_t scratch[config_t::ElementCount]; + +struct ScratchProxy +{ + template + void get(const uint32_t ix, NBL_REF_ARG(AccessType) value) + { + value = scratch[ix]; + } + template + void set(const uint32_t ix, const AccessType value) + { + scratch[ix] = value; + } + + uint32_t atomicOr(const uint32_t ix, const uint32_t value) + { + return nbl::hlsl::glsl::atomicOr(scratch[ix],value); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } +}; -// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders -[[vk::binding(2, 0)]] RWTexture2D outImage; // dummy template struct DataProxy @@ -48,10 +85,6 @@ struct operation_t } }; -#ifndef NUM_LOOPS -#error "Define NUM_LOOPS!" -#endif - template class binop, typename T, uint32_t N> static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) { diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl index ae0f61f33..a14986e0d 100644 --- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl +++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl @@ -1,10 +1,5 @@ #include "common.hlsl" -#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" -#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" -#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" - #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" // https://github.com/microsoft/DirectXShaderCompiler/issues/6144 @@ -14,8 +9,6 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1 #error "Define ITEMS_PER_INVOCATION!" #endif -typedef vector type_t; - struct PushConstantData { uint64_t inputBufAddress; @@ -36,41 +29,10 @@ bool canStore(); #ifndef SUBGROUP_SIZE_LOG2 #error "Define SUBGROUP_SIZE_LOG2!" #endif -template class binop, typename T, uint32_t N> -static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) -{ - // TODO static assert vector == type_t - //using type_t = vector; - using config_t = nbl::hlsl::subgroup2::Configuration; - using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; - - const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); - - if (globalIndex()==0u) - vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); - - operation_t func; - if (canStore()) - [unroll] - for (uint32_t i = 0; i < N; i++) - vk::RawBufferStore(outputBufAddr+sizeof(uint32_t)+sizeof(type_t)*globalIndex()+i*sizeof(uint32_t), val[i]); - // vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * globalIndex(), value, sizeof(uint32_t)); TODO why won't this work??? -} +#ifndef NUM_LOOPS +#error "Define NUM_LOOPS!" +#endif -type_t test() -{ - const uint32_t idx = globalIndex(); - type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + idx * sizeof(type_t)); - - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - return sourceVal; -} - -#include "nbl/builtin/hlsl/workgroup/basic.hlsl" +// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders +[[vk::binding(2, 0)]] RWTexture2D outImage; // dummy diff --git a/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl deleted file mode 100644 index c02d86969..000000000 --- a/29_Arithmetic2Bench/app_resources/workgroupCommon.hlsl +++ /dev/null @@ -1,74 +0,0 @@ -#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" - -#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" - -#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" - -#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" -#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" - -#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" - -#include "common.hlsl" - -static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2; - -// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 -uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} - -#ifndef ITEMS_PER_INVOCATION -#error "Define ITEMS_PER_INVOCATION!" -#endif - -using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration; - -typedef vector type_t; - -struct PushConstantData -{ - uint64_t inputBufAddress; - uint64_t outputAddressBufAddress; -}; - -[[vk::push_constant]] PushConstantData pc; - -// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way -uint32_t globalIndex(); -// since we test ITEMS_PER_WG - void get(const uint32_t ix, NBL_REF_ARG(AccessType) value) - { - value = scratch[ix]; - } - template - void set(const uint32_t ix, const AccessType value) - { - scratch[ix] = value; - } - - uint32_t atomicOr(const uint32_t ix, const uint32_t value) - { - return nbl::hlsl::glsl::atomicOr(scratch[ix],value); - } - - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } -}; diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index 0772997dc..5b8792040 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -708,7 +708,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub uint32_t numSubmits = 0; /* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */ - constexpr static inline bool DoWorkgroupBenchmarks = true; uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t NumLoops = 1000u; From e4735a4e840e870803e30fa78325f89fcf01df60 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 22 May 2025 15:03:10 +0700 Subject: [PATCH 195/296] simplified test,benchmark function template params --- .../app_resources/testSubgroup.comp.hlsl | 22 +++++++++---------- .../app_resources/testWorkgroup.comp.hlsl | 20 ++++++++--------- .../app_resources/benchmarkSubgroup.comp.hlsl | 20 ++++++++--------- .../benchmarkWorkgroup.comp.hlsl | 21 +++++++++--------- 29_Arithmetic2Bench/main.cpp | 4 ++-- 5 files changed, 42 insertions(+), 45 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl index c5a030851..838f7adf9 100644 --- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl @@ -11,15 +11,13 @@ typedef vector type_t; -template class binop, typename T, uint32_t N> +template static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { - // TODO static assert vector == type_t - //using type_t = vector; using config_t = nbl::hlsl::subgroup2::Configuration; - using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + using params_t = nbl::hlsl::subgroup2::ArithmeticParams; - const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); + const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); if (globalIndex()==0u) vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); @@ -35,13 +33,13 @@ type_t test() const uint32_t idx = globalIndex(); type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + idx * sizeof(type_t)); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); return sourceVal; } diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 51f556797..e2256d2f1 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -150,14 +150,14 @@ struct operation_t }; -template class binop, typename T, uint32_t N> +template static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { - uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + binop::BindingIndex * sizeof(uint64_t)); + uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t)); if (globalIndex()==0u) vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); - operation_t,nbl::hlsl::jit::device_capabilities> func; + operation_t func; func(); } @@ -166,13 +166,13 @@ type_t test() { type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + globalIndex() * sizeof(type_t)); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); return sourceVal; } diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl index cb033a5bb..113ec2bae 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl @@ -18,14 +18,14 @@ uint32_t globalIndex() bool canStore() {return true;} -template class binop, typename T, uint32_t N> +template static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) { using config_t = nbl::hlsl::subgroup2::Configuration; - using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + using params_t = nbl::hlsl::subgroup2::ArithmeticParams; type_t value = sourceVal; - const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); + const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); operation_t func; // [unroll] @@ -41,13 +41,13 @@ void benchmark() const uint32_t idx = globalIndex(); type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + idx * sizeof(type_t)); - subbench(sourceVal); - subbench(sourceVal); - subbench(sourceVal); - subbench(sourceVal); - subbench(sourceVal); - subbench(sourceVal); - subbench(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); } [numthreads(WORKGROUP_SIZE,1,1)] diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index 8815eb037..cdd5a9f4e 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -85,16 +85,15 @@ struct operation_t } }; -template class binop, typename T, uint32_t N> +template static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) { - const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); + const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); if (globalIndex()==0u) vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); - operation_t,nbl::hlsl::jit::device_capabilities> func; - // TODO separate out store/load from DataProxy? so we don't do too many RW in benchmark + operation_t func; for (uint32_t i = 0; i < NUM_LOOPS; i++) func(); // store is done with data accessor now } @@ -104,13 +103,13 @@ type_t benchmark() { const type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + globalIndex() * sizeof(type_t)); - subbench(sourceVal); - subbench(sourceVal); - subbench(sourceVal); - subbench(sourceVal); - subbench(sourceVal); - subbench(sourceVal); - subbench(sourceVal); + subbench >(sourceVal); + subbench >(sourceVal); + subbench >(sourceVal); + subbench >(sourceVal); + subbench >(sourceVal); + subbench >(sourceVal); + subbench >(sourceVal); return sourceVal; } diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index 5b8792040..ce2b915b1 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -708,13 +708,13 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub uint32_t numSubmits = 0; /* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */ - constexpr static inline bool DoWorkgroupBenchmarks = true; + constexpr static inline bool DoWorkgroupBenchmarks = false; uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t NumLoops = 1000u; constexpr static inline uint32_t NumBenchmarks = 6u; constexpr static inline std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; template - using ArithmeticOp = emulatedReduction; // change this to test other arithmetic ops + using ArithmeticOp = emulatedScanInclusive; // change this to test other arithmetic ops std::array benchSets; smart_refctd_ptr benchPool; From 13ae89f7d3fc666124486b5e18f13922995d3569 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 22 May 2025 15:04:00 +0700 Subject: [PATCH 196/296] revert test to default params --- 29_Arithmetic2Bench/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index ce2b915b1..5b8792040 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -708,13 +708,13 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub uint32_t numSubmits = 0; /* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */ - constexpr static inline bool DoWorkgroupBenchmarks = false; + constexpr static inline bool DoWorkgroupBenchmarks = true; uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t NumLoops = 1000u; constexpr static inline uint32_t NumBenchmarks = 6u; constexpr static inline std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; template - using ArithmeticOp = emulatedScanInclusive; // change this to test other arithmetic ops + using ArithmeticOp = emulatedReduction; // change this to test other arithmetic ops std::array benchSets; smart_refctd_ptr benchPool; From a8774db88d1d08d0a3fe9f2a30e7dc376120493a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 22 May 2025 17:02:32 +0700 Subject: [PATCH 197/296] use preloaded data in benchmark --- .../benchmarkWorkgroup.comp.hlsl | 70 +++++++++++++++++-- 29_Arithmetic2Bench/main.cpp | 12 ++-- 2 files changed, 73 insertions(+), 9 deletions(-) diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index cdd5a9f4e..31284c520 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -68,6 +68,50 @@ struct DataProxy } }; +template +struct PreloadedDataProxy +{ + using dtype_t = vector; + static_assert(nbl::hlsl::is_same_v); + + NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; + + template + void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) + { + value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2]; + } + template + void set(const uint32_t ix, const dtype_t value) + { + preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value; + } + + void preload() + { + const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; + [unroll] + for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) + preloaded[idx] = vk::RawBufferLoad(pc.inputBufAddress + (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t)); + } + void unload() + { + const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; + uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t)); + [unroll] + for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) + vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t)); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } + + dtype_t preloaded[PreloadedDataCount]; +}; + static ScratchProxy arithmeticAccessor; template @@ -76,13 +120,26 @@ struct operation_t using binop_base_t = typename Binop::base_t; using otype_t = typename Binop::type_t; - void operator()() +#if IS_REDUCTION + void operator()(PreloadedDataProxy dataAccessor) { - DataProxy dataAccessor; - nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); + otype_t value = nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); // we barrier before because we alias the accessors for Binop arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); + + [unroll] + for (uint32_t i = 0; i < PreloadedDataProxy::PreloadedDataCount; i++) + dataAccessor.preloaded[i] = value; } +#else + void operator()(PreloadedDataProxy dataAccessor) + { + nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); + // we barrier before because we alias the accessors for Binop + arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); + } +#endif + }; template @@ -93,9 +150,14 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) if (globalIndex()==0u) vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); + PreloadedDataProxy dataAccessor; + dataAccessor.preload(); + operation_t func; for (uint32_t i = 0; i < NUM_LOOPS; i++) - func(); // store is done with data accessor now + func(dataAccessor); + + dataAccessor.unload(); } diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index 5b8792040..165427750 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -599,24 +599,26 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub if constexpr (WorkgroupBench) { const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); - const std::string definitions[6] = { + const std::string definitions[7] = { "workgroup2::" + arith_name, std::to_string(workgroupSizeLog2), std::to_string(itemsPerWG), std::to_string(itemsPerInvoc), std::to_string(subgroupSizeLog2), - std::to_string(numLoops) + std::to_string(numLoops), + std::to_string(arith_name=="reduction") }; - const IShaderCompiler::SMacroDefinition defines[6] = { + const IShaderCompiler::SMacroDefinition defines[7] = { { "OPERATION", definitions[0] }, { "WORKGROUP_SIZE_LOG2", definitions[1] }, { "ITEMS_PER_WG", definitions[2] }, { "ITEMS_PER_INVOCATION", definitions[3] }, { "SUBGROUP_SIZE_LOG2", definitions[4] }, - { "NUM_LOOPS", definitions[5] } + { "NUM_LOOPS", definitions[5] }, + { "IS_REDUCTION", definitions[6] } }; - options.preprocessorOptions.extraDefines = { defines, defines + 6 }; + options.preprocessorOptions.extraDefines = { defines, defines + 7 }; overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); } From 8b729d5fae76ac0c63a0744b802ee5d206f7018d Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 22 May 2025 14:26:10 +0200 Subject: [PATCH 198/296] test compaction of BLASes --- 67_RayQueryGeometry/main.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index e096c1b71..362126332 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -644,10 +644,14 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu myalloc.device = m_device.get(); inputs.allocator = &myalloc; #endif - + + std::array,OT_COUNT> tmpBLASPatches = {}; std::array tmpBuffers; std::array, OT_COUNT * 2u> tmpBufferPatches; { + tmpBLASPatches.front().compactAfterBuild = true; + std::fill(tmpBLASPatches.begin(),tmpBLASPatches.end(),tmpBLASPatches.front()); + // for (uint32_t i = 0; i < objectsCpu.size(); i++) { tmpBuffers[2 * i + 0] = cpuBlas[i]->getTriangleGeometries().front().vertexData[0].buffer.get(); @@ -659,6 +663,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu std::get>(inputs.assets) = {&cpuTlas.get(),1}; std::get>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()}; + std::get>(inputs.patches) = tmpBLASPatches; std::get>(inputs.assets) = tmpBuffers; std::get>(inputs.patches) = tmpBufferPatches; } From 435212210dbf9392143e3ec8204051013215bd86 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 22 May 2025 14:39:51 +0200 Subject: [PATCH 199/296] test TLAS compaction --- 67_RayQueryGeometry/main.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 362126332..6fcf6b0d1 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -645,6 +645,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu inputs.allocator = &myalloc; #endif + CAssetConverter::patch_t blasPatch = {}; + blasPatch.compactAfterBuild = true; std::array,OT_COUNT> tmpBLASPatches = {}; std::array tmpBuffers; std::array, OT_COUNT * 2u> tmpBufferPatches; @@ -662,6 +664,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu patch.usage |= asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; std::get>(inputs.assets) = {&cpuTlas.get(),1}; + std::get>(inputs.patches) = {&blasPatch,1}; std::get>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()}; std::get>(inputs.patches) = tmpBLASPatches; std::get>(inputs.assets) = tmpBuffers; From 20fed8cc920f787e134323565ac0d8d30fcbfb99 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 23 May 2025 13:25:55 +0200 Subject: [PATCH 200/296] test Descriptor Set conversion with TLAS rewrites --- 67_RayQueryGeometry/main.cpp | 102 ++++++++++++++++------------------- 1 file changed, 45 insertions(+), 57 deletions(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 6fcf6b0d1..ce9eaee1f 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -126,6 +126,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu auto cQueue = getComputeQueue(); // create blas/tlas + renderDs = //#define TRY_BUILD_FOR_NGFX // Validation errors on the fake Acquire-Presents, TODO fix #ifdef TRY_BUILD_FOR_NGFX // Nsight is special and can't do debugger delay so you can debug your CPU stuff during a capture @@ -137,11 +138,12 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu std::this_thread::yield(); } // Nsight is special and can't capture anything not on the queue that performs the swapchain acquire/release - if (!createAccelerationStructuresFromGeometry(gQueue,geometryCreator)) + createAccelerationStructureDS(gQueue,geometryCreator); #else - if (!createAccelerationStructuresFromGeometry(cQueue,geometryCreator)) + createAccelerationStructureDS(cQueue,geometryCreator); #endif - return logFail("Could not create acceleration structures"); + if (!renderDs) + return logFail("Could not create acceleration structures and descriptor set"); // create pipelines { @@ -165,35 +167,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu if (!shader) return logFail("Failed to create shader!"); - // descriptors - IGPUDescriptorSetLayout::SBinding bindings[] = { - { - .binding = 0, - .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .count = 1, - }, - { - .binding = 1, - .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .count = 1, - } - }; - auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); - - const std::array dsLayoutPtrs = { descriptorSetLayout.get() }; - auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); - if (!pool) - return logFail("Could not create descriptor pool"); - renderDs = pool->createDescriptorSet(descriptorSetLayout); - if (!renderDs) - return logFail("Could not create descriptor set"); - SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0u, .size = sizeof(SPushConstants)}; - auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr); + auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(renderDs->getLayout()), nullptr, nullptr, nullptr); IGPUComputePipeline::SCreationParams params = {}; params.layout = pipelineLayout.get(); @@ -203,23 +178,21 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu } // write descriptors - IGPUDescriptorSet::SDescriptorInfo infos[2]; - infos[0].desc = gpuTlas; - infos[1].desc = m_device->createImageView({ - .flags = IGPUImageView::ECF_NONE, - .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, - .image = outHDRImage, - .viewType = IGPUImageView::E_TYPE::ET_2D, - .format = asset::EF_R16G16B16A16_SFLOAT - }); - if (!infos[1].desc) - return logFail("Failed to create image view"); - infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL; - IGPUDescriptorSet::SWriteDescriptorSet writes[3] = { - {.dstSet = renderDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]}, - {.dstSet = renderDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]} - }; - m_device->updateDescriptorSets(std::span(writes, 2), {}); + { + IGPUDescriptorSet::SDescriptorInfo info = {}; + info.desc = m_device->createImageView({ + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, + .image = outHDRImage, + .viewType = IGPUImageView::E_TYPE::ET_2D, + .format = asset::EF_R16G16B16A16_SFLOAT + }); + if (!info.desc) + return logFail("Failed to create image view"); + info.info.image.imageLayout = IImage::LAYOUT::GENERAL; + const IGPUDescriptorSet::SWriteDescriptorSet write = {.dstSet=renderDs.get(), .binding=1, .arrayElement=0, .count=1, .info=&info}; + m_device->updateDescriptorSets({&write,1}, {}); + } // camera { @@ -514,7 +487,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu return (dim + size - 1) / size; } - bool createAccelerationStructuresFromGeometry(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) + smart_refctd_ptr createAccelerationStructureDS(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) { // get geometries in ICPUBuffers std::array objectsCpu; @@ -582,8 +555,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu blas->setContentHash(blas->computeContentHash()); } - // TODO: when does compact blas happen? - // get ICPUBottomLevelAccelerationStructure into ICPUTopLevelAccelerationStructure auto geomInstances = make_refctd_dynamic_array>(OT_COUNT); { @@ -608,6 +579,26 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu auto cpuTlas = make_smart_refctd_ptr(); cpuTlas->setInstances(std::move(geomInstances)); cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); + + // descriptor set and layout + ICPUDescriptorSetLayout::SBinding bindings[] = { + { + .binding = 0, + .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, + .createFlags = IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1, + }, + { + .binding = 1, + .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, + .createFlags = IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1, + } + }; + auto descriptorSet = core::make_smart_refctd_ptr(core::make_smart_refctd_ptr(bindings)); + descriptorSet->getDescriptorInfos(IDescriptorSetLayoutBase::CBindingRedirect::binding_number_t{0},IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE).front().desc = cpuTlas; //#define TEST_REBAR_FALLBACK // convert with asset converter @@ -663,6 +654,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu for (auto& patch : tmpBufferPatches) patch.usage |= asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; + std::get>(inputs.assets) = {&descriptorSet.get(),1}; std::get>(inputs.assets) = {&cpuTlas.get(),1}; std::get>(inputs.patches) = {&blasPatch,1}; std::get>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()}; @@ -792,7 +784,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu } // assign gpu objects to output - gpuTlas = reservation.getGPUObjects().front().value; for (const auto& buffer : reservation.getGPUObjects()) retainedBuffers.push_back(buffer.value); for (uint32_t i = 0; i < objectsCpu.size(); i++) @@ -858,7 +849,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu }; for (const auto& blas : reservation.getGPUObjects()) acquireAS(blas.value.get()); - acquireAS(gpuTlas.get()); + acquireAS(reservation.getGPUObjects().front().value.get()); } if (!bufBarriers.empty()) cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,{.memBarriers={},.bufBarriers=bufBarriers}); @@ -900,7 +891,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu #endif m_api->endCapture(); - return bool(gpuTlas); + return reservation.getGPUObjects().front().value; } @@ -918,9 +909,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); video::CDumbPresentationOracle oracle; - // TODO: maybe convert the descriptor set from ICPU as well? - smart_refctd_ptr gpuTlas; - smart_refctd_ptr geometryInfoBuffer; core::vector> retainedBuffers; smart_refctd_ptr outHDRImage; From c2023dfbe83d19d3efab475b33a15e3dce2d1681 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 23 May 2025 14:32:16 +0200 Subject: [PATCH 201/296] name variables correctly --- 67_RayQueryGeometry/main.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index ce9eaee1f..7371cf1ea 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -636,8 +636,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu inputs.allocator = &myalloc; #endif - CAssetConverter::patch_t blasPatch = {}; - blasPatch.compactAfterBuild = true; + CAssetConverter::patch_t tlasPatch = {}; + tlasPatch.compactAfterBuild = true; std::array,OT_COUNT> tmpBLASPatches = {}; std::array tmpBuffers; std::array, OT_COUNT * 2u> tmpBufferPatches; @@ -656,7 +656,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu std::get>(inputs.assets) = {&descriptorSet.get(),1}; std::get>(inputs.assets) = {&cpuTlas.get(),1}; - std::get>(inputs.patches) = {&blasPatch,1}; + std::get>(inputs.patches) = {&tlasPatch,1}; std::get>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()}; std::get>(inputs.patches) = tmpBLASPatches; std::get>(inputs.assets) = tmpBuffers; @@ -780,7 +780,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu if (future.copy() != IQueue::RESULT::SUCCESS) { m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); - return false; + return {}; } // assign gpu objects to output From fae6490244a69223321488214a4606cd4c5044d1 Mon Sep 17 00:00:00 2001 From: devsh Date: Sun, 25 May 2025 19:46:53 +0200 Subject: [PATCH 202/296] get old Acceleration Structure code workin in ex 71 after API change --- 71_RayTracingPipeline/main.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index faa392a46..54a37a3d3 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -6,7 +6,7 @@ #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" #include "nbl/builtin/hlsl/indirect_commands.hlsl" -#define TEST_ASSET_CONV_AS +//#define TEST_ASSET_CONV_AS class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { @@ -1955,9 +1955,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, // build bottom level ASes { core::vector primitiveCounts(blasCount); - core::vector> triangles(m_gpuTriangleGeometries.size()); + core::vector> triangles(m_gpuTriangleGeometries.size()); core::vector scratchSizes(blasCount); - IGPUBottomLevelAccelerationStructure::AABBs aabbs; + IGPUBottomLevelAccelerationStructure::AABBs aabbs; auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) @@ -2017,12 +2017,12 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, if (isProcedural) { const auto* aabbData = &aabbs; - buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1 }, maxPrimCount); + buildSizes = m_device->getAccelerationStructureBuildSizes(false, blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1 }, maxPrimCount); } else { const auto* trianglesData = triangles.data(); - buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{ trianglesData,1 }, maxPrimCount); + buildSizes = m_device->getAccelerationStructureBuildSizes(false, blasBuildInfos[i].buildFlags, false, std::span{ trianglesData,1 }, maxPrimCount); } if (!buildSizes) return logFail("Failed to get BLAS build sizes"); @@ -2144,8 +2144,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo; copyInfo.src = cleanupBlas[i].get(); copyInfo.dst = m_gpuBlasList[i].get(); - copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT; - if (!cmdbufCompact->copyAccelerationStructure(copyInfo)) + copyInfo.compact = true; + if (!cmdbufCompact->copyAccelerationStructure(copyInfo)) return logFail("Failed to copy AS to compact"); } } From c8d4b607a281dfd04e7554253499f88ca055b087 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 26 May 2025 09:44:06 +0200 Subject: [PATCH 203/296] the multi-queue was badly implemented, just use Graphics & Compute throughout --- 71_RayTracingPipeline/main.cpp | 754 +-------------------------------- 1 file changed, 18 insertions(+), 736 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 54a37a3d3..0b6b4d724 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -6,7 +6,6 @@ #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" #include "nbl/builtin/hlsl/indirect_commands.hlsl" -//#define TEST_ASSET_CONV_AS class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { @@ -25,8 +24,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, "Spot" }; - constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); - struct ShaderBindingTable { SBufferRange raygenGroupRange; @@ -93,7 +90,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, inline core::vector getQueueRequirements() const override { auto reqs = device_base_t::getQueueRequirements(); - reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; return reqs; } @@ -405,7 +402,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, calculateRayTracingStackSize(m_rayTracingPipeline); - if (!createShaderBindingTable(gQueue, m_rayTracingPipeline)) + if (!createShaderBindingTable(m_rayTracingPipeline)) return logFail("Could not create shader binding table"); } @@ -413,20 +410,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); auto* geometryCreator = assetManager->getGeometryCreator(); - if (!createIndirectBuffer(gQueue)) + if (!createIndirectBuffer()) return logFail("Could not create indirect buffer"); -#ifdef TEST_ASSET_CONV_AS - if (!createAccelerationStructuresFromGeometry(getComputeQueue(), geometryCreator)) + if (!createAccelerationStructuresFromGeometry(geometryCreator)) return logFail("Could not create acceleration structures from geometry creator"); -#else - // create geometry objects - if (!createGeometries(gQueue, geometryCreator)) - return logFail("Could not create geometries from geometry creator"); - - if (!createAccelerationStructures(getComputeQueue())) - return logFail("Could not create acceleration structures"); -#endif // TEST_ASSET_CONV_AS ISampler::SParams samplerParams = { .AnisotropicFilter = 0 @@ -521,7 +509,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, params.renderpass = smart_refctd_ptr(renderpass); params.streamingBuffer = nullptr; params.subpassIx = 0u; - params.transfer = getTransferUpQueue(); + params.transfer = getGraphicsQueue(); params.utilities = m_utils; { m_ui.manager = ext::imgui::UI::create(std::move(params)); @@ -988,77 +976,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, return (dim + size - 1) / size; } - smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams& params) - { - smart_refctd_ptr buffer; - buffer = m_device->createBuffer(std::move(params)); - auto bufReqs = buffer->getMemoryReqs(); - bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); - m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - - return buffer; - } - - smart_refctd_ptr getSingleUseCommandBufferAndBegin(smart_refctd_ptr pool) - { - smart_refctd_ptr cmdbuf; - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) - return nullptr; - - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - - return cmdbuf; - } - - void cmdbufSubmitAndWait(smart_refctd_ptr cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue) - { - cmdbuf->end(); - - uint64_t finishedValue = startValue + 1; - - // submit builds - { - auto completed = m_device->createSemaphore(startValue); - - std::array signals; - { - auto& signal = signals.front(); - signal.value = finishedValue; - signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS); - signal.semaphore = completed.get(); - } - - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { - .cmdbuf = cmdbuf.get() - } }; - - const IQueue::SSubmitInfo infos[] = - { - { - .waitSemaphores = {}, - .commandBuffers = commandBuffers, - .signalSemaphores = signals - } - }; - - if (queue->submit(infos) != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR); - return; - } - - const ISemaphore::SWaitInfo info[] = - { { - .semaphore = completed.get(), - .value = finishedValue - } }; - - m_device->blockForSemaphores(info); - } - } - - bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue) + bool createIndirectBuffer() { const auto getBufferRangeAddress = [](const SBufferRange& range) { @@ -1083,7 +1001,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, IGPUBuffer::SCreationParams params; params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; params.size = sizeof(TraceRaysIndirectCommand_t); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), &command).move_into(m_indirectBuffer); return true; } @@ -1110,7 +1028,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax); } - bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr& pipeline) + bool createShaderBindingTable(const smart_refctd_ptr& pipeline) { const auto& limits = m_device->getPhysicalDevice()->getLimits(); const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize; @@ -1188,7 +1106,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, IGPUBuffer::SCreationParams params; params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT; params.size = bufferSize; - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), pData).move_into(raygenRange.buffer); missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); @@ -1197,9 +1115,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, return true; } -#ifdef TEST_ASSET_CONV_AS - bool createAccelerationStructuresFromGeometry(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) + bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc) { + auto queue = getGraphicsQueue(); // get geometries into ICPUBuffers auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); if (!pool) @@ -1431,23 +1349,23 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, cpuTlas->setInstances(std::move(geomInstances)); cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); -//#define TEST_REBAR_FALLBACK // convert with asset converter smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); struct MyInputs : CAssetConverter::SInputs { -#ifndef TEST_REBAR_FALLBACK + // For the GPU Buffers to be directly writeable and so that we don't need a Transfer Queue submit at all inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override { assert(memoryBacked); return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes; } -#endif + uint32_t rebarMemoryTypes; } inputs = {}; inputs.logger = m_logger.get(); inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); -#ifndef TEST_REBAR_FALLBACK + // the allocator needs to be overriden to hand out memory ranges which have already been mapped so that the ReBAR fast-path can kick in + // (multiple buffers can be bound to same memory, but memory can only be mapped once at one place, so Asset Converter can't do it) struct MyAllocator final : public IDeviceMemoryAllocator { ILogicalDevice* getDeviceForAllocations() const override { return device; } @@ -1465,7 +1383,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } myalloc; myalloc.device = m_device.get(); inputs.allocator = &myalloc; -#endif std::array tmpTlas; std::array tmpBuffers; @@ -1510,31 +1427,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, prepass.template operator() < ICPUBuffer > (tmpBuffers); } - constexpr auto XferBufferCount = 2; - std::array, XferBufferCount> xferBufs = {}; - std::array xferBufInfos = {}; - { - auto pool = m_device->createCommandPool(getTransferUpQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); - pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, xferBufs); - xferBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - for (auto i = 0; i < XferBufferCount; i++) - xferBufInfos[i].cmdbuf = xferBufs[i].get(); - } - auto xferSema = m_device->createSemaphore(0u); - SIntendedSubmitInfo transfer = {}; - transfer.queue = getTransferUpQueue(); - transfer.scratchCommandBuffers = xferBufInfos; - transfer.scratchSemaphore = { - .semaphore = xferSema.get(), - .value = 0u, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS - }; - constexpr auto CompBufferCount = 2; std::array, CompBufferCount> compBufs = {}; std::array compBufInfos = {}; { - auto pool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs); compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); for (auto i = 0; i < CompBufferCount; i++) @@ -1542,7 +1439,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } auto compSema = m_device->createSemaphore(0u); SIntendedSubmitInfo compute = {}; - compute.queue = getComputeQueue(); + compute.queue = queue; compute.scratchCommandBuffers = compBufInfos; compute.scratchSemaphore = { .semaphore = compSema.get(), @@ -1561,24 +1458,13 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, IGPUBuffer::SCreationParams creationParams = {}; creationParams.size = scratchSize; creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; -#ifdef TEST_REBAR_FALLBACK - creationParams.usage |= IGPUBuffer::EUF_TRANSFER_DST_BIT; - core::unordered_set sharingSet = { compute.queue->getFamilyIndex(),transfer.queue->getFamilyIndex() }; - core::vector sharingIndices(sharingSet.begin(), sharingSet.end()); - if (sharingIndices.size() > 1) - creationParams.queueFamilyIndexCount = sharingIndices.size(); - creationParams.queueFamilyIndices = sharingIndices.data(); -#endif auto scratchBuffer = m_device->createBuffer(std::move(creationParams)); auto reqs = scratchBuffer->getMemoryReqs(); -#ifndef TEST_REBAR_FALLBACK reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); -#endif + auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); -#ifndef TEST_REBAR_FALLBACK allocation.memory->map({ .offset = 0,.length = reqs.size }); -#endif scratchAlloc = make_smart_refctd_ptr( SBufferRange{0ull, scratchSize, std::move(scratchBuffer)}, @@ -1599,9 +1485,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, uint8_t finalUser; } params = {}; -#undef TEST_REBAR_FALLBACK params.utilities = m_utils.get(); - params.transfer = &transfer; params.compute = &compute; params.scratchForDeviceASBuild = scratchAlloc.get(); params.finalUser = queue->getFamilyIndex(); @@ -1661,608 +1545,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, return true; } -#else - bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) - { - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for geometry creation!"); - - const auto defaultMaterial = Material{ - .ambient = {0.2, 0.1, 0.1}, - .diffuse = {0.8, 0.3, 0.3}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - .alpha = 1.0f, - }; - - auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) - { - core::matrix3x4SIMD transform; - transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); - return transform; - }; - - core::matrix3x4SIMD planeTransform; - planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); - - const auto cpuObjects = std::array{ - ReferenceObjectCpu { - .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, - .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), - .material = defaultMaterial, - .transform = planeTransform, - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), - .material = defaultMaterial, - .transform = getTranslationMatrix(0, 0.5f, 0), - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = Material{ - .ambient = {0.1, 0.1, 0.2}, - .diffuse = {0.2, 0.2, 0.8}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - }, - .transform = getTranslationMatrix(-5.0f, 1.0f, 0), - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = Material{ - .ambient = {0.1, 0.2, 0.1}, - .diffuse = {0.2, 0.8, 0.2}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - .alpha = 0.2, - }, - .transform = getTranslationMatrix(5.0f, 1.0f, 0), - }, - }; - - struct ScratchVIBindings - { - nbl::asset::SBufferBinding vertex, index; - }; - std::array scratchBuffers; - - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - const auto& cpuObject = cpuObjects[i]; - - auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset - auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - vBuffer->addUsageFlags(vUsage); - vBuffer->setContentHash(vBuffer->computeContentHash()); - - auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset - auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - - if (cpuObject.data.indexType != EIT_UNKNOWN) - if (iBuffer) - { - iBuffer->addUsageFlags(iUsage); - iBuffer->setContentHash(iBuffer->computeContentHash()); - } - - scratchBuffers[i] = { - .vertex = {.offset = 0, .buffer = vBuffer}, - .index = {.offset = 0, .buffer = iBuffer}, - }; - - } - - auto cmdbuf = getSingleUseCommandBufferAndBegin(pool); - cmdbuf->beginDebugMarker("Build geometry vertex and index buffers"); - - CAssetConverter::SInputs inputs = {}; - inputs.logger = m_logger.get(); - std::array tmpBuffers; - { - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get(); - tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get(); - } - - std::get>(inputs.assets) = tmpBuffers; - } - - auto reservation = m_converter->reserve(inputs); - { - auto prepass = [&](const auto & references) -> bool - { - auto objects = reservation.getGPUObjects(); - uint32_t counter = {}; - for (auto& object : objects) - { - auto gpu = object.value; - auto* reference = references[counter]; - - if (reference) - { - if (!gpu) - { - m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); - return false; - } - } - counter++; - } - return true; - }; - - prepass.template operator() < ICPUBuffer > (tmpBuffers); - } - - auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) }); - STriangleGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); - - m_gpuTriangleGeometries.reserve(std::size(cpuObjects)); - // convert - { - // not sure if need this (probably not, originally for transition img view) - auto semaphore = m_device->createSemaphore(0u); - - std::array cmdbufs = {}; - cmdbufs.front().cmdbuf = cmdbuf.get(); - - SIntendedSubmitInfo transfer = {}; - transfer.queue = queue; - transfer.scratchCommandBuffers = cmdbufs; - transfer.scratchSemaphore = { - .semaphore = semaphore.get(), - .value = 0u, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS - }; - - CAssetConverter::SConvertParams params = {}; - params.utilities = m_utils.get(); - params.transfer = &transfer; - - auto future = reservation.convert(params); - if (future.copy() != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); - return false; - } - - auto&& buffers = reservation.getGPUObjects(); - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - auto& cpuObject = cpuObjects[i]; - - m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ - .meta = cpuObject.meta, - .bindings = { - .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, - .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, - }, - .vertexStride = cpuObject.data.inputParams.bindings[0].stride, - .indexType = cpuObject.data.indexType, - .indexCount = cpuObject.data.indexCount, - .material = hlsl::_static_cast(cpuObject.material), - .transform = cpuObject.transform, - }); - } - - for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) - { - const auto& gpuObject = m_gpuTriangleGeometries[i]; - const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); - geomInfos[i] = { - .material = gpuObject.material, - .vertexBufferAddress = vertexBufferAddress, - .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, - .vertexStride = gpuObject.vertexStride, - .objType = gpuObject.meta.type, - .indexType = gpuObject.indexType, - .smoothNormals = s_smoothNormals[gpuObject.meta.type], - }; - } - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = geomInfoBuffer->getSize(); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); - } - - // intersection geometries setup - { - core::vector proceduralGeoms; - proceduralGeoms.reserve(NumberOfProceduralGeometries); - using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; - core::vector aabbs; - aabbs.reserve(NumberOfProceduralGeometries); - for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) - { - const auto middle_i = NumberOfProceduralGeometries / 2.0; - SProceduralGeomInfo sphere = { - .material = hlsl::_static_cast(Material{ - .ambient = {0.1, 0.05 * i, 0.1}, - .diffuse = {0.3, 0.2 * i, 0.3}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - }), - .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), - .radius = 1, - }; - - proceduralGeoms.push_back(sphere); - const auto sphereMin = sphere.center - sphere.radius; - const auto sphereMax = sphere.center + sphere.radius; - aabbs.emplace_back( - vector3d(sphereMin.x, sphereMin.y, sphereMin.z), - vector3d(sphereMax.x, sphereMax.y, sphereMax.z)); - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT; - params.size = aabbs.size() * sizeof(Aabb); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer); - } - } - - return true; - } - - bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue) - { - // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}} - // spheres. Each sphere is a primitive instead one instance or geometry - const auto blasCount = m_gpuTriangleGeometries.size() + 1; - const auto proceduralBlasIdx = m_gpuTriangleGeometries.size(); - - IQueryPool::SCreationParams qParams{ .queryCount = static_cast(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE }; - smart_refctd_ptr queryPool = m_device->createQueryPool(std::move(qParams)); - - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for blas/tlas creation!"); - - m_api->startCapture(); -#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it - m_currentImageAcquire = m_surface->acquireNextImage(); - { - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS - } }; - m_surface->present(m_currentImageAcquire.imageIndex, acquired); - } - m_currentImageAcquire = m_surface->acquireNextImage(); -#endif - size_t totalScratchSize = 0; - const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; - - // build bottom level ASes - { - core::vector primitiveCounts(blasCount); - core::vector> triangles(m_gpuTriangleGeometries.size()); - core::vector scratchSizes(blasCount); - IGPUBottomLevelAccelerationStructure::AABBs aabbs; - - auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; - if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) - blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS; - - IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo; - initBuildInfo.buildFlags = blasFlags; - initBuildInfo.geometryCount = 1; // only 1 geometry object per blas - initBuildInfo.srcAS = nullptr; - initBuildInfo.dstAS = nullptr; - initBuildInfo.scratch = {}; - - auto blasBuildInfos = core::vector(blasCount, initBuildInfo); - - m_gpuBlasList.resize(blasCount); - // setup blas info for triangle geometries - for (uint32_t i = 0; i < blasCount; i++) - { - const auto isProcedural = i == proceduralBlasIdx; - if (isProcedural) - { - aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer); - aabbs.data.offset = 0; - aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t); - aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now - - primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries; - blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs; - blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; - } - else - { - const auto& gpuObject = m_gpuTriangleGeometries[i]; - - const uint32_t vertexStride = gpuObject.vertexStride; - const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride; - if (gpuObject.useIndex()) - primitiveCounts[i] = gpuObject.indexCount / 3; - else - primitiveCounts[i] = numVertices / 3; - - triangles[i].vertexData[0] = gpuObject.bindings.vertex; - triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex; - triangles[i].maxVertex = numVertices - 1; - triangles[i].vertexStride = vertexStride; - triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; - triangles[i].indexType = gpuObject.indexType; - triangles[i].geometryFlags = gpuObject.material.isTransparent() ? - IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : - IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; - - blasBuildInfos[i].triangles = &triangles[i]; - } - ILogicalDevice::AccelerationStructureBuildSizes buildSizes; - { - const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; - if (isProcedural) - { - const auto* aabbData = &aabbs; - buildSizes = m_device->getAccelerationStructureBuildSizes(false, blasBuildInfos[i].buildFlags, false, std::span{ aabbData, 1 }, maxPrimCount); - } - else - { - const auto* trianglesData = triangles.data(); - buildSizes = m_device->getAccelerationStructureBuildSizes(false, blasBuildInfos[i].buildFlags, false, std::span{ trianglesData,1 }, maxPrimCount); - } - if (!buildSizes) - return logFail("Failed to get BLAS build sizes"); - } - - scratchSizes[i] = buildSizes.buildScratchSize; - totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment); - totalScratchSize += buildSizes.buildScratchSize; - - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = buildSizes.accelerationStructureSize; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = buildSizes.accelerationStructureSize; - blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); - if (!m_gpuBlasList[i]) - return logFail("Could not create BLAS"); - } - } - - - auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool); - cmdbufBlas->beginDebugMarker("Build BLAS"); - - cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount); - - smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - params.size = totalScratchSize; - scratchBuffer = createBuffer(params); - } - - core::vector buildRangeInfos(blasCount); - core::vector pRangeInfos(blasCount); - for (uint32_t i = 0; i < blasCount; i++) - { - blasBuildInfos[i].dstAS = m_gpuBlasList[i].get(); - blasBuildInfos[i].scratch.buffer = scratchBuffer; - if (i == 0) - { - blasBuildInfos[i].scratch.offset = 0u; - } - else - { - const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1]; - blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment); - } - - buildRangeInfos[i].primitiveCount = primitiveCounts[i]; - buildRangeInfos[i].primitiveByteOffset = 0u; - buildRangeInfos[i].firstVertex = 0u; - buildRangeInfos[i].transformByteOffset = 0u; - - pRangeInfos[i] = &buildRangeInfos[i]; - } - - if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data())) - return logFail("Failed to build BLAS"); - - { - SMemoryBarrier memBarrier; - memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; - memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT; - cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); - } - - - core::vector ases(blasCount); - for (uint32_t i = 0; i < blasCount; i++) - ases[i] = m_gpuBlasList[i].get(); - if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE, - queryPool.get(), 0)) - return logFail("Failed to write acceleration structure properties!"); - - cmdbufBlas->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufBlas, queue, 39); - } - - auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool); - cmdbufCompact->beginDebugMarker("Compact BLAS"); - // compact blas - { - core::vector asSizes(blasCount); - if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT)) - return logFail("Could not get query pool results for AS sizes"); - - core::vector> cleanupBlas(blasCount); - for (uint32_t i = 0; i < blasCount; i++) - { - if (asSizes[i] == 0) continue; - cleanupBlas[i] = m_gpuBlasList[i]; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = asSizes[i]; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = asSizes[i]; - blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); - if (!m_gpuBlasList[i]) - return logFail("Could not create compacted BLAS"); - } - - IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo; - copyInfo.src = cleanupBlas[i].get(); - copyInfo.dst = m_gpuBlasList[i].get(); - copyInfo.compact = true; - if (!cmdbufCompact->copyAccelerationStructure(copyInfo)) - return logFail("Failed to copy AS to compact"); - } - } - - cmdbufCompact->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufCompact, queue, 40); - - auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool); - cmdbufTlas->beginDebugMarker("Build TLAS"); - - // build top level AS - { - const uint32_t instancesCount = blasCount; - core::vector instances(instancesCount); - for (uint32_t i = 0; i < instancesCount; i++) - { - const auto isProceduralInstance = i == proceduralBlasIdx; - instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress; - instances[i].base.mask = 0xFF; - instances[i].base.instanceCustomIndex = i; - instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0; - instances[i].base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); - instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform; - } - - { - size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT | - IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = bufSize; - m_instanceBuffer = createBuffer(params); - - SBufferRange range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer }; - cmdbufTlas->updateBuffer(range, instances.data()); - } - - // make sure instances upload complete first - { - SMemoryBarrier memBarrier; - memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; - memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; - memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; - cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); - } - - auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); - - IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo; - tlasBuildInfo.buildFlags = tlasFlags; - tlasBuildInfo.srcAS = nullptr; - tlasBuildInfo.dstAS = nullptr; - tlasBuildInfo.instanceData.buffer = m_instanceBuffer; - tlasBuildInfo.instanceData.offset = 0u; - tlasBuildInfo.scratch = {}; - - auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount); - if (!buildSizes) - return logFail("Failed to get TLAS build sizes"); - - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = buildSizes.accelerationStructureSize; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUTopLevelAccelerationStructure::SCreationParams tlasParams; - tlasParams.bufferRange.buffer = asBuffer; - tlasParams.bufferRange.offset = 0u; - tlasParams.bufferRange.size = buildSizes.accelerationStructureSize; - tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams)); - if (!m_gpuTlas) - return logFail("Could not create TLAS"); - } - - smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - params.size = buildSizes.buildScratchSize; - scratchBuffer = createBuffer(params); - } - - tlasBuildInfo.dstAS = m_gpuTlas.get(); - tlasBuildInfo.scratch.buffer = scratchBuffer; - tlasBuildInfo.scratch.offset = 0u; - - IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u]; - buildRangeInfo[0].instanceCount = instancesCount; - buildRangeInfo[0].instanceByteOffset = 0u; - IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos; - pRangeInfos = &buildRangeInfo[0]; - - if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos)) - return logFail("Failed to build TLAS"); - } - - cmdbufTlas->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufTlas, queue, 45); - -#ifdef TRY_BUILD_FOR_NGFX - { - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS - } }; - m_surface->present(m_currentImageAcquire.imageIndex, acquired); - } -#endif - m_api->endCapture(); - - return true; - } -#endif // TEST_ASSET_CONV_AS smart_refctd_ptr m_window; @@ -2317,7 +1600,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, core::vector m_gpuIntersectionSpheres; uint32_t m_intersectionHitGroupIdx; - std::vector> m_gpuBlasList; smart_refctd_ptr m_gpuTlas; smart_refctd_ptr m_instanceBuffer; From e30938c2615dd5d3ab69cadca3ba11d1e03f8233 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 26 May 2025 10:05:49 +0200 Subject: [PATCH 204/296] test that we're not overflown submitted when providing correct max size scratch buffer --- 71_RayTracingPipeline/main.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 0b6b4d724..968f7c42e 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1452,7 +1452,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, { constexpr auto MaxAlignment = 256; constexpr auto MinAllocationSize = 1024; - const auto scratchSize = core::alignUp(reservation.getMinASBuildScratchSize(false), MaxAlignment); + const auto scratchSize = core::alignUp(reservation.getMaxASBuildScratchSize(false), MaxAlignment); IGPUBuffer::SCreationParams creationParams = {}; @@ -1496,6 +1496,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); return false; } + // 2 submits, BLAS build, TLAS build, DO NOT ADD COMPACTIONS IN THIS EXAMPLE! + if (compute.getFutureScratchSemaphore().value>3) + m_logger->log("Overflow submitted on Compute Queue despite using ReBAR (no transfer submits or usage of staging buffer) and providing a AS Build Scratch Buffer of correctly queried max size!",system::ILogger::ELL_ERROR); // assign gpu objects to output auto&& tlases = reservation.getGPUObjects(); From 2a85f4e0911185a85df31f798b92e6902db3383e Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 11:24:43 +0700 Subject: [PATCH 205/296] refactor config member name --- 23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl | 2 +- 29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index e2256d2f1..048ccf316 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -14,7 +14,7 @@ using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration type_t; // final (level 1/2) scan needs to fit in one subgroup exactly -groupshared uint32_t scratch[config_t::ElementCount]; +groupshared uint32_t scratch[config_t::SharedScratchElementCount]; struct ScratchProxy { diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index 31284c520..fe340cf0c 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -14,7 +14,7 @@ using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration type_t; // final (level 1/2) scan needs to fit in one subgroup exactly -groupshared uint32_t scratch[config_t::ElementCount]; +groupshared uint32_t scratch[config_t::SharedScratchElementCount]; struct ScratchProxy { From 99f6dfe5b4345cc8bbe7ff2ab2353993e395d3bd Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 13:49:02 +0700 Subject: [PATCH 206/296] fit new accessor concepts --- .../app_resources/testWorkgroup.comp.hlsl | 4 ++-- .../app_resources/benchmarkWorkgroup.comp.hlsl | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 048ccf316..bda735b44 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -18,12 +18,12 @@ groupshared uint32_t scratch[config_t::SharedScratchElementCount]; struct ScratchProxy { - template + template void get(const uint32_t ix, NBL_REF_ARG(AccessType) value) { value = scratch[ix]; } - template + template void set(const uint32_t ix, const AccessType value) { scratch[ix] = value; diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index fe340cf0c..bfbe30ac9 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -18,13 +18,13 @@ groupshared uint32_t scratch[config_t::SharedScratchElementCount]; struct ScratchProxy { - template - void get(const uint32_t ix, NBL_REF_ARG(AccessType) value) + template + void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { value = scratch[ix]; } - template - void set(const uint32_t ix, const AccessType value) + template + void set(const IndexType ix, const AccessType value) { scratch[ix] = value; } From 0ed8dc4d42b8e11820f813b1c8281701ef1eebf0 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 27 May 2025 10:51:09 +0200 Subject: [PATCH 207/296] Lets remake the STL/PLY loaders --- 27_PLYSTLDemo/config.json.template | 28 - 27_PLYSTLDemo/main.cpp | 579 ------ .../CMakeLists.txt | 0 .../config.json.template | 2 +- 29_MeshLoaders/main.cpp | 1634 +++++++++++++++++ .../pipeline.groovy | 0 29_SpecializationConstants/CMakeLists.txt | 7 - 29_SpecializationConstants/main.cpp | 566 ------ 29_SpecializationConstants/particles.comp | 39 - 29_SpecializationConstants/particles.frag | 12 - 29_SpecializationConstants/particles.vert | 21 - 29_SpecializationConstants/pipeline.groovy | 50 - CMakeLists.txt | 1 + 13 files changed, 1636 insertions(+), 1303 deletions(-) delete mode 100644 27_PLYSTLDemo/config.json.template delete mode 100644 27_PLYSTLDemo/main.cpp rename {27_PLYSTLDemo => 29_MeshLoaders}/CMakeLists.txt (100%) rename {29_SpecializationConstants => 29_MeshLoaders}/config.json.template (90%) create mode 100644 29_MeshLoaders/main.cpp rename {27_PLYSTLDemo => 29_MeshLoaders}/pipeline.groovy (100%) delete mode 100644 29_SpecializationConstants/CMakeLists.txt delete mode 100644 29_SpecializationConstants/main.cpp delete mode 100644 29_SpecializationConstants/particles.comp delete mode 100644 29_SpecializationConstants/particles.frag delete mode 100644 29_SpecializationConstants/particles.vert delete mode 100644 29_SpecializationConstants/pipeline.groovy diff --git a/27_PLYSTLDemo/config.json.template b/27_PLYSTLDemo/config.json.template deleted file mode 100644 index cb1b3b7a7..000000000 --- a/27_PLYSTLDemo/config.json.template +++ /dev/null @@ -1,28 +0,0 @@ -{ - "enableParallelBuild": true, - "threadsPerBuildProcess" : 2, - "isExecuted": false, - "scriptPath": "", - "cmake": { - "configurations": [ "Release", "Debug", "RelWithDebInfo" ], - "buildModes": [], - "requiredOptions": [ "NBL_BUILD_MITSUBA_LOADER", "NBL_BUILD_OPTIX" ] - }, - "profiles": [ - { - "backend": "vulkan", - "platform": "windows", - "buildModes": [], - "runConfiguration": "Release", - "gpuArchitectures": [] - } - ], - "dependencies": [], - "data": [ - { - "dependencies": [], - "command": [""], - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/27_PLYSTLDemo/main.cpp b/27_PLYSTLDemo/main.cpp deleted file mode 100644 index 1e6d470e2..000000000 --- a/27_PLYSTLDemo/main.cpp +++ /dev/null @@ -1,579 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#define _NBL_STATIC_LIB_ -#include -#include -#include - -#include "CCamera.hpp" -#include "../common/CommonAPI.h" -#include "nbl/ext/ScreenShot/ScreenShot.h" - -using namespace nbl; -using namespace core; - -/* - Uncomment for more detailed logging -*/ - -// #define NBL_MORE_LOGS - -/* - Uncomment for writing assets -*/ - -#define WRITE_ASSETS - -class PLYSTLDemo : public ApplicationBase -{ - static constexpr uint32_t WIN_W = 1280; - static constexpr uint32_t WIN_H = 720; - static constexpr uint32_t SC_IMG_COUNT = 3u; - static constexpr uint32_t FRAMES_IN_FLIGHT = 5u; - static constexpr uint64_t MAX_TIMEOUT = 99999999999999ull; - static constexpr size_t NBL_FRAMES_TO_AVERAGE = 100ull; - static_assert(FRAMES_IN_FLIGHT > SC_IMG_COUNT); - - using RENDERPASS_INDEPENDENT_PIPELINE_ADRESS = size_t; - using GPU_PIPELINE_HASH_CONTAINER = std::map>; - using DependentDrawData = std::tuple, core::smart_refctd_ptr, core::smart_refctd_ptr, uint32_t, const asset::IRenderpassIndependentPipelineMetadata*>; - -public: - nbl::core::smart_refctd_ptr windowManager; - nbl::core::smart_refctd_ptr window; - nbl::core::smart_refctd_ptr windowCallback; - nbl::core::smart_refctd_ptr gl; - nbl::core::smart_refctd_ptr surface; - nbl::core::smart_refctd_ptr utilities; - nbl::core::smart_refctd_ptr logicalDevice; - nbl::video::IPhysicalDevice* gpuPhysicalDevice; - std::array queues = { nullptr, nullptr, nullptr, nullptr }; - nbl::core::smart_refctd_ptr swapchain; - nbl::core::smart_refctd_ptr renderpass; - nbl::core::smart_refctd_dynamic_array> fbos; - std::array, CommonAPI::InitOutput::MaxFramesInFlight>, CommonAPI::InitOutput::MaxQueuesCount> commandPools; - nbl::core::smart_refctd_ptr system; - nbl::core::smart_refctd_ptr assetManager; - nbl::video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams; - nbl::core::smart_refctd_ptr logger; - nbl::core::smart_refctd_ptr inputSystem; - - nbl::core::smart_refctd_ptr gpuTransferFence; - nbl::core::smart_refctd_ptr gpuComputeFence; - nbl::video::IGPUObjectFromAssetConverter cpu2gpu; - - uint32_t acquiredNextFBO = {}; - int resourceIx = -1; - - core::smart_refctd_ptr commandBuffers[FRAMES_IN_FLIGHT]; - - core::smart_refctd_ptr frameComplete[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr imageAcquire[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr renderFinished[FRAMES_IN_FLIGHT] = { nullptr }; - - nbl::video::ISwapchain::SCreationParams m_swapchainCreationParams; - - std::chrono::system_clock::time_point lastTime; - bool frameDataFilled = false; - size_t frame_count = 0ull; - double time_sum = 0; - double dtList[NBL_FRAMES_TO_AVERAGE] = {}; - - CommonAPI::InputSystem::ChannelReader mouse; - CommonAPI::InputSystem::ChannelReader keyboard; - - Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); - - GPU_PIPELINE_HASH_CONTAINER gpuPipelinesPly; - GPU_PIPELINE_HASH_CONTAINER gpuPipelinesStl; - - DependentDrawData plyDrawData; - DependentDrawData stlDrawData; - - void setWindow(core::smart_refctd_ptr&& wnd) override - { - window = std::move(wnd); - } - nbl::ui::IWindow* getWindow() override - { - return window.get(); - } - void setSystem(core::smart_refctd_ptr&& s) override - { - system = std::move(s); - } - video::IAPIConnection* getAPIConnection() override - { - return gl.get(); - } - video::ILogicalDevice* getLogicalDevice() override - { - return logicalDevice.get(); - } - video::IGPURenderpass* getRenderpass() override - { - return renderpass.get(); - } - void setSurface(core::smart_refctd_ptr&& s) override - { - surface = std::move(s); - } - void setFBOs(std::vector>& f) override - { - for (int i = 0; i < f.size(); i++) - { - fbos->begin()[i] = core::smart_refctd_ptr(f[i]); - } - } - void setSwapchain(core::smart_refctd_ptr&& s) override - { - swapchain = std::move(s); - } - uint32_t getSwapchainImageCount() override - { - return swapchain->getImageCount(); - } - virtual nbl::asset::E_FORMAT getDepthFormat() override - { - return nbl::asset::EF_D32_SFLOAT; - } - -APP_CONSTRUCTOR(PLYSTLDemo) - - void onAppInitialized_impl() override - { - const auto swapchainImageUsage = static_cast(asset::IImage::EUF_COLOR_ATTACHMENT_BIT); - CommonAPI::InitParams initParams; - initParams.window = core::smart_refctd_ptr(window); - initParams.apiType = video::EAT_VULKAN; - initParams.appName = { _NBL_APP_NAME_ }; - initParams.framesInFlight = FRAMES_IN_FLIGHT; - initParams.windowWidth = WIN_W; - initParams.windowHeight = WIN_H; - initParams.swapchainImageCount = SC_IMG_COUNT; - initParams.swapchainImageUsage = swapchainImageUsage; - initParams.depthFormat = nbl::asset::EF_D32_SFLOAT; - auto initOutput = CommonAPI::InitWithDefaultExt(std::move(initParams)); - - window = std::move(initParams.window); - gl = std::move(initOutput.apiConnection); - surface = std::move(initOutput.surface); - gpuPhysicalDevice = std::move(initOutput.physicalDevice); - logicalDevice = std::move(initOutput.logicalDevice); - queues = std::move(initOutput.queues); - renderpass = std::move(initOutput.renderToSwapchainRenderpass); - commandPools = std::move(initOutput.commandPools); - assetManager = std::move(initOutput.assetManager); - logger = std::move(initOutput.logger); - inputSystem = std::move(initOutput.inputSystem); - system = std::move(initOutput.system); - windowCallback = std::move(initParams.windowCb); - utilities = std::move(initOutput.utilities); - m_swapchainCreationParams = std::move(initOutput.swapchainCreationParams); - - CommonAPI::createSwapchain(std::move(logicalDevice), m_swapchainCreationParams, WIN_W, WIN_H, swapchain); - assert(swapchain); - fbos = CommonAPI::createFBOWithSwapchainImages( - swapchain->getImageCount(), WIN_W, WIN_H, - logicalDevice, swapchain, renderpass, - nbl::asset::EF_D32_SFLOAT - ); - - auto defaultComputeCommandPool = commandPools[CommonAPI::InitOutput::EQT_COMPUTE][0]; - auto defaultTransferUpCommandPool = commandPools[CommonAPI::InitOutput::EQT_TRANSFER_UP][0]; - - nbl::video::IGPUObjectFromAssetConverter cpu2gpu; - nbl::video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams; - - nbl::core::smart_refctd_ptr gpuTransferFence; - nbl::core::smart_refctd_ptr gpuTransferSemaphore; - - nbl::core::smart_refctd_ptr gpuComputeFence; - nbl::core::smart_refctd_ptr gpuComputeSemaphore; - - { - gpuTransferFence = logicalDevice->createFence(static_cast(0)); - gpuTransferSemaphore = logicalDevice->createSemaphore(); - - gpuComputeFence = logicalDevice->createFence(static_cast(0)); - gpuComputeSemaphore = logicalDevice->createSemaphore(); - - cpu2gpuParams.utilities = utilities.get(); - cpu2gpuParams.device = logicalDevice.get(); - cpu2gpuParams.assetManager = assetManager.get(); - cpu2gpuParams.pipelineCache = nullptr; - cpu2gpuParams.limits = gpuPhysicalDevice->getLimits(); - cpu2gpuParams.finalQueueFamIx = queues[decltype(initOutput)::EQT_GRAPHICS]->getFamilyIndex(); - - logicalDevice->createCommandBuffers(defaultTransferUpCommandPool.get(),video::IGPUCommandBuffer::EL_PRIMARY,1u,&cpu2gpuParams.perQueue[nbl::video::IGPUObjectFromAssetConverter::EQU_TRANSFER].cmdbuf); - cpu2gpuParams.perQueue[nbl::video::IGPUObjectFromAssetConverter::EQU_TRANSFER].queue = queues[decltype(initOutput)::EQT_TRANSFER_UP]; - cpu2gpuParams.perQueue[nbl::video::IGPUObjectFromAssetConverter::EQU_TRANSFER].semaphore = &gpuTransferSemaphore; - - logicalDevice->createCommandBuffers(defaultComputeCommandPool.get(),video::IGPUCommandBuffer::EL_PRIMARY,1u,&cpu2gpuParams.perQueue[nbl::video::IGPUObjectFromAssetConverter::EQU_COMPUTE].cmdbuf); - cpu2gpuParams.perQueue[nbl::video::IGPUObjectFromAssetConverter::EQU_COMPUTE].queue = queues[decltype(initOutput)::EQT_COMPUTE]; - cpu2gpuParams.perQueue[nbl::video::IGPUObjectFromAssetConverter::EQU_COMPUTE].semaphore = &gpuComputeSemaphore; - - cpu2gpuParams.beginCommandBuffers(); - } - - auto loadAndGetCpuMesh = [&](system::path path) -> std::pair, const asset::IAssetMetadata*> - { - auto meshes_bundle = assetManager->getAsset(path.string(), {}); - { - bool status = !meshes_bundle.getContents().empty(); - assert(status); - } - - auto mesh = core::smart_refctd_ptr_static_cast(meshes_bundle.getContents().begin()[0]); - auto metadata = meshes_bundle.getMetadata(); - return std::make_pair(mesh, metadata); - //return std::make_pair(core::smart_refctd_ptr_static_cast(meshes_bundle.getContents().begin()[0]), meshes_bundle.getMetadata()); - }; - - auto cpuBundlePLYData = loadAndGetCpuMesh(sharedInputCWD / "ply/Spanner-ply.ply"); - auto cpuBundleSTLData = loadAndGetCpuMesh(sharedInputCWD / "extrusionLogo_TEST_fixed.stl"); - - core::smart_refctd_ptr cpuMeshPly = cpuBundlePLYData.first; - auto metadataPly = cpuBundlePLYData.second->selfCast(); - - core::smart_refctd_ptr cpuMeshStl = cpuBundleSTLData.first; - auto metadataStl = cpuBundleSTLData.second->selfCast(); - -#ifdef WRITE_ASSETS - { - asset::IAssetWriter::SAssetWriteParams wp(cpuMeshPly.get()); - bool status = assetManager->writeAsset("Spanner_ply.ply", wp); - assert(status); - } - - { - asset::IAssetWriter::SAssetWriteParams wp(cpuMeshStl.get()); - bool status = assetManager->writeAsset("extrusionLogo_TEST_fixedTest.stl", wp); - assert(status); - } -#endif // WRITE_ASSETS - - /* - For the testing puposes we can safely assume all meshbuffers within mesh loaded from PLY & STL has same DS1 layout (used for camera-specific data) - */ - - auto getMeshDependentDrawData = [&](core::smart_refctd_ptr cpuMesh, bool isPLY) -> DependentDrawData - { - const asset::ICPUMeshBuffer* const firstMeshBuffer = cpuMesh->getMeshBuffers().begin()[0]; - const asset::ICPUDescriptorSetLayout* ds1layout = firstMeshBuffer->getPipeline()->getLayout()->getDescriptorSetLayout(1u); //! DS1 - const asset::IRenderpassIndependentPipelineMetadata* pipelineMetadata; - { - if (isPLY) - pipelineMetadata = metadataPly->getAssetSpecificMetadata(firstMeshBuffer->getPipeline()); - else - pipelineMetadata = metadataStl->getAssetSpecificMetadata(firstMeshBuffer->getPipeline()); - } - - /* - So we can create just one DescriptorSet - */ - - const uint32_t ds1UboBinding = ds1layout->getDescriptorRedirect(asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER).getBinding(asset::ICPUDescriptorSetLayout::CBindingRedirect::storage_range_index_t{ 0 }).data; - - auto getNeededDS1UboByteSize = [&]() - { - size_t neededDS1UboSize = 0ull; - { - for (const auto& shaderInputs : pipelineMetadata->m_inputSemantics) - if (shaderInputs.descriptorSection.type == asset::IRenderpassIndependentPipelineMetadata::ShaderInput::E_TYPE::ET_UNIFORM_BUFFER && shaderInputs.descriptorSection.uniformBufferObject.set == 1u && shaderInputs.descriptorSection.uniformBufferObject.binding == ds1UboBinding) - neededDS1UboSize = std::max(neededDS1UboSize, shaderInputs.descriptorSection.uniformBufferObject.relByteoffset + shaderInputs.descriptorSection.uniformBufferObject.bytesize); - } - return neededDS1UboSize; - }; - - const uint64_t uboDS1ByteSize = getNeededDS1UboByteSize(); - - core::smart_refctd_ptr gpuds1layout; - { - auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&ds1layout, &ds1layout + 1, cpu2gpuParams); - if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0]) - assert(false); - - gpuds1layout = (*gpu_array)[0]; - } - - const uint32_t setCount = 1; - auto gpuUBODescriptorPool = logicalDevice->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, &gpuds1layout.get(), &gpuds1layout.get()+1ull, &setCount); - - video::IGPUBuffer::SCreationParams creationParams; - creationParams.usage = asset::IBuffer::E_USAGE_FLAGS(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF); - creationParams.queueFamilyIndices = 0u; - creationParams.queueFamilyIndices = nullptr; - creationParams.size = uboDS1ByteSize; - - auto gpuubo = logicalDevice->createBuffer(std::move(creationParams)); - auto gpuuboMemReqs = gpuubo->getMemoryReqs(); - gpuuboMemReqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - logicalDevice->allocate(gpuuboMemReqs, gpuubo.get()); - - auto gpuds1 = gpuUBODescriptorPool->createDescriptorSet(std::move(gpuds1layout)); - { - video::IGPUDescriptorSet::SWriteDescriptorSet write; - write.dstSet = gpuds1.get(); - write.binding = ds1UboBinding; - write.count = 1u; - write.arrayElement = 0u; - write.descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER; - video::IGPUDescriptorSet::SDescriptorInfo info; - { - info.desc = gpuubo; - info.info.buffer.offset = 0ull; - info.info.buffer.size = uboDS1ByteSize; - } - write.info = &info; - logicalDevice->updateDescriptorSets(1u, &write, 0u, nullptr); - } - - core::smart_refctd_ptr gpumesh; - { - auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&cpuMesh.get(), &cpuMesh.get() + 1, cpu2gpuParams); - cpu2gpuParams.waitForCreationToComplete(true); - cpu2gpuParams.beginCommandBuffers(); - if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0]) - assert(false); - - gpumesh = (*gpu_array)[0]; - } - - return std::make_tuple(gpumesh, gpuubo, gpuds1, ds1UboBinding, pipelineMetadata); - }; - - plyDrawData = getMeshDependentDrawData(cpuMeshPly, true); - stlDrawData = getMeshDependentDrawData(cpuMeshStl, false); - - { - auto fillGpuPipeline = [&](GPU_PIPELINE_HASH_CONTAINER& container, video::IGPUMesh* gpuMesh) - { - for (size_t i = 0; i < gpuMesh->getMeshBuffers().size(); ++i) - { - auto gpuIndependentPipeline = gpuMesh->getMeshBuffers().begin()[i]->getPipeline(); - - nbl::video::IGPUGraphicsPipeline::SCreationParams graphicsPipelineParams; - graphicsPipelineParams.renderpassIndependent = core::smart_refctd_ptr(const_cast(gpuIndependentPipeline)); - graphicsPipelineParams.renderpass = core::smart_refctd_ptr(renderpass); - - const RENDERPASS_INDEPENDENT_PIPELINE_ADRESS adress = reinterpret_cast(graphicsPipelineParams.renderpassIndependent.get()); - container[adress] = logicalDevice->createGraphicsPipeline(nullptr, std::move(graphicsPipelineParams)); - } - }; - - fillGpuPipeline(gpuPipelinesPly, std::get>(plyDrawData).get()); - fillGpuPipeline(gpuPipelinesStl, std::get>(stlDrawData).get()); - } - - core::vectorSIMDf cameraPosition(0, 5, -10); - matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), video::ISurface::getTransformedAspectRatio(swapchain->getPreTransform(), WIN_W, WIN_H), 0.001, 1000); - camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), projectionMatrix, 0.01f, 1.f); - lastTime = std::chrono::system_clock::now(); - - for (size_t i = 0ull; i < NBL_FRAMES_TO_AVERAGE; ++i) - dtList[i] = 0.0; - - const auto& graphicsCommandPools = commandPools[CommonAPI::InitOutput::EQT_GRAPHICS]; - for (uint32_t i = 0u; i < FRAMES_IN_FLIGHT; i++) - { - logicalDevice->createCommandBuffers(graphicsCommandPools[i].get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, commandBuffers+i); - imageAcquire[i] = logicalDevice->createSemaphore(); - renderFinished[i] = logicalDevice->createSemaphore(); - } - } - - void onAppTerminated_impl() override - { - const auto& fboCreationParams = fbos->begin()[acquiredNextFBO]->getCreationParameters(); - auto gpuSourceImageView = fboCreationParams.attachments[0]; - - //TODO: - bool status = ext::ScreenShot::createScreenShot( - logicalDevice.get(), - queues[CommonAPI::InitOutput::EQT_TRANSFER_UP], - renderFinished[resourceIx].get(), - gpuSourceImageView.get(), - assetManager.get(), - "ScreenShot.png", - asset::IImage::EL_PRESENT_SRC, - asset::EAF_NONE); - assert(status); - } - - void workLoopBody() override - { - ++resourceIx; - if (resourceIx >= FRAMES_IN_FLIGHT) - resourceIx = 0; - - auto& commandBuffer = commandBuffers[resourceIx]; - auto& fence = frameComplete[resourceIx]; - - if (fence) - while (logicalDevice->waitForFences(1u, &fence.get(), false, MAX_TIMEOUT) == video::IGPUFence::ES_TIMEOUT) {} - else - fence = logicalDevice->createFence(static_cast(0)); - - auto renderStart = std::chrono::system_clock::now(); - const auto renderDt = std::chrono::duration_cast(renderStart - lastTime).count(); - lastTime = renderStart; - { // Calculate Simple Moving Average for FrameTime - time_sum -= dtList[frame_count]; - time_sum += renderDt; - dtList[frame_count] = renderDt; - frame_count++; - if (frame_count >= NBL_FRAMES_TO_AVERAGE) - { - frameDataFilled = true; - frame_count = 0; - } - - } - const double averageFrameTime = frameDataFilled ? (time_sum / (double)NBL_FRAMES_TO_AVERAGE) : (time_sum / frame_count); - -#ifdef NBL_MORE_LOGS - logger->log("renderDt = %f ------ averageFrameTime = %f", system::ILogger::ELL_INFO, renderDt, averageFrameTime); -#endif // NBL_MORE_LOGS - - auto averageFrameTimeDuration = std::chrono::duration(averageFrameTime); - auto nextPresentationTime = renderStart + averageFrameTimeDuration; - auto nextPresentationTimeStamp = std::chrono::duration_cast(nextPresentationTime.time_since_epoch()); - - inputSystem->getDefaultMouse(&mouse); - inputSystem->getDefaultKeyboard(&keyboard); - - camera.beginInputProcessing(nextPresentationTimeStamp); - mouse.consumeEvents([&](const ui::IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, logger.get()); - keyboard.consumeEvents([&](const ui::IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); }, logger.get()); - camera.endInputProcessing(nextPresentationTimeStamp); - - const auto& viewMatrix = camera.getViewMatrix(); - const auto& viewProjectionMatrix = matrix4SIMD::concatenateBFollowedByAPrecisely( - video::ISurface::getSurfaceTransformationMatrix(swapchain->getPreTransform()), - camera.getConcatenatedMatrix() - ); - - commandBuffer->reset(nbl::video::IGPUCommandBuffer::ERF_RELEASE_RESOURCES_BIT); - commandBuffer->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool - - asset::SViewport viewport; - viewport.minDepth = 1.f; - viewport.maxDepth = 0.f; - viewport.x = 0u; - viewport.y = 0u; - viewport.width = WIN_W; - viewport.height = WIN_H; - commandBuffer->setViewport(0u, 1u, &viewport); - - swapchain->acquireNextImage(MAX_TIMEOUT, imageAcquire[resourceIx].get(), nullptr, &acquiredNextFBO); - - nbl::video::IGPUCommandBuffer::SRenderpassBeginInfo beginInfo; - { - VkRect2D area; - area.offset = { 0,0 }; - area.extent = { WIN_W, WIN_H }; - asset::SClearValue clear[2] = {}; - clear[0].color.float32[0] = 1.f; - clear[0].color.float32[1] = 1.f; - clear[0].color.float32[2] = 1.f; - clear[0].color.float32[3] = 1.f; - clear[1].depthStencil.depth = 0.f; - - beginInfo.clearValueCount = 2u; - beginInfo.framebuffer = fbos->begin()[acquiredNextFBO]; - beginInfo.renderpass = renderpass; - beginInfo.renderArea = area; - beginInfo.clearValues = clear; - } - - commandBuffer->beginRenderPass(&beginInfo, nbl::asset::ESC_INLINE); - - auto renderMesh = [&](GPU_PIPELINE_HASH_CONTAINER& gpuPipelines, DependentDrawData& drawData, uint32_t index) - { - auto gpuMesh = std::get>(drawData); - auto gpuubo = std::get>(drawData); - auto gpuds1 = std::get>(drawData); - auto ds1UboBinding = std::get(drawData); - const auto* pipelineMetadata = std::get(drawData); - - core::matrix3x4SIMD modelMatrix; - - if (index == 1) - modelMatrix.setScale(core::vectorSIMDf(10, 10, 10)); - modelMatrix.setTranslation(nbl::core::vectorSIMDf(index * 150, 0, 0, 0)); - - core::matrix4SIMD mvp = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); - - core::vector uboData(gpuubo->getSize()); - for (const auto& shaderInputs : pipelineMetadata->m_inputSemantics) - { - if (shaderInputs.descriptorSection.type == asset::IRenderpassIndependentPipelineMetadata::ShaderInput::E_TYPE::ET_UNIFORM_BUFFER && shaderInputs.descriptorSection.uniformBufferObject.set == 1u && shaderInputs.descriptorSection.uniformBufferObject.binding == ds1UboBinding) - { - switch (shaderInputs.type) - { - case asset::IRenderpassIndependentPipelineMetadata::ECSI_WORLD_VIEW_PROJ: - { - memcpy(uboData.data() + shaderInputs.descriptorSection.uniformBufferObject.relByteoffset, mvp.pointer(), shaderInputs.descriptorSection.uniformBufferObject.bytesize); - } break; - - case asset::IRenderpassIndependentPipelineMetadata::ECSI_WORLD_VIEW: - { - memcpy(uboData.data() + shaderInputs.descriptorSection.uniformBufferObject.relByteoffset, viewMatrix.pointer(), shaderInputs.descriptorSection.uniformBufferObject.bytesize); - } break; - - case asset::IRenderpassIndependentPipelineMetadata::ECSI_WORLD_VIEW_INVERSE_TRANSPOSE: - { - memcpy(uboData.data() + shaderInputs.descriptorSection.uniformBufferObject.relByteoffset, viewMatrix.pointer(), shaderInputs.descriptorSection.uniformBufferObject.bytesize); - } break; - } - } - } - - commandBuffer->updateBuffer(gpuubo.get(), 0ull, gpuubo->getSize(), uboData.data()); - - for (auto gpuMeshBuffer : gpuMesh->getMeshBuffers()) - { - auto gpuGraphicsPipeline = gpuPipelines[reinterpret_cast(gpuMeshBuffer->getPipeline())]; - - const video::IGPURenderpassIndependentPipeline* gpuRenderpassIndependentPipeline = gpuMeshBuffer->getPipeline(); - const video::IGPUDescriptorSet* ds3 = gpuMeshBuffer->getAttachedDescriptorSet(); - - commandBuffer->bindGraphicsPipeline(gpuGraphicsPipeline.get()); - - const video::IGPUDescriptorSet* gpuds1_ptr = gpuds1.get(); - commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuRenderpassIndependentPipeline->getLayout(), 1u, 1u, &gpuds1_ptr, 0u); - const video::IGPUDescriptorSet* gpuds3_ptr = gpuMeshBuffer->getAttachedDescriptorSet(); - - if (gpuds3_ptr) - commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuRenderpassIndependentPipeline->getLayout(), 3u, 1u, &gpuds3_ptr, 0u); - if (gpuRenderpassIndependentPipeline->getLayout()->m_pushConstantRanges) - commandBuffer->pushConstants(gpuRenderpassIndependentPipeline->getLayout(), video::IGPUShader::ESS_FRAGMENT, 0u, gpuMeshBuffer->MAX_PUSH_CONSTANT_BYTESIZE, gpuMeshBuffer->getPushConstantsDataPtr()); - - commandBuffer->drawMeshBuffer(gpuMeshBuffer); - } - }; - - /* - Record PLY and STL rendering commands - */ - - renderMesh(gpuPipelinesPly, plyDrawData, 0); - renderMesh(gpuPipelinesStl, stlDrawData, 1); - - commandBuffer->endRenderPass(); - commandBuffer->end(); - - CommonAPI::Submit(logicalDevice.get(), commandBuffer.get(), queues[CommonAPI::InitOutput::EQT_GRAPHICS], imageAcquire[resourceIx].get(), renderFinished[resourceIx].get(), fence.get()); - CommonAPI::Present(logicalDevice.get(), swapchain.get(), queues[CommonAPI::InitOutput::EQT_GRAPHICS], renderFinished[resourceIx].get(), acquiredNextFBO); - } - - bool keepRunning() override - { - return windowCallback->isWindowOpen(); - } -}; - -NBL_COMMON_API_MAIN(PLYSTLDemo) \ No newline at end of file diff --git a/27_PLYSTLDemo/CMakeLists.txt b/29_MeshLoaders/CMakeLists.txt similarity index 100% rename from 27_PLYSTLDemo/CMakeLists.txt rename to 29_MeshLoaders/CMakeLists.txt diff --git a/29_SpecializationConstants/config.json.template b/29_MeshLoaders/config.json.template similarity index 90% rename from 29_SpecializationConstants/config.json.template rename to 29_MeshLoaders/config.json.template index f961745c1..2c42b001d 100644 --- a/29_SpecializationConstants/config.json.template +++ b/29_MeshLoaders/config.json.template @@ -6,7 +6,7 @@ "cmake": { "configurations": [ "Release", "Debug", "RelWithDebInfo" ], "buildModes": [], - "requiredOptions": [] + "requiredOptions": [ "NBL_BUILD_MITSUBA_LOADER" ] }, "profiles": [ { diff --git a/29_MeshLoaders/main.cpp b/29_MeshLoaders/main.cpp new file mode 100644 index 000000000..968f7c42e --- /dev/null +++ b/29_MeshLoaders/main.cpp @@ -0,0 +1,1634 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "common.hpp" +#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" +#include "nbl/builtin/hlsl/indirect_commands.hlsl" + + +class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = examples::SimpleWindowedApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using clock_t = std::chrono::steady_clock; + + constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; + constexpr static inline uint32_t MaxFramesInFlight = 3u; + constexpr static inline uint8_t MaxUITextureCount = 1u; + constexpr static inline uint32_t NumberOfProceduralGeometries = 5; + + static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = { + "Directional", + "Point", + "Spot" + }; + + struct ShaderBindingTable + { + SBufferRange raygenGroupRange; + SBufferRange hitGroupsRange; + uint32_t hitGroupsStride; + SBufferRange missGroupsRange; + uint32_t missGroupsStride; + SBufferRange callableGroupsRange; + uint32_t callableGroupsStride; + }; + + +public: + inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) + { + } + + inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + { + auto retval = device_base_t::getRequiredDeviceFeatures(); + retval.rayTracingPipeline = true; + retval.accelerationStructure = true; + retval.rayQuery = true; + return retval; + } + + inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override + { + auto retval = device_base_t::getPreferredDeviceFeatures(); + retval.accelerationStructureHostCommands = true; + return retval; + } + + inline core::vector getSurfaces() const override + { + if (!m_surface) + { + { + auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); + IWindow::SCreationParams params = {}; + params.callback = core::make_smart_refctd_ptr(); + params.width = WIN_W; + params.height = WIN_H; + params.x = 32; + params.y = 32; + params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; + params.windowCaption = "RaytracingPipelineApp"; + params.callback = windowCallback; + const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); + } + + auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); + const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); + } + + if (m_surface) + return { {m_surface->getSurface()/*,EQF_NONE*/} }; + + return {}; + } + + // so that we can use the same queue for asset converter and rendering + inline core::vector getQueueRequirements() const override + { + auto reqs = device_base_t::getQueueRequirements(); + reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + return reqs; + } + + inline bool onAppInitialized(smart_refctd_ptr&& system) override + { + m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + smart_refctd_ptr shaderReadCache = nullptr; + smart_refctd_ptr shaderWriteCache = core::make_smart_refctd_ptr(); + auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin"; + + { + core::smart_refctd_ptr shaderReadCacheFile; + { + system::ISystem::future_t> future; + m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ); + if (future.wait()) + { + future.acquire().move_into(shaderReadCacheFile); + if (shaderReadCacheFile) + { + const size_t size = shaderReadCacheFile->getSize(); + if (size > 0ull) + { + std::vector contents(size); + system::IFile::success_t succ; + shaderReadCacheFile->read(succ, contents.data(), 0, size); + if (succ) + shaderReadCache = IShaderCompiler::CCache::deserialize(contents); + } + } + } + else + m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR); + } + + } + + // Load Custom Shader + auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + auto assetBundle = m_assetMgr->getAsset(relPath, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + return nullptr; + + // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader + auto sourceRaw = IAsset::castDown(assets[0]); + if (!sourceRaw) + return nullptr; + + return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); + }; + + // load shaders + const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl"); + const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl"); + const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl"); + const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl"); + const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl"); + const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl"); + const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); + const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl"); + const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); + const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); + const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl"); + const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl"); + + core::smart_refctd_ptr shaderWriteCacheFile; + { + system::ISystem::future_t> future; + m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json + m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE); + if (future.wait()) + { + future.acquire().move_into(shaderWriteCacheFile); + if (shaderWriteCacheFile) + { + auto serializedCache = shaderWriteCache->serialize(); + if (shaderWriteCacheFile) + { + system::IFile::success_t succ; + shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize()); + if (!succ) + m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR); + } + } + else + m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); + } + else + m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); + } + + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); + + auto gQueue = getGraphicsQueue(); + + // Create renderpass and init surface + nbl::video::IGPURenderpass* renderpass; + { + ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr(m_surface->getSurface()) }; + if (!swapchainParams.deduceFormat(m_physicalDevice)) + return logFail("Could not choose a Surface Format for the Swapchain!"); + + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = + { + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; + + auto scResources = std::make_unique(m_device.get(), swapchainParams.surfaceFormat.format, dependencies); + renderpass = scResources->getRenderpass(); + + if (!renderpass) + return logFail("Failed to create Renderpass!"); + + if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) + return logFail("Could not create Window & Surface or initialize the Surface!"); + } + + auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + + m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); + + for (auto i = 0u; i < MaxFramesInFlight; i++) + { + if (!pool) + return logFail("Couldn't create Command Pool!"); + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) + return logFail("Couldn't create Command Buffer!"); + } + + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); + + + // create output images + m_hdrImage = m_device->createImage({ + { + .type = IGPUImage::ET_2D, + .samples = ICPUImage::ESCF_1_BIT, + .format = EF_R16G16B16A16_SFLOAT, + .extent = {WIN_W, WIN_H, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .flags = IImage::ECF_NONE, + .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT + } + }); + + if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid()) + return logFail("Could not create HDR Image"); + + m_hdrImageView = m_device->createImageView({ + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT, + .image = m_hdrImage, + .viewType = IGPUImageView::E_TYPE::ET_2D, + .format = asset::EF_R16G16B16A16_SFLOAT + }); + + + + // ray trace pipeline and descriptor set layout setup + { + const IGPUDescriptorSetLayout::SBinding bindings[] = { + { + .binding = 0, + .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, + .count = 1, + }, + { + .binding = 1, + .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, + .count = 1, + } + }; + const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); + + const std::array dsLayoutPtrs = { descriptorSetLayout.get() }; + m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); + m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout); + + const SPushConstantRange pcRange = { + .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, + .offset = 0u, + .size = sizeof(SPushConstants), + }; + const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr); + + IGPURayTracingPipeline::SCreationParams params = {}; + + enum RtDemoShader + { + RTDS_RAYGEN, + RTDS_MISS, + RTDS_MISS_SHADOW, + RTDS_CLOSEST_HIT, + RTDS_SPHERE_CLOSEST_HIT, + RTDS_ANYHIT_PRIMARY, + RTDS_ANYHIT_SHADOW, + RTDS_INTERSECTION, + RTDS_DIRECTIONAL_CALL, + RTDS_POINT_CALL, + RTDS_SPOT_CALL, + RTDS_COUNT + }; + + IGPUShader::SSpecInfo shaders[RTDS_COUNT]; + shaders[RTDS_RAYGEN] = { .shader = raygenShader.get() }; + shaders[RTDS_MISS] = { .shader = missShader.get() }; + shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() }; + shaders[RTDS_CLOSEST_HIT] = { .shader = closestHitShader.get() }; + shaders[RTDS_SPHERE_CLOSEST_HIT] = { .shader = proceduralClosestHitShader.get() }; + shaders[RTDS_ANYHIT_PRIMARY] = { .shader = anyHitShaderColorPayload.get() }; + shaders[RTDS_ANYHIT_SHADOW] = { .shader = anyHitShaderShadowPayload.get() }; + shaders[RTDS_INTERSECTION] = { .shader = intersectionHitShader.get() }; + shaders[RTDS_DIRECTIONAL_CALL] = { .shader = directionalLightCallShader.get() }; + shaders[RTDS_POINT_CALL] = { .shader = pointLightCallShader.get() }; + shaders[RTDS_SPOT_CALL] = { .shader = spotLightCallShader.get() }; + + params.layout = pipelineLayout.get(); + params.shaders = std::span(shaders); + using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS; + params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) | + RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | + RayTracingFlags::NO_NULL_ANY_HIT_SHADERS; + + auto& shaderGroups = params.shaderGroups; + + shaderGroups.raygen = { .index = RTDS_RAYGEN }; + + IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT]; + missGroups[EMT_PRIMARY] = { .index = RTDS_MISS }; + missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW }; + shaderGroups.misses = missGroups; + + auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) + { + return geomType * ERT_COUNT + rayType; + }; + IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; + hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { + .closestHit = RTDS_CLOSEST_HIT, + .anyHit = RTDS_ANYHIT_PRIMARY, + }; + hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { + .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, + .anyHit = RTDS_ANYHIT_SHADOW, + }; + hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { + .closestHit = RTDS_SPHERE_CLOSEST_HIT, + .anyHit = RTDS_ANYHIT_PRIMARY, + .intersection = RTDS_INTERSECTION, + }; + hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { + .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, + .anyHit = RTDS_ANYHIT_SHADOW, + .intersection = RTDS_INTERSECTION, + }; + shaderGroups.hits = hitGroups; + + IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT]; + callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL }; + callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL }; + callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL }; + shaderGroups.callables = callableGroups; + + params.cached.maxRecursionDepth = 1; + params.cached.dynamicStackSize = true; + + if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) + return logFail("Failed to create ray tracing pipeline"); + + calculateRayTracingStackSize(m_rayTracingPipeline); + + if (!createShaderBindingTable(m_rayTracingPipeline)) + return logFail("Could not create shader binding table"); + + } + + auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); + auto* geometryCreator = assetManager->getGeometryCreator(); + + if (!createIndirectBuffer()) + return logFail("Could not create indirect buffer"); + + if (!createAccelerationStructuresFromGeometry(geometryCreator)) + return logFail("Could not create acceleration structures from geometry creator"); + + ISampler::SParams samplerParams = { + .AnisotropicFilter = 0 + }; + auto defaultSampler = m_device->createSampler(samplerParams); + + { + const IGPUDescriptorSetLayout::SBinding bindings[] = { + { + .binding = 0u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, + .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT, + .count = 1u, + .immutableSamplers = &defaultSampler + } + }; + auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings); + const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() }; + const uint32_t setCounts[] = { 1u }; + m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts); + m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout); + + auto scRes = static_cast(m_surface->getSwapchainResources()); + ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); + if (!fsTriProtoPPln) + return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); + + const IGPUShader::SSpecInfo fragSpec = { + .entryPoint = "main", + .shader = fragmentShader.get() + }; + + auto presentLayout = m_device->createPipelineLayout( + {}, + core::smart_refctd_ptr(gpuPresentDescriptorSetLayout), + nullptr, + nullptr, + nullptr + ); + m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass()); + if (!m_presentPipeline) + return logFail("Could not create Graphics Pipeline!"); + } + + // write descriptors + IGPUDescriptorSet::SDescriptorInfo infos[3]; + infos[0].desc = m_gpuTlas; + + infos[1].desc = m_hdrImageView; + if (!infos[1].desc) + return logFail("Failed to create image view"); + infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL; + + infos[2].desc = m_hdrImageView; + infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + + IGPUDescriptorSet::SWriteDescriptorSet writes[] = { + {.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]}, + {.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]}, + {.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] }, + }; + m_device->updateDescriptorSets(std::span(writes), {}); + + // gui descriptor setup + { + using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS; + { + IGPUSampler::SParams params; + params.AnisotropicFilter = 1u; + params.TextureWrapU = ETC_REPEAT; + params.TextureWrapV = ETC_REPEAT; + params.TextureWrapW = ETC_REPEAT; + + m_ui.samplers.gui = m_device->createSampler(params); + m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler"); + } + + std::array, 69u> immutableSamplers; + for (auto& it : immutableSamplers) + it = smart_refctd_ptr(m_ui.samplers.scene); + + immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui); + + nbl::ext::imgui::UI::SCreationParameters params; + + params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u }; + params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u }; + params.assetManager = m_assetMgr; + params.pipelineCache = nullptr; + params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount); + params.renderpass = smart_refctd_ptr(renderpass); + params.streamingBuffer = nullptr; + params.subpassIx = 0u; + params.transfer = getGraphicsQueue(); + params.utilities = m_utils; + { + m_ui.manager = ext::imgui::UI::create(std::move(params)); + + // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources + const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u); + const auto& params = m_ui.manager->getCreationParameters(); + + IDescriptorPool::SCreateInfo descriptorPoolInfo = {}; + descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT; + descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount; + descriptorPoolInfo.maxSets = 1u; + descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT; + + m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo)); + assert(m_guiDescriptorSetPool); + + m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet); + assert(m_ui.descriptorSet); + } + } + + m_ui.manager->registerListener( + [this]() -> void { + ImGuiIO& io = ImGui::GetIO(); + + m_camera.setProjectionMatrix([&]() + { + static matrix4SIMD projection; + + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( + core::radians(m_cameraSetting.fov), + io.DisplaySize.x / io.DisplaySize.y, + m_cameraSetting.zNear, + m_cameraSetting.zFar); + + return projection; + }()); + + ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); + + // create a window and insert the inspector + ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); + ImGui::Begin("Controls"); + + ImGui::SameLine(); + + ImGui::Text("Camera"); + + ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f); + ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f); + ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f); + Light m_oldLight = m_light; + int light_type = m_light.type; + ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT); + m_light.type = static_cast(light_type); + if (m_light.type == ELT_DIRECTIONAL) + { + ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); + } + else if (m_light.type == ELT_POINT) + { + ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); + } + else if (m_light.type == ELT_SPOT) + { + ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); + ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); + + float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff)); + if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f)) + { + m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff)); + } + } + ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand); + if (m_light != m_oldLight) + { + m_frameAccumulationCounter = 0; + } + + ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); + + ImGui::End(); + } + ); + + // Set Camera + { + core::vectorSIMDf cameraPosition(0, 5, -10); + matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( + core::radians(60.0f), + WIN_W / WIN_H, + 0.01f, + 500.0f + ); + m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); + } + + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); + m_winMgr->show(m_window.get()); + m_oracle.reportBeginFrameRecord(); + m_camera.mapKeysToWASD(); + + return true; + } + + bool updateGUIDescriptorSet() + { + // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout + static std::array descriptorInfo; + static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount]; + + descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr(m_ui.manager->getFontAtlasView()); + + for (uint32_t i = 0; i < descriptorInfo.size(); ++i) + { + writes[i].dstSet = m_ui.descriptorSet.get(); + writes[i].binding = 0u; + writes[i].arrayElement = i; + writes[i].count = 1u; + } + writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId; + + return m_device->updateDescriptorSets(writes, {}); + } + + inline void workLoopBody() override + { + // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. + const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); + // We block for semaphores for 2 reasons here: + // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] + // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] + if (m_realFrameIx >= framesInFlight) + { + const ISemaphore::SWaitInfo cbDonePending[] = + { + { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + 1 - framesInFlight + } + }; + if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) + return; + } + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + + m_api->startCapture(); + + update(); + + auto queue = getGraphicsQueue(); + auto cmdbuf = m_cmdBufs[resourceIx].get(); + + if (!keepRunning()) + return; + + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame"); + + const auto viewMatrix = m_camera.getViewMatrix(); + const auto projectionMatrix = m_camera.getProjectionMatrix(); + const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix(); + + core::matrix3x4SIMD modelMatrix; + modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0)); + modelMatrix.setRotation(quaternion(0, 0, 0)); + + core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); + if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix) + { + m_frameAccumulationCounter = 0; + m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix; + } + core::matrix4SIMD invModelViewProjectionMatrix; + modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix); + + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader + .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }; + imageBarriers[0].image = m_hdrImage.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL; + imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + // Trace Rays Pass + { + SPushConstants pc; + pc.light = m_light; + pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress(); + pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress(); + pc.frameCounter = m_frameAccumulationCounter; + const core::vector3df camPos = m_camera.getPosition().getAsVector3df(); + pc.camPos = { camPos.X, camPos.Y, camPos.Z }; + memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP)); + + cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); + cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize); + cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); + cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get()); + if (m_useIndirectCommand) + { + cmdbuf->traceRaysIndirect( + SBufferBinding{ + .offset = 0, + .buffer = m_indirectBuffer, + }); + } + else + { + cmdbuf->traceRays( + m_shaderBindingTable.raygenGroupRange, + m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, + m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, + m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, + WIN_W, WIN_H, 1); + } + } + + // pipeline barrier + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }; + imageBarriers[0].image = m_hdrImage.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL; + imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + { + asset::SViewport viewport; + { + viewport.minDepth = 1.f; + viewport.maxDepth = 0.f; + viewport.x = 0u; + viewport.y = 0u; + viewport.width = WIN_W; + viewport.height = WIN_H; + } + cmdbuf->setViewport(0u, 1u, &viewport); + + + VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} }; + cmdbuf->setScissor(defaultScisors); + + auto scRes = static_cast(m_surface->getSwapchainResources()); + const VkRect2D currentRenderArea = + { + .offset = {0,0}, + .extent = {m_window->getWidth(),m_window->getHeight()} + }; + const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; + const IGPUCommandBuffer::SRenderpassBeginInfo info = + { + .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex), + .colorClearValues = &clearColor, + .depthStencilClearValues = nullptr, + .renderArea = currentRenderArea + }; + nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u }; + + cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); + + cmdbuf->bindGraphicsPipeline(m_presentPipeline.get()); + cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get()); + ext::FullScreenTriangle::recordDrawCall(cmdbuf); + + const auto uiParams = m_ui.manager->getCreationParameters(); + auto* uiPipeline = m_ui.manager->getPipeline(); + cmdbuf->bindGraphicsPipeline(uiPipeline); + cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get()); + m_ui.manager->render(cmdbuf, waitInfo); + + cmdbuf->endRenderPass(); + + } + + cmdbuf->endDebugMarker(); + cmdbuf->end(); + + { + const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = + { + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } + }; + { + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cmdbuf } + }; + + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = + { + { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = rendered + } + }; + + updateGUIDescriptorSet(); + + if (queue->submit(infos) != IQueue::RESULT::SUCCESS) + m_realFrameIx--; + } + } + + m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline"); + m_surface->present(m_currentImageAcquire.imageIndex, rendered); + } + m_api->endCapture(); + m_frameAccumulationCounter++; + } + + inline void update() + { + m_camera.setMoveSpeed(m_cameraSetting.moveSpeed); + m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed); + + static std::chrono::microseconds previousEventTimestamp{}; + + m_inputSystem->getDefaultMouse(&m_mouse); + m_inputSystem->getDefaultKeyboard(&m_keyboard); + + auto updatePresentationTimestamp = [&]() + { + m_currentImageAcquire = m_surface->acquireNextImage(); + + m_oracle.reportEndFrameRecord(); + const auto timestamp = m_oracle.getNextPresentationTimeStamp(); + m_oracle.reportBeginFrameRecord(); + + return timestamp; + }; + + const auto nextPresentationTimestamp = updatePresentationTimestamp(); + + struct + { + std::vector mouse{}; + std::vector keyboard{}; + } capturedEvents; + + m_camera.beginInputProcessing(nextPresentationTimestamp); + { + const auto& io = ImGui::GetIO(); + m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + { + if (!io.WantCaptureMouse) + m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.mouse.emplace_back(e); + + } + }, m_logger.get()); + + m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + { + if (!io.WantCaptureKeyboard) + m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.keyboard.emplace_back(e); + } + }, m_logger.get()); + + } + m_camera.endInputProcessing(nextPresentationTimestamp); + + const core::SRange mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size()); + const core::SRange keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size()); + const auto cursorPosition = m_window->getCursorControl()->getPosition(); + const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()); + + const ext::imgui::UI::SUpdateParameters params = + { + .mousePosition = mousePosition, + .displaySize = { m_window->getWidth(), m_window->getHeight() }, + .mouseEvents = mouseEvents, + .keyboardEvents = keyboardEvents + }; + + m_ui.manager->update(params); + } + + inline bool keepRunning() override + { + if (m_surface->irrecoverable()) + return false; + + return true; + } + + inline bool onAppTerminated() override + { + return device_base_t::onAppTerminated(); + } + +private: + uint32_t getWorkgroupCount(uint32_t dim, uint32_t size) + { + return (dim + size - 1) / size; + } + + bool createIndirectBuffer() + { + const auto getBufferRangeAddress = [](const SBufferRange& range) + { + return range.buffer->getDeviceAddress() + range.offset; + }; + const auto command = TraceRaysIndirectCommand_t{ + .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange), + .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size, + .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange), + .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size, + .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride, + .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange), + .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size, + .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride, + .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange), + .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size, + .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride, + .width = WIN_W, + .height = WIN_H, + .depth = 1, + }; + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = sizeof(TraceRaysIndirectCommand_t); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), &command).move_into(m_indirectBuffer); + return true; + } + + void calculateRayTracingStackSize(const smart_refctd_ptr& pipeline) + { + const auto raygenStackSize = pipeline->getRaygenStackSize(); + auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t + { + auto maxValue = 0; + for (const auto& val : ranges) + { + maxValue = std::max(maxValue, std::invoke(valProj, val)); + } + return maxValue; + }; + + const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit); + const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit); + const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection); + const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{}); + const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{}); + auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax); + firstDepthStackSizeMax = std::max(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax); + m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax); + } + + bool createShaderBindingTable(const smart_refctd_ptr& pipeline) + { + const auto& limits = m_device->getPhysicalDevice()->getLimits(); + const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize; + const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment); + + auto& raygenRange = m_shaderBindingTable.raygenGroupRange; + + auto& hitRange = m_shaderBindingTable.hitGroupsRange; + const auto hitHandles = pipeline->getHitHandles(); + + auto& missRange = m_shaderBindingTable.missGroupsRange; + const auto missHandles = pipeline->getMissHandles(); + + auto& callableRange = m_shaderBindingTable.callableGroupsRange; + const auto callableHandles = pipeline->getCallableHandles(); + + raygenRange = { + .offset = 0, + .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment) + }; + + missRange = { + .offset = raygenRange.size, + .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), + }; + m_shaderBindingTable.missGroupsStride = handleSizeAligned; + + hitRange = { + .offset = missRange.offset + missRange.size, + .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), + }; + m_shaderBindingTable.hitGroupsStride = handleSizeAligned; + + callableRange = { + .offset = hitRange.offset + hitRange.size, + .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), + }; + m_shaderBindingTable.callableGroupsStride = handleSizeAligned; + + const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size; + + ICPUBuffer::SCreationParams cpuBufferParams; + cpuBufferParams.size = bufferSize; + auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams)); + uint8_t* pData = reinterpret_cast(cpuBuffer->getPointer()); + + // copy raygen region + memcpy(pData, &pipeline->getRaygen(), handleSize); + + // copy miss region + uint8_t* pMissData = pData + missRange.offset; + for (const auto& handle : missHandles) + { + memcpy(pMissData, &handle, handleSize); + pMissData += m_shaderBindingTable.missGroupsStride; + } + + // copy hit region + uint8_t* pHitData = pData + hitRange.offset; + for (const auto& handle : hitHandles) + { + memcpy(pHitData, &handle, handleSize); + pHitData += m_shaderBindingTable.hitGroupsStride; + } + + // copy callable region + uint8_t* pCallableData = pData + callableRange.offset; + for (const auto& handle : callableHandles) + { + memcpy(pCallableData, &handle, handleSize); + pCallableData += m_shaderBindingTable.callableGroupsStride; + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT; + params.size = bufferSize; + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), pData).move_into(raygenRange.buffer); + missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + } + + return true; + } + + bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc) + { + auto queue = getGraphicsQueue(); + // get geometries into ICPUBuffers + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!pool) + return logFail("Couldn't create Command Pool for geometry creation!"); + + const auto defaultMaterial = Material{ + .ambient = {0.2, 0.1, 0.1}, + .diffuse = {0.8, 0.3, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 1.0f, + }; + + auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) + { + core::matrix3x4SIMD transform; + transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); + return transform; + }; + + core::matrix3x4SIMD planeTransform; + planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); + + // triangles geometries + const auto cpuObjects = std::array{ + ReferenceObjectCpu { + .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, + .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), + .material = defaultMaterial, + .transform = planeTransform, + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), + .material = defaultMaterial, + .transform = getTranslationMatrix(0, 0.5f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.1, 0.2}, + .diffuse = {0.2, 0.2, 0.8}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }, + .transform = getTranslationMatrix(-5.0f, 1.0f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.2, 0.1}, + .diffuse = {0.2, 0.8, 0.2}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 0.2, + }, + .transform = getTranslationMatrix(5.0f, 1.0f, 0), + }, + }; + + struct CPUTriBufferBindings + { + nbl::asset::SBufferBinding vertex, index; + }; + std::array cpuTriBuffers; + + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + const auto& cpuObject = cpuObjects[i]; + + auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset + auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + vBuffer->addUsageFlags(vUsage); + vBuffer->setContentHash(vBuffer->computeContentHash()); + + auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset + auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + + if (cpuObject.data.indexType != EIT_UNKNOWN) + if (iBuffer) + { + iBuffer->addUsageFlags(iUsage); + iBuffer->setContentHash(iBuffer->computeContentHash()); + } + + cpuTriBuffers[i] = { + .vertex = {.offset = 0, .buffer = vBuffer}, + .index = {.offset = 0, .buffer = iBuffer}, + }; + + } + + // procedural geometries + using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; + + smart_refctd_ptr cpuProcBuffer; + { + ICPUBuffer::SCreationParams params; + params.size = NumberOfProceduralGeometries * sizeof(Aabb); + cpuProcBuffer = ICPUBuffer::create(std::move(params)); + } + + core::vector proceduralGeoms; + proceduralGeoms.reserve(NumberOfProceduralGeometries); + auto proceduralGeometries = reinterpret_cast(cpuProcBuffer->getPointer()); + for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) + { + const auto middle_i = NumberOfProceduralGeometries / 2.0; + SProceduralGeomInfo sphere = { + .material = hlsl::_static_cast(Material{ + .ambient = {0.1, 0.05 * i, 0.1}, + .diffuse = {0.3, 0.2 * i, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }), + .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), + .radius = 1, + }; + + proceduralGeoms.push_back(sphere); + const auto sphereMin = sphere.center - sphere.radius; + const auto sphereMax = sphere.center + sphere.radius; + proceduralGeometries[i] = { + vector3d(sphereMin.x, sphereMin.y, sphereMin.z), + vector3d(sphereMax.x, sphereMax.y, sphereMax.z) + }; + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); + } + + // get ICPUBuffers into ICPUBLAS + // TODO use one BLAS and multiple triangles/aabbs in one + const auto blasCount = std::size(cpuObjects) + 1; + const auto proceduralBlasIdx = std::size(cpuObjects); + + std::array, std::size(cpuObjects)+1u> cpuBlas; + for (uint32_t i = 0; i < blasCount; i++) + { + auto& blas = cpuBlas[i]; + blas = make_smart_refctd_ptr(); + + if (i == proceduralBlasIdx) + { + auto aabbs = make_refctd_dynamic_array>>(1u); + auto primitiveCounts = make_refctd_dynamic_array>(1u); + + auto& aabb = aabbs->front(); + auto& primCount = primitiveCounts->front(); + + primCount = NumberOfProceduralGeometries; + aabb.data = { .offset = 0, .buffer = cpuProcBuffer }; + aabb.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t); + aabb.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now + + blas->setGeometries(std::move(aabbs), std::move(primitiveCounts)); + } + else + { + auto triangles = make_refctd_dynamic_array>>(1u); + auto primitiveCounts = make_refctd_dynamic_array>(1u); + + auto& tri = triangles->front(); + auto& primCount = primitiveCounts->front(); + const auto& geom = cpuObjects[i]; + const auto& cpuBuf = cpuTriBuffers[i]; + + const bool useIndex = geom.data.indexType != EIT_UNKNOWN; + const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride; + const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride; + + if (useIndex) + primCount = geom.data.indexCount / 3; + else + primCount = numVertices / 3; + + tri.vertexData[0] = cpuBuf.vertex; + tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex; + tri.maxVertex = numVertices - 1; + tri.vertexStride = vertexStride; + tri.vertexFormat = EF_R32G32B32_SFLOAT; + tri.indexType = geom.data.indexType; + tri.geometryFlags = geom.material.isTransparent() ? + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; + + blas->setGeometries(std::move(triangles), std::move(primitiveCounts)); + } + + auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; + if (i == proceduralBlasIdx) + blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; + + blas->setBuildFlags(blasFlags); + blas->setContentHash(blas->computeContentHash()); + } + + auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) }); + STriangleGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); + + // get ICPUBLAS into ICPUTLAS + auto geomInstances = make_refctd_dynamic_array>(blasCount); + { + uint32_t i = 0; + for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++) + { + const auto isProceduralInstance = i == proceduralBlasIdx; + ICPUTopLevelAccelerationStructure::StaticInstance inst; + inst.base.blas = cpuBlas[i]; + inst.base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); + inst.base.instanceCustomIndex = i; + inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;; + inst.base.mask = 0xFF; + inst.transform = isProceduralInstance ? matrix3x4SIMD() : cpuObjects[i].transform; + + instance->instance = inst; + } + } + + auto cpuTlas = make_smart_refctd_ptr(); + cpuTlas->setInstances(std::move(geomInstances)); + cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); + + // convert with asset converter + smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); + struct MyInputs : CAssetConverter::SInputs + { + // For the GPU Buffers to be directly writeable and so that we don't need a Transfer Queue submit at all + inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override + { + assert(memoryBacked); + return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes; + } + + uint32_t rebarMemoryTypes; + } inputs = {}; + inputs.logger = m_logger.get(); + inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); + // the allocator needs to be overriden to hand out memory ranges which have already been mapped so that the ReBAR fast-path can kick in + // (multiple buffers can be bound to same memory, but memory can only be mapped once at one place, so Asset Converter can't do it) + struct MyAllocator final : public IDeviceMemoryAllocator + { + ILogicalDevice* getDeviceForAllocations() const override { return device; } + + SAllocation allocate(const SAllocateInfo& info) override + { + auto retval = device->allocate(info); + // map what is mappable by default so ReBAR checks succeed + if (retval.isValid() && retval.memory->isMappable()) + retval.memory->map({ .offset = 0,.length = info.size }); + return retval; + } + + ILogicalDevice* device; + } myalloc; + myalloc.device = m_device.get(); + inputs.allocator = &myalloc; + + std::array tmpTlas; + std::array tmpBuffers; + { + tmpTlas[0] = cpuTlas.get(); + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get(); + tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get(); + } + tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get(); + + std::get>(inputs.assets) = tmpTlas; + std::get>(inputs.assets) = tmpBuffers; + } + + auto reservation = converter->reserve(inputs); + { + auto prepass = [&](const auto & references) -> bool + { + auto objects = reservation.getGPUObjects(); + uint32_t counter = {}; + for (auto& object : objects) + { + auto gpu = object.value; + auto* reference = references[counter]; + + if (reference) + { + if (!gpu) + { + m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); + return false; + } + } + counter++; + } + return true; + }; + + prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas); + prepass.template operator() < ICPUBuffer > (tmpBuffers); + } + + constexpr auto CompBufferCount = 2; + std::array, CompBufferCount> compBufs = {}; + std::array compBufInfos = {}; + { + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs); + compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + for (auto i = 0; i < CompBufferCount; i++) + compBufInfos[i].cmdbuf = compBufs[i].get(); + } + auto compSema = m_device->createSemaphore(0u); + SIntendedSubmitInfo compute = {}; + compute.queue = queue; + compute.scratchCommandBuffers = compBufInfos; + compute.scratchSemaphore = { + .semaphore = compSema.get(), + .value = 0u, + .stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT | PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT + }; + // convert + { + smart_refctd_ptr scratchAlloc; + { + constexpr auto MaxAlignment = 256; + constexpr auto MinAllocationSize = 1024; + const auto scratchSize = core::alignUp(reservation.getMaxASBuildScratchSize(false), MaxAlignment); + + + IGPUBuffer::SCreationParams creationParams = {}; + creationParams.size = scratchSize; + creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + auto scratchBuffer = m_device->createBuffer(std::move(creationParams)); + + auto reqs = scratchBuffer->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); + + auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + allocation.memory->map({ .offset = 0,.length = reqs.size }); + + scratchAlloc = make_smart_refctd_ptr( + SBufferRange{0ull, scratchSize, std::move(scratchBuffer)}, + core::allocator(), MaxAlignment, MinAllocationSize + ); + } + + struct MyParams final : CAssetConverter::SConvertParams + { + inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override + { + return finalUser; + } + inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override + { + return finalUser; + } + + uint8_t finalUser; + } params = {}; + params.utilities = m_utils.get(); + params.compute = &compute; + params.scratchForDeviceASBuild = scratchAlloc.get(); + params.finalUser = queue->getFamilyIndex(); + + auto future = reservation.convert(params); + if (future.copy() != IQueue::RESULT::SUCCESS) + { + m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); + return false; + } + // 2 submits, BLAS build, TLAS build, DO NOT ADD COMPACTIONS IN THIS EXAMPLE! + if (compute.getFutureScratchSemaphore().value>3) + m_logger->log("Overflow submitted on Compute Queue despite using ReBAR (no transfer submits or usage of staging buffer) and providing a AS Build Scratch Buffer of correctly queried max size!",system::ILogger::ELL_ERROR); + + // assign gpu objects to output + auto&& tlases = reservation.getGPUObjects(); + m_gpuTlas = tlases[0].value; + auto&& buffers = reservation.getGPUObjects(); + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + auto& cpuObject = cpuObjects[i]; + + m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ + .meta = cpuObject.meta, + .bindings = { + .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, + .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, + }, + .vertexStride = cpuObject.data.inputParams.bindings[0].stride, + .indexType = cpuObject.data.indexType, + .indexCount = cpuObject.data.indexCount, + .material = hlsl::_static_cast(cpuObject.material), + .transform = cpuObject.transform, + }); + } + m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value; + + for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) + { + const auto& gpuObject = m_gpuTriangleGeometries[i]; + const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); + geomInfos[i] = { + .material = gpuObject.material, + .vertexBufferAddress = vertexBufferAddress, + .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, + .vertexStride = gpuObject.vertexStride, + .objType = gpuObject.meta.type, + .indexType = gpuObject.indexType, + .smoothNormals = s_smoothNormals[gpuObject.meta.type], + }; + } + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = geomInfoBuffer->getSize(); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); + } + + return true; + } + + + + smart_refctd_ptr m_window; + smart_refctd_ptr> m_surface; + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; + uint32_t m_frameAccumulationCounter = 0; + std::array, MaxFramesInFlight> m_cmdBufs; + ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + + core::smart_refctd_ptr m_inputSystem; + InputSystem::ChannelReader m_mouse; + InputSystem::ChannelReader m_keyboard; + + struct CameraSetting + { + float fov = 60.f; + float zNear = 0.1f; + float zFar = 10000.f; + float moveSpeed = 1.f; + float rotateSpeed = 1.f; + float viewWidth = 10.f; + float camYAngle = 165.f / 180.f * 3.14159f; + float camXAngle = 32.f / 180.f * 3.14159f; + + } m_cameraSetting; + Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); + + Light m_light = { + .direction = {-1.0f, -1.0f, -0.4f}, + .position = {10.0f, 15.0f, 8.0f}, + .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, + .type = ELT_DIRECTIONAL + }; + + video::CDumbPresentationOracle m_oracle; + + struct C_UI + { + nbl::core::smart_refctd_ptr manager; + + struct + { + core::smart_refctd_ptr gui, scene; + } samplers; + + core::smart_refctd_ptr descriptorSet; + } m_ui; + core::smart_refctd_ptr m_guiDescriptorSetPool; + + core::vector m_gpuTriangleGeometries; + core::vector m_gpuIntersectionSpheres; + uint32_t m_intersectionHitGroupIdx; + + smart_refctd_ptr m_gpuTlas; + smart_refctd_ptr m_instanceBuffer; + + smart_refctd_ptr m_triangleGeomInfoBuffer; + smart_refctd_ptr m_proceduralGeomInfoBuffer; + smart_refctd_ptr m_proceduralAabbBuffer; + smart_refctd_ptr m_indirectBuffer; + + smart_refctd_ptr m_hdrImage; + smart_refctd_ptr m_hdrImageView; + + smart_refctd_ptr m_rayTracingDsPool; + smart_refctd_ptr m_rayTracingDs; + smart_refctd_ptr m_rayTracingPipeline; + uint64_t m_rayTracingStackSize; + ShaderBindingTable m_shaderBindingTable; + + smart_refctd_ptr m_presentDs; + smart_refctd_ptr m_presentDsPool; + smart_refctd_ptr m_presentPipeline; + + smart_refctd_ptr m_converter; + + + core::matrix4SIMD m_cachedModelViewProjectionMatrix; + bool m_useIndirectCommand = false; + +}; +NBL_MAIN_FUNC(RaytracingPipelineApp) diff --git a/27_PLYSTLDemo/pipeline.groovy b/29_MeshLoaders/pipeline.groovy similarity index 100% rename from 27_PLYSTLDemo/pipeline.groovy rename to 29_MeshLoaders/pipeline.groovy diff --git a/29_SpecializationConstants/CMakeLists.txt b/29_SpecializationConstants/CMakeLists.txt deleted file mode 100644 index a476b6203..000000000 --- a/29_SpecializationConstants/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ - -include(common RESULT_VARIABLE RES) -if(NOT RES) - message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") -endif() - -nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file diff --git a/29_SpecializationConstants/main.cpp b/29_SpecializationConstants/main.cpp deleted file mode 100644 index 11b73a330..000000000 --- a/29_SpecializationConstants/main.cpp +++ /dev/null @@ -1,566 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#define _NBL_STATIC_LIB_ -#include - -#include "../common/CommonAPI.h" -using namespace nbl; -using namespace core; -using namespace ui; - -struct UBOCompute -{ - //xyz - gravity point, w - dt - core::vectorSIMDf gravPointAndDt; -}; - -class SpecializationConstantsSampleApp : public ApplicationBase -{ - constexpr static uint32_t WIN_W = 1280u; - constexpr static uint32_t WIN_H = 720u; - constexpr static uint32_t SC_IMG_COUNT = 3u; - constexpr static uint32_t FRAMES_IN_FLIGHT = 5u; - static constexpr uint64_t MAX_TIMEOUT = 99999999999999ull; - static_assert(FRAMES_IN_FLIGHT > SC_IMG_COUNT); - - core::smart_refctd_ptr window; - core::smart_refctd_ptr system; - core::smart_refctd_ptr windowCb; - core::smart_refctd_ptr api; - core::smart_refctd_ptr surface; - core::smart_refctd_ptr utils; - core::smart_refctd_ptr device; - video::IPhysicalDevice* gpu; - std::array queues; - core::smart_refctd_ptr swapchain; - core::smart_refctd_ptr renderpass; - nbl::core::smart_refctd_dynamic_array> fbo; - std::array, CommonAPI::InitOutput::MaxFramesInFlight>, CommonAPI::InitOutput::MaxQueuesCount> commandPools; - core::smart_refctd_ptr filesystem; - core::smart_refctd_ptr assetManager; - video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams; - core::smart_refctd_ptr logger; - core::smart_refctd_ptr inputSystem; - video::IGPUObjectFromAssetConverter cpu2gpu; - - constexpr static uint32_t COMPUTE_SET = 0u; - constexpr static uint32_t PARTICLE_BUF_BINDING = 0u; - constexpr static uint32_t COMPUTE_DATA_UBO_BINDING = 1u; - constexpr static uint32_t WORKGROUP_SIZE = 256u; - constexpr static uint32_t PARTICLE_COUNT = 1u << 21; - constexpr static uint32_t PARTICLE_COUNT_PER_AXIS = 1u << 7; - constexpr static uint32_t POS_BUF_IX = 0u; - constexpr static uint32_t VEL_BUF_IX = 1u; - constexpr static uint32_t BUF_COUNT = 2u; - constexpr static uint32_t GRAPHICS_SET = 0u; - constexpr static uint32_t GRAPHICS_DATA_UBO_BINDING = 0u; - - std::chrono::high_resolution_clock::time_point m_lastTime; - int32_t m_resourceIx = -1; - core::smart_refctd_ptr m_cmdbuf[FRAMES_IN_FLIGHT]; - core::smart_refctd_ptr m_frameComplete[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr m_imageAcquire[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr m_renderFinished[FRAMES_IN_FLIGHT] = { nullptr }; - core::vectorSIMDf m_cameraPosition; - core::vectorSIMDf m_camFront; - UBOCompute m_uboComputeData; - asset::SBufferRange m_computeUBORange; - asset::SBufferRange m_graphicsUBORange; - core::smart_refctd_ptr m_gpuComputePipeline; - core::smart_refctd_ptr m_graphicsPipeline; - core::smart_refctd_ptr m_gpuds0Compute; - core::smart_refctd_ptr m_gpuds0Graphics; - asset::SBasicViewParameters m_viewParams; - core::matrix4SIMD m_viewProj; - core::smart_refctd_ptr m_gpuParticleBuf; - core::smart_refctd_ptr m_rpIndependentPipeline; - nbl::video::ISwapchain::SCreationParams m_swapchainCreationParams; - -public: - - void setWindow(core::smart_refctd_ptr&& wnd) override - { - window = std::move(wnd); - } - void setSystem(core::smart_refctd_ptr&& s) override - { - system = std::move(s); - } - nbl::ui::IWindow* getWindow() override - { - return window.get(); - } - video::IAPIConnection* getAPIConnection() override - { - return api.get(); - } - video::ILogicalDevice* getLogicalDevice() override - { - return device.get(); - } - video::IGPURenderpass* getRenderpass() override - { - return renderpass.get(); - } - void setSurface(core::smart_refctd_ptr&& s) override - { - surface = std::move(s); - } - void setFBOs(std::vector>& f) override - { - for (int i = 0; i < f.size(); i++) - { - fbo->begin()[i] = core::smart_refctd_ptr(f[i]); - } - } - void setSwapchain(core::smart_refctd_ptr&& s) override - { - swapchain = std::move(s); - } - uint32_t getSwapchainImageCount() override - { - return swapchain->getImageCount(); - } - virtual nbl::asset::E_FORMAT getDepthFormat() override - { - return nbl::asset::EF_UNKNOWN; - } - - APP_CONSTRUCTOR(SpecializationConstantsSampleApp); - - void onAppInitialized_impl() override - { - const auto swapchainImageUsage = static_cast(asset::IImage::EUF_COLOR_ATTACHMENT_BIT | asset::IImage::EUF_STORAGE_BIT); - const asset::E_FORMAT depthFormat = asset::EF_UNKNOWN; - CommonAPI::InitParams initParams; - initParams.window = core::smart_refctd_ptr(window); - initParams.apiType = video::EAT_VULKAN; - initParams.appName = { _NBL_APP_NAME_ }; - initParams.framesInFlight = FRAMES_IN_FLIGHT; - initParams.windowWidth = WIN_W; - initParams.windowHeight = WIN_H; - initParams.swapchainImageCount = SC_IMG_COUNT; - initParams.swapchainImageUsage = swapchainImageUsage; - initParams.depthFormat = depthFormat; - initParams.physicalDeviceFilter.minimumLimits.workgroupSizeFromSpecConstant = true; - auto initOutp = CommonAPI::InitWithDefaultExt(std::move(initParams)); - - window = std::move(initParams.window); - system = std::move(initOutp.system); - windowCb = std::move(initParams.windowCb); - api = std::move(initOutp.apiConnection); - surface = std::move(initOutp.surface); - device = std::move(initOutp.logicalDevice); - gpu = std::move(initOutp.physicalDevice); - queues = std::move(initOutp.queues); - renderpass = std::move(initOutp.renderToSwapchainRenderpass); - commandPools = std::move(initOutp.commandPools); - assetManager = std::move(initOutp.assetManager); - filesystem = std::move(initOutp.system); - cpu2gpuParams = std::move(initOutp.cpu2gpuParams); - utils = std::move(initOutp.utilities); - m_swapchainCreationParams = std::move(initOutp.swapchainCreationParams); - - CommonAPI::createSwapchain(std::move(device), m_swapchainCreationParams, WIN_W, WIN_H, swapchain); - assert(swapchain); - fbo = CommonAPI::createFBOWithSwapchainImages( - swapchain->getImageCount(), WIN_W, WIN_H, - device, swapchain, renderpass, - depthFormat - ); - - video::IGPUObjectFromAssetConverter CPU2GPU; - m_cameraPosition = core::vectorSIMDf(0, 0, -10); - matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(90.0f), video::ISurface::getTransformedAspectRatio(swapchain->getPreTransform(), WIN_W, WIN_H), 0.01, 100); - matrix3x4SIMD view = matrix3x4SIMD::buildCameraLookAtMatrixRH(m_cameraPosition, core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 1, 0)); - m_viewProj = matrix4SIMD::concatenateBFollowedByAPrecisely( - video::ISurface::getSurfaceTransformationMatrix(swapchain->getPreTransform()), - matrix4SIMD::concatenateBFollowedByA(proj, matrix4SIMD(view)) - ); - m_camFront = view[2]; - - // auto glslExts = device->getSupportedGLSLExtensions(); - asset::CSPIRVIntrospector introspector; - - const char* pathToCompShader = "../particles.comp"; - auto compilerSet = assetManager->getCompilerSet(); - core::smart_refctd_ptr computeUnspec = nullptr; - core::smart_refctd_ptr computeUnspecSPIRV = nullptr; - { - auto csBundle = assetManager->getAsset(pathToCompShader, {}); - auto csContents = csBundle.getContents(); - if (csContents.empty()) - assert(false); - - asset::ICPUSpecializedShader* csSpec = static_cast(csContents.begin()->get()); - computeUnspec = core::smart_refctd_ptr(csSpec->getUnspecialized()); - - auto compiler = compilerSet->getShaderCompiler(computeUnspec->getContentType()); - - asset::IShaderCompiler::SPreprocessorOptions preprocessOptions = {}; - preprocessOptions.sourceIdentifier = pathToCompShader; - preprocessOptions.includeFinder = compiler->getDefaultIncludeFinder(); - computeUnspec = compilerSet->preprocessShader(computeUnspec.get(), preprocessOptions); - } - - core::smart_refctd_ptr introspection = nullptr; - { - //! This example first preprocesses and then compiles the shader, although it could've been done by calling compileToSPIRV with setting compilerOptions.preprocessorOptions - asset::IShaderCompiler::SCompilerOptions compilerOptions = {}; - // compilerOptions.entryPoint = "main"; - compilerOptions.stage = computeUnspec->getStage(); - compilerOptions.debugInfoFlags = asset::IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT; // should be DIF_SOURCE_BIT for introspection - compilerOptions.preprocessorOptions.sourceIdentifier = computeUnspec->getFilepathHint(); // already preprocessed but for logging it's best to fill sourceIdentifier - computeUnspecSPIRV = compilerSet->compileToSPIRV(computeUnspec.get(), compilerOptions); - - asset::CSPIRVIntrospector::SIntrospectionParams params = { "main", computeUnspecSPIRV }; - introspection = introspector.introspect(params); - } - - asset::ISpecializedShader::SInfo specInfo; - { - struct SpecConstants - { - int32_t wg_size; - int32_t particle_count; - int32_t pos_buf_ix; - int32_t vel_buf_ix; - int32_t buf_count; - }; - SpecConstants swapchain{ WORKGROUP_SIZE, PARTICLE_COUNT, POS_BUF_IX, VEL_BUF_IX, BUF_COUNT }; - - auto it_particleBufDescIntro = std::find_if(introspection->descriptorSetBindings[COMPUTE_SET].begin(), introspection->descriptorSetBindings[COMPUTE_SET].end(), - [=](auto b) { return b.binding == PARTICLE_BUF_BINDING; } - ); - assert(it_particleBufDescIntro->descCountIsSpecConstant); - const uint32_t buf_count_specID = it_particleBufDescIntro->count_specID; - auto& particleDataArrayIntro = it_particleBufDescIntro->get().members.array[0]; - assert(particleDataArrayIntro.countIsSpecConstant); - const uint32_t particle_count_specID = particleDataArrayIntro.count_specID; - - auto backbuf = asset::ICPUBuffer::create({ sizeof(swapchain) }); - memcpy(backbuf->getPointer(), &swapchain, sizeof(swapchain)); - auto entries = core::make_refctd_dynamic_array>(5u); - (*entries)[0] = { 0u,offsetof(SpecConstants,wg_size),sizeof(int32_t) };//currently local_size_{x|y|z}_id is not queryable via introspection API - (*entries)[1] = { particle_count_specID,offsetof(SpecConstants,particle_count),sizeof(int32_t) }; - (*entries)[2] = { 2u,offsetof(SpecConstants,pos_buf_ix),sizeof(int32_t) }; - (*entries)[3] = { 3u,offsetof(SpecConstants,vel_buf_ix),sizeof(int32_t) }; - (*entries)[4] = { buf_count_specID,offsetof(SpecConstants,buf_count),sizeof(int32_t) }; - - specInfo = asset::ISpecializedShader::SInfo(std::move(entries), std::move(backbuf), "main"); - } - - auto compute = core::make_smart_refctd_ptr(std::move(computeUnspecSPIRV), std::move(specInfo)); - - auto computePipeline = introspector.createApproximateComputePipelineFromIntrospection(compute.get()); - auto computeLayout = core::make_smart_refctd_ptr(nullptr, nullptr, core::smart_refctd_ptr(computePipeline->getLayout()->getDescriptorSetLayout(0))); - computePipeline->setLayout(core::smart_refctd_ptr(computeLayout)); - - // These conversions don't require command buffers - m_gpuComputePipeline = CPU2GPU.getGPUObjectsFromAssets(&computePipeline.get(), &computePipeline.get() + 1, cpu2gpuParams)->front(); - auto* ds0layoutCompute = computeLayout->getDescriptorSetLayout(0); - core::smart_refctd_ptr gpuDs0layoutCompute = CPU2GPU.getGPUObjectsFromAssets(&ds0layoutCompute, &ds0layoutCompute + 1, cpu2gpuParams)->front(); - - core::vector particlePosAndVel; - particlePosAndVel.reserve(PARTICLE_COUNT * 2); - for (int32_t i = 0; i < PARTICLE_COUNT_PER_AXIS; ++i) - for (int32_t j = 0; j < PARTICLE_COUNT_PER_AXIS; ++j) - for (int32_t k = 0; k < PARTICLE_COUNT_PER_AXIS; ++k) - particlePosAndVel.push_back(core::vector3df_SIMD(i, j, k) * 0.5f); - - for (int32_t i = 0; i < PARTICLE_COUNT; ++i) - particlePosAndVel.push_back(core::vector3df_SIMD(0.0f)); - - constexpr size_t BUF_SZ = 4ull * sizeof(float) * PARTICLE_COUNT; - video::IGPUBuffer::SCreationParams bufferCreationParams = {}; - bufferCreationParams.usage = static_cast(asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_VERTEX_BUFFER_BIT); - bufferCreationParams.size = 2ull * BUF_SZ; - m_gpuParticleBuf = device->createBuffer(std::move(bufferCreationParams)); - m_gpuParticleBuf->setObjectDebugName("m_gpuParticleBuf"); - auto particleBufMemReqs = m_gpuParticleBuf->getMemoryReqs(); - particleBufMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - device->allocate(particleBufMemReqs, m_gpuParticleBuf.get()); - asset::SBufferRange range; - range.buffer = m_gpuParticleBuf; - range.offset = 0ull; - range.size = BUF_SZ * 2ull; - utils->updateBufferRangeViaStagingBufferAutoSubmit(range, particlePosAndVel.data(), queues[CommonAPI::InitOutput::EQT_GRAPHICS]); - particlePosAndVel.clear(); - - video::IGPUBuffer::SCreationParams uboComputeCreationParams = {}; - uboComputeCreationParams.usage = static_cast(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF); - uboComputeCreationParams.size = core::roundUp(sizeof(UBOCompute), 64ull); - auto gpuUboCompute = device->createBuffer(std::move(uboComputeCreationParams)); - auto gpuUboComputeMemReqs = gpuUboCompute->getMemoryReqs(); - gpuUboComputeMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - device->allocate(gpuUboComputeMemReqs, gpuUboCompute.get()); - - asset::SBufferBinding vtxBindings[video::IGPUMeshBuffer::MAX_ATTR_BUF_BINDING_COUNT]; - vtxBindings[0].buffer = m_gpuParticleBuf; - vtxBindings[0].offset = 0u; - //auto meshbuffer = core::make_smart_refctd_ptr(nullptr, nullptr, vtxBindings, asset::SBufferBinding{}); - //meshbuffer->setIndexCount(PARTICLE_COUNT); - //meshbuffer->setIndexType(asset::EIT_UNKNOWN); - - - auto createSpecShader = [&](const char* filepath, asset::IShader::E_SHADER_STAGE stage) - { - auto shaderBundle = assetManager->getAsset(filepath, {}); - auto shaderContents = shaderBundle.getContents(); - if (shaderContents.empty()) - assert(false); - - auto specializedShader = static_cast(shaderContents.begin()->get()); - auto unspecShader = specializedShader->getUnspecialized(); - - auto compiler = compilerSet->getShaderCompiler(computeUnspec->getContentType()); - asset::IShaderCompiler::SCompilerOptions compilerOptions = {}; - // compilerOptions.entryPoint = specializedShader->getSpecializationInfo().entryPoint; - compilerOptions.stage = unspecShader->getStage(); - compilerOptions.debugInfoFlags = asset::IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT; - compilerOptions.preprocessorOptions.sourceIdentifier = unspecShader->getFilepathHint(); // already preprocessed but for logging it's best to fill sourceIdentifier - compilerOptions.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); - auto unspecSPIRV = compilerSet->compileToSPIRV(unspecShader, compilerOptions); - - return core::make_smart_refctd_ptr(std::move(unspecSPIRV), asset::ISpecializedShader::SInfo(specializedShader->getSpecializationInfo())); - }; - auto vs = createSpecShader("../particles.vert", asset::IShader::ESS_VERTEX); - auto fs = createSpecShader("../particles.frag", asset::IShader::ESS_FRAGMENT); - - asset::ICPUSpecializedShader* shaders[2] = { vs.get(),fs.get() }; - auto pipeline = introspector.createApproximateRenderpassIndependentPipelineFromIntrospection({ shaders, shaders + 2 }); - { - auto& vtxParams = pipeline->getVertexInputParams(); - vtxParams.attributes[0].binding = 0u; - vtxParams.attributes[0].format = asset::EF_R32G32B32_SFLOAT; - vtxParams.attributes[0].relativeOffset = 0u; - vtxParams.bindings[0].inputRate = asset::EVIR_PER_VERTEX; - vtxParams.bindings[0].stride = 4u * sizeof(float); - - pipeline->getPrimitiveAssemblyParams().primitiveType = asset::EPT_POINT_LIST; - - auto& blendParams = pipeline->getBlendParams(); - blendParams.logicOpEnable = false; - blendParams.logicOp = nbl::asset::ELO_NO_OP; - } - auto gfxLayout = core::make_smart_refctd_ptr(nullptr, nullptr, core::smart_refctd_ptr(pipeline->getLayout()->getDescriptorSetLayout(0))); - pipeline->setLayout(core::smart_refctd_ptr(gfxLayout)); - - m_rpIndependentPipeline = CPU2GPU.getGPUObjectsFromAssets(&pipeline.get(), &pipeline.get() + 1, cpu2gpuParams)->front(); - auto* ds0layoutGraphics = gfxLayout->getDescriptorSetLayout(0); - core::smart_refctd_ptr gpuDs0layoutGraphics = CPU2GPU.getGPUObjectsFromAssets(&ds0layoutGraphics, &ds0layoutGraphics + 1, cpu2gpuParams)->front(); - - video::IGPUDescriptorSetLayout* gpuDSLayouts_raw[2] = { gpuDs0layoutCompute.get(), gpuDs0layoutGraphics.get() }; - const uint32_t setCount[2] = { 1u, 1u }; - auto dscPool = device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, gpuDSLayouts_raw, gpuDSLayouts_raw + 2ull, setCount); - - m_gpuds0Compute = dscPool->createDescriptorSet(std::move(gpuDs0layoutCompute)); - { - video::IGPUDescriptorSet::SDescriptorInfo i[3]; - video::IGPUDescriptorSet::SWriteDescriptorSet w[2]; - w[0].arrayElement = 0u; - w[0].binding = PARTICLE_BUF_BINDING; - w[0].count = BUF_COUNT; - w[0].descriptorType = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER; - w[0].dstSet = m_gpuds0Compute.get(); - w[0].info = i; - w[1].arrayElement = 0u; - w[1].binding = COMPUTE_DATA_UBO_BINDING; - w[1].count = 1u; - w[1].descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER; - w[1].dstSet = m_gpuds0Compute.get(); - w[1].info = i + 2u; - i[0].desc = m_gpuParticleBuf; - i[0].info.buffer.offset = 0ull; - i[0].info.buffer.size = BUF_SZ; - i[1].desc = m_gpuParticleBuf; - i[1].info.buffer.offset = BUF_SZ; - i[1].info.buffer.size = BUF_SZ; - i[2].desc = gpuUboCompute; - i[2].info.buffer.offset = 0ull; - i[2].info.buffer.size = gpuUboCompute->getSize(); - - device->updateDescriptorSets(2u, w, 0u, nullptr); - } - - - m_gpuds0Graphics = dscPool->createDescriptorSet(std::move(gpuDs0layoutGraphics)); - - video::IGPUGraphicsPipeline::SCreationParams gp_params; - gp_params.rasterizationSamples = asset::IImage::ESCF_1_BIT; - gp_params.renderpass = core::smart_refctd_ptr(renderpass); - gp_params.renderpassIndependent = core::smart_refctd_ptr(m_rpIndependentPipeline); - gp_params.subpassIx = 0u; - - m_graphicsPipeline = device->createGraphicsPipeline(nullptr, std::move(gp_params)); - - video::IGPUBuffer::SCreationParams gfxUboCreationParams = {}; - gfxUboCreationParams.usage = static_cast(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF); - gfxUboCreationParams.size = sizeof(m_viewParams); - auto gpuUboGraphics = device->createBuffer(std::move(gfxUboCreationParams)); - auto gpuUboGraphicsMemReqs = gpuUboGraphics->getMemoryReqs(); - gpuUboGraphicsMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - - device->allocate(gpuUboGraphicsMemReqs, gpuUboGraphics.get()); - { - video::IGPUDescriptorSet::SWriteDescriptorSet w; - video::IGPUDescriptorSet::SDescriptorInfo i; - w.arrayElement = 0u; - w.binding = GRAPHICS_DATA_UBO_BINDING; - w.count = 1u; - w.descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER; - w.dstSet = m_gpuds0Graphics.get(); - w.info = &i; - i.desc = gpuUboGraphics; - i.info.buffer.offset = 0u; - i.info.buffer.size = gpuUboGraphics->getSize(); // gpuUboGraphics->getSize(); - - device->updateDescriptorSets(1u, &w, 0u, nullptr); - } - - m_lastTime = std::chrono::high_resolution_clock::now(); - constexpr uint32_t FRAME_COUNT = 500000u; - constexpr uint64_t MAX_TIMEOUT = 99999999999999ull; - m_computeUBORange = { 0, gpuUboCompute->getSize(), gpuUboCompute }; - m_graphicsUBORange = { 0, gpuUboGraphics->getSize(), gpuUboGraphics }; - - const auto& graphicsCommandPools = commandPools[CommonAPI::InitOutput::EQT_GRAPHICS]; - for (uint32_t i = 0u; i < FRAMES_IN_FLIGHT; i++) - { - device->createCommandBuffers(graphicsCommandPools[i].get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, m_cmdbuf+i); - m_imageAcquire[i] = device->createSemaphore(); - m_renderFinished[i] = device->createSemaphore(); - } - } - - void onAppTerminated_impl() override - { - device->waitIdle(); - } - - void workLoopBody() override - { - m_resourceIx++; - if (m_resourceIx >= FRAMES_IN_FLIGHT) - m_resourceIx = 0; - - auto& cb = m_cmdbuf[m_resourceIx]; - auto& fence = m_frameComplete[m_resourceIx]; - if (fence) - { - auto retval = device->waitForFences(1u, &fence.get(), false, MAX_TIMEOUT); - assert(retval == video::IGPUFence::ES_TIMEOUT || retval == video::IGPUFence::ES_SUCCESS); - device->resetFences(1u, &fence.get()); - } - else - { - fence = device->createFence(static_cast(0)); - } - - // safe to proceed - cb->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool - - { - auto time = std::chrono::high_resolution_clock::now(); - core::vector3df_SIMD gravPoint = m_cameraPosition + m_camFront * 250.f; - m_uboComputeData.gravPointAndDt = gravPoint; - m_uboComputeData.gravPointAndDt.w = std::chrono::duration_cast(time - m_lastTime).count() * 1e-4; - - m_lastTime = time; - cb->updateBuffer(m_computeUBORange.buffer.get(), m_computeUBORange.offset, m_computeUBORange.size, &m_uboComputeData); - } - cb->bindComputePipeline(m_gpuComputePipeline.get()); - cb->bindDescriptorSets(asset::EPBP_COMPUTE, - m_gpuComputePipeline->getLayout(), - COMPUTE_SET, - 1u, - &m_gpuds0Compute.get(), - 0u); - cb->dispatch(PARTICLE_COUNT / WORKGROUP_SIZE, 1u, 1u); - - asset::SMemoryBarrier memBarrier; - memBarrier.srcAccessMask = asset::EAF_SHADER_WRITE_BIT; - memBarrier.dstAccessMask = asset::EAF_VERTEX_ATTRIBUTE_READ_BIT; - cb->pipelineBarrier( - asset::EPSF_COMPUTE_SHADER_BIT, - asset::EPSF_VERTEX_INPUT_BIT, - static_cast(0u), - 1, &memBarrier, - 0, nullptr, - 0, nullptr); - - { - memcpy(m_viewParams.MVP, &m_viewProj, sizeof(m_viewProj)); - cb->updateBuffer(m_graphicsUBORange.buffer.get(), m_graphicsUBORange.offset, m_graphicsUBORange.size, &m_viewParams); - } - { - asset::SViewport vp; - vp.minDepth = 1.f; - vp.maxDepth = 0.f; - vp.x = 0u; - vp.y = 0u; - vp.width = WIN_W; - vp.height = WIN_H; - cb->setViewport(0u, 1u, &vp); - - VkRect2D scissor; - scissor.offset = { 0, 0 }; - scissor.extent = { WIN_W, WIN_H }; - cb->setScissor(0u, 1u, &scissor); - } - // renderpass - uint32_t imgnum = 0u; - swapchain->acquireNextImage(MAX_TIMEOUT, m_imageAcquire[m_resourceIx].get(), nullptr, &imgnum); - { - video::IGPUCommandBuffer::SRenderpassBeginInfo info; - asset::SClearValue clear; - clear.color.float32[0] = 0.f; - clear.color.float32[1] = 0.f; - clear.color.float32[2] = 0.f; - clear.color.float32[3] = 1.f; - info.renderpass = renderpass; - info.framebuffer = fbo->begin()[imgnum]; - info.clearValueCount = 1u; - info.clearValues = &clear; - info.renderArea.offset = { 0, 0 }; - info.renderArea.extent = { WIN_W, WIN_H }; - cb->beginRenderPass(&info, asset::ESC_INLINE); - } - // individual draw - { - cb->bindGraphicsPipeline(m_graphicsPipeline.get()); - size_t vbOffset = 0; - cb->bindVertexBuffers(0, 1, &m_gpuParticleBuf.get(), &vbOffset); - cb->bindDescriptorSets(asset::EPBP_GRAPHICS, m_rpIndependentPipeline->getLayout(), GRAPHICS_SET, 1u, &m_gpuds0Graphics.get(), 0u); - cb->draw(PARTICLE_COUNT, 1, 0, 0); - } - cb->endRenderPass(); - cb->end(); - - CommonAPI::Submit( - device.get(), - cb.get(), - queues[CommonAPI::InitOutput::EQT_GRAPHICS], - m_imageAcquire[m_resourceIx].get(), - m_renderFinished[m_resourceIx].get(), - fence.get()); - - CommonAPI::Present( - device.get(), - swapchain.get(), - queues[CommonAPI::InitOutput::EQT_GRAPHICS], - m_renderFinished[m_resourceIx].get(), - imgnum); - } - - bool keepRunning() override - { - return windowCb->isWindowOpen(); - } -}; - -NBL_COMMON_API_MAIN(SpecializationConstantsSampleApp) - -extern "C" { _declspec(dllexport) DWORD NvOptimusEnablement = 0x00000001; } \ No newline at end of file diff --git a/29_SpecializationConstants/particles.comp b/29_SpecializationConstants/particles.comp deleted file mode 100644 index 5889af74c..000000000 --- a/29_SpecializationConstants/particles.comp +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#version 430 core - -layout (constant_id = 1) const int PARTICLE_COUNT = 256; -layout (constant_id = 2) const int POS_BUF_IX = 0; -layout (constant_id = 3) const int VEL_BUF_IX = 1; -layout (constant_id = 4) const int BUF_COUNT = 2; - -layout (local_size_x_id = 0) in; - -layout (set = 0, binding = 0, std430) restrict buffer PARTICLE_DATA -{ - vec3 p[PARTICLE_COUNT]; -} data[BUF_COUNT]; -layout (set = 0, binding = 1, std140) uniform UBO -{ - vec3 gravP; - float dt; -} ubo; - -void main() -{ - uint GID = gl_GlobalInvocationID.x; - - vec3 p = data[POS_BUF_IX].p[GID]; - vec3 v = data[VEL_BUF_IX].p[GID]; - - v *= 1.0 - 0.99*ubo.dt; - float d = distance(ubo.gravP,p); - float a = 10000.0 / max(1.0, 0.01*pow(d,1.5)); - v += (ubo.gravP-p)/d * a * ubo.dt; - p += v*ubo.dt; - - data[POS_BUF_IX].p[GID] = p; - data[VEL_BUF_IX].p[GID] = v; -} \ No newline at end of file diff --git a/29_SpecializationConstants/particles.frag b/29_SpecializationConstants/particles.frag deleted file mode 100644 index c03ba9afc..000000000 --- a/29_SpecializationConstants/particles.frag +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#version 430 core - -layout (location = 0) out vec4 Color; - -void main() -{ - Color = vec4(1.0); -} \ No newline at end of file diff --git a/29_SpecializationConstants/particles.vert b/29_SpecializationConstants/particles.vert deleted file mode 100644 index f87486cac..000000000 --- a/29_SpecializationConstants/particles.vert +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#version 430 core - -layout (location = 0) in vec3 vPos; - -#include -#include - -layout (set = 0, binding = 0, row_major, std140) uniform UBO -{ - nbl_glsl_SBasicViewParameters params; -} CamData; - -void main() -{ - gl_PointSize = 1; - gl_Position = nbl_glsl_pseudoMul4x4with3x1(CamData.params.MVP, vPos); -} \ No newline at end of file diff --git a/29_SpecializationConstants/pipeline.groovy b/29_SpecializationConstants/pipeline.groovy deleted file mode 100644 index d61a3c808..000000000 --- a/29_SpecializationConstants/pipeline.groovy +++ /dev/null @@ -1,50 +0,0 @@ -import org.DevshGraphicsProgramming.Agent -import org.DevshGraphicsProgramming.BuilderInfo -import org.DevshGraphicsProgramming.IBuilder - -class CSpecializationConstantsBuilder extends IBuilder -{ - public CSpecializationConstantsBuilder(Agent _agent, _info) - { - super(_agent, _info) - } - - @Override - public boolean prepare(Map axisMapping) - { - return true - } - - @Override - public boolean build(Map axisMapping) - { - IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") - IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") - - def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) - def nameOfConfig = getNameOfConfig(config) - - agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") - - return true - } - - @Override - public boolean test(Map axisMapping) - { - return true - } - - @Override - public boolean install(Map axisMapping) - { - return true - } -} - -def create(Agent _agent, _info) -{ - return new CSpecializationConstantsBuilder(_agent, _info) -} - -return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b3279a48..0c0584ebe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(26_Blur EXCLUDE_FROM_ALL) add_subdirectory(27_MPMCScheduler EXCLUDE_FROM_ALL) add_subdirectory(28_FFTBloom EXCLUDE_FROM_ALL) + add_subdirectory(29_MeshLoaders EXCLUDE_FROM_ALL) # add_subdirectory(36_CUDAInterop EXCLUDE_FROM_ALL) # Showcase compute pathtracing From 3d898943fb9bd4690aa3b92b7a80f5a61198f0de Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 16:12:55 +0700 Subject: [PATCH 208/296] fix template accessors --- .../app_resources/testWorkgroup.comp.hlsl | 24 +++++++++---------- .../benchmarkWorkgroup.comp.hlsl | 20 ++++++++-------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index bda735b44..0a7fde9ba 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -18,12 +18,12 @@ groupshared uint32_t scratch[config_t::SharedScratchElementCount]; struct ScratchProxy { - template + template void get(const uint32_t ix, NBL_REF_ARG(AccessType) value) { value = scratch[ix]; } - template + template void set(const uint32_t ix, const AccessType value) { scratch[ix] = value; @@ -47,18 +47,18 @@ struct DataProxy using dtype_t = vector; static_assert(nbl::hlsl::is_same_v); - template - void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) + template + void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - value = vk::RawBufferLoad(pc.inputBufAddress + (workgroupOffset + ix) * sizeof(dtype_t)); + value = vk::RawBufferLoad(pc.inputBufAddress + (workgroupOffset + ix) * sizeof(AccessType)); } - template - void set(const uint32_t ix, const dtype_t value) + template + void set(const IndexType ix, const AccessType value) { const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t)); - vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset+ix), value, sizeof(uint32_t)); + vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t)); } void workgroupExecutionAndMemoryBarrier() @@ -76,13 +76,13 @@ struct PreloadedDataProxy NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; - template - void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) + template + void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2]; } - template - void set(const uint32_t ix, const dtype_t value) + template + void set(const IndexType ix, const AccessType value) { preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value; } diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index bfbe30ac9..e44bf4f06 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -18,12 +18,12 @@ groupshared uint32_t scratch[config_t::SharedScratchElementCount]; struct ScratchProxy { - template + template void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { value = scratch[ix]; } - template + template void set(const IndexType ix, const AccessType value) { scratch[ix] = value; @@ -49,14 +49,14 @@ struct DataProxy static_assert(nbl::hlsl::is_same_v); // we don't want to write/read storage multiple times in loop; doesn't seem optimized out in generated spirv - template - void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) + template + void get(const IndexType ix, NBL_REF_ARG(dtype_t) value) { // value = inputValue[ix]; value = nbl::hlsl::promote(globalIndex()); } - template - void set(const uint32_t ix, const dtype_t value) + template + void set(const IndexType ix, const dtype_t value) { // output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); } @@ -76,13 +76,13 @@ struct PreloadedDataProxy NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; - template - void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) + template + void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2]; } - template - void set(const uint32_t ix, const dtype_t value) + template + void set(const IndexType ix, const AccessType value) { preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value; } From 3d63ed732838c3073dfb7993d3eb1305fb5882be Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 27 May 2025 16:30:55 +0700 Subject: [PATCH 209/296] add accessor index template type --- 28_FFTBloom/app_resources/fft_common.hlsl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/28_FFTBloom/app_resources/fft_common.hlsl b/28_FFTBloom/app_resources/fft_common.hlsl index 41f8821cc..9f2be1432 100644 --- a/28_FFTBloom/app_resources/fft_common.hlsl +++ b/28_FFTBloom/app_resources/fft_common.hlsl @@ -5,13 +5,13 @@ groupshared uint32_t sharedmem[FFTParameters::SharedMemoryDWORDs]; struct SharedMemoryAccessor { - template + template void set(IndexType idx, AccessType value) { sharedmem[idx] = value; } - template + template void get(IndexType idx, NBL_REF_ARG(AccessType) value) { value = sharedmem[idx]; @@ -36,14 +36,14 @@ struct PreloadedAccessorCommonBase struct PreloadedAccessorBase : PreloadedAccessorCommonBase { - template - void set(uint32_t idx, AccessType value) + template + void set(IndexType idx, AccessType value) { preloaded[idx >> WorkgroupSizeLog2] = value; } - template - void get(uint32_t idx, NBL_REF_ARG(AccessType) value) + template + void get(IndexType idx, NBL_REF_ARG(AccessType) value) { value = preloaded[idx >> WorkgroupSizeLog2]; } @@ -54,14 +54,14 @@ struct PreloadedAccessorBase : PreloadedAccessorCommonBase // In the case for preloading all channels at once we make it stateful so we track which channel we're running FFT on struct MultiChannelPreloadedAccessorBase : PreloadedAccessorCommonBase { - template - void set(uint32_t idx, AccessType value) + template + void set(IndexType idx, AccessType value) { preloaded[currentChannel][idx >> WorkgroupSizeLog2] = value; } - template - void get(uint32_t idx, NBL_REF_ARG(AccessType) value) + template + void get(IndexType idx, NBL_REF_ARG(AccessType) value) { value = preloaded[currentChannel][idx >> WorkgroupSizeLog2]; } From 1100876a9901f66adf362c17fb85ff23e6addf27 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 28 May 2025 15:06:52 +0700 Subject: [PATCH 210/296] limit workgroup count --- 23_Arithmetic2UnitTest/main.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 2daa772ae..176ef993e 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -365,7 +365,9 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu { itemsPerWG = workgroupSize; workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc); - } + } + workgroupCount = min(workgroupCount, m_physicalDevice->getLimits().maxComputeWorkGroupCount[0]); + cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); cmdbuf->bindComputePipeline(pipeline.get()); cmdbuf->pushConstants(pipelineLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc); From f202ef563249c172d4a6c699379c6793ae939863 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 29 May 2025 17:29:00 +0700 Subject: [PATCH 211/296] utility func to get items per wg --- 23_Arithmetic2UnitTest/main.cpp | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 176ef993e..73e6a144e 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -214,7 +214,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; logTestOutcome(passed, workgroupSize); - const uint32_t itemsPerWG = workgroupSize <= subgroupSize ? workgroupSize * itemsPerInvocation : itemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow + const uint32_t itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvocation); m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); @@ -267,6 +267,27 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } } + // reflects calculations in workgroup2::ArithmeticConfiguration + uint32_t calculateItemsPerWorkgroup(const uint32_t workgroupSize, const uint32_t subgroupSize, const uint32_t itemsPerInvocation) + { + if (workgroupSize <= subgroupSize) + return workgroupSize * itemsPerInvocation; + + const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); + const uint8_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); + + const uint16_t levels = (workgroupSizeLog2 == subgroupSizeLog2) ? 1 : + (workgroupSizeLog2 > subgroupSizeLog2 * 2 + 2) ? 3 : 2; + + const uint16_t itemsPerInvocationProductLog2 = max(workgroupSizeLog2 - subgroupSizeLog2 * levels, 0); + uint16_t itemsPerInvocation1 = (levels == 3) ? min(itemsPerInvocationProductLog2, 2) : itemsPerInvocationProductLog2; + itemsPerInvocation1 = uint16_t(1u) << itemsPerInvocation1; + + uint32_t virtualWorkgroupSize = 1u << max(subgroupSizeLog2 * levels, workgroupSizeLog2); + + return itemsPerInvocation * virtualWorkgroupSize; + } + // create pipeline (specialized every test) [TODO: turn into a future/async] smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2) { From 93b78108b433cfb85407c5f6816adc4c58b0fb7b Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 30 May 2025 15:56:16 +0700 Subject: [PATCH 212/296] added check for vk spec requirement --- 23_Arithmetic2UnitTest/main.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 73e6a144e..b172d79e7 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -193,11 +193,15 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; + const auto MaxComputeWorkgroupSubgroups = m_physicalDevice->getLimits().maxComputeWorkgroupSubgroups; for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u) { + if (workgroupSize > subgroupSize * MaxComputeWorkgroupSubgroups) + continue; // vk spec requirement: https://vulkan.lunarg.com/doc/view/1.4.304.0/windows/1.4-extensions/vkspec.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756 + // make sure renderdoc captures everything for debugging m_api->startCapture(); m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); From 3a3aaa9fce04cda7726170e2128124d466252a27 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 2 Jun 2025 11:31:03 +0700 Subject: [PATCH 213/296] removed maxComputeWorkgroupSubgroups*subgroupsize check --- 23_Arithmetic2UnitTest/main.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index b172d79e7..73e6a144e 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -193,15 +193,11 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; - const auto MaxComputeWorkgroupSubgroups = m_physicalDevice->getLimits().maxComputeWorkgroupSubgroups; for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u) { - if (workgroupSize > subgroupSize * MaxComputeWorkgroupSubgroups) - continue; // vk spec requirement: https://vulkan.lunarg.com/doc/view/1.4.304.0/windows/1.4-extensions/vkspec.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756 - // make sure renderdoc captures everything for debugging m_api->startCapture(); m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); From fef1cd5f1502ce9cf356f6cfd1045ee9bfb6bd21 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 3 Jun 2025 14:59:19 +0200 Subject: [PATCH 214/296] first outline --- 29_MeshLoaders/CMakeLists.txt | 36 +- 29_MeshLoaders/main.cpp | 1603 ++++++++++++++------------------- 2 files changed, 723 insertions(+), 916 deletions(-) diff --git a/29_MeshLoaders/CMakeLists.txt b/29_MeshLoaders/CMakeLists.txt index a476b6203..07b0fd396 100644 --- a/29_MeshLoaders/CMakeLists.txt +++ b/29_MeshLoaders/CMakeLists.txt @@ -1,7 +1,37 @@ - include(common RESULT_VARIABLE RES) if(NOT RES) - message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") endif() -nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file +if(NBL_BUILD_IMGUI) + set(NBL_INCLUDE_SERACH_DIRECTORIES + "${CMAKE_CURRENT_SOURCE_DIR}/include" + ) + + list(APPEND NBL_LIBRARIES + imtestengine + "${NBL_EXT_IMGUI_UI_LIB}" + ) + + nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + + if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) + endif() +endif() + + diff --git a/29_MeshLoaders/main.cpp b/29_MeshLoaders/main.cpp index 968f7c42e..feb52936a 100644 --- a/29_MeshLoaders/main.cpp +++ b/29_MeshLoaders/main.cpp @@ -2,139 +2,101 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "common.hpp" -#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" -#include "nbl/builtin/hlsl/indirect_commands.hlsl" +#include +#include "nbl/asset/utils/CGeometryCreator.h" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "SimpleWindowedApplication.hpp" -class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication -{ - using device_base_t = examples::SimpleWindowedApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; - using clock_t = std::chrono::steady_clock; - - constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; - constexpr static inline uint32_t MaxFramesInFlight = 3u; - constexpr static inline uint8_t MaxUITextureCount = 1u; - constexpr static inline uint32_t NumberOfProceduralGeometries = 5; - - static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = { - "Directional", - "Point", - "Spot" - }; - - struct ShaderBindingTable - { - SBufferRange raygenGroupRange; - SBufferRange hitGroupsRange; - uint32_t hitGroupsStride; - SBufferRange missGroupsRange; - uint32_t missGroupsStride; - SBufferRange callableGroupsRange; - uint32_t callableGroupsStride; - }; - - -public: - inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) - : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) - { - } +#include "InputSystem.hpp" +#include "CEventCallback.hpp" - inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override - { - auto retval = device_base_t::getRequiredDeviceFeatures(); - retval.rayTracingPipeline = true; - retval.accelerationStructure = true; - retval.rayQuery = true; - return retval; - } - - inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override - { - auto retval = device_base_t::getPreferredDeviceFeatures(); - retval.accelerationStructureHostCommands = true; - return retval; - } +#include "CCamera.hpp" - inline core::vector getSurfaces() const override - { - if (!m_surface) - { - { - auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); - IWindow::SCreationParams params = {}; - params.callback = core::make_smart_refctd_ptr(); - params.width = WIN_W; - params.height = WIN_H; - params.x = 32; - params.y = 32; - params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; - params.windowCaption = "RaytracingPipelineApp"; - params.callback = windowCallback; - const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); - } +#include +#include - auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); - const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); - } +using namespace nbl; +using namespace core; +using namespace hlsl; +using namespace system; +using namespace asset; +using namespace ui; +using namespace video; - if (m_surface) - return { {m_surface->getSurface()/*,EQF_NONE*/} }; - return {}; - } +class MeshLoadersApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = examples::SimpleWindowedApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; - // so that we can use the same queue for asset converter and rendering - inline core::vector getQueueRequirements() const override - { - auto reqs = device_base_t::getQueueRequirements(); - reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; - return reqs; - } + constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; + constexpr static inline uint32_t MaxFramesInFlight = 3u; + constexpr static inline uint8_t MaxUITextureCount = 1u; - inline bool onAppInitialized(smart_refctd_ptr&& system) override - { - m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); - if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - - if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; + public: + inline MeshLoadersApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) + { + } - smart_refctd_ptr shaderReadCache = nullptr; - smart_refctd_ptr shaderWriteCache = core::make_smart_refctd_ptr(); - auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin"; + inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override + { + auto retval = device_base_t::getPreferredDeviceFeatures(); + retval.accelerationStructure = true; + retval.rayQuery = true; + return retval; + } + inline core::vector getSurfaces() const override { - core::smart_refctd_ptr shaderReadCacheFile; + if (!m_surface) { - system::ISystem::future_t> future; - m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ); - if (future.wait()) { - future.acquire().move_into(shaderReadCacheFile); - if (shaderReadCacheFile) - { - const size_t size = shaderReadCacheFile->getSize(); - if (size > 0ull) - { - std::vector contents(size); - system::IFile::success_t succ; - shaderReadCacheFile->read(succ, contents.data(), 0, size); - if (succ) - shaderReadCache = IShaderCompiler::CCache::deserialize(contents); - } - } + auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); + IWindow::SCreationParams params = {}; + params.callback = core::make_smart_refctd_ptr(); + params.width = WIN_W; + params.height = WIN_H; + params.x = 32; + params.y = 32; + params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; + params.windowCaption = "MeshLoadersApp"; + params.callback = windowCallback; + const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); } - else - m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR); + + auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); + const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); } + if (m_surface) + return { {m_surface->getSurface()/*,EQF_NONE*/} }; + + return {}; } + // so that we can use the same queue for asset converter and rendering + inline core::vector getQueueRequirements() const override + { + auto reqs = device_base_t::getQueueRequirements(); + reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + return reqs; + } + + inline bool onAppInitialized(smart_refctd_ptr&& system) override + { + m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + +#if 0 // Load Custom Shader auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr { @@ -167,82 +129,57 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl"); const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl"); +#endif - core::smart_refctd_ptr shaderWriteCacheFile; - { - system::ISystem::future_t> future; - m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json - m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE); - if (future.wait()) - { - future.acquire().move_into(shaderWriteCacheFile); - if (shaderWriteCacheFile) - { - auto serializedCache = shaderWriteCache->serialize(); - if (shaderWriteCacheFile) - { - system::IFile::success_t succ; - shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize()); - if (!succ) - m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR); - } - } - else - m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); - } - else - m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); - } - - m_semaphore = m_device->createSemaphore(m_realFrameIx); - if (!m_semaphore) - return logFail("Failed to Create a Semaphore!"); + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); - auto gQueue = getGraphicsQueue(); + auto gQueue = getGraphicsQueue(); - // Create renderpass and init surface - nbl::video::IGPURenderpass* renderpass; - { - ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr(m_surface->getSurface()) }; - if (!swapchainParams.deduceFormat(m_physicalDevice)) - return logFail("Could not choose a Surface Format for the Swapchain!"); - - const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = + // Create renderpass and init surface + nbl::video::IGPURenderpass* renderpass; { - { - .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .dstSubpass = 0, - .memoryBarrier = - { - .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, - .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, - .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - }, - { - .srcSubpass = 0, - .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .memoryBarrier = - { - .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - }, - IGPURenderpass::SCreationParams::DependenciesEnd - }; + ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr(m_surface->getSurface()) }; + if (!swapchainParams.deduceFormat(m_physicalDevice)) + return logFail("Could not choose a Surface Format for the Swapchain!"); - auto scResources = std::make_unique(m_device.get(), swapchainParams.surfaceFormat.format, dependencies); - renderpass = scResources->getRenderpass(); + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = + { + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; - if (!renderpass) - return logFail("Failed to create Renderpass!"); + auto scResources = std::make_unique(m_device.get(), swapchainParams.surfaceFormat.format, dependencies); + renderpass = scResources->getRenderpass(); - if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) - return logFail("Could not create Window & Surface or initialize the Surface!"); - } + if (!renderpass) + return logFail("Failed to create Renderpass!"); - auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) + return logFail("Could not create Window & Surface or initialize the Surface!"); + } +#if 0 + auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); @@ -253,11 +190,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) return logFail("Couldn't create Command Buffer!"); } +#endif + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); - m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); - m_surface->recreateSwapchain(); - - +#if 0 // create output images m_hdrImage = m_device->createImage({ { @@ -600,84 +537,84 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, ImGui::End(); } ); +#endif + // Set Camera + { + core::vectorSIMDf cameraPosition(0, 5, -10); + matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( + core::radians(60.0f), + WIN_W / WIN_H, + 0.01f, + 500.0f + ); + m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); + } - // Set Camera - { - core::vectorSIMDf cameraPosition(0, 5, -10); - matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( - core::radians(60.0f), - WIN_W / WIN_H, - 0.01f, - 500.0f - ); - m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); - } + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); + m_winMgr->show(m_window.get()); + m_oracle.reportBeginFrameRecord(); + m_camera.mapKeysToWASD(); - m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); - m_surface->recreateSwapchain(); - m_winMgr->show(m_window.get()); - m_oracle.reportBeginFrameRecord(); - m_camera.mapKeysToWASD(); + return true; + } - return true; - } + bool updateGUIDescriptorSet() + { + // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout + static std::array descriptorInfo; + static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount]; - bool updateGUIDescriptorSet() - { - // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout - static std::array descriptorInfo; - static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount]; + descriptorInfo[ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + descriptorInfo[ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr(m_ui.manager->getFontAtlasView()); - descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; - descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr(m_ui.manager->getFontAtlasView()); + for (uint32_t i = 0; i < descriptorInfo.size(); ++i) + { + writes[i].dstSet = m_ui.descriptorSet.get(); + writes[i].binding = 0u; + writes[i].arrayElement = i; + writes[i].count = 1u; + } + writes[ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + ext::imgui::UI::FontAtlasTexId; - for (uint32_t i = 0; i < descriptorInfo.size(); ++i) - { - writes[i].dstSet = m_ui.descriptorSet.get(); - writes[i].binding = 0u; - writes[i].arrayElement = i; - writes[i].count = 1u; + return m_device->updateDescriptorSets(writes, {}); } - writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId; - - return m_device->updateDescriptorSets(writes, {}); - } - inline void workLoopBody() override - { - // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. - const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); - // We block for semaphores for 2 reasons here: - // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] - // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] - if (m_realFrameIx >= framesInFlight) + inline void workLoopBody() override { - const ISemaphore::SWaitInfo cbDonePending[] = + // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. + const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); + // We block for semaphores for 2 reasons here: + // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] + // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] + if (m_realFrameIx >= framesInFlight) { - { - .semaphore = m_semaphore.get(), - .value = m_realFrameIx + 1 - framesInFlight - } - }; - if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) - return; - } - const auto resourceIx = m_realFrameIx % MaxFramesInFlight; - - m_api->startCapture(); + const ISemaphore::SWaitInfo cbDonePending[] = + { + { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + 1 - framesInFlight + } + }; + if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) + return; + } + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; - update(); + m_api->startCapture(); - auto queue = getGraphicsQueue(); - auto cmdbuf = m_cmdBufs[resourceIx].get(); +// update(); - if (!keepRunning()) - return; + auto queue = getGraphicsQueue(); + auto cmdbuf = m_cmdBufs[resourceIx].get(); - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame"); + if (!keepRunning()) + return; + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdbuf->beginDebugMarker("Frame"); +#if 0 const auto viewMatrix = m_camera.getViewMatrix(); const auto projectionMatrix = m_camera.getProjectionMatrix(); const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix(); @@ -686,12 +623,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0)); modelMatrix.setRotation(quaternion(0, 0, 0)); - core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); - if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix) - { - m_frameAccumulationCounter = 0; - m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix; - } core::matrix4SIMD invModelViewProjectionMatrix; modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix); @@ -824,768 +755,619 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, cmdbuf->endRenderPass(); } +#endif + cmdbuf->endDebugMarker(); + cmdbuf->end(); - cmdbuf->endDebugMarker(); - cmdbuf->end(); - - { - const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = - { - { - .semaphore = m_semaphore.get(), - .value = ++m_realFrameIx, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS - } - }; { + const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = + { + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } + }; { - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = { - {.cmdbuf = cmdbuf } - }; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cmdbuf } + }; - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = - { - { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::NONE - } - }; - const IQueue::SSubmitInfo infos[] = - { - { - .waitSemaphores = acquired, - .commandBuffers = commandBuffers, - .signalSemaphores = rendered - } - }; + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = + { + { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = rendered + } + }; - updateGUIDescriptorSet(); +// updateGUIDescriptorSet(); - if (queue->submit(infos) != IQueue::RESULT::SUCCESS) - m_realFrameIx--; + if (queue->submit(infos) != IQueue::RESULT::SUCCESS) + m_realFrameIx--; + } } - } - m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline"); - m_surface->present(m_currentImageAcquire.imageIndex, rendered); + m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline"); + m_surface->present(m_currentImageAcquire.imageIndex, rendered); + } + m_api->endCapture(); + m_frameAccumulationCounter++; } - m_api->endCapture(); - m_frameAccumulationCounter++; - } - - inline void update() - { - m_camera.setMoveSpeed(m_cameraSetting.moveSpeed); - m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed); - - static std::chrono::microseconds previousEventTimestamp{}; - - m_inputSystem->getDefaultMouse(&m_mouse); - m_inputSystem->getDefaultKeyboard(&m_keyboard); - - auto updatePresentationTimestamp = [&]() - { - m_currentImageAcquire = m_surface->acquireNextImage(); - - m_oracle.reportEndFrameRecord(); - const auto timestamp = m_oracle.getNextPresentationTimeStamp(); - m_oracle.reportBeginFrameRecord(); +#if 0 + inline void update() + { + m_camera.setMoveSpeed(m_cameraSetting.moveSpeed); + m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed); - return timestamp; - }; + static std::chrono::microseconds previousEventTimestamp{}; - const auto nextPresentationTimestamp = updatePresentationTimestamp(); + m_inputSystem->getDefaultMouse(&m_mouse); + m_inputSystem->getDefaultKeyboard(&m_keyboard); - struct - { - std::vector mouse{}; - std::vector keyboard{}; - } capturedEvents; - - m_camera.beginInputProcessing(nextPresentationTimestamp); - { - const auto& io = ImGui::GetIO(); - m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + auto updatePresentationTimestamp = [&]() { - if (!io.WantCaptureMouse) - m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + m_currentImageAcquire = m_surface->acquireNextImage(); - for (const auto& e : events) // here capture - { - if (e.timeStamp < previousEventTimestamp) - continue; + m_oracle.reportEndFrameRecord(); + const auto timestamp = m_oracle.getNextPresentationTimeStamp(); + m_oracle.reportBeginFrameRecord(); - previousEventTimestamp = e.timeStamp; - capturedEvents.mouse.emplace_back(e); + return timestamp; + }; - } - }, m_logger.get()); + const auto nextPresentationTimestamp = updatePresentationTimestamp(); - m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void - { - if (!io.WantCaptureKeyboard) - m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl + struct + { + std::vector mouse{}; + std::vector keyboard{}; + } capturedEvents; - for (const auto& e : events) // here capture + m_camera.beginInputProcessing(nextPresentationTimestamp); + { + const auto& io = ImGui::GetIO(); + m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { - if (e.timeStamp < previousEventTimestamp) - continue; - - previousEventTimestamp = e.timeStamp; - capturedEvents.keyboard.emplace_back(e); - } - }, m_logger.get()); + if (!io.WantCaptureMouse) + m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl - } - m_camera.endInputProcessing(nextPresentationTimestamp); + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; - const core::SRange mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size()); - const core::SRange keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size()); - const auto cursorPosition = m_window->getCursorControl()->getPosition(); - const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()); + previousEventTimestamp = e.timeStamp; + capturedEvents.mouse.emplace_back(e); - const ext::imgui::UI::SUpdateParameters params = - { - .mousePosition = mousePosition, - .displaySize = { m_window->getWidth(), m_window->getHeight() }, - .mouseEvents = mouseEvents, - .keyboardEvents = keyboardEvents - }; + } + }, m_logger.get()); - m_ui.manager->update(params); - } + m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + { + if (!io.WantCaptureKeyboard) + m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl - inline bool keepRunning() override - { - if (m_surface->irrecoverable()) - return false; + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; - return true; - } + previousEventTimestamp = e.timeStamp; + capturedEvents.keyboard.emplace_back(e); + } + }, m_logger.get()); - inline bool onAppTerminated() override - { - return device_base_t::onAppTerminated(); - } + } + m_camera.endInputProcessing(nextPresentationTimestamp); -private: - uint32_t getWorkgroupCount(uint32_t dim, uint32_t size) - { - return (dim + size - 1) / size; - } + const core::SRange mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size()); + const core::SRange keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size()); + const auto cursorPosition = m_window->getCursorControl()->getPosition(); + const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()); - bool createIndirectBuffer() - { - const auto getBufferRangeAddress = [](const SBufferRange& range) + const ext::imgui::UI::SUpdateParameters params = { - return range.buffer->getDeviceAddress() + range.offset; - }; - const auto command = TraceRaysIndirectCommand_t{ - .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange), - .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size, - .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange), - .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size, - .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride, - .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange), - .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size, - .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride, - .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange), - .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size, - .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride, - .width = WIN_W, - .height = WIN_H, - .depth = 1, - }; - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = sizeof(TraceRaysIndirectCommand_t); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), &command).move_into(m_indirectBuffer); - return true; - } - - void calculateRayTracingStackSize(const smart_refctd_ptr& pipeline) - { - const auto raygenStackSize = pipeline->getRaygenStackSize(); - auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t - { - auto maxValue = 0; - for (const auto& val : ranges) - { - maxValue = std::max(maxValue, std::invoke(valProj, val)); - } - return maxValue; + .mousePosition = mousePosition, + .displaySize = { m_window->getWidth(), m_window->getHeight() }, + .mouseEvents = mouseEvents, + .keyboardEvents = keyboardEvents }; - const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit); - const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit); - const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection); - const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{}); - const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{}); - auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax); - firstDepthStackSizeMax = std::max(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax); - m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax); - } - - bool createShaderBindingTable(const smart_refctd_ptr& pipeline) - { - const auto& limits = m_device->getPhysicalDevice()->getLimits(); - const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize; - const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment); - - auto& raygenRange = m_shaderBindingTable.raygenGroupRange; - - auto& hitRange = m_shaderBindingTable.hitGroupsRange; - const auto hitHandles = pipeline->getHitHandles(); - - auto& missRange = m_shaderBindingTable.missGroupsRange; - const auto missHandles = pipeline->getMissHandles(); - - auto& callableRange = m_shaderBindingTable.callableGroupsRange; - const auto callableHandles = pipeline->getCallableHandles(); - - raygenRange = { - .offset = 0, - .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment) - }; - - missRange = { - .offset = raygenRange.size, - .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), - }; - m_shaderBindingTable.missGroupsStride = handleSizeAligned; - - hitRange = { - .offset = missRange.offset + missRange.size, - .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), - }; - m_shaderBindingTable.hitGroupsStride = handleSizeAligned; - - callableRange = { - .offset = hitRange.offset + hitRange.size, - .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), - }; - m_shaderBindingTable.callableGroupsStride = handleSizeAligned; - - const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size; - - ICPUBuffer::SCreationParams cpuBufferParams; - cpuBufferParams.size = bufferSize; - auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams)); - uint8_t* pData = reinterpret_cast(cpuBuffer->getPointer()); - - // copy raygen region - memcpy(pData, &pipeline->getRaygen(), handleSize); - - // copy miss region - uint8_t* pMissData = pData + missRange.offset; - for (const auto& handle : missHandles) - { - memcpy(pMissData, &handle, handleSize); - pMissData += m_shaderBindingTable.missGroupsStride; + m_ui.manager->update(params); } - - // copy hit region - uint8_t* pHitData = pData + hitRange.offset; - for (const auto& handle : hitHandles) +#endif + inline bool keepRunning() override { - memcpy(pHitData, &handle, handleSize); - pHitData += m_shaderBindingTable.hitGroupsStride; - } + if (m_surface->irrecoverable()) + return false; - // copy callable region - uint8_t* pCallableData = pData + callableRange.offset; - for (const auto& handle : callableHandles) - { - memcpy(pCallableData, &handle, handleSize); - pCallableData += m_shaderBindingTable.callableGroupsStride; + return true; } + inline bool onAppTerminated() override { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT; - params.size = bufferSize; - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), pData).move_into(raygenRange.buffer); - missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); - hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); - callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + return device_base_t::onAppTerminated(); } - return true; - } + private: +#if 0 + bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc) + { + auto queue = getGraphicsQueue(); + // get geometries into ICPUBuffers + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!pool) + return logFail("Couldn't create Command Pool for geometry creation!"); + + const auto defaultMaterial = Material{ + .ambient = {0.2, 0.1, 0.1}, + .diffuse = {0.8, 0.3, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 1.0f, + }; - bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc) - { - auto queue = getGraphicsQueue(); - // get geometries into ICPUBuffers - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for geometry creation!"); - - const auto defaultMaterial = Material{ - .ambient = {0.2, 0.1, 0.1}, - .diffuse = {0.8, 0.3, 0.3}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - .alpha = 1.0f, - }; + auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) + { + core::matrix3x4SIMD transform; + transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); + return transform; + }; - auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) - { - core::matrix3x4SIMD transform; - transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); - return transform; - }; + core::matrix3x4SIMD planeTransform; + planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); - core::matrix3x4SIMD planeTransform; - planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); - - // triangles geometries - const auto cpuObjects = std::array{ - ReferenceObjectCpu { - .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, - .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), - .material = defaultMaterial, - .transform = planeTransform, - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), - .material = defaultMaterial, - .transform = getTranslationMatrix(0, 0.5f, 0), - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = Material{ - .ambient = {0.1, 0.1, 0.2}, - .diffuse = {0.2, 0.2, 0.8}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, + // triangles geometries + const auto cpuObjects = std::array{ + ReferenceObjectCpu { + .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, + .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), + .material = defaultMaterial, + .transform = planeTransform, }, - .transform = getTranslationMatrix(-5.0f, 1.0f, 0), - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = Material{ - .ambient = {0.1, 0.2, 0.1}, - .diffuse = {0.2, 0.8, 0.2}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - .alpha = 0.2, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), + .material = defaultMaterial, + .transform = getTranslationMatrix(0, 0.5f, 0), }, - .transform = getTranslationMatrix(5.0f, 1.0f, 0), - }, - }; - - struct CPUTriBufferBindings - { - nbl::asset::SBufferBinding vertex, index; - }; - std::array cpuTriBuffers; - - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - const auto& cpuObject = cpuObjects[i]; - - auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset - auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - vBuffer->addUsageFlags(vUsage); - vBuffer->setContentHash(vBuffer->computeContentHash()); - - auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset - auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - - if (cpuObject.data.indexType != EIT_UNKNOWN) - if (iBuffer) - { - iBuffer->addUsageFlags(iUsage); - iBuffer->setContentHash(iBuffer->computeContentHash()); - } + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.1, 0.2}, + .diffuse = {0.2, 0.2, 0.8}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }, + .transform = getTranslationMatrix(-5.0f, 1.0f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.2, 0.1}, + .diffuse = {0.2, 0.8, 0.2}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 0.2, + }, + .transform = getTranslationMatrix(5.0f, 1.0f, 0), + }, + }; - cpuTriBuffers[i] = { - .vertex = {.offset = 0, .buffer = vBuffer}, - .index = {.offset = 0, .buffer = iBuffer}, + struct CPUTriBufferBindings + { + nbl::asset::SBufferBinding vertex, index; }; + std::array cpuTriBuffers; - } + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + const auto& cpuObject = cpuObjects[i]; - // procedural geometries - using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; + auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset + auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + vBuffer->addUsageFlags(vUsage); + vBuffer->setContentHash(vBuffer->computeContentHash()); - smart_refctd_ptr cpuProcBuffer; - { - ICPUBuffer::SCreationParams params; - params.size = NumberOfProceduralGeometries * sizeof(Aabb); - cpuProcBuffer = ICPUBuffer::create(std::move(params)); - } + auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset + auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - core::vector proceduralGeoms; - proceduralGeoms.reserve(NumberOfProceduralGeometries); - auto proceduralGeometries = reinterpret_cast(cpuProcBuffer->getPointer()); - for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) - { - const auto middle_i = NumberOfProceduralGeometries / 2.0; - SProceduralGeomInfo sphere = { - .material = hlsl::_static_cast(Material{ - .ambient = {0.1, 0.05 * i, 0.1}, - .diffuse = {0.3, 0.2 * i, 0.3}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - }), - .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), - .radius = 1, - }; + if (cpuObject.data.indexType != EIT_UNKNOWN) + if (iBuffer) + { + iBuffer->addUsageFlags(iUsage); + iBuffer->setContentHash(iBuffer->computeContentHash()); + } - proceduralGeoms.push_back(sphere); - const auto sphereMin = sphere.center - sphere.radius; - const auto sphereMax = sphere.center + sphere.radius; - proceduralGeometries[i] = { - vector3d(sphereMin.x, sphereMin.y, sphereMin.z), - vector3d(sphereMax.x, sphereMax.y, sphereMax.z) - }; - } + cpuTriBuffers[i] = { + .vertex = {.offset = 0, .buffer = vBuffer}, + .index = {.offset = 0, .buffer = iBuffer}, + }; - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); - } + } - // get ICPUBuffers into ICPUBLAS - // TODO use one BLAS and multiple triangles/aabbs in one - const auto blasCount = std::size(cpuObjects) + 1; - const auto proceduralBlasIdx = std::size(cpuObjects); + // procedural geometries + using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; - std::array, std::size(cpuObjects)+1u> cpuBlas; - for (uint32_t i = 0; i < blasCount; i++) - { - auto& blas = cpuBlas[i]; - blas = make_smart_refctd_ptr(); + smart_refctd_ptr cpuProcBuffer; + { + ICPUBuffer::SCreationParams params; + params.size = NumberOfProceduralGeometries * sizeof(Aabb); + cpuProcBuffer = ICPUBuffer::create(std::move(params)); + } - if (i == proceduralBlasIdx) + core::vector proceduralGeoms; + proceduralGeoms.reserve(NumberOfProceduralGeometries); + auto proceduralGeometries = reinterpret_cast(cpuProcBuffer->getPointer()); + for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) { - auto aabbs = make_refctd_dynamic_array>>(1u); - auto primitiveCounts = make_refctd_dynamic_array>(1u); + const auto middle_i = NumberOfProceduralGeometries / 2.0; + SProceduralGeomInfo sphere = { + .material = hlsl::_static_cast(Material{ + .ambient = {0.1, 0.05 * i, 0.1}, + .diffuse = {0.3, 0.2 * i, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }), + .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), + .radius = 1, + }; - auto& aabb = aabbs->front(); - auto& primCount = primitiveCounts->front(); - - primCount = NumberOfProceduralGeometries; - aabb.data = { .offset = 0, .buffer = cpuProcBuffer }; - aabb.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t); - aabb.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now + proceduralGeoms.push_back(sphere); + const auto sphereMin = sphere.center - sphere.radius; + const auto sphereMax = sphere.center + sphere.radius; + proceduralGeometries[i] = { + vector3d(sphereMin.x, sphereMin.y, sphereMin.z), + vector3d(sphereMax.x, sphereMax.y, sphereMax.z) + }; + } - blas->setGeometries(std::move(aabbs), std::move(primitiveCounts)); + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); } - else + + // get ICPUBuffers into ICPUBLAS + // TODO use one BLAS and multiple triangles/aabbs in one + const auto blasCount = std::size(cpuObjects) + 1; + const auto proceduralBlasIdx = std::size(cpuObjects); + + std::array, std::size(cpuObjects)+1u> cpuBlas; + for (uint32_t i = 0; i < blasCount; i++) { - auto triangles = make_refctd_dynamic_array>>(1u); - auto primitiveCounts = make_refctd_dynamic_array>(1u); + auto& blas = cpuBlas[i]; + blas = make_smart_refctd_ptr(); - auto& tri = triangles->front(); - auto& primCount = primitiveCounts->front(); - const auto& geom = cpuObjects[i]; - const auto& cpuBuf = cpuTriBuffers[i]; + if (i == proceduralBlasIdx) + { + auto aabbs = make_refctd_dynamic_array>>(1u); + auto primitiveCounts = make_refctd_dynamic_array>(1u); - const bool useIndex = geom.data.indexType != EIT_UNKNOWN; - const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride; - const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride; + auto& aabb = aabbs->front(); + auto& primCount = primitiveCounts->front(); + + primCount = NumberOfProceduralGeometries; + aabb.data = { .offset = 0, .buffer = cpuProcBuffer }; + aabb.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t); + aabb.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now - if (useIndex) - primCount = geom.data.indexCount / 3; + blas->setGeometries(std::move(aabbs), std::move(primitiveCounts)); + } else - primCount = numVertices / 3; - - tri.vertexData[0] = cpuBuf.vertex; - tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex; - tri.maxVertex = numVertices - 1; - tri.vertexStride = vertexStride; - tri.vertexFormat = EF_R32G32B32_SFLOAT; - tri.indexType = geom.data.indexType; - tri.geometryFlags = geom.material.isTransparent() ? - IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : - IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; - - blas->setGeometries(std::move(triangles), std::move(primitiveCounts)); - } + { + auto triangles = make_refctd_dynamic_array>>(1u); + auto primitiveCounts = make_refctd_dynamic_array>(1u); + + auto& tri = triangles->front(); + auto& primCount = primitiveCounts->front(); + const auto& geom = cpuObjects[i]; + const auto& cpuBuf = cpuTriBuffers[i]; + + const bool useIndex = geom.data.indexType != EIT_UNKNOWN; + const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride; + const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride; + + if (useIndex) + primCount = geom.data.indexCount / 3; + else + primCount = numVertices / 3; + + tri.vertexData[0] = cpuBuf.vertex; + tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex; + tri.maxVertex = numVertices - 1; + tri.vertexStride = vertexStride; + tri.vertexFormat = EF_R32G32B32_SFLOAT; + tri.indexType = geom.data.indexType; + tri.geometryFlags = geom.material.isTransparent() ? + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; + + blas->setGeometries(std::move(triangles), std::move(primitiveCounts)); + } - auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; - if (i == proceduralBlasIdx) - blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; + auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; + if (i == proceduralBlasIdx) + blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; - blas->setBuildFlags(blasFlags); - blas->setContentHash(blas->computeContentHash()); - } + blas->setBuildFlags(blasFlags); + blas->setContentHash(blas->computeContentHash()); + } - auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) }); - STriangleGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); + auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) }); + STriangleGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); - // get ICPUBLAS into ICPUTLAS - auto geomInstances = make_refctd_dynamic_array>(blasCount); - { - uint32_t i = 0; - for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++) + // get ICPUBLAS into ICPUTLAS + auto geomInstances = make_refctd_dynamic_array>(blasCount); { - const auto isProceduralInstance = i == proceduralBlasIdx; - ICPUTopLevelAccelerationStructure::StaticInstance inst; - inst.base.blas = cpuBlas[i]; - inst.base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); - inst.base.instanceCustomIndex = i; - inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;; - inst.base.mask = 0xFF; - inst.transform = isProceduralInstance ? matrix3x4SIMD() : cpuObjects[i].transform; - - instance->instance = inst; + uint32_t i = 0; + for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++) + { + const auto isProceduralInstance = i == proceduralBlasIdx; + ICPUTopLevelAccelerationStructure::StaticInstance inst; + inst.base.blas = cpuBlas[i]; + inst.base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); + inst.base.instanceCustomIndex = i; + inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;; + inst.base.mask = 0xFF; + inst.transform = isProceduralInstance ? matrix3x4SIMD() : cpuObjects[i].transform; + + instance->instance = inst; + } } - } - auto cpuTlas = make_smart_refctd_ptr(); - cpuTlas->setInstances(std::move(geomInstances)); - cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); + auto cpuTlas = make_smart_refctd_ptr(); + cpuTlas->setInstances(std::move(geomInstances)); + cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); - // convert with asset converter - smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); - struct MyInputs : CAssetConverter::SInputs - { - // For the GPU Buffers to be directly writeable and so that we don't need a Transfer Queue submit at all - inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override + // convert with asset converter + smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); + struct MyInputs : CAssetConverter::SInputs { - assert(memoryBacked); - return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes; - } - - uint32_t rebarMemoryTypes; - } inputs = {}; - inputs.logger = m_logger.get(); - inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); - // the allocator needs to be overriden to hand out memory ranges which have already been mapped so that the ReBAR fast-path can kick in - // (multiple buffers can be bound to same memory, but memory can only be mapped once at one place, so Asset Converter can't do it) - struct MyAllocator final : public IDeviceMemoryAllocator - { - ILogicalDevice* getDeviceForAllocations() const override { return device; } + // For the GPU Buffers to be directly writeable and so that we don't need a Transfer Queue submit at all + inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override + { + assert(memoryBacked); + return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes; + } - SAllocation allocate(const SAllocateInfo& info) override + uint32_t rebarMemoryTypes; + } inputs = {}; + inputs.logger = m_logger.get(); + inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); + // the allocator needs to be overriden to hand out memory ranges which have already been mapped so that the ReBAR fast-path can kick in + // (multiple buffers can be bound to same memory, but memory can only be mapped once at one place, so Asset Converter can't do it) + struct MyAllocator final : public IDeviceMemoryAllocator { - auto retval = device->allocate(info); - // map what is mappable by default so ReBAR checks succeed - if (retval.isValid() && retval.memory->isMappable()) - retval.memory->map({ .offset = 0,.length = info.size }); - return retval; - } + ILogicalDevice* getDeviceForAllocations() const override { return device; } - ILogicalDevice* device; - } myalloc; - myalloc.device = m_device.get(); - inputs.allocator = &myalloc; + SAllocation allocate(const SAllocateInfo& info) override + { + auto retval = device->allocate(info); + // map what is mappable by default so ReBAR checks succeed + if (retval.isValid() && retval.memory->isMappable()) + retval.memory->map({ .offset = 0,.length = info.size }); + return retval; + } - std::array tmpTlas; - std::array tmpBuffers; - { - tmpTlas[0] = cpuTlas.get(); - for (uint32_t i = 0; i < cpuObjects.size(); i++) + ILogicalDevice* device; + } myalloc; + myalloc.device = m_device.get(); + inputs.allocator = &myalloc; + + std::array tmpTlas; + std::array tmpBuffers; { - tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get(); - tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get(); - } - tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get(); + tmpTlas[0] = cpuTlas.get(); + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get(); + tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get(); + } + tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get(); - std::get>(inputs.assets) = tmpTlas; - std::get>(inputs.assets) = tmpBuffers; - } + std::get>(inputs.assets) = tmpTlas; + std::get>(inputs.assets) = tmpBuffers; + } - auto reservation = converter->reserve(inputs); - { - auto prepass = [&](const auto & references) -> bool + auto reservation = converter->reserve(inputs); { - auto objects = reservation.getGPUObjects(); - uint32_t counter = {}; - for (auto& object : objects) + auto prepass = [&](const auto & references) -> bool { - auto gpu = object.value; - auto* reference = references[counter]; - - if (reference) + auto objects = reservation.getGPUObjects(); + uint32_t counter = {}; + for (auto& object : objects) { - if (!gpu) + auto gpu = object.value; + auto* reference = references[counter]; + + if (reference) { - m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); - return false; + if (!gpu) + { + m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); + return false; + } } + counter++; } - counter++; - } - return true; - }; + return true; + }; - prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas); - prepass.template operator() < ICPUBuffer > (tmpBuffers); - } + prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas); + prepass.template operator() < ICPUBuffer > (tmpBuffers); + } - constexpr auto CompBufferCount = 2; - std::array, CompBufferCount> compBufs = {}; - std::array compBufInfos = {}; - { - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); - pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs); - compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - for (auto i = 0; i < CompBufferCount; i++) - compBufInfos[i].cmdbuf = compBufs[i].get(); - } - auto compSema = m_device->createSemaphore(0u); - SIntendedSubmitInfo compute = {}; - compute.queue = queue; - compute.scratchCommandBuffers = compBufInfos; - compute.scratchSemaphore = { - .semaphore = compSema.get(), - .value = 0u, - .stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT | PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT - }; - // convert - { - smart_refctd_ptr scratchAlloc; + constexpr auto CompBufferCount = 2; + std::array, CompBufferCount> compBufs = {}; + std::array compBufInfos = {}; { - constexpr auto MaxAlignment = 256; - constexpr auto MinAllocationSize = 1024; - const auto scratchSize = core::alignUp(reservation.getMaxASBuildScratchSize(false), MaxAlignment); + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs); + compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + for (auto i = 0; i < CompBufferCount; i++) + compBufInfos[i].cmdbuf = compBufs[i].get(); + } + auto compSema = m_device->createSemaphore(0u); + SIntendedSubmitInfo compute = {}; + compute.queue = queue; + compute.scratchCommandBuffers = compBufInfos; + compute.scratchSemaphore = { + .semaphore = compSema.get(), + .value = 0u, + .stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT | PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT + }; + // convert + { + smart_refctd_ptr scratchAlloc; + { + constexpr auto MaxAlignment = 256; + constexpr auto MinAllocationSize = 1024; + const auto scratchSize = core::alignUp(reservation.getMaxASBuildScratchSize(false), MaxAlignment); - IGPUBuffer::SCreationParams creationParams = {}; - creationParams.size = scratchSize; - creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - auto scratchBuffer = m_device->createBuffer(std::move(creationParams)); + IGPUBuffer::SCreationParams creationParams = {}; + creationParams.size = scratchSize; + creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + auto scratchBuffer = m_device->createBuffer(std::move(creationParams)); - auto reqs = scratchBuffer->getMemoryReqs(); - reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); + auto reqs = scratchBuffer->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); - auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - allocation.memory->map({ .offset = 0,.length = reqs.size }); + auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + allocation.memory->map({ .offset = 0,.length = reqs.size }); - scratchAlloc = make_smart_refctd_ptr( - SBufferRange{0ull, scratchSize, std::move(scratchBuffer)}, - core::allocator(), MaxAlignment, MinAllocationSize - ); - } + scratchAlloc = make_smart_refctd_ptr( + SBufferRange{0ull, scratchSize, std::move(scratchBuffer)}, + core::allocator(), MaxAlignment, MinAllocationSize + ); + } - struct MyParams final : CAssetConverter::SConvertParams - { - inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override + struct MyParams final : CAssetConverter::SConvertParams { - return finalUser; + inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override + { + return finalUser; + } + inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override + { + return finalUser; + } + + uint8_t finalUser; + } params = {}; + params.utilities = m_utils.get(); + params.compute = &compute; + params.scratchForDeviceASBuild = scratchAlloc.get(); + params.finalUser = queue->getFamilyIndex(); + + auto future = reservation.convert(params); + if (future.copy() != IQueue::RESULT::SUCCESS) + { + m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); + return false; } - inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override + // 2 submits, BLAS build, TLAS build, DO NOT ADD COMPACTIONS IN THIS EXAMPLE! + if (compute.getFutureScratchSemaphore().value>3) + m_logger->log("Overflow submitted on Compute Queue despite using ReBAR (no transfer submits or usage of staging buffer) and providing a AS Build Scratch Buffer of correctly queried max size!",system::ILogger::ELL_ERROR); + + // assign gpu objects to output + auto&& tlases = reservation.getGPUObjects(); + m_gpuTlas = tlases[0].value; + auto&& buffers = reservation.getGPUObjects(); + for (uint32_t i = 0; i < cpuObjects.size(); i++) { - return finalUser; + auto& cpuObject = cpuObjects[i]; + + m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ + .meta = cpuObject.meta, + .bindings = { + .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, + .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, + }, + .vertexStride = cpuObject.data.inputParams.bindings[0].stride, + .indexType = cpuObject.data.indexType, + .indexCount = cpuObject.data.indexCount, + .material = hlsl::_static_cast(cpuObject.material), + .transform = cpuObject.transform, + }); } + m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value; - uint8_t finalUser; - } params = {}; - params.utilities = m_utils.get(); - params.compute = &compute; - params.scratchForDeviceASBuild = scratchAlloc.get(); - params.finalUser = queue->getFamilyIndex(); - - auto future = reservation.convert(params); - if (future.copy() != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); - return false; - } - // 2 submits, BLAS build, TLAS build, DO NOT ADD COMPACTIONS IN THIS EXAMPLE! - if (compute.getFutureScratchSemaphore().value>3) - m_logger->log("Overflow submitted on Compute Queue despite using ReBAR (no transfer submits or usage of staging buffer) and providing a AS Build Scratch Buffer of correctly queried max size!",system::ILogger::ELL_ERROR); - - // assign gpu objects to output - auto&& tlases = reservation.getGPUObjects(); - m_gpuTlas = tlases[0].value; - auto&& buffers = reservation.getGPUObjects(); - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - auto& cpuObject = cpuObjects[i]; - - m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ - .meta = cpuObject.meta, - .bindings = { - .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, - .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, - }, - .vertexStride = cpuObject.data.inputParams.bindings[0].stride, - .indexType = cpuObject.data.indexType, - .indexCount = cpuObject.data.indexCount, - .material = hlsl::_static_cast(cpuObject.material), - .transform = cpuObject.transform, - }); + for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) + { + const auto& gpuObject = m_gpuTriangleGeometries[i]; + const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); + geomInfos[i] = { + .material = gpuObject.material, + .vertexBufferAddress = vertexBufferAddress, + .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, + .vertexStride = gpuObject.vertexStride, + .objType = gpuObject.meta.type, + .indexType = gpuObject.indexType, + .smoothNormals = s_smoothNormals[gpuObject.meta.type], + }; + } } - m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value; - for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) { - const auto& gpuObject = m_gpuTriangleGeometries[i]; - const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); - geomInfos[i] = { - .material = gpuObject.material, - .vertexBufferAddress = vertexBufferAddress, - .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, - .vertexStride = gpuObject.vertexStride, - .objType = gpuObject.meta.type, - .indexType = gpuObject.indexType, - .smoothNormals = s_smoothNormals[gpuObject.meta.type], - }; + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = geomInfoBuffer->getSize(); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); } - } - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = geomInfoBuffer->getSize(); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); + return true; } +#endif + smart_refctd_ptr m_converter; + + smart_refctd_ptr m_window; + smart_refctd_ptr> m_surface; + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; +uint32_t m_frameAccumulationCounter = 0; + std::array, MaxFramesInFlight> m_cmdBufs; + ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + + core::smart_refctd_ptr m_inputSystem; + InputSystem::ChannelReader m_mouse; + InputSystem::ChannelReader m_keyboard; + + struct CameraSetting + { + float fov = 60.f; + float zNear = 0.1f; + float zFar = 10000.f; + float moveSpeed = 1.f; + float rotateSpeed = 1.f; + float viewWidth = 10.f; + float camYAngle = 165.f / 180.f * 3.14159f; + float camXAngle = 32.f / 180.f * 3.14159f; - return true; - } - - - - smart_refctd_ptr m_window; - smart_refctd_ptr> m_surface; - smart_refctd_ptr m_semaphore; - uint64_t m_realFrameIx = 0; - uint32_t m_frameAccumulationCounter = 0; - std::array, MaxFramesInFlight> m_cmdBufs; - ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + } m_cameraSetting; + Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); - core::smart_refctd_ptr m_inputSystem; - InputSystem::ChannelReader m_mouse; - InputSystem::ChannelReader m_keyboard; - - struct CameraSetting - { - float fov = 60.f; - float zNear = 0.1f; - float zFar = 10000.f; - float moveSpeed = 1.f; - float rotateSpeed = 1.f; - float viewWidth = 10.f; - float camYAngle = 165.f / 180.f * 3.14159f; - float camXAngle = 32.f / 180.f * 3.14159f; - - } m_cameraSetting; - Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); - - Light m_light = { - .direction = {-1.0f, -1.0f, -0.4f}, - .position = {10.0f, 15.0f, 8.0f}, - .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, - .type = ELT_DIRECTIONAL - }; - - video::CDumbPresentationOracle m_oracle; + video::CDumbPresentationOracle m_oracle; +#if 0 struct C_UI { nbl::core::smart_refctd_ptr manager; @@ -1624,11 +1406,6 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, smart_refctd_ptr m_presentDsPool; smart_refctd_ptr m_presentPipeline; - smart_refctd_ptr m_converter; - - - core::matrix4SIMD m_cachedModelViewProjectionMatrix; - bool m_useIndirectCommand = false; - +#endif }; -NBL_MAIN_FUNC(RaytracingPipelineApp) +NBL_MAIN_FUNC(MeshLoadersApp) From 157bd8f407e108f258356205b8d5a8c36c2eee5c Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 4 Jun 2025 12:06:27 +0200 Subject: [PATCH 215/296] outline how I want stuff reorganised --- common/CMakeLists.txt | 21 +--- common/CommonAPI.h | 111 ------------------ common/CommonPCH/PCH.hpp | 13 -- common/include/CEventCallback.hpp | 49 -------- common/include/nbl/examples/PCH.hpp | 22 ++++ .../{ => nbl/examples/cameras}/CCamera.hpp | 9 +- .../nbl/examples/common/CEventCallback.hpp | 49 ++++++++ .../{ => nbl/examples/common}/InputSystem.hpp | 0 .../common}/SBasicViewParameters.hlsl | 0 .../common}/SimpleWindowedApplication.hpp | 3 +- .../geometry/CGeometryCreatorScene.hpp} | 19 +-- common/src/camera/CMakeLists.txt | 7 -- common/src/empty.cpp | 0 common/src/geometry/CMakeLists.txt | 1 - common/src/geometry/creator/CMakeLists.txt | 69 ----------- common/src/{ => nbl/examples}/CMakeLists.txt | 10 +- .../src/nbl/examples/cameras/CMakeLists.txt | 7 ++ .../src/nbl/examples/geometry/CMakeLists.txt | 69 +++++++++++ .../geometry}/shaders/gc.basic.fragment.hlsl | 0 .../geometry}/shaders/gc.basic.vertex.hlsl | 0 .../geometry}/shaders/gc.cone.vertex.hlsl | 0 .../geometry}/shaders/gc.ico.vertex.hlsl | 0 .../geometry}/shaders/grid.fragment.hlsl | 0 .../geometry}/shaders/grid.vertex.hlsl | 0 .../template/gc.basic.vertex.input.hlsl | 0 .../geometry}/shaders/template/gc.common.hlsl | 2 +- .../template/gc.cone.vertex.input.hlsl | 0 .../shaders/template/gc.ico.vertex.input.hlsl | 0 .../geometry}/shaders/template/gc.vertex.hlsl | 0 .../shaders/template/grid.common.hlsl | 2 +- .../nbl/examples/pch}/CMakeLists.txt | 0 .../nbl/examples/pch}/main.cpp | 0 32 files changed, 179 insertions(+), 284 deletions(-) delete mode 100644 common/CommonAPI.h delete mode 100644 common/CommonPCH/PCH.hpp delete mode 100644 common/include/CEventCallback.hpp create mode 100644 common/include/nbl/examples/PCH.hpp rename common/include/{ => nbl/examples/cameras}/CCamera.hpp (99%) create mode 100644 common/include/nbl/examples/common/CEventCallback.hpp rename common/include/{ => nbl/examples/common}/InputSystem.hpp (100%) rename common/include/{ => nbl/examples/common}/SBasicViewParameters.hlsl (100%) rename common/include/{ => nbl/examples/common}/SimpleWindowedApplication.hpp (99%) rename common/include/{CGeomtryCreatorScene.hpp => nbl/examples/geometry/CGeometryCreatorScene.hpp} (99%) delete mode 100644 common/src/camera/CMakeLists.txt delete mode 100644 common/src/empty.cpp delete mode 100644 common/src/geometry/CMakeLists.txt delete mode 100644 common/src/geometry/creator/CMakeLists.txt rename common/src/{ => nbl/examples}/CMakeLists.txt (64%) create mode 100644 common/src/nbl/examples/cameras/CMakeLists.txt create mode 100644 common/src/nbl/examples/geometry/CMakeLists.txt rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/gc.basic.fragment.hlsl (100%) rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/gc.basic.vertex.hlsl (100%) rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/gc.cone.vertex.hlsl (100%) rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/gc.ico.vertex.hlsl (100%) rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/grid.fragment.hlsl (100%) rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/grid.vertex.hlsl (100%) rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/template/gc.basic.vertex.input.hlsl (100%) rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/template/gc.common.hlsl (88%) rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/template/gc.cone.vertex.input.hlsl (100%) rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/template/gc.ico.vertex.input.hlsl (100%) rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/template/gc.vertex.hlsl (100%) rename common/src/{geometry/creator => nbl/examples/geometry}/shaders/template/grid.common.hlsl (95%) rename common/{CommonPCH => src/nbl/examples/pch}/CMakeLists.txt (100%) rename common/{CommonPCH => src/nbl/examples/pch}/main.cpp (100%) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index d9073f273..32c0ed6cf 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -7,22 +7,11 @@ ## # interface libraries don't have build rules (except custom commands however it doesn't matter here) but properties -add_library(nblCommonAPI INTERFACE) +add_library(nblExamplesAPI INTERFACE) +# TODO: change every variable prefix from `NBL_COMMON_API` to `NBL_EXAMPLES_API` here and elsewhere set(NBL_COMMON_API_INCLUDE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include") -target_include_directories(nblCommonAPI INTERFACE "${NBL_COMMON_API_INCLUDE_DIRECTORY}") +target_include_directories(nblExamplesAPI INTERFACE "${NBL_COMMON_API_INCLUDE_DIRECTORY}") -add_subdirectory(src EXCLUDE_FROM_ALL) +add_subdirectory("src/nbl/examples" EXCLUDE_FROM_ALL) -########## <- -# TODO: disable this CommonPCH thing! + DEPRICATED! -# TODO: move asset converer into separate library - -nbl_create_ext_library_project(CommonAPI "" "${CMAKE_CURRENT_SOURCE_DIR}/src/empty.cpp" "" "" "") -set(NBL_EXECUTABLE_COMMON_API_TARGET "${LIB_NAME}" CACHE INTERNAL "") - -add_subdirectory(CommonPCH EXCLUDE_FROM_ALL) - -#target_precompile_headers("${NBL_EXECUTABLE_COMMON_API_TARGET}" REUSE_FROM "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") -########## <- - -set(NBL_COMMON_API_TARGETS nblCommonAPI ${NBL_COMMON_API_TARGETS} ${NBL_EXECUTABLE_COMMON_API_TARGET} PARENT_SCOPE) +set(NBL_COMMON_API_TARGETS nblExamplesAPI ${NBL_COMMON_API_TARGETS} ${NBL_EXECUTABLE_COMMON_API_TARGET} PARENT_SCOPE) diff --git a/common/CommonAPI.h b/common/CommonAPI.h deleted file mode 100644 index aca8c0741..000000000 --- a/common/CommonAPI.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef __NBL_COMMON_API_H_INCLUDED__ -#define __NBL_COMMON_API_H_INCLUDED__ - -#include - -#include "MonoSystemMonoLoggerApplication.hpp" - -#include "nbl/ui/CGraphicalApplicationAndroid.h" -#include "nbl/ui/CWindowManagerAndroid.h" - -// TODO: see TODO below -// TODO: make these include themselves via `nabla.h` - -#include "nbl/video/utilities/SPhysicalDeviceFilter.h" - -#if 0 -class CommonAPI -{ - CommonAPI() = delete; -public: - class CommonAPIEventCallback : public nbl::ui::IWindow::IEventCallback - { - public: - CommonAPIEventCallback(nbl::core::smart_refctd_ptr&& inputSystem, nbl::system::logger_opt_smart_ptr&& logger) : m_inputSystem(std::move(inputSystem)), m_logger(std::move(logger)), m_gotWindowClosedMsg(false){} - CommonAPIEventCallback() {} - bool isWindowOpen() const {return !m_gotWindowClosedMsg;} - void setLogger(nbl::system::logger_opt_smart_ptr& logger) - { - m_logger = logger; - } - void setInputSystem(nbl::core::smart_refctd_ptr&& inputSystem) - { - m_inputSystem = std::move(inputSystem); - } - private: - - bool onWindowClosed_impl() override - { - m_logger.log("Window closed"); - m_gotWindowClosedMsg = true; - return true; - } - - void onMouseConnected_impl(nbl::core::smart_refctd_ptr&& mch) override - { - m_logger.log("A mouse %p has been connected", nbl::system::ILogger::ELL_INFO, mch.get()); - m_inputSystem.get()->add(m_inputSystem.get()->m_mouse,std::move(mch)); - } - void onMouseDisconnected_impl(nbl::ui::IMouseEventChannel* mch) override - { - m_logger.log("A mouse %p has been disconnected", nbl::system::ILogger::ELL_INFO, mch); - m_inputSystem.get()->remove(m_inputSystem.get()->m_mouse,mch); - } - void onKeyboardConnected_impl(nbl::core::smart_refctd_ptr&& kbch) override - { - m_logger.log("A keyboard %p has been connected", nbl::system::ILogger::ELL_INFO, kbch.get()); - m_inputSystem.get()->add(m_inputSystem.get()->m_keyboard,std::move(kbch)); - } - void onKeyboardDisconnected_impl(nbl::ui::IKeyboardEventChannel* kbch) override - { - m_logger.log("A keyboard %p has been disconnected", nbl::system::ILogger::ELL_INFO, kbch); - m_inputSystem.get()->remove(m_inputSystem.get()->m_keyboard,kbch); - } - - private: - nbl::core::smart_refctd_ptr m_inputSystem = nullptr; - nbl::system::logger_opt_smart_ptr m_logger = nullptr; - bool m_gotWindowClosedMsg; - }; - - // old code from init - { - // ... - - result.inputSystem = nbl::core::make_smart_refctd_ptr(system::logger_opt_smart_ptr(nbl::core::smart_refctd_ptr(result.logger))); - result.assetManager = nbl::core::make_smart_refctd_ptr(nbl::core::smart_refctd_ptr(result.system), nbl::core::smart_refctd_ptr(result.compilerSet)); // we should let user choose it? - - if (!headlessCompute) - { - params.windowCb->setInputSystem(nbl::core::smart_refctd_ptr(result.inputSystem)); - if (!params.window) - { - #ifdef _NBL_PLATFORM_WINDOWS_ - result.windowManager = ui::IWindowManagerWin32::create(); // on the Windows path - #elif defined(_NBL_PLATFORM_LINUX_) - result.windowManager = nbl::core::make_smart_refctd_ptr(); // on the Android path - #else - #error "Unsupported platform" - #endif - - nbl::ui::IWindow::SCreationParams windowsCreationParams; - windowsCreationParams.width = params.windowWidth; - windowsCreationParams.height = params.windowHeight; - windowsCreationParams.x = 64u; - windowsCreationParams.y = 64u; - windowsCreationParams.flags = nbl::ui::IWindow::ECF_RESIZABLE; - windowsCreationParams.windowCaption = params.appName.data(); - windowsCreationParams.callback = params.windowCb; - - params.window = result.windowManager->createWindow(std::move(windowsCreationParams)); - } - params.windowCb = nbl::core::smart_refctd_ptr((CommonAPIEventCallback*) params.window->getEventCallback()); - } - - // ... - } -}; - -#endif - -#endif diff --git a/common/CommonPCH/PCH.hpp b/common/CommonPCH/PCH.hpp deleted file mode 100644 index 5b9d6a433..000000000 --- a/common/CommonPCH/PCH.hpp +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright (C) 2018-2022 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _EXAMPLES_COMMON_PCH_HPP_ -#define _EXAMPLES_COMMON_PCH_HPP_ - -#include - -#include -#include -#include - -#endif // _EXAMPLES_COMMON_PCH_HPP_ \ No newline at end of file diff --git a/common/include/CEventCallback.hpp b/common/include/CEventCallback.hpp deleted file mode 100644 index 2d4e36932..000000000 --- a/common/include/CEventCallback.hpp +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef __NBL_C_EVENT_CALLBACK_HPP_INCLUDED__ -#define __NBL_C_EVENT_CALLBACK_HPP_INCLUDED__ - -#include "nbl/video/utilities/CSimpleResizeSurface.h" -#include "InputSystem.hpp" - -class CEventCallback : public nbl::video::ISimpleManagedSurface::ICallback -{ -public: - CEventCallback(nbl::core::smart_refctd_ptr&& m_inputSystem, nbl::system::logger_opt_smart_ptr&& logger) : m_inputSystem(std::move(m_inputSystem)), m_logger(std::move(logger)) {} - CEventCallback() {} - - void setLogger(nbl::system::logger_opt_smart_ptr& logger) - { - m_logger = logger; - } - void setInputSystem(nbl::core::smart_refctd_ptr&& m_inputSystem) - { - m_inputSystem = std::move(m_inputSystem); - } -private: - - void onMouseConnected_impl(nbl::core::smart_refctd_ptr&& mch) override - { - m_logger.log("A mouse %p has been connected", nbl::system::ILogger::ELL_INFO, mch.get()); - m_inputSystem.get()->add(m_inputSystem.get()->m_mouse, std::move(mch)); - } - void onMouseDisconnected_impl(nbl::ui::IMouseEventChannel* mch) override - { - m_logger.log("A mouse %p has been disconnected", nbl::system::ILogger::ELL_INFO, mch); - m_inputSystem.get()->remove(m_inputSystem.get()->m_mouse, mch); - } - void onKeyboardConnected_impl(nbl::core::smart_refctd_ptr&& kbch) override - { - m_logger.log("A keyboard %p has been connected", nbl::system::ILogger::ELL_INFO, kbch.get()); - m_inputSystem.get()->add(m_inputSystem.get()->m_keyboard, std::move(kbch)); - } - void onKeyboardDisconnected_impl(nbl::ui::IKeyboardEventChannel* kbch) override - { - m_logger.log("A keyboard %p has been disconnected", nbl::system::ILogger::ELL_INFO, kbch); - m_inputSystem.get()->remove(m_inputSystem.get()->m_keyboard, kbch); - } - -private: - nbl::core::smart_refctd_ptr m_inputSystem = nullptr; - nbl::system::logger_opt_smart_ptr m_logger = nullptr; -}; - -#endif // __NBL_C_EVENT_CALLBACK_HPP_INCLUDED__ \ No newline at end of file diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp new file mode 100644 index 000000000..7a1b6bdc6 --- /dev/null +++ b/common/include/nbl/examples/PCH.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_EXAMPLES_PCH_HPP_ +#define _NBL_EXAMPLES_PCH_HPP_ + + +#include + +// #include "nbl/ui/CGraphicalApplicationAndroid.h" +// #include "nbl/ui/CWindowManagerAndroid.h" + +#include "nbl/examples/common/SimpleWindowedApplication.hpp" +#include "nbl/examples/common/InputSystem.hpp" +#include "nbl/examples/common/CEventCallback.hpp" + +#include "nbl/examples/cameras/CCamera.hpp" + +#include "nbl/examples/geometry/CGeometryCreatorScene.hpp" + + +#endif // _NBL_EXAMPLES_COMMON_PCH_HPP_ \ No newline at end of file diff --git a/common/include/CCamera.hpp b/common/include/nbl/examples/cameras/CCamera.hpp similarity index 99% rename from common/include/CCamera.hpp rename to common/include/nbl/examples/cameras/CCamera.hpp index 1b0fe9c0f..3b3cd38d8 100644 --- a/common/include/CCamera.hpp +++ b/common/include/nbl/examples/cameras/CCamera.hpp @@ -1,16 +1,18 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_COMMON_CAMERA_IMPL_ +#define _NBL_COMMON_CAMERA_IMPL_ -#ifndef _CAMERA_IMPL_ -#define _CAMERA_IMPL_ #include + #include #include #include #include + class Camera { public: @@ -322,5 +324,4 @@ class Camera std::chrono::microseconds nextPresentationTimeStamp, lastVirtualUpTimeStamp; }; - -#endif // _CAMERA_IMPL_ \ No newline at end of file +#endif \ No newline at end of file diff --git a/common/include/nbl/examples/common/CEventCallback.hpp b/common/include/nbl/examples/common/CEventCallback.hpp new file mode 100644 index 000000000..4670ca7f6 --- /dev/null +++ b/common/include/nbl/examples/common/CEventCallback.hpp @@ -0,0 +1,49 @@ +#ifndef _NBL_COMMON_C_EVENT_CALLBACK_HPP_INCLUDED_ +#define _NBL_C_EVENT_CALLBACK_HPP_INCLUDED_ + +#include "nbl/video/utilities/CSimpleResizeSurface.h" +#include "InputSystem.hpp" + +class CEventCallback : public nbl::video::ISimpleManagedSurface::ICallback +{ + public: + CEventCallback(nbl::core::smart_refctd_ptr&& m_inputSystem, nbl::system::logger_opt_smart_ptr&& logger) : m_inputSystem(std::move(m_inputSystem)), m_logger(std::move(logger)) {} + CEventCallback() {} + + void setLogger(nbl::system::logger_opt_smart_ptr& logger) + { + m_logger = logger; + } + void setInputSystem(nbl::core::smart_refctd_ptr&& m_inputSystem) + { + m_inputSystem = std::move(m_inputSystem); + } + + private: + void onMouseConnected_impl(nbl::core::smart_refctd_ptr&& mch) override + { + m_logger.log("A mouse %p has been connected", nbl::system::ILogger::ELL_INFO, mch.get()); + m_inputSystem.get()->add(m_inputSystem.get()->m_mouse, std::move(mch)); + } + void onMouseDisconnected_impl(nbl::ui::IMouseEventChannel* mch) override + { + m_logger.log("A mouse %p has been disconnected", nbl::system::ILogger::ELL_INFO, mch); + m_inputSystem.get()->remove(m_inputSystem.get()->m_mouse, mch); + } + void onKeyboardConnected_impl(nbl::core::smart_refctd_ptr&& kbch) override + { + m_logger.log("A keyboard %p has been connected", nbl::system::ILogger::ELL_INFO, kbch.get()); + m_inputSystem.get()->add(m_inputSystem.get()->m_keyboard, std::move(kbch)); + } + void onKeyboardDisconnected_impl(nbl::ui::IKeyboardEventChannel* kbch) override + { + m_logger.log("A keyboard %p has been disconnected", nbl::system::ILogger::ELL_INFO, kbch); + m_inputSystem.get()->remove(m_inputSystem.get()->m_keyboard, kbch); + } + + private: + nbl::core::smart_refctd_ptr m_inputSystem = nullptr; + nbl::system::logger_opt_smart_ptr m_logger = nullptr; +}; + +#endif // _NBL_C_EVENT_CALLBACK_HPP_INCLUDED_ \ No newline at end of file diff --git a/common/include/InputSystem.hpp b/common/include/nbl/examples/common/InputSystem.hpp similarity index 100% rename from common/include/InputSystem.hpp rename to common/include/nbl/examples/common/InputSystem.hpp diff --git a/common/include/SBasicViewParameters.hlsl b/common/include/nbl/examples/common/SBasicViewParameters.hlsl similarity index 100% rename from common/include/SBasicViewParameters.hlsl rename to common/include/nbl/examples/common/SBasicViewParameters.hlsl diff --git a/common/include/SimpleWindowedApplication.hpp b/common/include/nbl/examples/common/SimpleWindowedApplication.hpp similarity index 99% rename from common/include/SimpleWindowedApplication.hpp rename to common/include/nbl/examples/common/SimpleWindowedApplication.hpp index 802a93188..ddb510eb7 100644 --- a/common/include/SimpleWindowedApplication.hpp +++ b/common/include/nbl/examples/common/SimpleWindowedApplication.hpp @@ -88,5 +88,4 @@ class SimpleWindowedApplication : public virtual application_templates::BasicMul }; } - -#endif // _CAMERA_IMPL_ \ No newline at end of file +#endif \ No newline at end of file diff --git a/common/include/CGeomtryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp similarity index 99% rename from common/include/CGeomtryCreatorScene.hpp rename to common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index 0d9bc6edd..7a3f253f3 100644 --- a/common/include/CGeomtryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -1,14 +1,19 @@ -#ifndef _NBL_GEOMETRY_CREATOR_SCENE_H_INCLUDED_ -#define _NBL_GEOMETRY_CREATOR_SCENE_H_INCLUDED_ +#ifndef _NBL_EXAMPLES_C_GEOMETRY_CREATOR_SCENE_H_INCLUDED_ +#define _NBL_EXAMPLES_C_GEOMETRY_CREATOR_SCENE_H_INCLUDED_ + #include -#include "nbl/asset/utils/CGeometryCreator.h" -#include "SBasicViewParameters.hlsl" -#include "geometry/creator/spirv/builtin/CArchive.h" -#include "geometry/creator/spirv/builtin/builtinResources.h" +#include "nbl/asset/utils/CPolygonGeometryCreator.h" + +// soon to be deprecated! +#include "nbl/examples/common/SBasicViewParameters.hlsl" + +#include "nbl/examples/geometry/creator/spirv/builtin/CArchive.h" +#include "nbl/examples/geometry/creator/spirv/builtin/builtinResources.h" + -namespace nbl::scene::geometrycreator +namespace nbl::examples { enum ObjectType : uint8_t diff --git a/common/src/camera/CMakeLists.txt b/common/src/camera/CMakeLists.txt deleted file mode 100644 index eedf690aa..000000000 --- a/common/src/camera/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -# header only currently - -#set(NBL_LIB_SOURCES -# "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" -#) - -#nbl_create_ext_library_project(Camera "" "${NBL_LIB_SOURCES}" "" "" "") \ No newline at end of file diff --git a/common/src/empty.cpp b/common/src/empty.cpp deleted file mode 100644 index e69de29bb..000000000 diff --git a/common/src/geometry/CMakeLists.txt b/common/src/geometry/CMakeLists.txt deleted file mode 100644 index fb33ec637..000000000 --- a/common/src/geometry/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(creator EXCLUDE_FROM_ALL) \ No newline at end of file diff --git a/common/src/geometry/creator/CMakeLists.txt b/common/src/geometry/creator/CMakeLists.txt deleted file mode 100644 index 336d32fe5..000000000 --- a/common/src/geometry/creator/CMakeLists.txt +++ /dev/null @@ -1,69 +0,0 @@ -# shaders IO directories -set(NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/shaders") -get_filename_component(_THIS_EXAMPLE_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_ "${CMAKE_CURRENT_BINARY_DIR}/shaders/include" ABSOLUTE) -get_filename_component(_THIS_EXAMPLE_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/include" ABSOLUTE) -get_filename_component(_THIS_EXAMPLE_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/src" ABSOLUTE) -set(NBL_THIS_EXAMPLE_OUTPUT_SPIRV_DIRECTORY "${_THIS_EXAMPLE_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}/nbl/geometryCreator/spirv") - -# list of input source shaders -set(NBL_THIS_EXAMPLE_INPUT_SHADERS - # geometry creator - "${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/gc.basic.fragment.hlsl" - "${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/gc.basic.vertex.hlsl" - "${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/gc.cone.vertex.hlsl" - "${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/gc.ico.vertex.hlsl" - - # grid - "${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/grid.vertex.hlsl" - "${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/grid.fragment.hlsl" -) - -file(GLOB_RECURSE NBL_THIS_EXAMPLE_INPUT_COMMONS CONFIGURE_DEPENDS "${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}/template/*.hlsl") - -include("${NBL_ROOT_PATH}/src/nbl/builtin/utils.cmake") - -foreach(NBL_INPUT_SHADER IN LISTS NBL_THIS_EXAMPLE_INPUT_SHADERS) - cmake_path(GET NBL_INPUT_SHADER FILENAME NBL_INPUT_SHADER_FILENAME) - cmake_path(GET NBL_INPUT_SHADER_FILENAME STEM LAST_ONLY NBL_SHADER_STEM) # filename without .hlsl extension - cmake_path(GET NBL_SHADER_STEM EXTENSION LAST_ONLY NBL_SHADER_TYPE) # . - - set(NBL_OUTPUT_SPIRV_FILENAME "${NBL_SHADER_STEM}.spv") - set(NBL_OUTPUT_SPIRV_PATH "${NBL_THIS_EXAMPLE_OUTPUT_SPIRV_DIRECTORY}/${NBL_OUTPUT_SPIRV_FILENAME}") - - if(NBL_SHADER_TYPE STREQUAL .vertex) - set(NBL_NSC_COMPILE_OPTIONS -T vs_6_7 -E VSMain) - elseif(NBL_SHADER_TYPE STREQUAL .geometry) - set(NBL_NSC_COMPILE_OPTIONS -T gs_6_7 -E GSMain) - elseif(NBL_SHADER_TYPE STREQUAL .fragment) - set(NBL_NSC_COMPILE_OPTIONS -T ps_6_7 -E PSMain) - else() - message(FATAL_ERROR "Input shader is supposed to be ..hlsl!") - endif() - - set(NBL_NSC_COMPILE_COMMAND - "$" - -Fc "${NBL_OUTPUT_SPIRV_PATH}" - -I "${NBL_COMMON_API_INCLUDE_DIRECTORY}" - ${NBL_NSC_COMPILE_OPTIONS} # this should come from shader's [#pragma WAVE ] but our NSC doesn't seem to work properly currently - "${NBL_INPUT_SHADER}" - ) - - set(NBL_DEPENDS - "${NBL_INPUT_SHADER}" - ${NBL_THIS_EXAMPLE_INPUT_COMMONS} - ) - - add_custom_command(OUTPUT "${NBL_OUTPUT_SPIRV_PATH}" - COMMAND ${NBL_NSC_COMPILE_COMMAND} - DEPENDS ${NBL_DEPENDS} - WORKING_DIRECTORY "${NBL_THIS_EXAMPLE_INPUT_SHADERS_DIRECTORY}" - COMMENT "Generating \"${NBL_OUTPUT_SPIRV_PATH}\"" - VERBATIM - COMMAND_EXPAND_LISTS - ) - - list(APPEND NBL_THIS_EXAMPLE_OUTPUT_SPIRV_BUILTINS "${NBL_OUTPUT_SPIRV_PATH}") - LIST_BUILTIN_RESOURCE(GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "geometryCreator/spirv/${NBL_OUTPUT_SPIRV_FILENAME}") -endforeach() - -ADD_CUSTOM_BUILTIN_RESOURCES(geometryCreatorSpirvBRD GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "${_THIS_EXAMPLE_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}" "nbl" "geometry::creator::spirv::builtin" "${_THIS_EXAMPLE_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_}" "${_THIS_EXAMPLE_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_}" "STATIC" "INTERNAL") \ No newline at end of file diff --git a/common/src/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt similarity index 64% rename from common/src/CMakeLists.txt rename to common/src/nbl/examples/CMakeLists.txt index 1399b949e..96ccaabea 100644 --- a/common/src/CMakeLists.txt +++ b/common/src/nbl/examples/CMakeLists.txt @@ -1,5 +1,8 @@ +# TODO: @AnastaZluk redo the PCH +# add_subdirectory(pch EXCLUDE_FROM_ALL) + # we add common libraries -# add_subdirectory(camera EXCLUDE_FROM_ALL) # header only currently +# add_subdirectory(cameras EXCLUDE_FROM_ALL) # header only currently add_subdirectory(geometry EXCLUDE_FROM_ALL) # we get all available targets inclusive & below this directory @@ -7,8 +10,9 @@ NBL_GET_ALL_TARGETS(NBL_SUBDIRECTORY_TARGETS) # then we expose common include search directories to all common libraries + create link interface foreach(NBL_TARGET IN LISTS NBL_SUBDIRECTORY_TARGETS) - target_include_directories(${NBL_TARGET} PUBLIC $) - target_link_libraries(nblCommonAPI INTERFACE ${NBL_TARGET}) + target_include_directories(${NBL_TARGET} PUBLIC $) + target_link_libraries(nblExamplesAPI INTERFACE ${NBL_TARGET}) endforeach() +# set(NBL_COMMON_API_TARGETS ${NBL_SUBDIRECTORY_TARGETS} PARENT_SCOPE) \ No newline at end of file diff --git a/common/src/nbl/examples/cameras/CMakeLists.txt b/common/src/nbl/examples/cameras/CMakeLists.txt new file mode 100644 index 000000000..0b0e59cdc --- /dev/null +++ b/common/src/nbl/examples/cameras/CMakeLists.txt @@ -0,0 +1,7 @@ +# header only currently + +#set(NBL_EXAMPLES_CAMERA_LIB_SOURCES +# "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" +#) + +#nbl_create_ext_library_project(ExampleCameras "" "${NBL_EXAMPLES_CAMERA_LIB_SOURCES}" "" "" "") \ No newline at end of file diff --git a/common/src/nbl/examples/geometry/CMakeLists.txt b/common/src/nbl/examples/geometry/CMakeLists.txt new file mode 100644 index 000000000..0eb09263b --- /dev/null +++ b/common/src/nbl/examples/geometry/CMakeLists.txt @@ -0,0 +1,69 @@ +# shaders IO directories +set(NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/shaders") +get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_ "${CMAKE_CURRENT_BINARY_DIR}/shaders/include" ABSOLUTE) +get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/include" ABSOLUTE) +get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/src" ABSOLUTE) +set(NBL_EXAMPLES_GEOMETRY_OUTPUT_SPIRV_DIRECTORY "${_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}/nbl/examples/geometry/spirv") + +# list of input source shaders +set(NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS + # geometry creator + "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.basic.fragment.hlsl" + "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.basic.vertex.hlsl" + "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.cone.vertex.hlsl" + "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.ico.vertex.hlsl" + + # grid + "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/grid.vertex.hlsl" + "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/grid.fragment.hlsl" +) + +file(GLOB_RECURSE NBL_EXAMPLES_GEOMETRY_INPUT_COMMONS CONFIGURE_DEPENDS "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/template/*.hlsl") + +include("${NBL_ROOT_PATH}/src/nbl/builtin/utils.cmake") + +foreach(NBL_INPUT_SHADER IN LISTS NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS) + cmake_path(GET NBL_INPUT_SHADER FILENAME NBL_INPUT_SHADER_FILENAME) + cmake_path(GET NBL_INPUT_SHADER_FILENAME STEM LAST_ONLY NBL_SHADER_STEM) # filename without .hlsl extension + cmake_path(GET NBL_SHADER_STEM EXTENSION LAST_ONLY NBL_SHADER_TYPE) # . + + set(NBL_OUTPUT_SPIRV_FILENAME "${NBL_SHADER_STEM}.spv") + set(NBL_OUTPUT_SPIRV_PATH "${NBL_EXAMPLES_GEOMETRY_OUTPUT_SPIRV_DIRECTORY}/${NBL_OUTPUT_SPIRV_FILENAME}") + + if(NBL_SHADER_TYPE STREQUAL .vertex) + set(NBL_NSC_COMPILE_OPTIONS -T vs_6_8 -E VSMain) + elseif(NBL_SHADER_TYPE STREQUAL .geometry) + set(NBL_NSC_COMPILE_OPTIONS -T gs_6_8 -E GSMain) + elseif(NBL_SHADER_TYPE STREQUAL .fragment) + set(NBL_NSC_COMPILE_OPTIONS -T ps_6_8 -E PSMain) + else() + message(FATAL_ERROR "Input shader is supposed to be ..hlsl!") + endif() + + set(NBL_NSC_COMPILE_COMMAND + "$" + -Fc "${NBL_OUTPUT_SPIRV_PATH}" + -I "${NBL_COMMON_API_INCLUDE_DIRECTORY}" + ${NBL_NSC_COMPILE_OPTIONS} # this should come from shader's [#pragma WAVE ] but our NSC doesn't seem to work properly currently + "${NBL_INPUT_SHADER}" + ) + + set(NBL_DEPENDS + "${NBL_INPUT_SHADER}" + ${NBL_EXAMPLES_GEOMETRY_INPUT_COMMONS} + ) + + add_custom_command(OUTPUT "${NBL_OUTPUT_SPIRV_PATH}" + COMMAND ${NBL_NSC_COMPILE_COMMAND} + DEPENDS ${NBL_DEPENDS} + WORKING_DIRECTORY "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}" + COMMENT "Generating \"${NBL_OUTPUT_SPIRV_PATH}\"" + VERBATIM + COMMAND_EXPAND_LISTS + ) + + list(APPEND NBL_EXAMPLES_GEOMETRY_OUTPUT_SPIRV_BUILTINS "${NBL_OUTPUT_SPIRV_PATH}") + LIST_BUILTIN_RESOURCE(GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "geometry/spirv/${NBL_OUTPUT_SPIRV_FILENAME}") +endforeach() + +ADD_CUSTOM_BUILTIN_RESOURCES(geometryCreatorSpirvBRD GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "${_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}" "nbl" "geometry::spirv::builtin" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_}" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_}" "STATIC" "INTERNAL") \ No newline at end of file diff --git a/common/src/geometry/creator/shaders/gc.basic.fragment.hlsl b/common/src/nbl/examples/geometry/shaders/gc.basic.fragment.hlsl similarity index 100% rename from common/src/geometry/creator/shaders/gc.basic.fragment.hlsl rename to common/src/nbl/examples/geometry/shaders/gc.basic.fragment.hlsl diff --git a/common/src/geometry/creator/shaders/gc.basic.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/gc.basic.vertex.hlsl similarity index 100% rename from common/src/geometry/creator/shaders/gc.basic.vertex.hlsl rename to common/src/nbl/examples/geometry/shaders/gc.basic.vertex.hlsl diff --git a/common/src/geometry/creator/shaders/gc.cone.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/gc.cone.vertex.hlsl similarity index 100% rename from common/src/geometry/creator/shaders/gc.cone.vertex.hlsl rename to common/src/nbl/examples/geometry/shaders/gc.cone.vertex.hlsl diff --git a/common/src/geometry/creator/shaders/gc.ico.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/gc.ico.vertex.hlsl similarity index 100% rename from common/src/geometry/creator/shaders/gc.ico.vertex.hlsl rename to common/src/nbl/examples/geometry/shaders/gc.ico.vertex.hlsl diff --git a/common/src/geometry/creator/shaders/grid.fragment.hlsl b/common/src/nbl/examples/geometry/shaders/grid.fragment.hlsl similarity index 100% rename from common/src/geometry/creator/shaders/grid.fragment.hlsl rename to common/src/nbl/examples/geometry/shaders/grid.fragment.hlsl diff --git a/common/src/geometry/creator/shaders/grid.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl similarity index 100% rename from common/src/geometry/creator/shaders/grid.vertex.hlsl rename to common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl diff --git a/common/src/geometry/creator/shaders/template/gc.basic.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl similarity index 100% rename from common/src/geometry/creator/shaders/template/gc.basic.vertex.input.hlsl rename to common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl diff --git a/common/src/geometry/creator/shaders/template/gc.common.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl similarity index 88% rename from common/src/geometry/creator/shaders/template/gc.common.hlsl rename to common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl index 4590cd4a3..26e2885f7 100644 --- a/common/src/geometry/creator/shaders/template/gc.common.hlsl +++ b/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl @@ -9,7 +9,7 @@ }; #endif // __HLSL_VERSION -#include "SBasicViewParameters.hlsl" +#include "common/SBasicViewParameters.hlsl" #endif // _THIS_EXAMPLE_GC_COMMON_HLSL_ diff --git a/common/src/geometry/creator/shaders/template/gc.cone.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl similarity index 100% rename from common/src/geometry/creator/shaders/template/gc.cone.vertex.input.hlsl rename to common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl diff --git a/common/src/geometry/creator/shaders/template/gc.ico.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl similarity index 100% rename from common/src/geometry/creator/shaders/template/gc.ico.vertex.input.hlsl rename to common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl diff --git a/common/src/geometry/creator/shaders/template/gc.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl similarity index 100% rename from common/src/geometry/creator/shaders/template/gc.vertex.hlsl rename to common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl diff --git a/common/src/geometry/creator/shaders/template/grid.common.hlsl b/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl similarity index 95% rename from common/src/geometry/creator/shaders/template/grid.common.hlsl rename to common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl index bc6516600..616412245 100644 --- a/common/src/geometry/creator/shaders/template/grid.common.hlsl +++ b/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl @@ -31,7 +31,7 @@ } #endif // __HLSL_VERSION -#include "SBasicViewParameters.hlsl" +#include "common/SBasicViewParameters.hlsl" #endif // _THIS_EXAMPLE_GRID_COMMON_HLSL_ diff --git a/common/CommonPCH/CMakeLists.txt b/common/src/nbl/examples/pch/CMakeLists.txt similarity index 100% rename from common/CommonPCH/CMakeLists.txt rename to common/src/nbl/examples/pch/CMakeLists.txt diff --git a/common/CommonPCH/main.cpp b/common/src/nbl/examples/pch/main.cpp similarity index 100% rename from common/CommonPCH/main.cpp rename to common/src/nbl/examples/pch/main.cpp From 837071974d01e908a4cbce8ff0cca05bd5aecf39 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 4 Jun 2025 12:32:07 +0200 Subject: [PATCH 216/296] temporarily disable some things that I couldn't figure out withour Arek's help Yes geometry creator scene default SPIR-V shaders are gone --- 09_GeometryCreator/CMakeLists.txt | 6 ++++-- 09_GeometryCreator/include/common.hpp | 20 +++++-------------- 61_UI/CMakeLists.txt | 8 +++++--- CMakeLists.txt | 10 +++++----- common/CMakeLists.txt | 8 ++++---- .../geometry/CGeometryCreatorScene.hpp | 7 ++++--- .../src/nbl/examples/geometry/CMakeLists.txt | 8 ++++++-- common/src/nbl/examples/pch/CMakeLists.txt | 5 ++++- 8 files changed, 37 insertions(+), 35 deletions(-) diff --git a/09_GeometryCreator/CMakeLists.txt b/09_GeometryCreator/CMakeLists.txt index 928ef5761..2dd253226 100644 --- a/09_GeometryCreator/CMakeLists.txt +++ b/09_GeometryCreator/CMakeLists.txt @@ -2,5 +2,7 @@ set(NBL_INCLUDE_SERACH_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/include" ) -nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") -LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} geometryCreatorSpirvBRD) \ No newline at end of file + # TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !? +nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "" "") +# TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet +# LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD) \ No newline at end of file diff --git a/09_GeometryCreator/include/common.hpp b/09_GeometryCreator/include/common.hpp index 3661e5697..946f2982f 100644 --- a/09_GeometryCreator/include/common.hpp +++ b/09_GeometryCreator/include/common.hpp @@ -1,20 +1,10 @@ -#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ -#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ +#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ +#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ #include -#include "nbl/asset/utils/CGeometryCreator.h" -#include "SimpleWindowedApplication.hpp" -#include "InputSystem.hpp" -#include "CEventCallback.hpp" - -#include "CCamera.hpp" -#include "SBasicViewParameters.hlsl" - -#include "geometry/creator/spirv/builtin/CArchive.h" -#include "geometry/creator/spirv/builtin/builtinResources.h" - -#include "CGeomtryCreatorScene.hpp" +// TODO: @AnastaZIuk do we even make that explicit? +#include "nbl/examples/PCH.hpp" using namespace nbl; using namespace core; @@ -24,6 +14,6 @@ using namespace asset; using namespace ui; using namespace video; using namespace scene; -using namespace geometrycreator; +using namespace examples; #endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ \ No newline at end of file diff --git a/61_UI/CMakeLists.txt b/61_UI/CMakeLists.txt index a34e46ce6..5d0021f61 100644 --- a/61_UI/CMakeLists.txt +++ b/61_UI/CMakeLists.txt @@ -12,7 +12,9 @@ if(NBL_BUILD_IMGUI) imguizmo "${NBL_EXT_IMGUI_UI_LIB}" ) - - nbl_create_executable_project("${NBL_EXTRA_SOURCES}" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") - LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} geometryCreatorSpirvBRD) + + # TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !? + nbl_create_executable_project("${NBL_EXTRA_SOURCES}" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}") + # TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet + # LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD) endif() \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c0584ebe..789e96937 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,8 +3,8 @@ # For conditions of distribution and use, see copyright notice in nabla.h function(NBL_HOOK_COMMON_API NBL_EXCLUDE_TARGETS_LIST) - if(NOT TARGET nblCommonAPI) - message(FATAL_ERROR "nblCommonAPI not defined!") + if(NOT TARGET nblExamplesAPI) + message(FATAL_ERROR "nblExamplesAPI not defined!") endif() NBL_GET_ALL_TARGETS(NBL_TARGETS) @@ -13,8 +13,8 @@ function(NBL_HOOK_COMMON_API NBL_EXCLUDE_TARGETS_LIST) # TODO: exclude builtin targets created by examples as well - doesn't impact anything at all now if(NOT ${NBL_TARGET} IN_LIST NBL_EXCLUDE_TARGETS_LIST) - target_include_directories(${NBL_TARGET} PRIVATE $) - target_link_libraries(${NBL_TARGET} PRIVATE nblCommonAPI) + target_include_directories(${NBL_TARGET} PRIVATE $) + target_link_libraries(${NBL_TARGET} PRIVATE nblExamplesAPI) endif() endforeach() endfunction() @@ -92,5 +92,5 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) - NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") + NBL_HOOK_COMMON_API("${NBL_EXAMPLES_API_TARGETS}") endif() diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 32c0ed6cf..9560a8f42 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -8,10 +8,10 @@ # interface libraries don't have build rules (except custom commands however it doesn't matter here) but properties add_library(nblExamplesAPI INTERFACE) -# TODO: change every variable prefix from `NBL_COMMON_API` to `NBL_EXAMPLES_API` here and elsewhere -set(NBL_COMMON_API_INCLUDE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include") -target_include_directories(nblExamplesAPI INTERFACE "${NBL_COMMON_API_INCLUDE_DIRECTORY}") +set(NBL_EXAMPLES_API_INCLUDE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include") +target_include_directories(nblExamplesAPI INTERFACE "${NBL_EXAMPLES_API_INCLUDE_DIRECTORY}") add_subdirectory("src/nbl/examples" EXCLUDE_FROM_ALL) -set(NBL_COMMON_API_TARGETS nblExamplesAPI ${NBL_COMMON_API_TARGETS} ${NBL_EXECUTABLE_COMMON_API_TARGET} PARENT_SCOPE) +# TODO: Arek what was `NBL_EXECUTABLE_COMMON_API_TARGET` ? I removed it. +set(NBL_EXAMPLES_API_TARGETS nblExamplesAPI ${NBL_EXAMPLES_API_TARGETS} PARENT_SCOPE) diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index 7a3f253f3..9ebd244aa 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -4,13 +4,14 @@ #include -#include "nbl/asset/utils/CPolygonGeometryCreator.h" +#include "nbl/asset/utils/CGeometryCreator.h" // soon to be deprecated! #include "nbl/examples/common/SBasicViewParameters.hlsl" -#include "nbl/examples/geometry/creator/spirv/builtin/CArchive.h" -#include "nbl/examples/geometry/creator/spirv/builtin/builtinResources.h" +// TODO: Arek bring back +//#include "nbl/examples/geometry/spirv/builtin/CArchive.h" +//#include "nbl/examples/geometry/spirv/builtin/builtinResources.h" namespace nbl::examples diff --git a/common/src/nbl/examples/geometry/CMakeLists.txt b/common/src/nbl/examples/geometry/CMakeLists.txt index 0eb09263b..c402a2b8a 100644 --- a/common/src/nbl/examples/geometry/CMakeLists.txt +++ b/common/src/nbl/examples/geometry/CMakeLists.txt @@ -1,3 +1,6 @@ +# TODO: let arek figure out how to redo the shaders +#[===[ + # shaders IO directories set(NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/shaders") get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_ "${CMAKE_CURRENT_BINARY_DIR}/shaders/include" ABSOLUTE) @@ -43,7 +46,7 @@ foreach(NBL_INPUT_SHADER IN LISTS NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS) set(NBL_NSC_COMPILE_COMMAND "$" -Fc "${NBL_OUTPUT_SPIRV_PATH}" - -I "${NBL_COMMON_API_INCLUDE_DIRECTORY}" + -I "${NBL_EXAMPLES_API_INCLUDE_DIRECTORY}" ${NBL_NSC_COMPILE_OPTIONS} # this should come from shader's [#pragma WAVE ] but our NSC doesn't seem to work properly currently "${NBL_INPUT_SHADER}" ) @@ -66,4 +69,5 @@ foreach(NBL_INPUT_SHADER IN LISTS NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS) LIST_BUILTIN_RESOURCE(GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "geometry/spirv/${NBL_OUTPUT_SPIRV_FILENAME}") endforeach() -ADD_CUSTOM_BUILTIN_RESOURCES(geometryCreatorSpirvBRD GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "${_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}" "nbl" "geometry::spirv::builtin" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_}" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_}" "STATIC" "INTERNAL") \ No newline at end of file +ADD_CUSTOM_BUILTIN_RESOURCES(geometryCreatorSpirvBRD GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "${_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}" "nbl" "geometry::spirv::builtin" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_}" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_}" "STATIC" "INTERNAL") +]===] \ No newline at end of file diff --git a/common/src/nbl/examples/pch/CMakeLists.txt b/common/src/nbl/examples/pch/CMakeLists.txt index 5e62f885f..34f16c2d2 100644 --- a/common/src/nbl/examples/pch/CMakeLists.txt +++ b/common/src/nbl/examples/pch/CMakeLists.txt @@ -1,3 +1,5 @@ +# TODO: let arek figure out how to redo the PCH +#[===[ include(common RESULT_VARIABLE RES) if(NOT RES) message(FATAL_ERROR "common.cmake not found. Should be in '${NBL_ROOT_PATH}/cmake' directory") @@ -12,4 +14,5 @@ target_precompile_headers("${EXECUTABLE_NAME}" PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/PCH.hpp" # Common PCH for examples "${NBL_NABLA_TARGET_SOURCE_DIR}/pch.h" # Nabla's PCH ) -unset(NBL_NABLA_TARGET_SOURCE_DIR) \ No newline at end of file +unset(NBL_NABLA_TARGET_SOURCE_DIR) +]===] \ No newline at end of file From 83443a75be9d732989cc77fe8cdfd18b7e6fa52f Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 4 Jun 2025 15:55:08 +0200 Subject: [PATCH 217/296] factored out the `CSwapchainFramebuffersAndDepth` into `nbl/examples/common` --- 09_GeometryCreator/include/common.hpp | 2 - 09_GeometryCreator/main.cpp | 394 ++++-------------- common/include/nbl/examples/PCH.hpp | 1 + .../nbl/examples/common/CEventCallback.hpp | 15 +- .../common/CSwapchainFramebuffersAndDepth.hpp | 101 +++++ .../nbl/examples/common/InputSystem.hpp | 37 +- .../examples/common/MonoWindowApplication.hpp | 189 +++++++++ .../geometry/CGeometryCreatorScene.hpp | 35 +- 8 files changed, 433 insertions(+), 341 deletions(-) create mode 100644 common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp create mode 100644 common/include/nbl/examples/common/MonoWindowApplication.hpp diff --git a/09_GeometryCreator/include/common.hpp b/09_GeometryCreator/include/common.hpp index 946f2982f..02197171d 100644 --- a/09_GeometryCreator/include/common.hpp +++ b/09_GeometryCreator/include/common.hpp @@ -1,8 +1,6 @@ #ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ #define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ -#include - // TODO: @AnastaZIuk do we even make that explicit? #include "nbl/examples/PCH.hpp" diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index 4ac527e08..2a3a1553e 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -1,205 +1,34 @@ -// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h #include "common.hpp" -class CSwapchainFramebuffersAndDepth final : public nbl::video::CDefaultSwapchainFramebuffers +class GeometryCreatorApp final : public examples::MonoWindowApplication { - using base_t = CDefaultSwapchainFramebuffers; - -public: - template - inline CSwapchainFramebuffersAndDepth(ILogicalDevice* device, const asset::E_FORMAT _desiredDepthFormat, Args&&... args) : CDefaultSwapchainFramebuffers(device, std::forward(args)...) - { - const IPhysicalDevice::SImageFormatPromotionRequest req = { - .originalFormat = _desiredDepthFormat, - .usages = {IGPUImage::EUF_RENDER_ATTACHMENT_BIT} - }; - m_depthFormat = m_device->getPhysicalDevice()->promoteImageFormat(req, IGPUImage::TILING::OPTIMAL); - - const static IGPURenderpass::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = { - {{ - { - .format = m_depthFormat, - .samples = IGPUImage::ESCF_1_BIT, - .mayAlias = false - }, - /*.loadOp = */{IGPURenderpass::LOAD_OP::CLEAR}, - /*.storeOp = */{IGPURenderpass::STORE_OP::STORE}, - /*.initialLayout = */{IGPUImage::LAYOUT::UNDEFINED}, // because we clear we don't care about contents - /*.finalLayout = */{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} // transition to presentation right away so we can skip a barrier - }}, - IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd - }; - m_params.depthStencilAttachments = depthAttachments; - - static IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = { - m_params.subpasses[0], - IGPURenderpass::SCreationParams::SubpassesEnd - }; - subpasses[0].depthStencilAttachment.render = { .attachmentIndex = 0,.layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL }; - m_params.subpasses = subpasses; - } - -protected: - inline bool onCreateSwapchain_impl(const uint8_t qFam) override - { - auto device = const_cast(m_renderpass->getOriginDevice()); - - const auto depthFormat = m_renderpass->getCreationParameters().depthStencilAttachments[0].format; - const auto& sharedParams = getSwapchain()->getCreationParameters().sharedParams; - auto image = device->createImage({ IImage::SCreationParams{ - .type = IGPUImage::ET_2D, - .samples = IGPUImage::ESCF_1_BIT, - .format = depthFormat, - .extent = {sharedParams.width,sharedParams.height,1}, - .mipLevels = 1, - .arrayLayers = 1, - .depthUsage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT - } }); - - device->allocate(image->getMemoryReqs(), image.get()); - - m_depthBuffer = device->createImageView({ - .flags = IGPUImageView::ECF_NONE, - .subUsages = IGPUImage::EUF_RENDER_ATTACHMENT_BIT, - .image = std::move(image), - .viewType = IGPUImageView::ET_2D, - .format = depthFormat, - .subresourceRange = {IGPUImage::EAF_DEPTH_BIT,0,1,0,1} - }); - - const auto retval = base_t::onCreateSwapchain_impl(qFam); - m_depthBuffer = nullptr; - return retval; - } - - inline smart_refctd_ptr createFramebuffer(IGPUFramebuffer::SCreationParams&& params) override - { - params.depthStencilAttachments = &m_depthBuffer.get(); - return m_device->createFramebuffer(std::move(params)); - } - - E_FORMAT m_depthFormat; - // only used to pass a parameter from `onCreateSwapchain_impl` to `createFramebuffer` - smart_refctd_ptr m_depthBuffer; -}; - -class GeometryCreatorApp final : public examples::SimpleWindowedApplication -{ - using device_base_t = examples::SimpleWindowedApplication; - using clock_t = std::chrono::steady_clock; - - constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; - - // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers - constexpr static inline uint32_t MaxFramesInFlight = 3u; - - constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); + using base_t = examples::MonoWindowApplication; public: - inline GeometryCreatorApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) - : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + GeometryCreatorApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + : base_t({1280,720}, EF_D16_UNORM, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} - virtual SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override { - auto retval = device_base_t::getRequiredDeviceFeatures(); + auto retval = base_t::getRequiredDeviceFeatures(); retval.geometryShader = true; return retval; } - inline core::vector getSurfaces() const override - { - if (!m_surface) - { - { - auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); - IWindow::SCreationParams params = {}; - params.callback = core::make_smart_refctd_ptr(); - params.width = WIN_W; - params.height = WIN_H; - params.x = 32; - params.y = 32; - params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; - params.windowCaption = "GeometryCreatorApp"; - params.callback = windowCallback; - const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); - } - - auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); - const_cast&>(m_surface) = nbl::video::CSimpleResizeSurface::create(std::move(surface)); - } - - if (m_surface) - return { {m_surface->getSurface()/*,EQF_NONE*/} }; - - return {}; - } - inline bool onAppInitialized(smart_refctd_ptr&& system) override { - m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); - - if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + if (!base_t::onAppInitialized(smart_refctd_ptr(system))) return false; m_semaphore = m_device->createSemaphore(m_realFrameIx); if (!m_semaphore) return logFail("Failed to Create a Semaphore!"); - ISwapchain::SCreationParams swapchainParams = { .surface = m_surface->getSurface() }; - if (!swapchainParams.deduceFormat(m_physicalDevice)) - return logFail("Could not choose a Surface Format for the Swapchain!"); - - // Subsequent submits don't wait for each other, hence its important to have External Dependencies which prevent users of the depth attachment overlapping. - const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { - // wipe-transition of Color to ATTACHMENT_OPTIMAL - { - .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .dstSubpass = 0, - .memoryBarrier = { - // last place where the depth can get modified in previous frame - .srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT, - // only write ops, reads can't be made available - .srcAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, - // destination needs to wait as early as possible - .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT, - // because of depth test needing a read and a write - .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_READ_BIT - } - // leave view offsets and flags default - }, - // color from ATTACHMENT_OPTIMAL to PRESENT_SRC - { - .srcSubpass = 0, - .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .memoryBarrier = { - // last place where the depth can get modified - .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - // only write ops, reads can't be made available - .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - // spec says nothing is needed when presentation is the destination - } - // leave view offsets and flags default - }, - IGPURenderpass::SCreationParams::DependenciesEnd - }; - - // TODO: promote the depth format if D16 not supported, or quote the spec if there's guaranteed support for it - auto scResources = std::make_unique(m_device.get(), EF_D16_UNORM, swapchainParams.surfaceFormat.format, dependencies); - - auto* renderpass = scResources->getRenderpass(); - - if (!renderpass) - return logFail("Failed to create Renderpass!"); - - auto gQueue = getGraphicsQueue(); - if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) - return logFail("Could not create Window & Surface or initialize the Surface!"); - - auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - + auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); for (auto i = 0u; i < MaxFramesInFlight; i++) { if (!pool) @@ -208,12 +37,7 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication return logFail("Couldn't create Command Buffer!"); } - m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); - m_surface->recreateSwapchain(); - - auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); - auto* geometry = assetManager->getGeometryCreator(); - +#if 0 //using Builder = typename CScene::CreateResourcesDirectlyWithDevice::Builder; using Builder = typename CScene::CreateResourcesWithAssetConverter::Builder; auto oneRunCmd = CScene::createCommandBuffer(m_utils->getLogicalDevice(), m_utils->getLogger(), gQueue->getFamilyIndex()); @@ -227,61 +51,25 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication } else m_logger->log("Could not build resource objects!", ILogger::ELL_ERROR); - +#endif // camera { core::vectorSIMDf cameraPosition(-5.81655884, 2.58630896, -4.23974705); core::vectorSIMDf cameraTarget(-0.349590302, -0.213266611, 0.317821503); - matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), float(WIN_W) / WIN_H, 0.1, 10000); + matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), float(m_initialResolution.x)/float(m_initialResolution.y), 0.1, 10000); camera = Camera(cameraPosition, cameraTarget, projectionMatrix, 1.069f, 0.4f); } - m_winMgr->show(m_window.get()); - oracle.reportBeginFrameRecord(); - + onAppInitializedFinish(); return true; } - inline void workLoopBody() override + inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override { - // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. - const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); - // We block for semaphores for 2 reasons here: - // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] - // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] - if (m_realFrameIx >= framesInFlight) - { - const ISemaphore::SWaitInfo cbDonePending[] = - { - { - .semaphore = m_semaphore.get(), - .value = m_realFrameIx + 1 - framesInFlight - } - }; - if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) - return; - } - - const auto resourceIx = m_realFrameIx % MaxFramesInFlight; - m_inputSystem->getDefaultMouse(&mouse); m_inputSystem->getDefaultKeyboard(&keyboard); - auto updatePresentationTimestamp = [&]() - { - m_currentImageAcquire = m_surface->acquireNextImage(); - - oracle.reportEndFrameRecord(); - const auto timestamp = oracle.getNextPresentationTimeStamp(); - oracle.reportBeginFrameRecord(); - - return timestamp; - }; - - const auto nextPresentationTimestamp = updatePresentationTimestamp(); - - if (!m_currentImageAcquire) - return; + const auto resourceIx = m_realFrameIx % base_t::MaxFramesInFlight; auto* const cb = m_cmdBufs.data()[resourceIx].get(); cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); @@ -292,12 +80,13 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); mouseProcess(events); }, m_logger.get()); keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); }, m_logger.get()); camera.endInputProcessing(nextPresentationTimestamp); - +#if 0 const auto type = static_cast(gcIndex); const auto& [gpu, meta] = resources.objects[type]; object.meta.type = type; object.meta.name = meta.name; +#endif } const auto viewMatrix = camera.getViewMatrix(); @@ -312,7 +101,7 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication core::matrix3x4SIMD normalMatrix; modelViewMatrix.getSub3x3InverseTranspose(normalMatrix); - +#if 0 SBasicViewParameters uboData; memcpy(uboData.MVP, modelViewProjectionMatrix.pointer(), sizeof(uboData.MVP)); memcpy(uboData.MV, modelViewMatrix.pointer(), sizeof(uboData.MV)); @@ -324,7 +113,7 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication cb->updateBuffer(range, &uboData); } - +#endif auto* queue = getGraphicsQueue(); asset::SViewport viewport; @@ -357,7 +146,7 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication auto scRes = static_cast(m_surface->getSwapchainResources()); const IGPUCommandBuffer::SRenderpassBeginInfo info = { - .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex), + .framebuffer = scRes->getFramebuffer(base_t::getCurrentAcquire().imageIndex), .colorClearValues = &clearValue, .depthStencilClearValues = &depthValue, .renderArea = currentRenderArea @@ -365,7 +154,7 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication cb->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); } - +#if 0 const auto& [hook, meta] = resources.objects[object.meta.type]; auto* rawPipeline = hook.pipeline.get(); @@ -382,96 +171,97 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication } else cb->draw(hook.indexCount, 1, 0, 0); - +#endif cb->endRenderPass(); cb->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo retval = + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS + }; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cb } + }; + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { + {.semaphore = base_t::getCurrentAcquire().semaphore, .value = base_t::getCurrentAcquire().acquireCount, .stageMask = PIPELINE_STAGE_FLAGS::NONE} + }; + const IQueue::SSubmitInfo infos[] = { - const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = - { - { - .semaphore = m_semaphore.get(), - .value = ++m_realFrameIx, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS - } - }; { - { - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = - { - {.cmdbuf = cb } - }; - - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = - { - { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::NONE - } - }; - const IQueue::SSubmitInfo infos[] = - { - { - .waitSemaphores = acquired, - .commandBuffers = commandBuffers, - .signalSemaphores = rendered - } - }; - - if (queue->submit(infos) == IQueue::RESULT::SUCCESS) - { - const nbl::video::ISemaphore::SWaitInfo waitInfos[] = - { { - .semaphore = m_semaphore.get(), - .value = m_realFrameIx - } }; - - m_device->blockForSemaphores(waitInfos); // this is not solution, quick wa to not throw validation errors - } - else - --m_realFrameIx; - } + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = {&retval,1} } + }; - std::string caption = "[Nabla Engine] Geometry Creator"; - { - caption += ", displaying [" + std::string(object.meta.name.data()) + "]"; - m_window->setCaption(caption); - } - m_surface->present(m_currentImageAcquire.imageIndex, rendered); + if (queue->submit(infos) != IQueue::RESULT::SUCCESS) + { + retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal + m_realFrameIx--; } - } - inline bool keepRunning() override - { - if (m_surface->irrecoverable()) - return false; - - return true; + std::string caption = "[Nabla Engine] Geometry Creator"; + { +// caption += ", displaying [" + std::string(object.meta.name.data()) + "]"; + m_window->setCaption(caption); + } + return retval; } - - inline bool onAppTerminated() override + + protected: + const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override { - return device_base_t::onAppTerminated(); + // Subsequent submits don't wait for each other, hence its important to have External Dependencies which prevent users of the depth attachment overlapping. + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { + // wipe-transition of Color to ATTACHMENT_OPTIMAL + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = { + // last place where the depth can get modified in previous frame + .srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT, + // only write ops, reads can't be made available + .srcAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + // destination needs to wait as early as possible + .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT, + // because of depth test needing a read and a write + .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_READ_BIT + } + // leave view offsets and flags default + }, + // color from ATTACHMENT_OPTIMAL to PRESENT_SRC + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = { + // last place where the depth can get modified + .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // only write ops, reads can't be made available + .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + // spec says nothing is needed when presentation is the destination + } + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; + return dependencies; } private: - smart_refctd_ptr m_window; - smart_refctd_ptr> m_surface; smart_refctd_ptr m_semaphore; uint64_t m_realFrameIx = 0; - std::array, MaxFramesInFlight> m_cmdBufs; - ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + std::array,base_t::MaxFramesInFlight> m_cmdBufs; - core::smart_refctd_ptr m_inputSystem; InputSystem::ChannelReader mouse; InputSystem::ChannelReader keyboard; Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); - video::CDumbPresentationOracle oracle; - ResourcesBundle resources; - ObjectDrawHookCpu object; +// ResourcesBundle resources; +// ObjectDrawHookCpu object; uint16_t gcIndex = {}; void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events) @@ -481,7 +271,7 @@ class GeometryCreatorApp final : public examples::SimpleWindowedApplication auto ev = *eventIt; if (ev.type == nbl::ui::SMouseEvent::EET_SCROLL) - gcIndex = std::clamp(int16_t(gcIndex) + int16_t(core::sign(ev.scrollEvent.verticalScroll)), int64_t(0), int64_t(OT_COUNT - (uint8_t)1u)); + gcIndex = std::clamp(int16_t(gcIndex) + int16_t(core::sign(ev.scrollEvent.verticalScroll)), int64_t(0), int64_t(CGeometryCreatorScene::OT_COUNT - (uint8_t)1u)); } } }; diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp index 7a1b6bdc6..179c9f037 100644 --- a/common/include/nbl/examples/PCH.hpp +++ b/common/include/nbl/examples/PCH.hpp @@ -11,6 +11,7 @@ // #include "nbl/ui/CWindowManagerAndroid.h" #include "nbl/examples/common/SimpleWindowedApplication.hpp" +#include "nbl/examples/common/MonoWindowApplication.hpp" #include "nbl/examples/common/InputSystem.hpp" #include "nbl/examples/common/CEventCallback.hpp" diff --git a/common/include/nbl/examples/common/CEventCallback.hpp b/common/include/nbl/examples/common/CEventCallback.hpp index 4670ca7f6..cae6dc7de 100644 --- a/common/include/nbl/examples/common/CEventCallback.hpp +++ b/common/include/nbl/examples/common/CEventCallback.hpp @@ -1,9 +1,14 @@ -#ifndef _NBL_COMMON_C_EVENT_CALLBACK_HPP_INCLUDED_ -#define _NBL_C_EVENT_CALLBACK_HPP_INCLUDED_ +#ifndef _NBL_EXAMPLES_COMMON_C_EVENT_CALLBACK_HPP_INCLUDED_ +#define _NBL_EXAMPLES_COMMON_C_EVENT_CALLBACK_HPP_INCLUDED_ + #include "nbl/video/utilities/CSimpleResizeSurface.h" -#include "InputSystem.hpp" +#include "nbl/examples/common/InputSystem.hpp" + + +namespace nbl::examples +{ class CEventCallback : public nbl::video::ISimpleManagedSurface::ICallback { public: @@ -45,5 +50,5 @@ class CEventCallback : public nbl::video::ISimpleManagedSurface::ICallback nbl::core::smart_refctd_ptr m_inputSystem = nullptr; nbl::system::logger_opt_smart_ptr m_logger = nullptr; }; - -#endif // _NBL_C_EVENT_CALLBACK_HPP_INCLUDED_ \ No newline at end of file +} +#endif \ No newline at end of file diff --git a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp new file mode 100644 index 000000000..a79d59730 --- /dev/null +++ b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp @@ -0,0 +1,101 @@ +// Copyright (C) 2023-2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_EXAMPLES_COMMON_C_SWAPCHAIN_FRAMEBUFFERS_AND_DEPTH_HPP_INCLUDED_ +#define _NBL_EXAMPLES_COMMON_C_SWAPCHAIN_FRAMEBUFFERS_AND_DEPTH_HPP_INCLUDED_ + +// Build on top of the previous one +#include "nbl/application_templates/BasicMultiQueueApplication.hpp" + +namespace nbl::examples +{ + +class CSwapchainFramebuffersAndDepth final : public video::CDefaultSwapchainFramebuffers +{ + using base_t = CDefaultSwapchainFramebuffers; + + public: + template + inline CSwapchainFramebuffersAndDepth(video::ILogicalDevice* device, const asset::E_FORMAT _desiredDepthFormat, Args&&... args) : base_t(device,std::forward(args)...) + { + using namespace nbl::asset; + using namespace nbl::video; + const IPhysicalDevice::SImageFormatPromotionRequest req = { + .originalFormat = _desiredDepthFormat, + .usages = {IGPUImage::EUF_RENDER_ATTACHMENT_BIT} + }; + m_depthFormat = m_device->getPhysicalDevice()->promoteImageFormat(req,IGPUImage::TILING::OPTIMAL); + + const static IGPURenderpass::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = { + {{ + { + .format = m_depthFormat, + .samples = IGPUImage::ESCF_1_BIT, + .mayAlias = false + }, + /*.loadOp = */{IGPURenderpass::LOAD_OP::CLEAR}, + /*.storeOp = */{IGPURenderpass::STORE_OP::STORE}, + /*.initialLayout = */{IGPUImage::LAYOUT::UNDEFINED}, // because we clear we don't care about contents + /*.finalLayout = */{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} // transition to presentation right away so we can skip a barrier + }}, + IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd + }; + m_params.depthStencilAttachments = depthAttachments; + + static IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = { + m_params.subpasses[0], + IGPURenderpass::SCreationParams::SubpassesEnd + }; + subpasses[0].depthStencilAttachment.render = { .attachmentIndex = 0,.layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL }; + m_params.subpasses = subpasses; + } + + protected: + inline bool onCreateSwapchain_impl(const uint8_t qFam) override + { + using namespace nbl::asset; + using namespace nbl::video; + // DOCS: why are we not using `m_device` here? any particular reason? + auto device = const_cast(m_renderpass->getOriginDevice()); + + const auto depthFormat = m_renderpass->getCreationParameters().depthStencilAttachments[0].format; + const auto& sharedParams = getSwapchain()->getCreationParameters().sharedParams; + auto image = device->createImage({ IImage::SCreationParams{ + .type = IGPUImage::ET_2D, + .samples = IGPUImage::ESCF_1_BIT, + .format = depthFormat, + .extent = {sharedParams.width,sharedParams.height,1}, + .mipLevels = 1, + .arrayLayers = 1, + .depthUsage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT + } }); + + device->allocate(image->getMemoryReqs(), image.get()); + + m_depthBuffer = device->createImageView({ + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::EUF_RENDER_ATTACHMENT_BIT, + .image = std::move(image), + .viewType = IGPUImageView::ET_2D, + .format = depthFormat, + .subresourceRange = {IGPUImage::EAF_DEPTH_BIT,0,1,0,1} + }); + + const auto retval = base_t::onCreateSwapchain_impl(qFam); + m_depthBuffer = nullptr; + return retval; + } + + inline core::smart_refctd_ptr createFramebuffer(video::IGPUFramebuffer::SCreationParams&& params) override + { + params.depthStencilAttachments = &m_depthBuffer.get(); + return m_device->createFramebuffer(std::move(params)); + } + + asset::E_FORMAT m_depthFormat; + // only used to pass a parameter from `onCreateSwapchain_impl` to `createFramebuffer` + core::smart_refctd_ptr m_depthBuffer; +}; + +} +#endif \ No newline at end of file diff --git a/common/include/nbl/examples/common/InputSystem.hpp b/common/include/nbl/examples/common/InputSystem.hpp index c42b738d0..c30fc1212 100644 --- a/common/include/nbl/examples/common/InputSystem.hpp +++ b/common/include/nbl/examples/common/InputSystem.hpp @@ -4,16 +4,19 @@ #ifndef _NBL_EXAMPLES_COMMON_INPUT_SYSTEM_HPP_INCLUDED_ #define _NBL_EXAMPLES_COMMON_INPUT_SYSTEM_HPP_INCLUDED_ -class InputSystem : public nbl::core::IReferenceCounted +namespace nbl::examples +{ + +class InputSystem : public core::IReferenceCounted { public: template struct Channels { - nbl::core::mutex lock; + core::mutex lock; std::condition_variable added; - nbl::core::vector> channels; - nbl::core::vector timeStamps; + core::vector> channels; + core::vector timeStamps; uint32_t defaultChannelIndex = 0; }; // TODO: move to "nbl/ui/InputEventChannel.h" once the interface of this utility struct matures, also maybe rename to `Consumer` ? @@ -21,7 +24,7 @@ class InputSystem : public nbl::core::IReferenceCounted struct ChannelReader { template - inline void consumeEvents(F&& processFunc, nbl::system::logger_opt_ptr logger=nullptr) + inline void consumeEvents(F&& processFunc, system::logger_opt_ptr logger=nullptr) { auto events = channel->getEvents(); const auto frontBufferCapacity = channel->getFrontBufferCapacity(); @@ -29,7 +32,7 @@ class InputSystem : public nbl::core::IReferenceCounted { logger.log( "Detected overflow, %d unconsumed events in channel of size %d!", - nbl::system::ILogger::ELL_ERROR,events.size()-consumedCounter,frontBufferCapacity + system::ILogger::ELL_ERROR,events.size()-consumedCounter,frontBufferCapacity ); consumedCounter = events.size()-frontBufferCapacity; } @@ -38,22 +41,22 @@ class InputSystem : public nbl::core::IReferenceCounted consumedCounter = events.size(); } - nbl::core::smart_refctd_ptr channel = nullptr; + core::smart_refctd_ptr channel = nullptr; uint64_t consumedCounter = 0ull; }; - InputSystem(nbl::system::logger_opt_smart_ptr&& logger) : m_logger(std::move(logger)) {} + InputSystem(system::logger_opt_smart_ptr&& logger) : m_logger(std::move(logger)) {} - void getDefaultMouse(ChannelReader* reader) + void getDefaultMouse(ChannelReader* reader) { getDefault(m_mouse,reader); } - void getDefaultKeyboard(ChannelReader* reader) + void getDefaultKeyboard(ChannelReader* reader) { getDefault(m_keyboard,reader); } template - void add(Channels& channels, nbl::core::smart_refctd_ptr&& channel) + void add(Channels& channels, core::smart_refctd_ptr&& channel) { std::unique_lock lock(channels.lock); channels.channels.push_back(std::move(channel)); @@ -94,7 +97,7 @@ class InputSystem : public nbl::core::IReferenceCounted std::unique_lock lock(channels.lock); while (channels.channels.empty()) { - m_logger.log("Waiting For Input Device to be connected...",nbl::system::ILogger::ELL_INFO); + m_logger.log("Waiting For Input Device to be connected...",system::ILogger::ELL_INFO); channels.added.wait(lock); } @@ -159,7 +162,7 @@ class InputSystem : public nbl::core::IReferenceCounted } if(defaultIdx != newDefaultIdx) { - m_logger.log("Default InputChannel for ChannelType changed from %u to %u",nbl::system::ILogger::ELL_INFO, defaultIdx, newDefaultIdx); + m_logger.log("Default InputChannel for ChannelType changed from %u to %u",system::ILogger::ELL_INFO, defaultIdx, newDefaultIdx); defaultIdx = newDefaultIdx; channels.defaultChannelIndex = newDefaultIdx; @@ -177,10 +180,10 @@ class InputSystem : public nbl::core::IReferenceCounted reader->consumedCounter = consumedCounter; } - nbl::system::logger_opt_smart_ptr m_logger; - Channels m_mouse; - Channels m_keyboard; + system::logger_opt_smart_ptr m_logger; + Channels m_mouse; + Channels m_keyboard; }; - +} #endif diff --git a/common/include/nbl/examples/common/MonoWindowApplication.hpp b/common/include/nbl/examples/common/MonoWindowApplication.hpp new file mode 100644 index 000000000..0f18012c0 --- /dev/null +++ b/common/include/nbl/examples/common/MonoWindowApplication.hpp @@ -0,0 +1,189 @@ +// Copyright (C) 2023-2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_EXAMPLES_COMMON_MONO_WINDOW_APPLICATION_HPP_INCLUDED_ +#define _NBL_EXAMPLES_COMMON_MONO_WINDOW_APPLICATION_HPP_INCLUDED_ + +// Build on top of the previous one +#include "nbl/examples/common/SimpleWindowedApplication.hpp" +#include "nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp" +#include "nbl/examples/common/CEventCallback.hpp" + +namespace nbl::examples +{ + +// Virtual Inheritance because apps might end up doing diamond inheritance +class MonoWindowApplication : public virtual SimpleWindowedApplication +{ + using base_t = SimpleWindowedApplication; + + public: + // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers + constexpr static inline uint8_t MaxFramesInFlight = 3; + + template + MonoWindowApplication(const hlsl::uint16_t2 _initialResolution, const asset::E_FORMAT _depthFormat, Args&&... args) : + base_t(std::forward(args)...), m_initialResolution(_initialResolution), m_depthFormat(_depthFormat) {} + + // + inline core::vector getSurfaces() const override final + { + if (!m_surface) + { + using namespace nbl::core; + using namespace nbl::ui; + using namespace nbl::video; + { + auto windowCallback = make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem),smart_refctd_ptr(m_logger)); + IWindow::SCreationParams params = {}; + params.callback = make_smart_refctd_ptr(); + params.width = m_initialResolution[0]; + params.height = m_initialResolution[1]; + params.x = 32; + params.y = 32; + params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; + params.windowCaption = "MonoWindowApplication"; + params.callback = windowCallback; + const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); + } + + auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); + const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); + } + + if (m_surface) + return { {m_surface->getSurface()/*,EQF_NONE*/} }; + + return {}; + } + + virtual inline bool onAppInitialized(core::smart_refctd_ptr&& system) override + { + using namespace nbl::core; + using namespace nbl::video; + // want to have a usable system and logger first + if (!MonoSystemMonoLoggerApplication::onAppInitialized(std::move(system))) + return false; + + m_inputSystem = make_smart_refctd_ptr(system::logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + if (!base_t::onAppInitialized(std::move(system))) + return false; + + ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr(m_surface->getSurface()) }; + if (!swapchainParams.deduceFormat(m_physicalDevice)) + return logFail("Could not choose a Surface Format for the Swapchain!"); + + // TODO: option without depth + auto scResources = std::make_unique(m_device.get(),m_depthFormat,swapchainParams.surfaceFormat.format,getDefaultSubpassDependencies()); + auto* renderpass = scResources->getRenderpass(); + + if (!renderpass) + return logFail("Failed to create Renderpass!"); + + auto gQueue = getGraphicsQueue(); + if (!m_surface || !m_surface->init(gQueue,std::move(scResources),swapchainParams.sharedParams)) + return logFail("Could not create Window & Surface or initialize the Surface!"); + + m_winMgr->setWindowSize(m_window.get(),m_initialResolution[0],m_initialResolution[1]); + m_surface->recreateSwapchain(); + + return true; + } + + // we do slight inversion of control here + inline void workLoopBody() override final + { + using namespace nbl::core; + using namespace nbl::video; + // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. + const uint32_t framesInFlightCount = hlsl::min(MaxFramesInFlight,m_surface->getMaxAcquiresInFlight()); + // We block for semaphores for 2 reasons here: + // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] + // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] + if (m_framesInFlight.size()>=framesInFlightCount) + { + const ISemaphore::SWaitInfo framesDone[] = + { + { + .semaphore = m_framesInFlight.front().semaphore.get(), + .value = m_framesInFlight.front().value + } + }; + if (m_device->blockForSemaphores(framesDone)!=ISemaphore::WAIT_RESULT::SUCCESS) + return; + m_framesInFlight.pop_front(); + } + + auto updatePresentationTimestamp = [&]() + { + m_currentImageAcquire = m_surface->acquireNextImage(); + + // TODO: better frame pacing than this + oracle.reportEndFrameRecord(); + const auto timestamp = oracle.getNextPresentationTimeStamp(); + oracle.reportBeginFrameRecord(); + + return timestamp; + }; + + const auto nextPresentationTimestamp = updatePresentationTimestamp(); + + if (!m_currentImageAcquire) + return; + + const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = {renderFrame(nextPresentationTimestamp)}; + m_surface->present(m_currentImageAcquire.imageIndex,rendered); + if (rendered->semaphore) + m_framesInFlight.emplace_back(smart_refctd_ptr(rendered->semaphore),rendered->value); + } + + // + virtual inline bool keepRunning() override + { + if (m_surface->irrecoverable()) + return false; + + return true; + } + + // + virtual inline bool onAppTerminated() + { + m_inputSystem = nullptr; + m_device->waitIdle(); + m_framesInFlight.clear(); + m_surface = nullptr; + m_window = nullptr; + return base_t::onAppTerminated(); + } + + protected: + inline void onAppInitializedFinish() + { + m_winMgr->show(m_window.get()); + oracle.reportBeginFrameRecord(); + } + inline const auto& getCurrentAcquire() const {return m_currentImageAcquire;} + + virtual const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const = 0; + virtual video::IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) = 0; + + const hlsl::uint16_t2 m_initialResolution; + const asset::E_FORMAT m_depthFormat; + core::smart_refctd_ptr m_inputSystem; + core::smart_refctd_ptr m_window; + core::smart_refctd_ptr> m_surface; + + private: + struct SSubmittedFrame + { + core::smart_refctd_ptr semaphore; + uint64_t value; + }; + core::deque m_framesInFlight; + video::ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + video::CDumbPresentationOracle oracle; +}; + +} +#endif \ No newline at end of file diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index 9ebd244aa..e68441ffe 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -17,20 +17,25 @@ namespace nbl::examples { -enum ObjectType : uint8_t +class CGeometryCreatorScene { - OT_CUBE, - OT_SPHERE, - OT_CYLINDER, - OT_RECTANGLE, - OT_DISK, - OT_ARROW, - OT_CONE, - OT_ICOSPHERE, - - OT_COUNT, - OT_UNKNOWN = std::numeric_limits::max() + public: + enum ObjectType : uint8_t + { + OT_CUBE, + OT_SPHERE, + OT_CYLINDER, + OT_RECTANGLE, + OT_DISK, + OT_ARROW, + OT_CONE, + OT_ICOSPHERE, + + OT_COUNT, + OT_UNKNOWN = std::numeric_limits::max() + }; }; +#if 0 struct ObjectMeta { @@ -1346,7 +1351,7 @@ class CScene final : public nbl::core::IReferenceCounted ResourcesBundle resources; }; +#endif -} // nbl::scene::geometrycreator - -#endif // _NBL_GEOMETRY_CREATOR_SCENE_H_INCLUDED_ \ No newline at end of file +} +#endif \ No newline at end of file From 90ba9265ae0ed7cdb460ff90bb5bb2a1c439655c Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 5 Jun 2025 15:00:14 +0700 Subject: [PATCH 218/296] various minor adjustments to unit tests --- .../app_resources/common.hlsl | 9 ++ .../app_resources/shaderCommon.hlsl | 14 +-- .../app_resources/testSubgroup.comp.hlsl | 40 +++---- .../app_resources/testWorkgroup.comp.hlsl | 110 ++++++++---------- 23_Arithmetic2UnitTest/main.cpp | 26 ++--- 5 files changed, 88 insertions(+), 111 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/common.hlsl b/23_Arithmetic2UnitTest/app_resources/common.hlsl index 10892a2b9..2daffa56c 100644 --- a/23_Arithmetic2UnitTest/app_resources/common.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/common.hlsl @@ -10,6 +10,14 @@ struct Output uint32_t data[ScanElementCount]; }; +struct PushConstantData +{ + uint64_t pInputBuf; + uint64_t ppOutputBuf; +}; + +namespace arithmetic +{ // Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code template struct bit_and : nbl::hlsl::bit_and @@ -92,5 +100,6 @@ struct ballot : nbl::hlsl::plus static inline constexpr const char* name = "bitcount"; #endif }; +} #include "nbl/builtin/hlsl/subgroup/basic.hlsl" \ No newline at end of file diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl index 31d59121b..9045d62e8 100644 --- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl @@ -2,6 +2,9 @@ #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" +using namespace nbl; +using namespace hlsl; + // https://github.com/microsoft/DirectXShaderCompiler/issues/6144 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} @@ -9,19 +12,8 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1 #error "Define ITEMS_PER_INVOCATION!" #endif -struct PushConstantData -{ - uint64_t inputBufAddress; - uint64_t outputAddressBufAddress; -}; - [[vk::push_constant]] PushConstantData pc; -// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way -uint32_t globalIndex(); -// since we test ITEMS_PER_WG type_t; +uint32_t globalIndex() +{ + return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex(); +} + +bool canStore() { return true; } + template static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { - using config_t = nbl::hlsl::subgroup2::Configuration; - using params_t = nbl::hlsl::subgroup2::ArithmeticParams; + using config_t = subgroup2::Configuration; + using params_t = subgroup2::ArithmeticParams; - const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); + const uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); if (globalIndex()==0u) - vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); + vk::RawBufferStore(outputBufAddr, glsl::gl_SubgroupSize()); operation_t func; type_t val = func(sourceVal); @@ -31,25 +38,18 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) type_t test() { const uint32_t idx = globalIndex(); - type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + idx * sizeof(type_t)); - - subtest, ITEMS_PER_INVOCATION>(sourceVal); - subtest, ITEMS_PER_INVOCATION>(sourceVal); - subtest, ITEMS_PER_INVOCATION>(sourceVal); - subtest, ITEMS_PER_INVOCATION>(sourceVal); - subtest, ITEMS_PER_INVOCATION>(sourceVal); - subtest, ITEMS_PER_INVOCATION>(sourceVal); - subtest, ITEMS_PER_INVOCATION>(sourceVal); + type_t sourceVal = vk::RawBufferLoad(pc.pInputBuf + idx * sizeof(type_t)); + + subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest, ITEMS_PER_INVOCATION>(sourceVal); return sourceVal; } -uint32_t globalIndex() -{ - return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); -} - -bool canStore() {return true;} - [numthreads(WORKGROUP_SIZE,1,1)] void main() { diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 0a7fde9ba..0f97c7b54 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -9,12 +9,12 @@ static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2; #include "shaderCommon.hlsl" -using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration; +using config_t = workgroup2::ArithmeticConfiguration; typedef vector type_t; // final (level 1/2) scan needs to fit in one subgroup exactly -groupshared uint32_t scratch[config_t::SharedScratchElementCount]; +groupshared uint32_t scratch[mpl::max_v]; struct ScratchProxy { @@ -31,13 +31,13 @@ struct ScratchProxy uint32_t atomicOr(const uint32_t ix, const uint32_t value) { - return nbl::hlsl::glsl::atomicOr(scratch[ix],value); + return glsl::atomicOr(scratch[ix],value); } void workgroupExecutionAndMemoryBarrier() { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + glsl::barrier(); + //glsl::memoryBarrierShared(); implied by the above } }; @@ -45,26 +45,26 @@ template struct DataProxy { using dtype_t = vector; - static_assert(nbl::hlsl::is_same_v); + static_assert(is_same_v); template void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { - const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - value = vk::RawBufferLoad(pc.inputBufAddress + (workgroupOffset + ix) * sizeof(AccessType)); + const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; + value = vk::RawBufferLoad(pc.pInputBuf + (workgroupOffset + ix) * sizeof(AccessType)); } template void set(const IndexType ix, const AccessType value) { - const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t)); + const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; + uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t)); vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t)); } void workgroupExecutionAndMemoryBarrier() { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + glsl::barrier(); + //glsl::memoryBarrierShared(); implied by the above } }; @@ -72,41 +72,41 @@ template struct PreloadedDataProxy { using dtype_t = vector; - static_assert(nbl::hlsl::is_same_v); + static_assert(is_same_v); NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; template void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { - value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2]; + value = preloaded[ix>>Config::WorkgroupSizeLog2]; } template void set(const IndexType ix, const AccessType value) { - preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value; + preloaded[ix>>Config::WorkgroupSizeLog2] = value; } void preload() { - const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; + const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; [unroll] for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) - preloaded[idx] = vk::RawBufferLoad(pc.inputBufAddress + (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t)); + preloaded[idx] = vk::RawBufferLoad(pc.pInputBuf + (workgroupOffset + idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t)); } void unload() { - const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t)); + const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; + uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t)); [unroll] for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) - vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t)); + vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t)); } void workgroupExecutionAndMemoryBarrier() { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + glsl::barrier(); + //glsl::memoryBarrierShared(); implied by the above } dtype_t preloaded[PreloadedDataCount]; @@ -122,73 +122,55 @@ struct operation_t // workgroup reduction returns the value of the reduction // workgroup scans do no return anything, but use the data accessor to do the storing directly -#if IS_REDUCTION void operator()() { PreloadedDataProxy dataAccessor; dataAccessor.preload(); - otype_t value = nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); +#if IS_REDUCTION + otype_t value = +#endif + OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); // we barrier before because we alias the accessors for Binop arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); - +#if IS_REDUCTION [unroll] for (uint32_t i = 0; i < PreloadedDataProxy::PreloadedDataCount; i++) dataAccessor.preloaded[i] = value; +#endif dataAccessor.unload(); } -#else - void operator()() - { - PreloadedDataProxy dataAccessor; - dataAccessor.preload(); - nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); - // we barrier before because we alias the accessors for Binop - arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); - dataAccessor.unload(); - } -#endif }; -template -static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) +uint32_t globalIndex() { - uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t)); - if (globalIndex()==0u) - vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); - - operation_t func; - func(); + return glsl::gl_WorkGroupID().x*ITEMS_PER_WG+workgroup::SubgroupContiguousIndex(); } - -type_t test() +template +static void subtest() { - type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + globalIndex() * sizeof(type_t)); - - subtest >(sourceVal); - subtest >(sourceVal); - subtest >(sourceVal); - subtest >(sourceVal); - subtest >(sourceVal); - subtest >(sourceVal); - subtest >(sourceVal); - return sourceVal; -} - + uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t)); + if (globalIndex()==0u) + vk::RawBufferStore(outputBufAddr, glsl::gl_SubgroupSize()); -uint32_t globalIndex() -{ - return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); + operation_t func; + func(); } -bool canStore() +void test() { - return nbl::hlsl::workgroup::SubgroupContiguousIndex() >(); + subtest >(); + subtest >(); + subtest >(); + subtest >(); + subtest >(); + subtest >(); } [numthreads(WORKGROUP_SIZE,1,1)] void main() { - const type_t sourceVal = test(); + test(); } \ No newline at end of file diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 73e6a144e..71642b631 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -45,12 +45,6 @@ struct emulatedScanExclusive static inline constexpr const char* name = "exclusive_scan"; }; -struct PushConstantData -{ - uint64_t inputBufAddress; - uint64_t outputAddressBufAddress; -}; - class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { using device_base_t = application_templates::BasicMultiQueueApplication; @@ -118,8 +112,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu params.size = OutputBufferCount * sizeof(uint64_t); m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer); } - pc.inputBufAddress = gpuinputDataBuffer->getDeviceAddress(); - pc.outputAddressBufAddress = gpuOutputAddressesBuffer->getDeviceAddress(); + pc.pInputBuf = gpuinputDataBuffer->getDeviceAddress(); + pc.ppOutputBuf = gpuOutputAddressesBuffer->getDeviceAddress(); // create Pipeline Layout { @@ -310,7 +304,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu template class Arithmetic, bool WorkgroupTest> bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u) { - std::string arith_name = Arithmetic>::name; + std::string arith_name = Arithmetic>::name; const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); @@ -423,13 +417,13 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu m_device->blockForSemaphores(wait); // check results - bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc); - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc); + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; return passed; } From 19d7fe0fa35a0e6ddf7061b1ed22460ebdb56273 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 5 Jun 2025 15:21:42 +0700 Subject: [PATCH 219/296] simplified data accessors --- .../app_resources/testWorkgroup.comp.hlsl | 37 +++++++++++++------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 0f97c7b54..5cb316578 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -47,18 +47,23 @@ struct DataProxy using dtype_t = vector; static_assert(is_same_v); + static DataProxy create() + { + DataProxy retval; + retval.workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; + retval.outputBufAddr = sizeof(uint32_t) + vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t)); + return retval; + } + template void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { - const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; value = vk::RawBufferLoad(pc.pInputBuf + (workgroupOffset + ix) * sizeof(AccessType)); } template void set(const IndexType ix, const AccessType value) { - const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t)); - vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t)); + vk::RawBufferStore(outputBufAddr + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t)); } void workgroupExecutionAndMemoryBarrier() @@ -66,6 +71,9 @@ struct DataProxy glsl::barrier(); //glsl::memoryBarrierShared(); implied by the above } + + uint32_t workgroupOffset; + uint64_t outputBufAddr; }; template @@ -76,6 +84,13 @@ struct PreloadedDataProxy NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; + static PreloadedDataProxy create() + { + PreloadedDataProxy retval; + retval.data = DataProxy::create(); + return retval; + } + template void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { @@ -89,18 +104,15 @@ struct PreloadedDataProxy void preload() { - const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; [unroll] - for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) - preloaded[idx] = vk::RawBufferLoad(pc.pInputBuf + (workgroupOffset + idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t)); + for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) + data.template get(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]); } void unload() { - const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t)); [unroll] - for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) - vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t)); + for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) + data.template set(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]); } void workgroupExecutionAndMemoryBarrier() @@ -109,6 +121,7 @@ struct PreloadedDataProxy //glsl::memoryBarrierShared(); implied by the above } + DataProxy data; dtype_t preloaded[PreloadedDataCount]; }; @@ -124,7 +137,7 @@ struct operation_t // workgroup scans do no return anything, but use the data accessor to do the storing directly void operator()() { - PreloadedDataProxy dataAccessor; + PreloadedDataProxy dataAccessor = PreloadedDataProxy::create(); dataAccessor.preload(); #if IS_REDUCTION otype_t value = From fdace317db64525773dcf0cca9bc647331db7540 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 5 Jun 2025 15:47:23 +0700 Subject: [PATCH 220/296] tests for native and emulated subgroup op --- .../app_resources/shaderCommon.hlsl | 11 +- .../app_resources/testSubgroup.comp.hlsl | 2 +- .../app_resources/testWorkgroup.comp.hlsl | 2 +- 23_Arithmetic2UnitTest/main.cpp | 112 +++++++++++------- 4 files changed, 80 insertions(+), 47 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl index 9045d62e8..6b9575ccd 100644 --- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl @@ -1,7 +1,5 @@ #include "common.hlsl" -#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" - using namespace nbl; using namespace hlsl; @@ -14,6 +12,15 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1 [[vk::push_constant]] PushConstantData pc; +struct device_capabilities +{ +#ifdef TEST_NATIVE + NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true; +#else + NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = false; +#endif +}; + #ifndef OPERATION #error "Define OPERATION!" #endif diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl index 585a8498c..8d8557ccd 100644 --- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl @@ -22,7 +22,7 @@ template static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { using config_t = subgroup2::Configuration; - using params_t = subgroup2::ArithmeticParams; + using params_t = subgroup2::ArithmeticParams; const uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 5cb316578..cdd4af4b2 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -167,7 +167,7 @@ static void subtest() if (globalIndex()==0u) vk::RawBufferStore(outputBufAddr, glsl::gl_SubgroupSize()); - operation_t func; + operation_t func; func(); } diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 71642b631..98a9def2e 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -187,47 +187,65 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; - for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) + for (uint32_t useNative = 0; useNative < 2; useNative++) { - const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u) + bool b_useNative = false; + if (!m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic && useNative == 0) { - // make sure renderdoc captures everything for debugging - m_api->startCapture(); - m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); + m_logger->log("Device property shaderSubgroupArithmetic is false! Skipping to emulated arithmetic...", ILogger::ELL_INFO); + continue; + } - for (uint32_t j = 0; j < ItemsPerInvocations.size(); j++) - { - const uint32_t itemsPerInvocation = ItemsPerInvocations[j]; - m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation); - bool passed = true; - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; - logTestOutcome(passed, workgroupSize); - - const uint32_t itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvocation); - m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed; - logTestOutcome(passed, itemsPerWG); - } - m_api->endCapture(); + if (useNative) + m_logger->log("Testing with emulated subgroup arithmetic", ILogger::ELL_INFO); + else + { + m_logger->log("Testing with native subgroup arithmetic", ILogger::ELL_INFO); + b_useNative = true; + } - // save cache every now and then + for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) + { + const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); + for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u) { - auto cpu = m_spirv_isa_cache->convertToCPUCache(); - // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata - auto bin = cpu->getEntries().begin()->second.bin; - IFile::success_t success; - m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size()); - if (!success) - logFail("Could not write Create SPIR-V to ISA cache to disk!"); + // make sure renderdoc captures everything for debugging + m_api->startCapture(); + m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); + + for (uint32_t j = 0; j < ItemsPerInvocations.size(); j++) + { + const uint32_t itemsPerInvocation = ItemsPerInvocations[j]; + m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation); + bool passed = true; + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, ~0u, itemsPerInvocation) && passed; + logTestOutcome(passed, workgroupSize); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, ~0u, itemsPerInvocation) && passed; + logTestOutcome(passed, workgroupSize); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, ~0u, itemsPerInvocation) && passed; + logTestOutcome(passed, workgroupSize); + + const uint32_t itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvocation); + m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + } + m_api->endCapture(); + + // save cache every now and then + { + auto cpu = m_spirv_isa_cache->convertToCPUCache(); + // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata + auto bin = cpu->getEntries().begin()->second.bin; + IFile::success_t success; + m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size()); + if (!success) + logFail("Could not write Create SPIR-V to ISA cache to disk!"); + } } } } @@ -302,7 +320,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u) + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, bool useNative, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u) { std::string arith_name = Arithmetic>::name; const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); @@ -338,15 +356,19 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu std::to_string(arith_name=="reduction") }; - const IShaderCompiler::SMacroDefinition defines[6] = { + const IShaderCompiler::SMacroDefinition defines[7] = { { "OPERATION", definitions[0] }, { "WORKGROUP_SIZE_LOG2", definitions[1] }, { "ITEMS_PER_WG", definitions[2] }, { "ITEMS_PER_INVOCATION", definitions[3] }, { "SUBGROUP_SIZE_LOG2", definitions[4] }, - { "IS_REDUCTION", definitions[5] } + { "IS_REDUCTION", definitions[5] }, + { "TEST_NATIVE", "1" } }; - options.preprocessorOptions.extraDefines = { defines, defines + 6 }; + if (useNative) + options.preprocessorOptions.extraDefines = { defines, defines + 7 }; + else + options.preprocessorOptions.extraDefines = { defines, defines + 6 }; overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); } @@ -359,13 +381,17 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu std::to_string(subgroupSizeLog2) }; - const IShaderCompiler::SMacroDefinition defines[4] = { + const IShaderCompiler::SMacroDefinition defines[5] = { { "OPERATION", definitions[0] }, { "WORKGROUP_SIZE", definitions[1] }, { "ITEMS_PER_INVOCATION", definitions[2] }, - { "SUBGROUP_SIZE_LOG2", definitions[3] } + { "SUBGROUP_SIZE_LOG2", definitions[3] }, + { "TEST_NATIVE", "1" } }; - options.preprocessorOptions.extraDefines = { defines, defines + 4 }; + if (useNative) + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + else + options.preprocessorOptions.extraDefines = { defines, defines + 4 }; overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); } From d6680f2996d7acf56085b9e072c29698d9d06469 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 5 Jun 2025 15:58:32 +0700 Subject: [PATCH 221/296] removed redundant stuff --- 23_Arithmetic2UnitTest/app_resources/common.hlsl | 9 --------- 23_Arithmetic2UnitTest/main.cpp | 5 ++--- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/common.hlsl b/23_Arithmetic2UnitTest/app_resources/common.hlsl index 2daffa56c..ddf5dc00f 100644 --- a/23_Arithmetic2UnitTest/app_resources/common.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/common.hlsl @@ -1,15 +1,6 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/functional.hlsl" -template -struct Output -{ - NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount; - - uint32_t subgroupSize; - uint32_t data[ScanElementCount]; -}; - struct PushConstantData { uint64_t pInputBuf; diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 98a9def2e..326c9e57f 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -65,7 +65,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu computeQueue = getComputeQueue(); // TODO: get the element count from argv - const uint32_t elementCount = Output<>::ScanElementCount; + const uint32_t elementCount = 1024 * 1024; // populate our random data buffer on the CPU and create a GPU copy inputData = new uint32_t[elementCount]; smart_refctd_ptr gpuinputDataBuffer; @@ -75,7 +75,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; - inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount; + inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount; inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; m_utils->createFilledDeviceLocalBufferOnDedMem( SIntendedSubmitInfo{.queue=getTransferUpQueue()}, @@ -341,7 +341,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu options.preprocessorOptions.logger = m_logger.get(); auto* includeFinder = compiler->getDefaultIncludeFinder(); - includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr(m_physicalDevice->getLimits(), m_device->getEnabledFeatures())); options.preprocessorOptions.includeFinder = includeFinder; smart_refctd_ptr overriddenUnspecialized; From bafad3ecd353863a1f12feada096814799a1ee04 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 6 Jun 2025 11:04:06 +0700 Subject: [PATCH 222/296] bind swapchain image directly, explicit surface format swapchain --- 29_Arithmetic2Bench/main.cpp | 286 ++++++++++++++++++++++------------- 1 file changed, 181 insertions(+), 105 deletions(-) diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index 165427750..9f59f38d8 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -53,6 +53,160 @@ struct PushConstantData uint64_t outputAddressBufAddress; }; +template requires std::is_base_of_v +class CExplicitSurfaceFormatResizeSurface final : public ISimpleManagedSurface +{ +public: + using this_t = CExplicitSurfaceFormatResizeSurface; + + // Factory method so we can fail, requires a `_surface` created from a window and with a callback that inherits from `ICallback` declared just above + template requires std::is_base_of_v, Surface> + static inline core::smart_refctd_ptr create(core::smart_refctd_ptr&& _surface) + { + if (!_surface) + return nullptr; + + auto _window = _surface->getWindow(); + ICallback* cb = nullptr; + if (_window) + cb = dynamic_cast(_window->getEventCallback()); + + return core::smart_refctd_ptr(new this_t(std::move(_surface), cb), core::dont_grab); + } + + // Factory method so we can fail, requires a `_surface` created from a native surface + template requires std::is_base_of_v, Surface> + static inline core::smart_refctd_ptr create(core::smart_refctd_ptr&& _surface, ICallback* cb) + { + if (!_surface) + return nullptr; + + return core::smart_refctd_ptr(new this_t(std::move(_surface), cb), core::dont_grab); + } + + // + inline bool init(CThreadSafeQueueAdapter* queue, std::unique_ptr&& scResources, const ISwapchain::SSharedCreationParams& sharedParams = {}) + { + if (!scResources || !base_init(queue)) + return init_fail(); + + m_sharedParams = sharedParams; + if (!m_sharedParams.deduce(queue->getOriginDevice()->getPhysicalDevice(), getSurface())) + return init_fail(); + + m_swapchainResources = std::move(scResources); + return true; + } + + // Can be public because we don't need to worry about mutexes unlike the Smooth Resize class + inline ISwapchainResources* getSwapchainResources() override { return m_swapchainResources.get(); } + + // need to see if the swapchain is invalidated (e.g. because we're starting from 0-area old Swapchain) and try to recreate the swapchain + inline SAcquireResult acquireNextImage() + { + if (!isWindowOpen()) + { + becomeIrrecoverable(); + return {}; + } + + if (!m_swapchainResources || (m_swapchainResources->getStatus() != ISwapchainResources::STATUS::USABLE && !recreateSwapchain(m_surfaceFormat))) + return {}; + + return ISimpleManagedSurface::acquireNextImage(); + } + + // its enough to just foward though + inline bool present(const uint8_t imageIndex, const std::span waitSemaphores) + { + return ISimpleManagedSurface::present(imageIndex, waitSemaphores); + } + + // + inline bool recreateSwapchain(const ISurface::SFormat& explicitSurfaceFormat) + { + assert(m_swapchainResources); + // dont assign straight to `m_swapchainResources` because of complex refcounting and cycles + core::smart_refctd_ptr newSwapchain; + // TODO: This block of code could be rolled up into `ISimpleManagedSurface::ISwapchainResources` eventually + { + auto* surface = getSurface(); + auto device = const_cast(getAssignedQueue()->getOriginDevice()); + // 0s are invalid values, so they indicate we want them deduced + m_sharedParams.width = 0; + m_sharedParams.height = 0; + // Question: should we re-query the supported queues, formats, present modes, etc. just-in-time?? + auto* swapchain = m_swapchainResources->getSwapchain(); + if (swapchain ? swapchain->deduceRecreationParams(m_sharedParams) : m_sharedParams.deduce(device->getPhysicalDevice(), surface)) + { + // super special case, we can't re-create the swapchain but its possible to recover later on + if (m_sharedParams.width == 0 || m_sharedParams.height == 0) + { + // we need to keep the old-swapchain around, but can drop the rest + m_swapchainResources->invalidate(); + return false; + } + // now lets try to create a new swapchain + if (swapchain) + newSwapchain = swapchain->recreate(m_sharedParams); + else + { + ISwapchain::SCreationParams params = { + .surface = core::smart_refctd_ptr(surface), + .surfaceFormat = explicitSurfaceFormat, + .sharedParams = m_sharedParams + // we're not going to support concurrent sharing in this simple class + }; + m_surfaceFormat = explicitSurfaceFormat; + newSwapchain = CVulkanSwapchain::create(core::smart_refctd_ptr(device), std::move(params)); + } + } + else // parameter deduction failed + return false; + } + + if (newSwapchain) + { + m_swapchainResources->invalidate(); + return m_swapchainResources->onCreateSwapchain(getAssignedQueue()->getFamilyIndex(), std::move(newSwapchain)); + } + else + becomeIrrecoverable(); + + return false; + } + +protected: + using ISimpleManagedSurface::ISimpleManagedSurface; + + // + inline void deinit_impl() override final + { + becomeIrrecoverable(); + } + + // + inline void becomeIrrecoverable() override { m_swapchainResources = nullptr; } + + // gets called when OUT_OF_DATE upon an acquire + inline SAcquireResult handleOutOfDate() override final + { + // recreate swapchain and try to acquire again + if (recreateSwapchain(m_surfaceFormat)) + return ISimpleManagedSurface::acquireNextImage(); + return {}; + } + +private: + // Because the surface can start minimized (extent={0,0}) we might not be able to create the swapchain right away, so store creation parameters until we can create it. + ISwapchain::SSharedCreationParams m_sharedParams = {}; + // The swapchain might not be possible to create or recreate right away, so this might be + // either nullptr before the first successful acquire or the old to-be-retired swapchain. + std::unique_ptr m_swapchainResources = {}; + + ISurface::SFormat m_surfaceFormat = {}; +}; + // NOTE added swapchain + drawing frames to be able to profile with Nsight, which still doesn't support profiling headless compute shaders class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { @@ -86,7 +240,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); - const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); + const_cast&>(m_surface) = CExplicitSurfaceFormatResizeSurface::create(std::move(surface)); } if (m_surface) @@ -109,9 +263,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub return logFail("Failed to Create a Semaphore!"); ISwapchain::SCreationParams swapchainParams = { .surface = m_surface->getSurface() }; - if (!swapchainParams.deduceFormat(m_physicalDevice)) + asset::E_FORMAT preferredFormats[] = { asset::EF_R8G8B8A8_UNORM }; + if (!swapchainParams.deduceFormat(m_physicalDevice, preferredFormats)) return logFail("Could not choose a Surface Format for the Swapchain!"); + swapchainParams.sharedParams.imageUsage = IGPUImage::E_USAGE_FLAGS::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT; + auto graphicsQueue = getGraphicsQueue(); if (!m_surface || !m_surface->init(graphicsQueue, std::make_unique(), swapchainParams.sharedParams)) return logFail("Could not create Window & Surface or initialize the Surface!"); @@ -127,7 +284,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); - m_surface->recreateSwapchain(); + m_surface->recreateSwapchain(swapchainParams.surfaceFormat); transferDownQueue = getTransferDownQueue(); computeQueue = getComputeQueue(); @@ -181,21 +338,21 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub pc.inputBufAddress = gpuinputDataBuffer->getDeviceAddress(); pc.outputAddressBufAddress = gpuOutputAddressesBuffer->getDeviceAddress(); - // create dummy image - dummyImg = m_device->createImage({ - { - .type = IGPUImage::ET_2D, - .samples = asset::ICPUImage::ESCF_1_BIT, - .format = asset::EF_R16G16B16A16_SFLOAT, - .extent = {WIN_W, WIN_H, 1}, - .mipLevels = 1, - .arrayLayers = 1, - .flags = IImage::ECF_NONE, - .usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT - } - }); - if (!dummyImg || !m_device->allocate(dummyImg->getMemoryReqs(), dummyImg.get()).isValid()) - return logFail("Could not create HDR Image"); + // create image views for swapchain images + for (uint32_t i = 0; i < ISwapchain::MaxImages; i++) + { + IGPUImage* scImg = m_surface->getSwapchainResources()->getImage(i); + if (scImg == nullptr) + continue; + IGPUImageView::SCreationParams viewParams = { + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, + .image = smart_refctd_ptr(scImg), + .viewType = IGPUImageView::ET_2D, + .format = scImg->getCreationParameters().format + }; + swapchainImageViews[i] = m_device->createImageView(std::move(viewParams)); + } // create Descriptor Sets and Pipeline Layouts smart_refctd_ptr benchPplnLayout; @@ -322,7 +479,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS } }; - imageBarriers[0].image = dummyImg.get(); + imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); imageBarriers[0].subresourceRange = { .aspectMask = IImage::EAF_COLOR_BIT, .baseMipLevel = 0u, @@ -336,19 +493,9 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); } - // bind dummy image - IGPUImageView::SCreationParams viewParams = { - .flags = IGPUImageView::ECF_NONE, - .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, - .image = dummyImg, - .viewType = IGPUImageView::ET_2D, - .format = dummyImg->getCreationParameters().format - }; - auto dummyImgView = m_device->createImageView(std::move(viewParams)); - video::IGPUDescriptorSet::SDescriptorInfo dsInfo; dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL; - dsInfo.desc = dummyImgView; + dsInfo.desc = swapchainImageViews[m_currentImageAcquire.imageIndex]; IGPUDescriptorSet::SWriteDescriptorSet dsWrites[1u] = { @@ -366,7 +513,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; - const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize); + const auto SubgroupSizeLog2 = hlsl::findMSB(MaxSubgroupSize); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get()); cmdbuf->pushConstants(benchSets[0].pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc); @@ -374,72 +521,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub for (uint32_t i = 0; i < benchSets.size(); i++) runBenchmark(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2); - - // blit - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[2]; - imageBarriers[0].barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, - .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, - .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, - .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT - } - }; - imageBarriers[0].image = dummyImg.get(); - imageBarriers[0].subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .baseMipLevel = 0u, - .levelCount = 1u, - .baseArrayLayer = 0u, - .layerCount = 1u - }; - imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; - imageBarriers[0].newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL; - - imageBarriers[1].barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, - .srcAccessMask = ACCESS_FLAGS::NONE, - .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, - .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT - } - }; - imageBarriers[1].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); - imageBarriers[1].subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .baseMipLevel = 0u, - .levelCount = 1u, - .baseArrayLayer = 0u, - .layerCount = 1u - }; - imageBarriers[1].oldLayout = IImage::LAYOUT::UNDEFINED; - imageBarriers[1].newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL; - - cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); - } - - { - IGPUCommandBuffer::SImageBlit regions[] = { { - .srcMinCoord = {0,0,0}, - .srcMaxCoord = {WIN_W,WIN_H,1}, - .dstMinCoord = {0,0,0}, - .dstMaxCoord = {WIN_W,WIN_H,1}, - .layerCount = 1, - .srcBaseLayer = 0, - .dstBaseLayer = 0, - .srcMipLevel = 0, - .dstMipLevel = 0, - .aspectMask = IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT - } }; - - auto srcImg = dummyImg.get(); - auto scRes = static_cast(m_surface->getSwapchainResources()); - auto dstImg = scRes->getImage(m_currentImageAcquire.imageIndex); - - cmdbuf->blitImage(srcImg, IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, dstImg, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, regions, ISampler::ETF_NEAREST); - } - // barrier transition to PRESENT { IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; @@ -459,7 +540,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub .baseArrayLayer = 0u, .layerCount = 1u }; - imageBarriers[0].oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL; + imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL; imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC; cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); @@ -517,11 +598,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } } - std::string caption = "[Nabla Engine] Geometry Creator"; - { - caption += ", displaying [all objects]"; - m_window->setCaption(caption); - } m_surface->present(m_currentImageAcquire.imageIndex, rendered); } @@ -696,7 +772,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub IQueue* computeQueue; smart_refctd_ptr m_window; - smart_refctd_ptr> m_surface; + smart_refctd_ptr> m_surface; smart_refctd_ptr m_semaphore; uint64_t m_realFrameIx = 0; std::array, MaxFramesInFlight> m_cmdBufs; @@ -704,7 +780,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub smart_refctd_ptr m_inputSystem; - smart_refctd_ptr dummyImg; + std::array, ISwapchain::MaxImages> swapchainImageViews; constexpr static inline uint32_t MaxNumSubmits = 30; uint32_t numSubmits = 0; From 32dc78f065a8414dacafc216f31b7d333e301083 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 6 Jun 2025 14:48:03 +0700 Subject: [PATCH 223/296] shared data accessor header between test and bench, same shader adjustments as test --- .../app_resources/testWorkgroup.comp.hlsl | 109 +----------- .../app_resources/benchmarkSubgroup.comp.hlsl | 31 ++-- .../benchmarkWorkgroup.comp.hlsl | 160 ++++-------------- 29_Arithmetic2Bench/app_resources/common.hlsl | 12 +- .../app_resources/shaderCommon.hlsl | 16 +- 29_Arithmetic2Bench/main.cpp | 57 ++----- common/include/WorkgroupDataAccessors.hlsl | 119 +++++++++++++ 7 files changed, 191 insertions(+), 313 deletions(-) create mode 100644 common/include/WorkgroupDataAccessors.hlsl diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index cdd4af4b2..a38124b0c 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -16,114 +16,7 @@ typedef vector type_t; // final (level 1/2) scan needs to fit in one subgroup exactly groupshared uint32_t scratch[mpl::max_v]; -struct ScratchProxy -{ - template - void get(const uint32_t ix, NBL_REF_ARG(AccessType) value) - { - value = scratch[ix]; - } - template - void set(const uint32_t ix, const AccessType value) - { - scratch[ix] = value; - } - - uint32_t atomicOr(const uint32_t ix, const uint32_t value) - { - return glsl::atomicOr(scratch[ix],value); - } - - void workgroupExecutionAndMemoryBarrier() - { - glsl::barrier(); - //glsl::memoryBarrierShared(); implied by the above - } -}; - -template -struct DataProxy -{ - using dtype_t = vector; - static_assert(is_same_v); - - static DataProxy create() - { - DataProxy retval; - retval.workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - retval.outputBufAddr = sizeof(uint32_t) + vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t)); - return retval; - } - - template - void get(const IndexType ix, NBL_REF_ARG(AccessType) value) - { - value = vk::RawBufferLoad(pc.pInputBuf + (workgroupOffset + ix) * sizeof(AccessType)); - } - template - void set(const IndexType ix, const AccessType value) - { - vk::RawBufferStore(outputBufAddr + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t)); - } - - void workgroupExecutionAndMemoryBarrier() - { - glsl::barrier(); - //glsl::memoryBarrierShared(); implied by the above - } - - uint32_t workgroupOffset; - uint64_t outputBufAddr; -}; - -template -struct PreloadedDataProxy -{ - using dtype_t = vector; - static_assert(is_same_v); - - NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; - - static PreloadedDataProxy create() - { - PreloadedDataProxy retval; - retval.data = DataProxy::create(); - return retval; - } - - template - void get(const IndexType ix, NBL_REF_ARG(AccessType) value) - { - value = preloaded[ix>>Config::WorkgroupSizeLog2]; - } - template - void set(const IndexType ix, const AccessType value) - { - preloaded[ix>>Config::WorkgroupSizeLog2] = value; - } - - void preload() - { - [unroll] - for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) - data.template get(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]); - } - void unload() - { - [unroll] - for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) - data.template set(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]); - } - - void workgroupExecutionAndMemoryBarrier() - { - glsl::barrier(); - //glsl::memoryBarrierShared(); implied by the above - } - - DataProxy data; - dtype_t preloaded[PreloadedDataCount]; -}; +#include "../../common/include/WorkgroupDataAccessors.hlsl" static ScratchProxy arithmeticAccessor; diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl index 113ec2bae..553103bef 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl @@ -13,41 +13,38 @@ typedef vector type_t; uint32_t globalIndex() { - return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); + return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex(); } -bool canStore() {return true;} - template static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) { - using config_t = nbl::hlsl::subgroup2::Configuration; - using params_t = nbl::hlsl::subgroup2::ArithmeticParams; + using config_t = subgroup2::Configuration; + using params_t = subgroup2::ArithmeticParams; type_t value = sourceVal; - const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); + const uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); operation_t func; // [unroll] for (uint32_t i = 0; i < NUM_LOOPS; i++) value = func(value); - if (canStore()) - vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value, sizeof(uint32_t)); + vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value, sizeof(uint32_t)); } void benchmark() { const uint32_t idx = globalIndex(); - type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + idx * sizeof(type_t)); - - subbench, ITEMS_PER_INVOCATION>(sourceVal); - subbench, ITEMS_PER_INVOCATION>(sourceVal); - subbench, ITEMS_PER_INVOCATION>(sourceVal); - subbench, ITEMS_PER_INVOCATION>(sourceVal); - subbench, ITEMS_PER_INVOCATION>(sourceVal); - subbench, ITEMS_PER_INVOCATION>(sourceVal); - subbench, ITEMS_PER_INVOCATION>(sourceVal); + type_t sourceVal = vk::RawBufferLoad(pc.pInputBuf + idx * sizeof(type_t)); + + subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench, ITEMS_PER_INVOCATION>(sourceVal); } [numthreads(WORKGROUP_SIZE,1,1)] diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index e44bf4f06..504cc36de 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -9,108 +9,14 @@ static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2; #include "shaderCommon.hlsl" -using config_t = nbl::hlsl::workgroup2::ArithmeticConfiguration; +using config_t = workgroup2::ArithmeticConfiguration; typedef vector type_t; // final (level 1/2) scan needs to fit in one subgroup exactly groupshared uint32_t scratch[config_t::SharedScratchElementCount]; -struct ScratchProxy -{ - template - void get(const IndexType ix, NBL_REF_ARG(AccessType) value) - { - value = scratch[ix]; - } - template - void set(const IndexType ix, const AccessType value) - { - scratch[ix] = value; - } - - uint32_t atomicOr(const uint32_t ix, const uint32_t value) - { - return nbl::hlsl::glsl::atomicOr(scratch[ix],value); - } - - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } -}; - - -template -struct DataProxy -{ - using dtype_t = vector; - static_assert(nbl::hlsl::is_same_v); - - // we don't want to write/read storage multiple times in loop; doesn't seem optimized out in generated spirv - template - void get(const IndexType ix, NBL_REF_ARG(dtype_t) value) - { - // value = inputValue[ix]; - value = nbl::hlsl::promote(globalIndex()); - } - template - void set(const IndexType ix, const dtype_t value) - { - // output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); - } - - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } -}; - -template -struct PreloadedDataProxy -{ - using dtype_t = vector; - static_assert(nbl::hlsl::is_same_v); - - NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; - - template - void get(const IndexType ix, NBL_REF_ARG(AccessType) value) - { - value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2]; - } - template - void set(const IndexType ix, const AccessType value) - { - preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value; - } - - void preload() - { - const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - [unroll] - for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) - preloaded[idx] = vk::RawBufferLoad(pc.inputBufAddress + (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t)); - } - void unload() - { - const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t)); - [unroll] - for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) - vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t)); - } - - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } - - dtype_t preloaded[PreloadedDataCount]; -}; +#include "../../common/include/WorkgroupDataAccessors.hlsl" static ScratchProxy arithmeticAccessor; @@ -120,74 +26,70 @@ struct operation_t using binop_base_t = typename Binop::base_t; using otype_t = typename Binop::type_t; -#if IS_REDUCTION void operator()(PreloadedDataProxy dataAccessor) { - otype_t value = nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); +#if IS_REDUCTION + otype_t value = +#endif + OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); // we barrier before because we alias the accessors for Binop arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); - +#if IS_REDUCTION [unroll] for (uint32_t i = 0; i < PreloadedDataProxy::PreloadedDataCount; i++) dataAccessor.preloaded[i] = value; - } -#else - void operator()(PreloadedDataProxy dataAccessor) - { - nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); - // we barrier before because we alias the accessors for Binop - arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); - } #endif + } +// #else +// void operator()(PreloadedDataProxy dataAccessor) +// { +// OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); +// // we barrier before because we alias the accessors for Binop +// arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); +// } +// #endif }; template -static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) +static void subbench() { - const uint64_t outputBufAddr = vk::RawBufferLoad(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); - - if (globalIndex()==0u) - vk::RawBufferStore(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); + const uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); - PreloadedDataProxy dataAccessor; + PreloadedDataProxy dataAccessor = PreloadedDataProxy::create(); dataAccessor.preload(); - operation_t func; + operation_t func; for (uint32_t i = 0; i < NUM_LOOPS; i++) func(dataAccessor); dataAccessor.unload(); } - -type_t benchmark() +void benchmark() { - const type_t sourceVal = vk::RawBufferLoad(pc.inputBufAddress + globalIndex() * sizeof(type_t)); - - subbench >(sourceVal); - subbench >(sourceVal); - subbench >(sourceVal); - subbench >(sourceVal); - subbench >(sourceVal); - subbench >(sourceVal); - subbench >(sourceVal); - return sourceVal; + subbench >(); + subbench >(); + subbench >(); + subbench >(); + subbench >(); + subbench >(); + subbench >(); } uint32_t globalIndex() { - return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); + return glsl::gl_WorkGroupID().x*ITEMS_PER_WG+workgroup::SubgroupContiguousIndex(); } bool canStore() { - return nbl::hlsl::workgroup::SubgroupContiguousIndex() -struct Output +struct PushConstantData { - NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount; - - uint32_t subgroupSize; - uint32_t data[ScanElementCount]; + uint64_t pInputBuf; + uint64_t ppOutputBuf; }; +namespace arithmetic +{ template struct bit_and : nbl::hlsl::bit_and { @@ -91,5 +90,6 @@ struct ballot : nbl::hlsl::plus static inline constexpr const char* name = "bitcount"; #endif }; +} #include "nbl/builtin/hlsl/subgroup/basic.hlsl" diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl index a14986e0d..8659fd054 100644 --- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl +++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl @@ -1,6 +1,7 @@ #include "common.hlsl" -#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" +using namespace nbl; +using namespace hlsl; // https://github.com/microsoft/DirectXShaderCompiler/issues/6144 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} @@ -9,14 +10,17 @@ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1 #error "Define ITEMS_PER_INVOCATION!" #endif -struct PushConstantData +[[vk::push_constant]] PushConstantData pc; + +struct device_capabilities { - uint64_t inputBufAddress; - uint64_t outputAddressBufAddress; +#ifdef TEST_NATIVE + NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true; +#else + NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true; +#endif }; -[[vk::push_constant]] PushConstantData pc; - // because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way uint32_t globalIndex(); // since we test ITEMS_PER_WG requires std::is_base_of_v class CExplicitSurfaceFormatResizeSurface final : public ISimpleManagedSurface { @@ -289,8 +283,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub transferDownQueue = getTransferDownQueue(); computeQueue = getComputeQueue(); - // TODO: get the element count from argv - const uint32_t elementCount = Output<>::ScanElementCount; // populate our random data buffer on the CPU and create a GPU copy inputData = new uint32_t[elementCount]; { @@ -299,7 +291,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; - inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount; + inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount; inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; m_utils->createFilledDeviceLocalBufferOnDedMem( SIntendedSubmitInfo{.queue=getTransferUpQueue()}, @@ -335,8 +327,8 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub params.size = OutputBufferCount * sizeof(uint64_t); m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer); } - pc.inputBufAddress = gpuinputDataBuffer->getDeviceAddress(); - pc.outputAddressBufAddress = gpuOutputAddressesBuffer->getDeviceAddress(); + pc.pInputBuf = gpuinputDataBuffer->getDeviceAddress(); + pc.ppOutputBuf = gpuOutputAddressesBuffer->getDeviceAddress(); // create image views for swapchain images for (uint32_t i = 0; i < ISwapchain::MaxImages; i++) @@ -357,16 +349,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub // create Descriptor Sets and Pipeline Layouts smart_refctd_ptr benchPplnLayout; { - // create Descriptor Set Layout - smart_refctd_ptr dsLayout; - { - IGPUDescriptorSetLayout::SBinding binding[2]; - for (uint32_t i = 0u; i < 2; i++) - binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; - binding[1].count = OutputBufferCount; - dsLayout = m_device->createDescriptorSetLayout(binding); - } - // set and transient pool smart_refctd_ptr benchLayout; { @@ -402,7 +384,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub auto workgroupBenchSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl"); // now create or retrieve final resources to run our tests sema = m_device->createSemaphore(timelineValue); - resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() }); smart_refctd_ptr cmdbuf; { smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); @@ -413,20 +394,17 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub } } - // const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; - const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; - // for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram) if constexpr (DoWorkgroupBenchmarks) { for (uint32_t i = 0; i < workgroupSizes.size(); i++) - benchSets[i] = createBenchmarkPipelines(workgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); + benchSets[i] = createBenchmarkPipelines(workgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MaxSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); } else { for (uint32_t i = 0; i < workgroupSizes.size(); i++) - benchSets[i] = createBenchmarkPipelines(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); + benchSets[i] = createBenchmarkPipelines(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MaxSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); } m_winMgr->show(m_window.get()); @@ -509,10 +487,8 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub }; m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr); - const uint32_t elementCount = Output<>::ScanElementCount; - const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; + const uint32_t elementCount = 1024*1024; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; - const auto SubgroupSizeLog2 = hlsl::findMSB(MaxSubgroupSize); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get()); @@ -608,17 +584,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub bool keepRunning() override { return numSubmits < MaxNumSubmits; } private: - void logTestOutcome(bool passed, uint32_t workgroupSize) - { - if (passed) - m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); - else - { - totalFailCount++; - m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); - } - } - // create pipeline (specialized every test) [TODO: turn into a future/async] smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2) { @@ -648,7 +613,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub template class Arithmetic, bool WorkgroupBench> BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) { - std::string arith_name = Arithmetic>::name; + std::string arith_name = Arithmetic>::name; auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); CHLSLCompiler::SOptions options = {}; @@ -784,13 +749,14 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub constexpr static inline uint32_t MaxNumSubmits = 30; uint32_t numSubmits = 0; + uint32_t elementCount = 1024 * 1024; /* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */ constexpr static inline bool DoWorkgroupBenchmarks = true; uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t NumLoops = 1000u; - constexpr static inline uint32_t NumBenchmarks = 6u; - constexpr static inline std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; + constexpr static inline uint32_t NumBenchmarks = 2u; + constexpr static inline std::array workgroupSizes = { 32, 64 };// 128, 256, 512, 1024}; template using ArithmeticOp = emulatedReduction; // change this to test other arithmetic ops @@ -807,9 +773,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub smart_refctd_ptr sema; uint64_t timelineValue = 0; - smart_refctd_ptr resultsBuffer; - - uint32_t totalFailCount = 0; }; NBL_MAIN_FUNC(ArithmeticBenchApp) \ No newline at end of file diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl new file mode 100644 index 000000000..267c81a73 --- /dev/null +++ b/common/include/WorkgroupDataAccessors.hlsl @@ -0,0 +1,119 @@ +#ifndef _WORKGROUP_DATA_ACCESSORS_HLSL_ +#define _WORKGROUP_DATA_ACCESSORS_HLSL_ + +namespace nbl +{ +namespace hlsl +{ + +struct ScratchProxy +{ + template + void get(const uint32_t ix, NBL_REF_ARG(AccessType) value) + { + value = scratch[ix]; + } + template + void set(const uint32_t ix, const AccessType value) + { + scratch[ix] = value; + } + + uint32_t atomicOr(const uint32_t ix, const uint32_t value) + { + return glsl::atomicOr(scratch[ix],value); + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + //glsl::memoryBarrierShared(); implied by the above + } +}; + +template +struct DataProxy +{ + using dtype_t = vector; + + static DataProxy create() + { + DataProxy retval; + retval.workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; + retval.outputBufAddr = sizeof(uint32_t) + vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t)); + return retval; + } + + template + void get(const IndexType ix, NBL_REF_ARG(AccessType) value) + { + value = vk::RawBufferLoad(pc.pInputBuf + (workgroupOffset + ix) * sizeof(AccessType)); + } + template + void set(const IndexType ix, const AccessType value) + { + vk::RawBufferStore(outputBufAddr + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t)); + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + //glsl::memoryBarrierShared(); implied by the above + } + + uint32_t workgroupOffset; + uint64_t outputBufAddr; +}; + +template +struct PreloadedDataProxy +{ + using dtype_t = vector; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; + + static PreloadedDataProxy create() + { + PreloadedDataProxy retval; + retval.data = DataProxy::create(); + return retval; + } + + template + void get(const IndexType ix, NBL_REF_ARG(AccessType) value) + { + value = preloaded[ix>>Config::WorkgroupSizeLog2]; + } + template + void set(const IndexType ix, const AccessType value) + { + preloaded[ix>>Config::WorkgroupSizeLog2] = value; + } + + void preload() + { + [unroll] + for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) + data.template get(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]); + } + void unload() + { + [unroll] + for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) + data.template set(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]); + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + //glsl::memoryBarrierShared(); implied by the above + } + + DataProxy data; + dtype_t preloaded[PreloadedDataCount]; +}; + +} +} + +#endif From 2aef6d343f68dbf9db15505ae50ed6ce2a249d4c Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 6 Jun 2025 15:42:05 +0700 Subject: [PATCH 224/296] generate benchmark inputs with xoroshiro --- .../app_resources/benchmarkSubgroup.comp.hlsl | 8 ++- .../benchmarkWorkgroup.comp.hlsl | 71 +++++++++++++++---- .../app_resources/shaderCommon.hlsl | 2 +- 29_Arithmetic2Bench/main.cpp | 54 +++++++------- common/include/WorkgroupDataAccessors.hlsl | 6 +- 5 files changed, 93 insertions(+), 48 deletions(-) diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl index 553103bef..2e5d3e146 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl @@ -5,6 +5,7 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/random/xoroshiro.hlsl" #include "shaderCommon.hlsl" #include "nbl/builtin/hlsl/workgroup/basic.hlsl" @@ -35,8 +36,11 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) void benchmark() { - const uint32_t idx = globalIndex(); - type_t sourceVal = vk::RawBufferLoad(pc.pInputBuf + idx * sizeof(type_t)); + type_t sourceVal; + Xoroshiro64Star xoroshiro = Xoroshiro64Star::construct(uint32_t2(invocationIndex,invocationIndex+1)); + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++) + sourceVal[i] = xoroshiro(); subbench, ITEMS_PER_INVOCATION>(sourceVal); subbench, ITEMS_PER_INVOCATION>(sourceVal); diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index 504cc36de..4e611476a 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -4,6 +4,7 @@ #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" +#include "nbl/builtin/hlsl/random/xoroshiro.hlsl" static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2; @@ -18,6 +19,59 @@ groupshared uint32_t scratch[config_t::SharedScratchElementCount]; #include "../../common/include/WorkgroupDataAccessors.hlsl" +template +struct RandomizedInputDataProxy +{ + using dtype_t = vector; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; + + static RandomizedInputDataProxy create() + { + RandomizedInputDataProxy retval; + retval.data = DataProxy::create(); + return retval; + } + + template + void get(const IndexType ix, NBL_REF_ARG(AccessType) value) + { + value = preloaded[ix>>Config::WorkgroupSizeLog2]; + } + template + void set(const IndexType ix, const AccessType value) + { + preloaded[ix>>Config::WorkgroupSizeLog2] = value; + } + + void preload() + { + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + Xoroshiro64Star xoroshiro = Xoroshiro64Star::construct(uint32_t2(invocationIndex,invocationIndex+1)); + [unroll] + for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++) + preloaded[idx][i] = xoroshiro(); + } + void unload() + { + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + [unroll] + for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) + data.template set(idx * Config::WorkgroupSize + invocationIndex, preloaded[idx]); + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + //glsl::memoryBarrierShared(); implied by the above + } + + DataProxy data; + dtype_t preloaded[PreloadedDataCount]; +}; + static ScratchProxy arithmeticAccessor; template @@ -26,29 +80,20 @@ struct operation_t using binop_base_t = typename Binop::base_t; using otype_t = typename Binop::type_t; - void operator()(PreloadedDataProxy dataAccessor) + void operator()(RandomizedInputDataProxy dataAccessor) { #if IS_REDUCTION otype_t value = #endif - OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); + OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); // we barrier before because we alias the accessors for Binop arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); #if IS_REDUCTION [unroll] - for (uint32_t i = 0; i < PreloadedDataProxy::PreloadedDataCount; i++) + for (uint32_t i = 0; i < RandomizedInputDataProxy::PreloadedDataCount; i++) dataAccessor.preloaded[i] = value; #endif } -// #else -// void operator()(PreloadedDataProxy dataAccessor) -// { -// OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); -// // we barrier before because we alias the accessors for Binop -// arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); -// } -// #endif - }; template @@ -56,7 +101,7 @@ static void subbench() { const uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); - PreloadedDataProxy dataAccessor = PreloadedDataProxy::create(); + RandomizedInputDataProxy dataAccessor = RandomizedInputDataProxy::create(); dataAccessor.preload(); operation_t func; diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl index 8659fd054..bf66de500 100644 --- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl +++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl @@ -17,7 +17,7 @@ struct device_capabilities #ifdef TEST_NATIVE NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true; #else - NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true; + NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = false; #endif }; diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index 0b6639bec..002471b22 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -283,28 +283,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub transferDownQueue = getTransferDownQueue(); computeQueue = getComputeQueue(); - // populate our random data buffer on the CPU and create a GPU copy - inputData = new uint32_t[elementCount]; - { - std::mt19937 randGenerator(0xdeadbeefu); - for (uint32_t i = 0u; i < elementCount; i++) - inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all - - IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; - inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount; - inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - m_utils->createFilledDeviceLocalBufferOnDedMem( - SIntendedSubmitInfo{.queue=getTransferUpQueue()}, - std::move(inputDataBufferCreationParams), - inputData - ).move_into(gpuinputDataBuffer); - } - // create 8 buffers for 8 operations for (auto i=0u; igetSize(); + params.size = sizeof(uint32_t) * (elementCount+1); params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; outputBuffers[i] = m_device->createBuffer(std::move(params)); @@ -327,7 +310,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub params.size = OutputBufferCount * sizeof(uint64_t); m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer); } - pc.pInputBuf = gpuinputDataBuffer->getDeviceAddress(); pc.ppOutputBuf = gpuOutputAddressesBuffer->getDeviceAddress(); // create image views for swapchain images @@ -363,6 +345,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) }; benchPplnLayout = m_device->createPipelineLayout({ &pcRange, 1 }, std::move(benchLayout)); } + if (UseNativeArithmetic && !m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic) + { + m_logger->log("UseNativeArithmetic is true but device does not support shaderSubgroupArithmetic!", ILogger::ELL_ERROR); + exit(-1); + } + // load shader source from file auto getShaderSource = [&](const char* filePath) -> auto @@ -414,7 +402,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub virtual bool onAppTerminated() override { - delete[] inputData; return true; } @@ -650,16 +637,20 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub std::to_string(arith_name=="reduction") }; - const IShaderCompiler::SMacroDefinition defines[7] = { + const IShaderCompiler::SMacroDefinition defines[8] = { { "OPERATION", definitions[0] }, { "WORKGROUP_SIZE_LOG2", definitions[1] }, { "ITEMS_PER_WG", definitions[2] }, { "ITEMS_PER_INVOCATION", definitions[3] }, { "SUBGROUP_SIZE_LOG2", definitions[4] }, { "NUM_LOOPS", definitions[5] }, - { "IS_REDUCTION", definitions[6] } + { "IS_REDUCTION", definitions[6] }, + { "TEST_NATIVE", "1" } }; - options.preprocessorOptions.extraDefines = { defines, defines + 7 }; + if (UseNativeArithmetic) + options.preprocessorOptions.extraDefines = { defines, defines + 8 }; + else + options.preprocessorOptions.extraDefines = { defines, defines + 7 }; overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); } @@ -673,14 +664,18 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub std::to_string(numLoops) }; - const IShaderCompiler::SMacroDefinition defines[5] = { + const IShaderCompiler::SMacroDefinition defines[6] = { { "OPERATION", definitions[0] }, { "WORKGROUP_SIZE", definitions[1] }, { "ITEMS_PER_INVOCATION", definitions[2] }, { "SUBGROUP_SIZE_LOG2", definitions[3] }, - { "NUM_LOOPS", definitions[4] } + { "NUM_LOOPS", definitions[4] }, + { "TEST_NATIVE", "1" } }; - options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + if (UseNativeArithmetic) + options.preprocessorOptions.extraDefines = { defines, defines + 6 }; + else + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); } @@ -753,10 +748,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub /* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */ constexpr static inline bool DoWorkgroupBenchmarks = true; + constexpr static inline bool UseNativeArithmetic = true; uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t NumLoops = 1000u; - constexpr static inline uint32_t NumBenchmarks = 2u; - constexpr static inline std::array workgroupSizes = { 32, 64 };// 128, 256, 512, 1024}; + constexpr static inline uint32_t NumBenchmarks = 6u; + constexpr static inline std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; template using ArithmeticOp = emulatedReduction; // change this to test other arithmetic ops @@ -764,8 +760,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub smart_refctd_ptr benchPool; smart_refctd_ptr benchDs; - uint32_t* inputData = nullptr; - smart_refctd_ptr gpuinputDataBuffer; constexpr static inline uint32_t OutputBufferCount = 8u; smart_refctd_ptr outputBuffers[OutputBufferCount]; smart_refctd_ptr gpuOutputAddressesBuffer; diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl index 267c81a73..e6112f797 100644 --- a/common/include/WorkgroupDataAccessors.hlsl +++ b/common/include/WorkgroupDataAccessors.hlsl @@ -92,15 +92,17 @@ struct PreloadedDataProxy void preload() { + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); [unroll] for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) - data.template get(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]); + data.template get(idx * Config::WorkgroupSize + invocationIndex, preloaded[idx]); } void unload() { + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); [unroll] for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) - data.template set(idx * Config::WorkgroupSize + workgroup::SubgroupContiguousIndex(), preloaded[idx]); + data.template set(idx * Config::WorkgroupSize + invocationIndex, preloaded[idx]); } void workgroupExecutionAndMemoryBarrier() From 149a2375c3ffb43fa4c3e403c6d6eae056828fb3 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 6 Jun 2025 16:20:40 +0700 Subject: [PATCH 225/296] only have to benchmark plus op --- .../app_resources/benchmarkSubgroup.comp.hlsl | 6 -- .../benchmarkWorkgroup.comp.hlsl | 17 +---- 29_Arithmetic2Bench/app_resources/common.hlsl | 64 +------------------ .../app_resources/shaderCommon.hlsl | 5 -- 29_Arithmetic2Bench/main.cpp | 15 +---- 5 files changed, 4 insertions(+), 103 deletions(-) diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl index 2e5d3e146..ba11890d1 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl @@ -42,13 +42,7 @@ void benchmark() for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++) sourceVal[i] = xoroshiro(); - subbench, ITEMS_PER_INVOCATION>(sourceVal); - subbench, ITEMS_PER_INVOCATION>(sourceVal); - subbench, ITEMS_PER_INVOCATION>(sourceVal); subbench, ITEMS_PER_INVOCATION>(sourceVal); - subbench, ITEMS_PER_INVOCATION>(sourceVal); - subbench, ITEMS_PER_INVOCATION>(sourceVal); - subbench, ITEMS_PER_INVOCATION>(sourceVal); } [numthreads(WORKGROUP_SIZE,1,1)] diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index 4e611476a..58a3624cd 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -113,26 +113,11 @@ static void subbench() void benchmark() { - subbench >(); - subbench >(); - subbench >(); + // only benchmark plus op subbench >(); - subbench >(); - subbench >(); - subbench >(); } -uint32_t globalIndex() -{ - return glsl::gl_WorkGroupID().x*ITEMS_PER_WG+workgroup::SubgroupContiguousIndex(); -} - -bool canStore() -{ - return workgroup::SubgroupContiguousIndex() -struct bit_and : nbl::hlsl::bit_and -{ - using base_t = nbl::hlsl::bit_and; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "bit_and"; -#endif -}; -template -struct bit_or : nbl::hlsl::bit_or -{ - using base_t = nbl::hlsl::bit_or; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "bit_xor"; -#endif -}; -template -struct bit_xor : nbl::hlsl::bit_xor -{ - using base_t = nbl::hlsl::bit_xor; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "bit_or"; -#endif -}; -template struct plus : nbl::hlsl::plus { using base_t = nbl::hlsl::plus; - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3; + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0; #ifndef __HLSL_VERSION static inline constexpr const char* name = "plus"; #endif }; -template -struct multiplies : nbl::hlsl::multiplies -{ - using base_t = nbl::hlsl::multiplies; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "multiplies"; -#endif -}; -template -struct minimum : nbl::hlsl::minimum -{ - using base_t = nbl::hlsl::minimum; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "minimum"; -#endif -}; -template -struct maximum : nbl::hlsl::maximum -{ - using base_t = nbl::hlsl::maximum; - - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6; -#ifndef __HLSL_VERSION - static inline constexpr const char* name = "maximum"; -#endif -}; template struct ballot : nbl::hlsl::plus { using base_t = nbl::hlsl::plus; - NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7; + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1; #ifndef __HLSL_VERSION static inline constexpr const char* name = "bitcount"; #endif diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl index bf66de500..4866efe81 100644 --- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl +++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl @@ -21,11 +21,6 @@ struct device_capabilities #endif }; -// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way -uint32_t globalIndex(); -// since we test ITEMS_PER_WGcreateSemaphore(timelineValue); - smart_refctd_ptr cmdbuf; - { - smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1})) - { - logFail("Failed to create Command Buffers!\n"); - return false; - } - } - const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; // for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram) if constexpr (DoWorkgroupBenchmarks) @@ -760,12 +748,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub smart_refctd_ptr benchPool; smart_refctd_ptr benchDs; - constexpr static inline uint32_t OutputBufferCount = 8u; + constexpr static inline uint32_t OutputBufferCount = 2u; smart_refctd_ptr outputBuffers[OutputBufferCount]; smart_refctd_ptr gpuOutputAddressesBuffer; PushConstantData pc; - smart_refctd_ptr sema; uint64_t timelineValue = 0; }; From 00ed9beaddced9d0bd01e18d510ea7d58e48cfb5 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 6 Jun 2025 16:55:15 +0700 Subject: [PATCH 226/296] benchmark all reduce/scan in one run (lots of shaders) --- 29_Arithmetic2Bench/main.cpp | 96 ++++++++++++++---------------------- 1 file changed, 37 insertions(+), 59 deletions(-) diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index c91cbe4aa..38f995264 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -10,43 +10,6 @@ using namespace asset; using namespace ui; using namespace video; -// method emulations on the CPU, to verify the results of the GPU methods -template -struct emulatedReduction -{ - using type_t = typename Binop::type_t; - - static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) - { - const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop()); - std::fill(out,out+itemCount,red); - } - - static inline constexpr const char* name = "reduction"; -}; -template -struct emulatedScanInclusive -{ - using type_t = typename Binop::type_t; - - static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) - { - std::inclusive_scan(in,in+itemCount,out,Binop()); - } - static inline constexpr const char* name = "inclusive_scan"; -}; -template -struct emulatedScanExclusive -{ - using type_t = typename Binop::type_t; - - static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) - { - std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop()); - } - static inline constexpr const char* name = "exclusive_scan"; -}; - template requires std::is_base_of_v class CExplicitSurfaceFormatResizeSurface final : public ISimpleManagedSurface { @@ -287,7 +250,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub for (auto i=0u; icreateBuffer(std::move(params)); @@ -368,20 +331,17 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub return smart_refctd_ptr_static_cast(firstAssetInBundle); }; - auto subgroupBenchSource = getShaderSource("app_resources/benchmarkSubgroup.comp.hlsl"); - auto workgroupBenchSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl"); - const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; // for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram) + const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; + smart_refctd_ptr shaderSource; if constexpr (DoWorkgroupBenchmarks) - { - for (uint32_t i = 0; i < workgroupSizes.size(); i++) - benchSets[i] = createBenchmarkPipelines(workgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MaxSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); - } + shaderSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl"); else - { + shaderSource = getShaderSource("app_resources/benchmarkSubgroup.comp.hlsl"); + + for (uint32_t op = 0; op < arithmeticOperations.size(); op++) for (uint32_t i = 0; i < workgroupSizes.size(); i++) - benchSets[i] = createBenchmarkPipelines(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MaxSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); - } + benchSets[op*workgroupSizes.size()+i] = createBenchmarkPipelines(shaderSource, benchPplnLayout.get(), ElementCount, arithmeticOperations[op], hlsl::findMSB(MaxSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); m_winMgr->show(m_window.get()); @@ -559,6 +519,27 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub bool keepRunning() override { return numSubmits < MaxNumSubmits; } private: + // reflects calculations in workgroup2::ArithmeticConfiguration + uint32_t calculateItemsPerWorkgroup(const uint32_t workgroupSize, const uint32_t subgroupSize, const uint32_t itemsPerInvocation) + { + if (workgroupSize <= subgroupSize) + return workgroupSize * itemsPerInvocation; + + const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); + const uint8_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); + + const uint16_t levels = (workgroupSizeLog2 == subgroupSizeLog2) ? 1 : + (workgroupSizeLog2 > subgroupSizeLog2 * 2 + 2) ? 3 : 2; + + const uint16_t itemsPerInvocationProductLog2 = max(workgroupSizeLog2 - subgroupSizeLog2 * levels, 0); + uint16_t itemsPerInvocation1 = (levels == 3) ? min(itemsPerInvocationProductLog2, 2) : itemsPerInvocationProductLog2; + itemsPerInvocation1 = uint16_t(1u) << itemsPerInvocation1; + + uint32_t virtualWorkgroupSize = 1u << max(subgroupSizeLog2 * levels, workgroupSizeLog2); + + return itemsPerInvocation * virtualWorkgroupSize; + } + // create pipeline (specialized every test) [TODO: turn into a future/async] smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2) { @@ -585,11 +566,9 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub uint32_t itemsPerInvocation; }; - template class Arithmetic, bool WorkgroupBench> - BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) + template + BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const std::string& arith_name, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) { - std::string arith_name = Arithmetic>::name; - auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); CHLSLCompiler::SOptions options = {}; options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; @@ -606,11 +585,10 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub options.preprocessorOptions.logger = m_logger.get(); auto* includeFinder = compiler->getDefaultIncludeFinder(); - includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr(m_physicalDevice->getLimits(), m_device->getEnabledFeatures())); options.preprocessorOptions.includeFinder = includeFinder; const uint32_t subgroupSize = 0x1u << subgroupSizeLog2; - const uint32_t itemsPerWG = workgroupSize <= subgroupSize ? workgroupSize * itemsPerInvoc : itemsPerInvoc * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow + const uint32_t itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvoc); smart_refctd_ptr overriddenUnspecialized; if constexpr (WorkgroupBench) { @@ -732,7 +710,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub constexpr static inline uint32_t MaxNumSubmits = 30; uint32_t numSubmits = 0; - uint32_t elementCount = 1024 * 1024; + constexpr static inline uint32_t ElementCount = 1024 * 1024; /* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */ constexpr static inline bool DoWorkgroupBenchmarks = true; @@ -740,11 +718,11 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t NumLoops = 1000u; constexpr static inline uint32_t NumBenchmarks = 6u; - constexpr static inline std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; - template - using ArithmeticOp = emulatedReduction; // change this to test other arithmetic ops + std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; + std::array arithmeticOperations = { "reduction", "inclusive_scan", "exclusive_scan" }; + - std::array benchSets; + std::array benchSets; smart_refctd_ptr benchPool; smart_refctd_ptr benchDs; From a5a21fd577fab8dae00995e5f1cefbc63f48cd5e Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 9 Jun 2025 11:09:53 +0700 Subject: [PATCH 227/296] minor changes to passing subgroup size and items per wg --- .../app_resources/testSubgroup.comp.hlsl | 8 +-- .../app_resources/testWorkgroup.comp.hlsl | 7 +-- 23_Arithmetic2UnitTest/main.cpp | 60 ++++++++----------- .../benchmarkWorkgroup.comp.hlsl | 2 +- 4 files changed, 30 insertions(+), 47 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl index 8d8557ccd..2adb4dc81 100644 --- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl @@ -16,8 +16,6 @@ uint32_t globalIndex() return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex(); } -bool canStore() { return true; } - template static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { @@ -26,13 +24,13 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) const uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); - if (globalIndex()==0u) + if (glsl::gl_SubgroupSize()!=1u<(outputBufAddr, glsl::gl_SubgroupSize()); operation_t func; type_t val = func(sourceVal); - if (canStore()) - vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t)); + + vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t)); } type_t test() diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index a38124b0c..efaa25874 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -48,16 +48,11 @@ struct operation_t }; -uint32_t globalIndex() -{ - return glsl::gl_WorkGroupID().x*ITEMS_PER_WG+workgroup::SubgroupContiguousIndex(); -} - template static void subtest() { uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t)); - if (globalIndex()==0u) + if (glsl::gl_SubgroupSize()!=1u<(outputBufAddr, glsl::gl_SubgroupSize()); operation_t func; diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 326c9e57f..ad867bc92 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -216,16 +216,17 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu for (uint32_t j = 0; j < ItemsPerInvocations.size(); j++) { const uint32_t itemsPerInvocation = ItemsPerInvocations[j]; + uint32_t itemsPerWG = workgroupSize * itemsPerInvocation; m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation); bool passed = true; - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, ~0u, itemsPerInvocation) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, ~0u, itemsPerInvocation) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, ~0u, itemsPerInvocation) && passed; - logTestOutcome(passed, workgroupSize); - - const uint32_t itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvocation); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + + itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvocation); m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); @@ -320,7 +321,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, bool useNative, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u) + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, bool useNative, uint32_t itemsPerWG, uint32_t itemsPerInvoc = 1u) { std::string arith_name = Arithmetic>::name; const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); @@ -398,15 +399,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2); // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) - uint32_t workgroupCount; - if constexpr (WorkgroupTest) - workgroupCount = elementCount / itemsPerWG; - else - { - itemsPerWG = workgroupSize; - workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc); - } - workgroupCount = min(workgroupCount, m_physicalDevice->getLimits().maxComputeWorkGroupCount[0]); + uint32_t workgroupCount = min(elementCount / itemsPerWG, m_physicalDevice->getLimits().maxComputeWorkGroupCount[0]); cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); cmdbuf->bindComputePipeline(pipeline.get()); @@ -441,21 +434,22 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}}; m_device->blockForSemaphores(wait); + const uint32_t subgroupSize = 1u << subgroupSizeLog2; // check results - bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc); - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc); + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed; return passed; } //returns true if result matches template class Arithmetic, class Binop, bool WorkgroupTest> - bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, const uint32_t itemsPerInvoc) + bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, const uint32_t subgroupSize, const uint32_t itemsPerInvoc) { bool success = true; @@ -465,7 +459,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu using type_t = typename Binop::type_t; const auto dataFromBuffer = reinterpret_cast(resultsBuffer->getPointer()); - const auto subgroupSize = dataFromBuffer[0]; if (subgroupSizenbl::hlsl::subgroup::MaxSubgroupSize) { m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize); @@ -475,11 +468,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu const auto testData = reinterpret_cast(dataFromBuffer + 1); // TODO: parallel for (the temporary values need to be threadlocal or what?) // now check if the data obtained has valid values - type_t* tmp; - if constexpr (WorkgroupTest) - tmp = new type_t[itemsPerWG]; - else - tmp = new type_t[itemsPerWG * itemsPerInvoc]; + type_t* tmp = new type_t[itemsPerWG]; for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) { if constexpr (WorkgroupTest) @@ -506,11 +495,12 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } else { - const auto workgroupOffset = workgroupID * itemsPerWG * itemsPerInvoc; - for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) + const auto workgroupOffset = workgroupID * itemsPerWG; + const auto workgroupSize = itemsPerWG / itemsPerInvoc; + for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < workgroupSize; pseudoSubgroupID += subgroupSize) Arithmetic::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc); - for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) + for (uint32_t localInvocationIndex = 0u; localInvocationIndex < workgroupSize; localInvocationIndex++) { const auto localOffset = localInvocationIndex * itemsPerInvoc; const auto globalInvocationIndex = workgroupOffset + localOffset; diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index 58a3624cd..72a42f9a1 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -15,7 +15,7 @@ using config_t = workgroup2::ArithmeticConfiguration type_t; // final (level 1/2) scan needs to fit in one subgroup exactly -groupshared uint32_t scratch[config_t::SharedScratchElementCount]; +groupshared uint32_t scratch[mpl::max_v]; #include "../../common/include/WorkgroupDataAccessors.hlsl" From 1710b698621796aa767edf7bc940e55e6758c2a8 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 9 Jun 2025 12:20:43 +0700 Subject: [PATCH 228/296] push constant stores array of output addresses directly because static addressing --- .../app_resources/common.hlsl | 4 ++-- .../app_resources/testSubgroup.comp.hlsl | 2 +- .../app_resources/testWorkgroup.comp.hlsl | 3 +-- 23_Arithmetic2UnitTest/main.cpp | 21 ++----------------- .../app_resources/benchmarkSubgroup.comp.hlsl | 2 +- .../benchmarkWorkgroup.comp.hlsl | 2 -- 29_Arithmetic2Bench/app_resources/common.hlsl | 4 ++-- 29_Arithmetic2Bench/main.cpp | 17 +++------------ common/include/WorkgroupDataAccessors.hlsl | 2 +- 9 files changed, 13 insertions(+), 44 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/common.hlsl b/23_Arithmetic2UnitTest/app_resources/common.hlsl index ddf5dc00f..6654645cf 100644 --- a/23_Arithmetic2UnitTest/app_resources/common.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/common.hlsl @@ -4,7 +4,7 @@ struct PushConstantData { uint64_t pInputBuf; - uint64_t ppOutputBuf; + uint64_t pOutputBuf[8]; }; namespace arithmetic @@ -93,4 +93,4 @@ struct ballot : nbl::hlsl::plus }; } -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" \ No newline at end of file +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl index 2adb4dc81..6cd496648 100644 --- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl @@ -22,7 +22,7 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) using config_t = subgroup2::Configuration; using params_t = subgroup2::ArithmeticParams; - const uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); + const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex]; if (glsl::gl_SubgroupSize()!=1u<(outputBufAddr, glsl::gl_SubgroupSize()); diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index efaa25874..97ff31481 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -51,9 +51,8 @@ struct operation_t template static void subtest() { - uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t)); if (glsl::gl_SubgroupSize()!=1u<(outputBufAddr, glsl::gl_SubgroupSize()); + vk::RawBufferStore(pc.pOutputBuf[Binop::BindingIndex], glsl::gl_SubgroupSize()); operation_t func; func(); diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index ad867bc92..85d6e610f 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -99,21 +99,9 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); assert(bufferMem.isValid()); } - - // create buffer to store BDA of output buffers - smart_refctd_ptr gpuOutputAddressesBuffer; - { - std::array outputAddresses; - for (uint32_t i = 0; i < OutputBufferCount; i++) - outputAddresses[i] = outputBuffers[i]->getDeviceAddress(); - - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = OutputBufferCount * sizeof(uint64_t); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer); - } pc.pInputBuf = gpuinputDataBuffer->getDeviceAddress(); - pc.ppOutputBuf = gpuOutputAddressesBuffer->getDeviceAddress(); + for (uint32_t i = 0; i < OutputBufferCount; i++) + pc.pOutputBuf[i] = outputBuffers[i]->getDeviceAddress(); // create Pipeline Layout { @@ -459,11 +447,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu using type_t = typename Binop::type_t; const auto dataFromBuffer = reinterpret_cast(resultsBuffer->getPointer()); - if (subgroupSizenbl::hlsl::subgroup::MaxSubgroupSize) - { - m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize); - return false; - } const auto testData = reinterpret_cast(dataFromBuffer + 1); // TODO: parallel for (the temporary values need to be threadlocal or what?) diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl index ba11890d1..2da7de38f 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl @@ -24,7 +24,7 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) using params_t = subgroup2::ArithmeticParams; type_t value = sourceVal; - const uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); + const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex]; operation_t func; // [unroll] diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index 72a42f9a1..ad861a30d 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -99,8 +99,6 @@ struct operation_t template static void subbench() { - const uint64_t outputBufAddr = vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); - RandomizedInputDataProxy dataAccessor = RandomizedInputDataProxy::create(); dataAccessor.preload(); diff --git a/29_Arithmetic2Bench/app_resources/common.hlsl b/29_Arithmetic2Bench/app_resources/common.hlsl index 388be324f..0cdcd7dad 100644 --- a/29_Arithmetic2Bench/app_resources/common.hlsl +++ b/29_Arithmetic2Bench/app_resources/common.hlsl @@ -4,7 +4,7 @@ struct PushConstantData { uint64_t pInputBuf; - uint64_t ppOutputBuf; + uint64_t pOutputBuf[2]; }; namespace arithmetic @@ -32,4 +32,4 @@ struct ballot : nbl::hlsl::plus }; } -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index 38f995264..d317f07df 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -246,7 +246,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub transferDownQueue = getTransferDownQueue(); computeQueue = getComputeQueue(); - // create 8 buffers for 8 operations + // create 2 buffers for 2 operations for (auto i=0u; iallocate(mreq, outputBuffers[i].get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); assert(bufferMem.isValid()); } - - // create buffer to store BDA of output buffers - { - std::array outputAddresses; - for (uint32_t i = 0; i < OutputBufferCount; i++) - outputAddresses[i] = outputBuffers[i]->getDeviceAddress(); - - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = OutputBufferCount * sizeof(uint64_t); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer); - } - pc.ppOutputBuf = gpuOutputAddressesBuffer->getDeviceAddress(); + for (auto i = 0u; i < OutputBufferCount; i++) + pc.pOutputBuf[i] = outputBuffers[i]->getDeviceAddress(); // create image views for swapchain images for (uint32_t i = 0; i < ISwapchain::MaxImages; i++) diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl index e6112f797..6beadfbc9 100644 --- a/common/include/WorkgroupDataAccessors.hlsl +++ b/common/include/WorkgroupDataAccessors.hlsl @@ -40,7 +40,7 @@ struct DataProxy { DataProxy retval; retval.workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - retval.outputBufAddr = sizeof(uint32_t) + vk::RawBufferLoad(pc.ppOutputBuf + Binop::BindingIndex * sizeof(uint64_t)); + retval.outputBufAddr = sizeof(uint32_t) + pc.pOutputBuf[Binop::BindingIndex]; return retval; } From 6c78e29707f3af7cba1ca67781bd71f8d7e35189 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 11 Jun 2025 16:01:03 +0200 Subject: [PATCH 229/296] rework PCH for examples, save work and test with 01 - successfully --- 01_HelloCoreSystemAsset/main.cpp | 4 +-- CMakeLists.txt | 33 ++++++++-------------- common/CMakeLists.txt | 18 +++--------- common/include/nbl/examples/PCH.hpp | 20 ++++++------- common/include/nbl/examples/api.hpp | 31 ++++++++++++++++++++ common/src/nbl/examples/CMakeLists.txt | 3 ++ common/src/nbl/examples/pch.cpp | 1 + common/src/nbl/examples/pch/CMakeLists.txt | 18 ------------ common/src/nbl/examples/pch/main.cpp | 9 ------ 9 files changed, 60 insertions(+), 77 deletions(-) create mode 100644 common/include/nbl/examples/api.hpp create mode 100644 common/src/nbl/examples/pch.cpp delete mode 100644 common/src/nbl/examples/pch/CMakeLists.txt delete mode 100644 common/src/nbl/examples/pch/main.cpp diff --git a/01_HelloCoreSystemAsset/main.cpp b/01_HelloCoreSystemAsset/main.cpp index 6a9188344..96e4a0d4e 100644 --- a/01_HelloCoreSystemAsset/main.cpp +++ b/01_HelloCoreSystemAsset/main.cpp @@ -2,8 +2,8 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -// always include nabla first before std:: headers -#include "nabla.h" +// public interface and common examples API, always include first before std:: headers +#include "nbl/examples/api.hpp" #include "nbl/system/IApplicationFramework.h" diff --git a/CMakeLists.txt b/CMakeLists.txt index 789e96937..a9d9d046c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,26 +2,6 @@ # This file is part of the "Nabla Engine". # For conditions of distribution and use, see copyright notice in nabla.h -function(NBL_HOOK_COMMON_API NBL_EXCLUDE_TARGETS_LIST) - if(NOT TARGET nblExamplesAPI) - message(FATAL_ERROR "nblExamplesAPI not defined!") - endif() - - NBL_GET_ALL_TARGETS(NBL_TARGETS) - - foreach(NBL_TARGET IN LISTS NBL_TARGETS) - # TODO: exclude builtin targets created by examples as well - doesn't impact anything at all now - if(NOT ${NBL_TARGET} IN_LIST NBL_EXCLUDE_TARGETS_LIST) - - target_include_directories(${NBL_TARGET} PRIVATE $) - target_link_libraries(${NBL_TARGET} PRIVATE nblExamplesAPI) - endif() - endforeach() -endfunction() - -# PCH & CommonAPI library for Nabla framework examples -add_subdirectory(common EXCLUDE_FROM_ALL) - if(NBL_BUILD_EXAMPLES) if(NBL_BUILD_ANDROID) nbl_android_create_media_storage_apk() @@ -44,7 +24,7 @@ if(NBL_BUILD_EXAMPLES) # showcase the set-up of a swapchain and picking of a matching device add_subdirectory(08_HelloSwapchain EXCLUDE_FROM_ALL) add_subdirectory(09_GeometryCreator EXCLUDE_FROM_ALL) - # demonstrate the counting sort utility + # demonstrate the counting sort utility add_subdirectory(10_CountingSort EXCLUDE_FROM_ALL) # showcase use of FFT for post-FX Bloom effect add_subdirectory(11_FFT EXCLUDE_FROM_ALL) @@ -92,5 +72,14 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) - NBL_HOOK_COMMON_API("${NBL_EXAMPLES_API_TARGETS}") + NBL_GET_ALL_TARGETS(TARGETS) + + # PCH & CommonAPI library for Nabla framework examples + add_subdirectory(common EXCLUDE_FROM_ALL) + + foreach(T IN LISTS TARGETS) + target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET}) + target_include_directories(${T} PUBLIC $) + target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}") + endforeach() endif() diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 9560a8f42..3cdcce82d 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,17 +1,7 @@ -########################################### -# TODO: the way it should work is following (remove the comment once all done!) -# - one top PCH which includes -> currently not done -# - sources used only within examples splitted into "common libraries" (optional -> with options to toggle if include them to build tree), each common library should reuse the above top PCH -# - examples_tests CMake loop over example targets and hook the interface library with NBL_HOOK_COMMON_API [done] -# - each common library should declare ONLY interface and never expose source definition into headers nor any 3rdparty stuff! -## +nbl_create_ext_library_project(ExamplesAPI "" "${CMAKE_CURRENT_SOURCE_DIR}/src/nbl/examples/pch.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/include" "" "") -# interface libraries don't have build rules (except custom commands however it doesn't matter here) but properties -add_library(nblExamplesAPI INTERFACE) -set(NBL_EXAMPLES_API_INCLUDE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include") -target_include_directories(nblExamplesAPI INTERFACE "${NBL_EXAMPLES_API_INCLUDE_DIRECTORY}") +set_target_properties(${LIB_NAME} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF) +target_precompile_headers(${LIB_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include/nbl/examples/PCH.hpp") add_subdirectory("src/nbl/examples" EXCLUDE_FROM_ALL) - -# TODO: Arek what was `NBL_EXECUTABLE_COMMON_API_TARGET` ? I removed it. -set(NBL_EXAMPLES_API_TARGETS nblExamplesAPI ${NBL_EXAMPLES_API_TARGETS} PARENT_SCOPE) +set(NBL_EXAMPLES_API_TARGET ${LIB_NAME} PARENT_SCOPE) \ No newline at end of file diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp index 7a1b6bdc6..de686b26f 100644 --- a/common/include/nbl/examples/PCH.hpp +++ b/common/include/nbl/examples/PCH.hpp @@ -4,19 +4,15 @@ #ifndef _NBL_EXAMPLES_PCH_HPP_ #define _NBL_EXAMPLES_PCH_HPP_ - +//! public declarations +/* + NOTE: currently our whole public and private interface is broken + and private headers leak to public includes +*/ #include -// #include "nbl/ui/CGraphicalApplicationAndroid.h" -// #include "nbl/ui/CWindowManagerAndroid.h" - -#include "nbl/examples/common/SimpleWindowedApplication.hpp" -#include "nbl/examples/common/InputSystem.hpp" -#include "nbl/examples/common/CEventCallback.hpp" - -#include "nbl/examples/cameras/CCamera.hpp" - -#include "nbl/examples/geometry/CGeometryCreatorScene.hpp" - +//! note: one can add common std headers here not present in nabla.h or +//! any headers shared between examples, you cannot put there include +//! files which require extra preprocessor definitions #endif // _NBL_EXAMPLES_COMMON_PCH_HPP_ \ No newline at end of file diff --git a/common/include/nbl/examples/api.hpp b/common/include/nbl/examples/api.hpp new file mode 100644 index 000000000..0cb2278cb --- /dev/null +++ b/common/include/nbl/examples/api.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_EXAMPLES_API_HPP_ +#define _NBL_EXAMPLES_API_HPP_ + +//! PCH for examples +/* + PCH is compiled only once *if* an example can be promoted to use it, it is + when its compile options & definitions set is the same as nblExamplesAPI's + each example links to, otherwise it compiles its own PCH +*/ +#include "nbl/examples/PCH.hpp" + +//! common headers used across examples +/* + NOTE: those *cannot* be used in PCH since they use unique + preprocessor definitions per example which would change + content of PCH +*/ +#include "nbl/examples/common/SimpleWindowedApplication.hpp" +#include "nbl/examples/common/InputSystem.hpp" +#include "nbl/examples/common/CEventCallback.hpp" + +#include "nbl/examples/cameras/CCamera.hpp" + +// broken? probably to refactor or even remove? +// #include "nbl/examples/geometry/CGeometryCreatorScene.hpp" + + +#endif // _NBL_EXAMPLES_API_HPP_ \ No newline at end of file diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt index 96ccaabea..4fceed571 100644 --- a/common/src/nbl/examples/CMakeLists.txt +++ b/common/src/nbl/examples/CMakeLists.txt @@ -1,3 +1,6 @@ + + + # TODO: @AnastaZluk redo the PCH # add_subdirectory(pch EXCLUDE_FROM_ALL) diff --git a/common/src/nbl/examples/pch.cpp b/common/src/nbl/examples/pch.cpp new file mode 100644 index 000000000..39a146f1d --- /dev/null +++ b/common/src/nbl/examples/pch.cpp @@ -0,0 +1 @@ +#include "nbl/examples/PCH.hpp" \ No newline at end of file diff --git a/common/src/nbl/examples/pch/CMakeLists.txt b/common/src/nbl/examples/pch/CMakeLists.txt deleted file mode 100644 index 34f16c2d2..000000000 --- a/common/src/nbl/examples/pch/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -# TODO: let arek figure out how to redo the PCH -#[===[ -include(common RESULT_VARIABLE RES) -if(NOT RES) - message(FATAL_ERROR "common.cmake not found. Should be in '${NBL_ROOT_PATH}/cmake' directory") -endif() - -nbl_create_executable_project("" "" "" "" "") - -set(NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET "${EXECUTABLE_NAME}" CACHE INTERNAL "") -get_target_property(NBL_NABLA_TARGET_SOURCE_DIR Nabla SOURCE_DIR) -set_target_properties("${EXECUTABLE_NAME}" PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF) -target_precompile_headers("${EXECUTABLE_NAME}" PUBLIC - "${CMAKE_CURRENT_SOURCE_DIR}/PCH.hpp" # Common PCH for examples - "${NBL_NABLA_TARGET_SOURCE_DIR}/pch.h" # Nabla's PCH -) -unset(NBL_NABLA_TARGET_SOURCE_DIR) -]===] \ No newline at end of file diff --git a/common/src/nbl/examples/pch/main.cpp b/common/src/nbl/examples/pch/main.cpp deleted file mode 100644 index c19ee3c45..000000000 --- a/common/src/nbl/examples/pch/main.cpp +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright (C) 2018-2022 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -int main(int argc, char** argv) -{ - return 0; -} - From fdae6f916057d539a02f1329bb90e687f5ca70d0 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 11 Jun 2025 16:34:37 +0200 Subject: [PATCH 230/296] actually some example headers I removed can be used in PCH with small tmp trick #include "nbl/examples/common/SimpleWindowedApplication.hpp" #include "nbl/examples/common/InputSystem.hpp" #include "nbl/examples/common/CEventCallback.hpp" #include "nbl/examples/cameras/CCamera.hpp" --- common/include/nbl/examples/PCH.hpp | 6 ++++++ common/include/nbl/examples/api.hpp | 11 ++--------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp index de686b26f..f4fbe377c 100644 --- a/common/include/nbl/examples/PCH.hpp +++ b/common/include/nbl/examples/PCH.hpp @@ -11,6 +11,12 @@ */ #include +#include "nbl/examples/common/SimpleWindowedApplication.hpp" +#include "nbl/examples/common/InputSystem.hpp" +#include "nbl/examples/common/CEventCallback.hpp" + +#include "nbl/examples/cameras/CCamera.hpp" + //! note: one can add common std headers here not present in nabla.h or //! any headers shared between examples, you cannot put there include //! files which require extra preprocessor definitions diff --git a/common/include/nbl/examples/api.hpp b/common/include/nbl/examples/api.hpp index 0cb2278cb..9b809b8ea 100644 --- a/common/include/nbl/examples/api.hpp +++ b/common/include/nbl/examples/api.hpp @@ -12,17 +12,10 @@ */ #include "nbl/examples/PCH.hpp" -//! common headers used across examples +//! common headers used across examples which cannot be part of PCH /* - NOTE: those *cannot* be used in PCH since they use unique - preprocessor definitions per example which would change - content of PCH + NOTE: put here if a header requires defines which may be differ */ -#include "nbl/examples/common/SimpleWindowedApplication.hpp" -#include "nbl/examples/common/InputSystem.hpp" -#include "nbl/examples/common/CEventCallback.hpp" - -#include "nbl/examples/cameras/CCamera.hpp" // broken? probably to refactor or even remove? // #include "nbl/examples/geometry/CGeometryCreatorScene.hpp" From ed51dee2d394663dad3e0d0adfad10fd6df120ca Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 11 Jun 2025 16:49:45 +0200 Subject: [PATCH 231/296] remove old header references from 29_MeshLoaders/main.cpp which are included in PCH now though the example doesn't compile anyway (misses imgui link and more) --- 29_MeshLoaders/main.cpp | 7 ------- common/include/nbl/examples/PCH.hpp | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/29_MeshLoaders/main.cpp b/29_MeshLoaders/main.cpp index feb52936a..6afb74a5c 100644 --- a/29_MeshLoaders/main.cpp +++ b/29_MeshLoaders/main.cpp @@ -6,13 +6,6 @@ #include "nbl/asset/utils/CGeometryCreator.h" #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" -#include "SimpleWindowedApplication.hpp" - -#include "InputSystem.hpp" -#include "CEventCallback.hpp" - -#include "CCamera.hpp" - #include #include diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp index f4fbe377c..58269d652 100644 --- a/common/include/nbl/examples/PCH.hpp +++ b/common/include/nbl/examples/PCH.hpp @@ -19,6 +19,6 @@ //! note: one can add common std headers here not present in nabla.h or //! any headers shared between examples, you cannot put there include -//! files which require extra preprocessor definitions +//! files which require unique preprocessor definitions for each example #endif // _NBL_EXAMPLES_COMMON_PCH_HPP_ \ No newline at end of file From 971ee343256efa218a76297887d17d471ef6d414 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 11 Jun 2025 17:08:34 +0200 Subject: [PATCH 232/296] wipe all EXCLUDE_FROM_ALL except common directory, group all examples into project to allow build all at once --- CMakeLists.txt | 84 ++++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a9d9d046c..a6164dbfd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,74 +3,76 @@ # For conditions of distribution and use, see copyright notice in nabla.h if(NBL_BUILD_EXAMPLES) + project(NablaExamples) + if(NBL_BUILD_ANDROID) nbl_android_create_media_storage_apk() endif() # showcase the use of `nbl::core`,`nbl::system` and `nbl::asset` - add_subdirectory(01_HelloCoreSystemAsset EXCLUDE_FROM_ALL) + add_subdirectory(01_HelloCoreSystemAsset) # showcase the use of `system::IApplicationFramework` and `nbl::video` - add_subdirectory(02_HelloCompute EXCLUDE_FROM_ALL) + add_subdirectory(02_HelloCompute) # showcase physical device selection, resource embedding and the use of identical headers in HLSL and C++ - add_subdirectory(03_DeviceSelectionAndSharedSources EXCLUDE_FROM_ALL) + add_subdirectory(03_DeviceSelectionAndSharedSources) # showcase the creation of windows and polling for input - add_subdirectory(04_HelloUI EXCLUDE_FROM_ALL) + add_subdirectory(04_HelloUI) # showcase the semi-advanced use of Nabla's Streaming Buffers and BDA - add_subdirectory(05_StreamingAndBufferDeviceAddressApp EXCLUDE_FROM_ALL) + add_subdirectory(05_StreamingAndBufferDeviceAddressApp) # showcase the use of a graphics queue - add_subdirectory(06_HelloGraphicsQueue EXCLUDE_FROM_ALL) + add_subdirectory(06_HelloGraphicsQueue) # showcase the set-up of multiple queues - add_subdirectory(07_StagingAndMultipleQueues EXCLUDE_FROM_ALL) + add_subdirectory(07_StagingAndMultipleQueues) # showcase the set-up of a swapchain and picking of a matching device - add_subdirectory(08_HelloSwapchain EXCLUDE_FROM_ALL) - add_subdirectory(09_GeometryCreator EXCLUDE_FROM_ALL) + add_subdirectory(08_HelloSwapchain) + add_subdirectory(09_GeometryCreator) # demonstrate the counting sort utility - add_subdirectory(10_CountingSort EXCLUDE_FROM_ALL) + add_subdirectory(10_CountingSort) # showcase use of FFT for post-FX Bloom effect - add_subdirectory(11_FFT EXCLUDE_FROM_ALL) + add_subdirectory(11_FFT) # Waiting for a refactor - #add_subdirectory(27_PLYSTLDemo EXCLUDE_FROM_ALL) - #add_subdirectory(33_Draw3DLine EXCLUDE_FROM_ALL) + #add_subdirectory(27_PLYSTLDemo) + #add_subdirectory(33_Draw3DLine) # Unit Test Examples - add_subdirectory(20_AllocatorTest EXCLUDE_FROM_ALL) - add_subdirectory(21_LRUCacheUnitTest EXCLUDE_FROM_ALL) - add_subdirectory(22_CppCompat EXCLUDE_FROM_ALL) - add_subdirectory(23_ArithmeticUnitTest EXCLUDE_FROM_ALL) - add_subdirectory(24_ColorSpaceTest EXCLUDE_FROM_ALL) - add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL) - add_subdirectory(26_Blur EXCLUDE_FROM_ALL) - add_subdirectory(27_MPMCScheduler EXCLUDE_FROM_ALL) - add_subdirectory(28_FFTBloom EXCLUDE_FROM_ALL) - add_subdirectory(29_MeshLoaders EXCLUDE_FROM_ALL) - # add_subdirectory(36_CUDAInterop EXCLUDE_FROM_ALL) + add_subdirectory(20_AllocatorTest) + add_subdirectory(21_LRUCacheUnitTest) + add_subdirectory(22_CppCompat) + add_subdirectory(23_ArithmeticUnitTest) + add_subdirectory(24_ColorSpaceTest) + add_subdirectory(25_FilterTest) + add_subdirectory(26_Blur) + add_subdirectory(27_MPMCScheduler) + add_subdirectory(28_FFTBloom) + add_subdirectory(29_MeshLoaders) + # add_subdirectory(36_CUDAInterop) # Showcase compute pathtracing - add_subdirectory(30_ComputeShaderPathTracer EXCLUDE_FROM_ALL) + add_subdirectory(30_ComputeShaderPathTracer) - add_subdirectory(38_EXRSplit EXCLUDE_FROM_ALL) + add_subdirectory(38_EXRSplit) # if (NBL_BUILD_MITSUBA_LOADER AND NBL_BUILD_OPTIX) - # add_subdirectory(39_DenoiserTonemapper EXCLUDE_FROM_ALL) + # add_subdirectory(39_DenoiserTonemapper) # endif() - #add_subdirectory(43_SumAndCDFFilters EXCLUDE_FROM_ALL) - add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL) - add_subdirectory(54_Transformations EXCLUDE_FROM_ALL) - add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL) - add_subdirectory(61_UI EXCLUDE_FROM_ALL) - add_subdirectory(62_CAD EXCLUDE_FROM_ALL) - add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL) - add_subdirectory(64_EmulatedFloatTest EXCLUDE_FROM_ALL) - add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42 + #add_subdirectory(43_SumAndCDFFilters) + add_subdirectory(47_DerivMapTest) + add_subdirectory(54_Transformations) + add_subdirectory(55_RGB18E7S3) + add_subdirectory(61_UI) + add_subdirectory(62_CAD) + add_subdirectory(62_SchusslerTest) + add_subdirectory(64_EmulatedFloatTest) + add_subdirectory(0_ImportanceSamplingEnvMaps) #TODO: integrate back into 42 - add_subdirectory(66_HLSLBxDFTests EXCLUDE_FROM_ALL) - add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL) - add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL) + add_subdirectory(66_HLSLBxDFTests) + add_subdirectory(67_RayQueryGeometry) + add_subdirectory(68_JpegLoading) - add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) - add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) + add_subdirectory(70_FLIPFluids) + add_subdirectory(71_RayTracingPipeline) NBL_GET_ALL_TARGETS(TARGETS) From c9f610f7adbed4f572cf834c79ffd6d09b7c47bf Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 11 Jun 2025 17:51:41 +0200 Subject: [PATCH 233/296] use NBL_ADJUST_FOLDERS with examples namespace --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a6164dbfd..82daaccb3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,4 +84,6 @@ if(NBL_BUILD_EXAMPLES) target_include_directories(${T} PUBLIC $) target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}") endforeach() -endif() + + NBL_ADJUST_FOLDERS(examples) +endif() \ No newline at end of file From b9f3f13bc42167243bdab2edb624ab0d76b8a878 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 13 Jun 2025 17:16:05 +0700 Subject: [PATCH 234/296] Fix example to use reworked shader spec itnerface --- 02_HelloCompute/main.cpp | 1 - 03_DeviceSelectionAndSharedSources/Testers.h | 5 ++- 03_DeviceSelectionAndSharedSources/main.cpp | 5 ++- .../main.cpp | 1 - 07_StagingAndMultipleQueues/main.cpp | 1 - 10_CountingSort/main.cpp | 5 ++- 11_FFT/main.cpp | 5 ++- 22_CppCompat/ITester.h | 1 - 22_CppCompat/main.cpp | 1 - 23_ArithmeticUnitTest/main.cpp | 5 ++- 24_ColorSpaceTest/main.cpp | 3 +- 26_Blur/main.cpp | 3 +- 27_MPMCScheduler/main.cpp | 3 +- 30_ComputeShaderPathTracer/main.cpp | 8 ++--- 67_RayQueryGeometry/main.cpp | 1 - common/include/CGeomtryCreatorScene.hpp | 34 +++++++++++-------- 16 files changed, 36 insertions(+), 46 deletions(-) diff --git a/02_HelloCompute/main.cpp b/02_HelloCompute/main.cpp index 63a9f8832..32812fb1a 100644 --- a/02_HelloCompute/main.cpp +++ b/02_HelloCompute/main.cpp @@ -169,7 +169,6 @@ class HelloComputeApp final : public nbl::application_templates::MonoSystemMonoL // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main") params.shader.entryPoint = "main"; params.shader.shader = shader.get(); - params.shader.stage = hlsl::ESS_COMPUTE; // we'll cover the specialization constant API in another example if (!device->createComputePipelines(nullptr,{¶ms,1},&pipeline)) return logFail("Failed to create pipelines (compile & link shaders)!\n"); diff --git a/03_DeviceSelectionAndSharedSources/Testers.h b/03_DeviceSelectionAndSharedSources/Testers.h index b21da71c4..9a4016d20 100644 --- a/03_DeviceSelectionAndSharedSources/Testers.h +++ b/03_DeviceSelectionAndSharedSources/Testers.h @@ -254,10 +254,9 @@ class PredefinedLayoutTester final : public IntrospectionTesterBase bool pplnCreationSuccess[MERGE_TEST_SHADERS_CNT]; for (uint32_t i = 0u; i < MERGE_TEST_SHADERS_CNT; ++i) { - IPipelineBase::SShaderSpecInfo specInfo; + ICPUPipelineBase::SShaderSpecInfo specInfo; specInfo.entryPoint = "main"; - specInfo.shader = sources[i].get(); - specInfo.stage = hlsl::ShaderStage::ESS_COMPUTE; + specInfo.shader = sources[i]; pplnCreationSuccess[i] = static_cast(introspector[i].createApproximateComputePipelineFromIntrospection(specInfo, core::smart_refctd_ptr(predefinedPplnLayout))); } diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp index 3712b5719..5fb584e4d 100644 --- a/03_DeviceSelectionAndSharedSources/main.cpp +++ b/03_DeviceSelectionAndSharedSources/main.cpp @@ -61,10 +61,9 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M //shaderIntrospection->debugPrint(m_logger.get()); // We've now skipped the manual creation of a descriptor set layout, pipeline layout - IPipelineBase::SShaderSpecInfo specInfo; + ICPUPipelineBase::SShaderSpecInfo specInfo; specInfo.entryPoint = "main"; - specInfo.shader = source.get(); - specInfo.stage = hlsl::ShaderStage::ESS_COMPUTE; + specInfo.shader = source; smart_refctd_ptr cpuPipeline = introspector.createApproximateComputePipelineFromIntrospection(specInfo); diff --git a/05_StreamingAndBufferDeviceAddressApp/main.cpp b/05_StreamingAndBufferDeviceAddressApp/main.cpp index 96ccce9f5..c6c537363 100644 --- a/05_StreamingAndBufferDeviceAddressApp/main.cpp +++ b/05_StreamingAndBufferDeviceAddressApp/main.cpp @@ -135,7 +135,6 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M params.layout = layout.get(); params.shader.shader = shader.get(); params.shader.entryPoint = "main"; - params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE; if (!m_device->createComputePipelines(nullptr,{¶ms,1},&m_pipeline)) return logFail("Failed to create compute pipeline!\n"); } diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp index 23f2246bc..3e79bdfed 100644 --- a/07_StagingAndMultipleQueues/main.cpp +++ b/07_StagingAndMultipleQueues/main.cpp @@ -311,7 +311,6 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main") params.shader.entryPoint = "main"; params.shader.shader = shader.get(); - params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE; // we'll cover the specialization constant API in another example if (!m_device->createComputePipelines(nullptr,{¶ms,1},&pipeline)) logFailAndTerminate("Failed to create pipelines (compile & link shaders)!\n"); diff --git a/10_CountingSort/main.cpp b/10_CountingSort/main.cpp index 1fd789ad1..de2ffca8b 100644 --- a/10_CountingSort/main.cpp +++ b/10_CountingSort/main.cpp @@ -92,10 +92,9 @@ class CountingSortApp final : public application_templates::MonoDeviceApplicatio params.layout = layout.get(); params.shader.shader = prefixSumShader.get(); params.shader.entryPoint = "main"; - params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE; params.shader.entries = nullptr; - params.shader.requireFullSubgroups = true; - params.shader.requiredSubgroupSize = static_cast(5); + params.shader.requiredSubgroupSize = static_cast(5); + params.cached.requireFullSubgroups = true; if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &prefixSumPipeline)) return logFail("Failed to create compute pipeline!\n"); params.shader.shader = scatterShader.get(); diff --git a/11_FFT/main.cpp b/11_FFT/main.cpp index 1cac98b1f..ad9bbfd47 100644 --- a/11_FFT/main.cpp +++ b/11_FFT/main.cpp @@ -133,9 +133,8 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ params.layout = layout.get(); params.shader.shader = shader.get(); params.shader.entryPoint = "main"; - params.shader.stage = hlsl::ESS_COMPUTE; - params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)); - params.shader.requireFullSubgroups = true; + params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)); + params.cached.requireFullSubgroups = true; if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) return logFail("Failed to create compute pipeline!\n"); } diff --git a/22_CppCompat/ITester.h b/22_CppCompat/ITester.h index 273f51663..32138f198 100644 --- a/22_CppCompat/ITester.h +++ b/22_CppCompat/ITester.h @@ -113,7 +113,6 @@ class ITester params.layout = m_pplnLayout.get(); params.shader.entryPoint = "main"; params.shader.shader = shader.get(); - params.shader.stage = shaderStage; if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) logFail("Failed to create pipelines (compile & link shaders)!\n"); } diff --git a/22_CppCompat/main.cpp b/22_CppCompat/main.cpp index 877831c55..a5a819d49 100644 --- a/22_CppCompat/main.cpp +++ b/22_CppCompat/main.cpp @@ -128,7 +128,6 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa params.layout = layout.get(); params.shader.shader = shader.get(); params.shader.entryPoint = "main"; - params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE; if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) return logFail("Failed to create compute pipeline!\n"); } diff --git a/23_ArithmeticUnitTest/main.cpp b/23_ArithmeticUnitTest/main.cpp index e2d7d3cfe..12b9d3c2d 100644 --- a/23_ArithmeticUnitTest/main.cpp +++ b/23_ArithmeticUnitTest/main.cpp @@ -284,11 +284,10 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu params.shader = { .shader = shader.get(), .entryPoint = "main", - .stage = hlsl::ESS_COMPUTE, - .requiredSubgroupSize = static_cast(subgroupSizeLog2), - .requireFullSubgroups = true, + .requiredSubgroupSize = static_cast(subgroupSizeLog2), .entries = nullptr, }; + params.cached.requireFullSubgroups = true; core::smart_refctd_ptr pipeline; if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{¶ms,1},&pipeline)) return nullptr; diff --git a/24_ColorSpaceTest/main.cpp b/24_ColorSpaceTest/main.cpp index 1c23a3f2f..fae93cf45 100644 --- a/24_ColorSpaceTest/main.cpp +++ b/24_ColorSpaceTest/main.cpp @@ -260,10 +260,9 @@ class ColorSpaceTestSampleApp final : public examples::SimpleWindowedApplication .size = sizeof(push_constants_t) }; auto layout = m_device->createPipelineLayout({ &range,1 }, nullptr, nullptr, nullptr, core::smart_refctd_ptr(dsLayout)); - const IPipelineBase::SShaderSpecInfo fragSpec = { + const IGPUPipelineBase::SShaderSpecInfo fragSpec = { .shader = fragmentShader.get(), .entryPoint = "main", - .stage = ESS_FRAGMENT, }; m_pipeline = fsTriProtoPPln.createPipeline(fragSpec, layout.get(), scResources->getRenderpass()/*,default is subpass 0*/); if (!m_pipeline) diff --git a/26_Blur/main.cpp b/26_Blur/main.cpp index 4910ba5f0..bd4b6dedc 100644 --- a/26_Blur/main.cpp +++ b/26_Blur/main.cpp @@ -282,8 +282,7 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica params.layout = layout.get(); params.shader.shader = shader.get(); params.shader.entryPoint = "main"; - params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE; - params.shader.requireFullSubgroups = true; + params.cached.requireFullSubgroups = true; if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &m_ppln)) return logFail("Failed to create Pipeline"); } diff --git a/27_MPMCScheduler/main.cpp b/27_MPMCScheduler/main.cpp index 03275d114..33768c981 100644 --- a/27_MPMCScheduler/main.cpp +++ b/27_MPMCScheduler/main.cpp @@ -115,8 +115,7 @@ class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, publi params.layout = layout.get(); params.shader.shader = shader.get(); params.shader.entryPoint = "main"; - params.shader.stage = hlsl::ShaderStage::ESS_COMPUTE; - params.shader.requireFullSubgroups = true; + params.cached.requireFullSubgroups = true; if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &m_ppln)) return logFail("Failed to create Pipeline"); } diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp index 44a4dd6ef..2fa7bfc0b 100644 --- a/30_ComputeShaderPathTracer/main.cpp +++ b/30_ComputeShaderPathTracer/main.cpp @@ -351,10 +351,9 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication params.layout = ptPipelineLayout.get(); params.shader.shader = ptShader.get(); params.shader.entryPoint = "main"; - params.shader.stage = ESS_COMPUTE; params.shader.entries = nullptr; - params.shader.requireFullSubgroups = true; - params.shader.requiredSubgroupSize = static_cast(5); + params.cached.requireFullSubgroups = true; + params.shader.requiredSubgroupSize = static_cast(5); if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTPipelines.data() + index)) { return logFail("Failed to create compute pipeline!\n"); } @@ -373,10 +372,9 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication if (!fragmentShader) return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!"); - const IPipelineBase::SShaderSpecInfo fragSpec = { + const IGPUPipelineBase::SShaderSpecInfo fragSpec = { .shader = fragmentShader.get(), .entryPoint = "main", - .stage = ESS_FRAGMENT, }; auto presentLayout = m_device->createPipelineLayout( diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 4c09da5da..f4fa38aa1 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -203,7 +203,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu params.layout = pipelineLayout.get(); params.shader.shader = shader.get(); params.shader.entryPoint = "main"; - params.shader.stage = ESS_COMPUTE; if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &renderPipeline)) return logFail("Failed to create compute pipeline"); } diff --git a/common/include/CGeomtryCreatorScene.hpp b/common/include/CGeomtryCreatorScene.hpp index 6ffad2c73..8cbce35c4 100644 --- a/common/include/CGeomtryCreatorScene.hpp +++ b/common/include/CGeomtryCreatorScene.hpp @@ -824,7 +824,6 @@ class ResourceBuilder { SBlendParams blend; SRasterizationParams rasterization; - typename Types::graphics_pipeline_t::SCreationParams pipeline; } params; { @@ -842,16 +841,6 @@ class ResourceBuilder params.rasterization.faceCullingMode = EFCM_NONE; { - const IPipelineBase::SShaderSpecInfo info [] = - { - {.shader = scratch.shaders[inGeometry.shadersType].vertex.get(), .entryPoint = "VSMain", .stage = hlsl::ShaderStage::ESS_VERTEX}, - {.shader = scratch.shaders[inGeometry.shadersType].fragment.get(), .entryPoint = "PSMain", .stage = hlsl::ShaderStage::ESS_FRAGMENT}, - }; - - params.pipeline.layout = scratch.pipelineLayout.get(); - params.pipeline.shaders = info; - params.pipeline.renderpass = scratch.renderpass.get(); - params.pipeline.cached = { .vertexInput = inGeometry.data.inputParams, .primitiveAssembly = inGeometry.data.assemblyParams, .rasterization = params.rasterization, .blend = params.blend, .subpassIx = 0u }; obj.indexCount = inGeometry.data.indexCount; obj.indexType = inGeometry.data.indexType; @@ -859,11 +848,28 @@ class ResourceBuilder // TODO: cache pipeline & try lookup for existing one first maybe // similar issue like with shaders again, in this case gpu contructor allows for extra cache parameters + there is no constructor you can use to fire make_smart_refctd_ptr yourself for cpu - if constexpr (withAssetConverter) - obj.pipeline = ICPUGraphicsPipeline::create(params.pipeline); + if constexpr (withAssetConverter) { + + obj.pipeline = ICPUGraphicsPipeline::create(scratch.pipelineLayout.get(), scratch.renderpass.get()); + obj.pipeline->getCachedCreationParams() = { + .vertexInput = inGeometry.data.inputParams, + .primitiveAssembly = inGeometry.data.assemblyParams, + .rasterization = params.rasterization, + .blend = params.blend, + .subpassIx = 0u + }; + *obj.pipeline->getSpecInfo(hlsl::ESS_VERTEX) = { .shader = scratch.shaders[inGeometry.shadersType].vertex, .entryPoint = "VSMain" }; + *obj.pipeline->getSpecInfo(hlsl::ESS_FRAGMENT) = { .shader = scratch.shaders[inGeometry.shadersType].fragment, .entryPoint = "PSMain" }; + } else { - const std::array info = { { params.pipeline } }; + IGPUGraphicsPipeline::SCreationParams createParams = {}; + createParams.layout = scratch.pipelineLayout.get(); + createParams.vertexShader = {.shader = scratch.shaders[inGeometry.shadersType].vertex.get(), .entryPoint = "VSMain" }; + createParams.fragmentShader = { .shader = scratch.shaders[inGeometry.shadersType].fragment.get(), .entryPoint = "PSMain" }; + createParams.renderpass = scratch.renderpass.get(); + createParams.cached = { .vertexInput = inGeometry.data.inputParams, .primitiveAssembly = inGeometry.data.assemblyParams, .rasterization = params.rasterization, .blend = params.blend, .subpassIx = 0u }; + const std::array info = { { createParams } }; utilities->getLogicalDevice()->createGraphicsPipelines(nullptr, info, &obj.pipeline); } From 60319f442bacf210404035cc5daef042f169d2ff Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 13 Jun 2025 17:28:21 +0700 Subject: [PATCH 235/296] Fix picking the wrong diff when merging with master --- 67_RayQueryGeometry/main.cpp | 830 ++++++++++++++--------------------- 1 file changed, 330 insertions(+), 500 deletions(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index f4fa38aa1..7371cf1ea 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -1,7 +1,6 @@ // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h - #include "common.hpp" class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication @@ -126,11 +125,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu auto cQueue = getComputeQueue(); - // create geometry objects - if (!createGeometries(gQueue, geometryCreator)) - return logFail("Could not create geometries from geometry creator"); - // create blas/tlas + renderDs = //#define TRY_BUILD_FOR_NGFX // Validation errors on the fake Acquire-Presents, TODO fix #ifdef TRY_BUILD_FOR_NGFX // Nsight is special and can't do debugger delay so you can debug your CPU stuff during a capture @@ -142,11 +138,12 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu std::this_thread::yield(); } // Nsight is special and can't capture anything not on the queue that performs the swapchain acquire/release - if (!createAccelerationStructures(gQueue)) + createAccelerationStructureDS(gQueue,geometryCreator); #else - if (!createAccelerationStructures(cQueue)) + createAccelerationStructureDS(cQueue,geometryCreator); #endif - return logFail("Could not create acceleration structures"); + if (!renderDs) + return logFail("Could not create acceleration structures and descriptor set"); // create pipelines { @@ -164,67 +161,38 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu const auto assets = bundle.getContents(); assert(assets.size() == 1); - smart_refctd_ptr shaderSrc = IAsset::castDown(assets[0]); - auto shader = m_device->compileShader({ shaderSrc.get() }); + smart_refctd_ptr shaderSrc = IAsset::castDown(assets[0]); + shaderSrc->setShaderStage(IShader::E_SHADER_STAGE::ESS_COMPUTE); + auto shader = m_device->createShader(shaderSrc.get()); if (!shader) return logFail("Failed to create shader!"); - // descriptors - IGPUDescriptorSetLayout::SBinding bindings[] = { - { - .binding = 0, - .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .count = 1, - }, - { - .binding = 1, - .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .count = 1, - } - }; - auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); - - const std::array dsLayoutPtrs = { descriptorSetLayout.get() }; - renderPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); - if (!renderPool) - return logFail("Could not create descriptor pool"); - renderDs = renderPool->createDescriptorSet(descriptorSetLayout); - if (!renderDs) - return logFail("Could not create descriptor set"); - SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0u, .size = sizeof(SPushConstants)}; - auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr); + auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(renderDs->getLayout()), nullptr, nullptr, nullptr); IGPUComputePipeline::SCreationParams params = {}; params.layout = pipelineLayout.get(); params.shader.shader = shader.get(); - params.shader.entryPoint = "main"; if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &renderPipeline)) return logFail("Failed to create compute pipeline"); } // write descriptors - IGPUDescriptorSet::SDescriptorInfo infos[2]; - infos[0].desc = gpuTlas; - infos[1].desc = m_device->createImageView({ - .flags = IGPUImageView::ECF_NONE, - .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, - .image = outHDRImage, - .viewType = IGPUImageView::E_TYPE::ET_2D, - .format = asset::EF_R16G16B16A16_SFLOAT - }); - if (!infos[1].desc) - return logFail("Failed to create image view"); - infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL; - IGPUDescriptorSet::SWriteDescriptorSet writes[3] = { - {.dstSet = renderDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]}, - {.dstSet = renderDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]} - }; - m_device->updateDescriptorSets(std::span(writes, 2), {}); + { + IGPUDescriptorSet::SDescriptorInfo info = {}; + info.desc = m_device->createImageView({ + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, + .image = outHDRImage, + .viewType = IGPUImageView::E_TYPE::ET_2D, + .format = asset::EF_R16G16B16A16_SFLOAT + }); + if (!info.desc) + return logFail("Failed to create image view"); + info.info.image.imageLayout = IImage::LAYOUT::GENERAL; + const IGPUDescriptorSet::SWriteDescriptorSet write = {.dstSet=renderDs.get(), .binding=1, .arrayElement=0, .count=1, .info=&info}; + m_device->updateDescriptorSets({&write,1}, {}); + } // camera { @@ -281,7 +249,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu static bool first = true; if (first) { - m_api->startCapture(); first = false; } @@ -520,82 +487,9 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu return (dim + size - 1) / size; } - smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams& params) - { - smart_refctd_ptr buffer; - buffer = m_device->createBuffer(std::move(params)); - auto bufReqs = buffer->getMemoryReqs(); - bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); - m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - - return buffer; - } - - smart_refctd_ptr getSingleUseCommandBufferAndBegin(smart_refctd_ptr pool) - { - smart_refctd_ptr cmdbuf; - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) - return nullptr; - - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - - return cmdbuf; - } - - void cmdbufSubmitAndWait(smart_refctd_ptr cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue) - { - cmdbuf->end(); - - uint64_t finishedValue = startValue + 1; - - // submit builds - { - auto completed = m_device->createSemaphore(startValue); - - std::array signals; - { - auto& signal = signals.front(); - signal.value = finishedValue; - signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS); - signal.semaphore = completed.get(); - } - - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { - .cmdbuf = cmdbuf.get() - } }; - - const IQueue::SSubmitInfo infos[] = - { - { - .waitSemaphores = {}, - .commandBuffers = commandBuffers, - .signalSemaphores = signals - } - }; - - if (queue->submit(infos) != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR); - return; - } - - const ISemaphore::SWaitInfo info[] = - { { - .semaphore = completed.get(), - .value = finishedValue - } }; - - m_device->blockForSemaphores(info); - } - } - - bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) + smart_refctd_ptr createAccelerationStructureDS(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) { - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for geometry creation!"); - + // get geometries in ICPUBuffers std::array objectsCpu; objectsCpu[OT_CUBE] = ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) }; objectsCpu[OT_SPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) }; @@ -606,163 +500,213 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu objectsCpu[OT_CONE] = ReferenceObjectCpu{ .meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) }; objectsCpu[OT_ICOSPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_ICOSPHERE, .name = "Icosphere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) }; - struct ScratchVIBindings - { - nbl::asset::SBufferBinding vertex, index; - }; - std::array scratchBuffers; - //std::array geomInfos; auto geomInfoBuffer = ICPUBuffer::create({ OT_COUNT * sizeof(SGeomInfo) }); - + SGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); const uint32_t byteOffsets[OT_COUNT] = { 18, 24, 24, 20, 20, 24, 16, 12 }; // based on normals data position const uint32_t smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 }; - for (uint32_t i = 0; i < objectsCpu.size(); i++) + // get ICPUBuffers into ICPUBottomLevelAccelerationStructures + std::array, OT_COUNT> cpuBlas; + for (uint32_t i = 0; i < cpuBlas.size(); i++) { + auto triangles = make_refctd_dynamic_array>>(1u); + auto primitiveCounts = make_refctd_dynamic_array>(1u); + + auto& tri = triangles->front(); + auto& primCount = primitiveCounts->front(); const auto& geom = objectsCpu[i]; - auto& obj = objectsGpu[i]; - auto& scratchObj = scratchBuffers[i]; - obj.meta.name = geom.meta.name; - obj.meta.type = geom.meta.type; + const bool useIndex = geom.data.indexType != EIT_UNKNOWN; + const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride; + const uint32_t numVertices = (geom.data.bindings[0].buffer->getSize()-geom.data.bindings[0].offset) / vertexStride; - obj.indexCount = geom.data.indexCount; - obj.indexType = geom.data.indexType; - obj.vertexStride = geom.data.inputParams.bindings[0].stride; + if (useIndex) + primCount = geom.data.indexCount / 3; + else + primCount = numVertices / 3; - geomInfos[i].indexType = obj.indexType; - geomInfos[i].vertexStride = obj.vertexStride; + geomInfos[i].indexType = geom.data.indexType; + geomInfos[i].vertexStride = vertexStride; geomInfos[i].smoothNormals = smoothNormals[i]; - auto vBuffer = smart_refctd_ptr(geom.data.bindings[0].buffer); // no offset - auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - obj.bindings.vertex.offset = 0u; - - auto iBuffer = smart_refctd_ptr(geom.data.indexBuffer.buffer); // no offset - auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - obj.bindings.index.offset = 0u; + geom.data.bindings[0].buffer->setContentHash(geom.data.bindings[0].buffer->computeContentHash()); + tri.vertexData[0] = geom.data.bindings[0]; + if (useIndex) + { + geom.data.indexBuffer.buffer->setContentHash(geom.data.indexBuffer.buffer->computeContentHash()); + tri.indexData = geom.data.indexBuffer; + } + tri.maxVertex = numVertices - 1; + tri.vertexStride = vertexStride; + tri.vertexFormat = static_cast(geom.data.inputParams.attributes[0].format); + tri.indexType = geom.data.indexType; + tri.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; + + auto& blas = cpuBlas[i]; + blas = make_smart_refctd_ptr(); + blas->setGeometries(std::move(triangles), std::move(primitiveCounts)); + + auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; + if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) + blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS; + + blas->setBuildFlags(blasFlags); + blas->setContentHash(blas->computeContentHash()); + } - vBuffer->addUsageFlags(vUsage); - vBuffer->setContentHash(vBuffer->computeContentHash()); - scratchObj.vertex = { .offset = 0, .buffer = vBuffer }; + // get ICPUBottomLevelAccelerationStructure into ICPUTopLevelAccelerationStructure + auto geomInstances = make_refctd_dynamic_array>(OT_COUNT); + { + uint32_t i = 0; + for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++) + { + ICPUTopLevelAccelerationStructure::StaticInstance inst; + inst.base.blas = cpuBlas[i]; + inst.base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); + inst.base.instanceCustomIndex = i; + inst.base.instanceShaderBindingTableRecordOffset = 0; + inst.base.mask = 0xFF; - if (geom.data.indexType != EIT_UNKNOWN) - if (iBuffer) - { - iBuffer->addUsageFlags(iUsage); - iBuffer->setContentHash(iBuffer->computeContentHash()); - } - scratchObj.index = { .offset = 0, .buffer = iBuffer }; + core::matrix3x4SIMD transform; + transform.setTranslation(nbl::core::vectorSIMDf(5.f * i, 0, 0, 0)); + inst.transform = transform; + + instance->instance = inst; + } } - auto cmdbuf = getSingleUseCommandBufferAndBegin(pool); - cmdbuf->beginDebugMarker("Build geometry vertex and index buffers"); + auto cpuTlas = make_smart_refctd_ptr(); + cpuTlas->setInstances(std::move(geomInstances)); + cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); + + // descriptor set and layout + ICPUDescriptorSetLayout::SBinding bindings[] = { + { + .binding = 0, + .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, + .createFlags = IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1, + }, + { + .binding = 1, + .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, + .createFlags = IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1, + } + }; + auto descriptorSet = core::make_smart_refctd_ptr(core::make_smart_refctd_ptr(bindings)); + descriptorSet->getDescriptorInfos(IDescriptorSetLayoutBase::CBindingRedirect::binding_number_t{0},IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE).front().desc = cpuTlas; +//#define TEST_REBAR_FALLBACK + // convert with asset converter smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); - CAssetConverter::SInputs inputs = {}; + struct MyInputs : CAssetConverter::SInputs + { +#ifndef TEST_REBAR_FALLBACK + inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override + { + assert(memoryBacked); + return memoryBacked->getObjectType()!=IDeviceMemoryBacked::EOT_BUFFER ? (~0u):rebarMemoryTypes; + } +#endif + uint32_t rebarMemoryTypes; + } inputs = {}; inputs.logger = m_logger.get(); + inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); +#ifndef TEST_REBAR_FALLBACK + struct MyAllocator final : public IDeviceMemoryAllocator + { + ILogicalDevice* getDeviceForAllocations() const override {return device;} - std::array tmpBuffers; + SAllocation allocate(const SAllocateInfo& info) override + { + auto retval = device->allocate(info); + // map what is mappable by default so ReBAR checks succeed + if (retval.isValid() && retval.memory->isMappable()) + retval.memory->map({.offset=0,.length=info.size}); + return retval; + } + + ILogicalDevice* device; + } myalloc; + myalloc.device = m_device.get(); + inputs.allocator = &myalloc; +#endif + + CAssetConverter::patch_t tlasPatch = {}; + tlasPatch.compactAfterBuild = true; + std::array,OT_COUNT> tmpBLASPatches = {}; + std::array tmpBuffers; + std::array, OT_COUNT * 2u> tmpBufferPatches; { + tmpBLASPatches.front().compactAfterBuild = true; + std::fill(tmpBLASPatches.begin(),tmpBLASPatches.end(),tmpBLASPatches.front()); + // for (uint32_t i = 0; i < objectsCpu.size(); i++) { - tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get(); - tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get(); + tmpBuffers[2 * i + 0] = cpuBlas[i]->getTriangleGeometries().front().vertexData[0].buffer.get(); + tmpBuffers[2 * i + 1] = cpuBlas[i]->getTriangleGeometries().front().indexData.buffer.get(); } - + // make sure all buffers are BDA-readable + for (auto& patch : tmpBufferPatches) + patch.usage |= asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; + + std::get>(inputs.assets) = {&descriptorSet.get(),1}; + std::get>(inputs.assets) = {&cpuTlas.get(),1}; + std::get>(inputs.patches) = {&tlasPatch,1}; + std::get>(inputs.assets) = {&cpuBlas.data()->get(),cpuBlas.size()}; + std::get>(inputs.patches) = tmpBLASPatches; std::get>(inputs.assets) = tmpBuffers; + std::get>(inputs.patches) = tmpBufferPatches; } auto reservation = converter->reserve(inputs); - { - auto prepass = [&](const auto & references) -> bool - { - auto objects = reservation.getGPUObjects(); - uint32_t counter = {}; - for (auto& object : objects) - { - auto gpu = object.value; - auto* reference = references[counter]; - if (reference) - { - if (!gpu) - { - m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); - return false; - } - } - counter++; - } - return true; - }; - - prepass.template operator() < ICPUBuffer > (tmpBuffers); + constexpr auto XferBufferCount = 2; + std::array,XferBufferCount> xferBufs = {}; + std::array xferBufInfos = {}; + { + auto pool = m_device->createCommandPool(getTransferUpQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,xferBufs); + xferBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + for (auto i=0; icreateSemaphore(0u); - - std::array cmdbufs = {}; - cmdbufs.front().cmdbuf = cmdbuf.get(); - + auto xferSema = m_device->createSemaphore(0u); + xferSema->setObjectDebugName("Transfer Semaphore"); SIntendedSubmitInfo transfer = {}; - transfer.queue = queue; - transfer.scratchCommandBuffers = cmdbufs; + transfer.queue = getTransferUpQueue(); + transfer.scratchCommandBuffers = xferBufInfos; transfer.scratchSemaphore = { - .semaphore = semaphore.get(), + .semaphore = xferSema.get(), .value = 0u, .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS }; - // convert - { - CAssetConverter::SConvertParams params = {}; - params.utilities = m_utils.get(); - params.transfer = &transfer; - - auto future = reservation.convert(params); - if (future.copy() != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); - return false; - } - - // assign gpu objects to output - auto&& buffers = reservation.getGPUObjects(); - for (uint32_t i = 0; i < objectsCpu.size(); i++) - { - auto& obj = objectsGpu[i]; - obj.bindings.vertex = { .offset = 0, .buffer = buffers[2 * i + 0].value }; - obj.bindings.index = { .offset = 0, .buffer = buffers[2 * i + 1].value }; - - geomInfos[i].vertexBufferAddress = obj.bindings.vertex.buffer->getDeviceAddress() + byteOffsets[i]; - geomInfos[i].indexBufferAddress = obj.useIndex() ? obj.bindings.index.buffer->getDeviceAddress() : geomInfos[i].vertexBufferAddress; - } - } - + + constexpr auto CompBufferCount = 2; + std::array,CompBufferCount> compBufs = {}; + std::array compBufInfos = {}; { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = OT_COUNT * sizeof(SGeomInfo); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{.queue = queue}, std::move(params), geomInfos).move_into(geometryInfoBuffer); + auto pool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT|IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,compBufs); + compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + for (auto i=0; i queryPool = m_device->createQueryPool(std::move(qParams)); - - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for blas/tlas creation!"); - - m_api->startCapture(); + auto compSema = m_device->createSemaphore(0u); + compSema->setObjectDebugName("Compute Semaphore"); + SIntendedSubmitInfo compute = {}; + compute.queue = getComputeQueue(); + compute.scratchCommandBuffers = compBufInfos; + compute.scratchSemaphore = { + .semaphore = compSema.get(), + .value = 0u, + .stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT + }; + // convert #ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it m_currentImageAcquire = m_surface->acquireNextImage(); { @@ -775,274 +719,166 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu } m_currentImageAcquire = m_surface->acquireNextImage(); #endif - size_t totalScratchSize = 0; - const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; - - // build bottom level ASes + m_api->startCapture(); + auto gQueue = getGraphicsQueue(); { - IGPUBottomLevelAccelerationStructure::DeviceBuildInfo blasBuildInfos[OT_COUNT]; - uint32_t primitiveCounts[OT_COUNT]; - using Geometry = IGPUBottomLevelAccelerationStructure::Triangles; - Geometry triangles[OT_COUNT]; - uint32_t scratchSizes[OT_COUNT]; - - for (uint32_t i = 0; i < objectsGpu.size(); i++) + smart_refctd_ptr scratchAlloc; { - const auto& obj = objectsGpu[i]; - - const uint32_t vertexStride = obj.vertexStride; - const uint32_t numVertices = obj.bindings.vertex.buffer->getSize() / vertexStride; - if (obj.useIndex()) - primitiveCounts[i] = obj.indexCount / 3; - else - primitiveCounts[i] = numVertices / 3; - - triangles[i].vertexData[0] = obj.bindings.vertex; - triangles[i].indexData = obj.useIndex() ? obj.bindings.index : obj.bindings.vertex; - triangles[i].maxVertex = numVertices - 1; - triangles[i].vertexStride = vertexStride; - triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; - triangles[i].indexType = obj.indexType; - triangles[i].geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; - - auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; - if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) - blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR; - - blasBuildInfos[i].buildFlags = blasFlags; - blasBuildInfos[i].geometryCount = 1; // only 1 geometry object per blas - blasBuildInfos[i].srcAS = nullptr; - blasBuildInfos[i].dstAS = nullptr; - blasBuildInfos[i].triangles = &triangles[i]; - blasBuildInfos[i].scratch = {}; - - ILogicalDevice::AccelerationStructureBuildSizes buildSizes; - { - const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; - buildSizes = m_device->getAccelerationStructureBuildSizes(blasFlags, false, std::span{&triangles[i], 1}, maxPrimCount); - if (!buildSizes) - return logFail("Failed to get BLAS build sizes"); - } - - scratchSizes[i] = buildSizes.buildScratchSize; - totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment); - totalScratchSize += buildSizes.buildScratchSize; - - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = buildSizes.accelerationStructureSize; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = buildSizes.accelerationStructureSize; - blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - gpuBlas[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); - if (!gpuBlas[i]) - return logFail("Could not create BLAS"); - } - } - - auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool); - cmdbufBlas->beginDebugMarker("Build BLAS"); + constexpr auto MaxAlignment = 256; + constexpr auto MinAllocationSize = 1024; + const auto scratchSize = core::alignUp(reservation.getMinASBuildScratchSize(false),MaxAlignment); + + + IGPUBuffer::SCreationParams creationParams = {}; + creationParams.size = scratchSize; + creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT|IGPUBuffer::EUF_STORAGE_BUFFER_BIT; +#ifdef TEST_REBAR_FALLBACK + creationParams.usage |= IGPUBuffer::EUF_TRANSFER_DST_BIT; + core::unordered_set sharingSet = {compute.queue->getFamilyIndex(),transfer.queue->getFamilyIndex()}; + core::vector sharingIndices(sharingSet.begin(),sharingSet.end()); + if (sharingIndices.size()>1) + creationParams.queueFamilyIndexCount = sharingIndices.size(); + creationParams.queueFamilyIndices = sharingIndices.data(); +#endif + auto scratchBuffer = m_device->createBuffer(std::move(creationParams)); - cmdbufBlas->resetQueryPool(queryPool.get(), 0, objectsGpu.size()); + auto reqs = scratchBuffer->getMemoryReqs(); +#ifndef TEST_REBAR_FALLBACK + reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); +#endif + auto allocation = m_device->allocate(reqs,scratchBuffer.get(),IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); +#ifndef TEST_REBAR_FALLBACK + allocation.memory->map({.offset=0,.length=reqs.size}); +#endif - smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - params.size = totalScratchSize; - scratchBuffer = createBuffer(params); + scratchAlloc = make_smart_refctd_ptr( + SBufferRange{0ull,scratchSize,std::move(scratchBuffer)}, + core::allocator(),MaxAlignment,MinAllocationSize + ); } - uint32_t queryCount = 0; - IGPUBottomLevelAccelerationStructure::BuildRangeInfo buildRangeInfos[OT_COUNT]; - IGPUBottomLevelAccelerationStructure::BuildRangeInfo* pRangeInfos[OT_COUNT]; - for (uint32_t i = 0; i < objectsGpu.size(); i++) + struct MyParams final : CAssetConverter::SConvertParams { - blasBuildInfos[i].dstAS = gpuBlas[i].get(); - blasBuildInfos[i].scratch.buffer = scratchBuffer; - if (i == 0) + inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override { - blasBuildInfos[i].scratch.offset = 0u; + return finalUser; } - else + inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override { - const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1]; - blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment); + return finalUser; } - buildRangeInfos[i].primitiveCount = primitiveCounts[i]; - buildRangeInfos[i].primitiveByteOffset = 0u; - buildRangeInfos[i].firstVertex = 0u; - buildRangeInfos[i].transformByteOffset = 0u; - - pRangeInfos[i] = &buildRangeInfos[i]; - } - - if (!cmdbufBlas->buildAccelerationStructures({ blasBuildInfos, OT_COUNT }, pRangeInfos)) - return logFail("Failed to build BLAS"); + uint8_t finalUser; + } params = {}; + params.utilities = m_utils.get(); + params.transfer = &transfer; + params.compute = &compute; + params.scratchForDeviceASBuild = scratchAlloc.get(); + params.finalUser = gQueue->getFamilyIndex(); + auto future = reservation.convert(params); + if (future.copy() != IQueue::RESULT::SUCCESS) { - SMemoryBarrier memBarrier; - memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; - memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT; - cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); + m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); + return {}; } - const IGPUAccelerationStructure* ases[OT_COUNT]; - for (uint32_t i = 0; i < objectsGpu.size(); i++) - ases[i] = gpuBlas[i].get(); - if (!cmdbufBlas->writeAccelerationStructureProperties({ ases, OT_COUNT }, IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE, - queryPool.get(), queryCount++)) - return logFail("Failed to write acceleration structure properties!"); - - cmdbufBlas->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufBlas, queue, 39); - } - - auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool); - cmdbufCompact->beginDebugMarker("Compact BLAS"); - - // compact blas - { - std::array asSizes{ 0 }; - if (!m_device->getQueryPoolResults(queryPool.get(), 0, objectsGpu.size(), asSizes.data(), sizeof(size_t), IQueryPool::WAIT_BIT)) - return logFail("Could not get query pool results for AS sizes"); - - std::array, OT_COUNT> cleanupBlas; - for (uint32_t i = 0; i < objectsGpu.size(); i++) + // assign gpu objects to output + for (const auto& buffer : reservation.getGPUObjects()) + retainedBuffers.push_back(buffer.value); + for (uint32_t i = 0; i < objectsCpu.size(); i++) { - cleanupBlas[i] = gpuBlas[i]; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = asSizes[i]; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = asSizes[i]; - blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - gpuBlas[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); - if (!gpuBlas[i]) - return logFail("Could not create compacted BLAS"); - } + auto vBuffer = retainedBuffers[2 * i + 0].get(); + auto iBuffer = retainedBuffers[2 * i + 1].get(); + const auto& geom = objectsCpu[i]; + const bool useIndex = geom.data.indexType != EIT_UNKNOWN; - IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo; - copyInfo.src = cleanupBlas[i].get(); - copyInfo.dst = gpuBlas[i].get(); - copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT; - if (!cmdbufCompact->copyAccelerationStructure(copyInfo)) - return logFail("Failed to copy AS to compact"); + geomInfos[i].vertexBufferAddress = vBuffer->getDeviceAddress() + byteOffsets[i]; + geomInfos[i].indexBufferAddress = useIndex ? iBuffer->getDeviceAddress():0x0ull; } } - cmdbufCompact->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufCompact, queue, 40); - - auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool); - cmdbufTlas->beginDebugMarker("Build TLAS"); - - // build top level AS + // { - const uint32_t instancesCount = objectsGpu.size(); - IGPUTopLevelAccelerationStructure::DeviceStaticInstance instances[OT_COUNT]; - for (uint32_t i = 0; i < instancesCount; i++) - { - core::matrix3x4SIMD transform; - transform.setTranslation(nbl::core::vectorSIMDf(5.f * i, 0, 0, 0)); - instances[i].base.blas.deviceAddress = gpuBlas[i]->getReferenceForDeviceOperations().deviceAddress; - instances[i].base.mask = 0xFF; - instances[i].base.instanceCustomIndex = i; - instances[i].base.instanceShaderBindingTableRecordOffset = 0; - instances[i].base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); - instances[i].transform = transform; - } - - { - size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT | - IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = bufSize; - instancesBuffer = createBuffer(params); - - SBufferRange range = { .offset = 0u, .size = bufSize, .buffer = instancesBuffer }; - cmdbufTlas->updateBuffer(range, instances); - } - - // make sure instances upload complete first - { - SMemoryBarrier memBarrier; - memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; - memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; - memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; - cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); - } - - auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); - - IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo; - tlasBuildInfo.buildFlags = tlasFlags; - tlasBuildInfo.srcAS = nullptr; - tlasBuildInfo.dstAS = nullptr; - tlasBuildInfo.instanceData.buffer = instancesBuffer; - tlasBuildInfo.instanceData.offset = 0u; - tlasBuildInfo.scratch = {}; - - auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount); - if (!buildSizes) - return logFail("Failed to get TLAS build sizes"); + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = OT_COUNT * sizeof(SGeomInfo); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = gQueue }, std::move(params), geomInfos).move_into(geometryInfoBuffer); + } + // acquire ownership + { + smart_refctd_ptr cmdbuf; { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = buildSizes.accelerationStructureSize; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUTopLevelAccelerationStructure::SCreationParams tlasParams; - tlasParams.bufferRange.buffer = asBuffer; - tlasParams.bufferRange.offset = 0u; - tlasParams.bufferRange.size = buildSizes.accelerationStructureSize; - tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams)); - if (!gpuTlas) - return logFail("Could not create TLAS"); + const auto gQFI = gQueue->getFamilyIndex(); + m_device->createCommandPool(gQFI,IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1}); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + { + core::vector> bufBarriers; + auto acquireBufferRange = [&bufBarriers](const uint8_t otherQueueFamilyIndex, const SBufferRange& bufferRange) + { + bufBarriers.push_back({ + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + // we don't care what exactly, uncomplex our code + .dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = otherQueueFamilyIndex + }, + .range = bufferRange + }); + }; +#ifdef TEST_REBAR_FALLBACK + if (const auto otherQueueFamilyIndex=transfer.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex) + for (const auto& buffer : reservation.getGPUObjects()) + { + const auto& buff = buffer.value; + if (buff) + acquireBufferRange(otherQueueFamilyIndex,{.offset=0,.size=buff->getSize(),.buffer=buff}); + } +#endif + if (const auto otherQueueFamilyIndex=compute.queue->getFamilyIndex(); gQFI!=otherQueueFamilyIndex) + { + auto acquireAS = [&acquireBufferRange,otherQueueFamilyIndex](const IGPUAccelerationStructure* as) + { + acquireBufferRange(otherQueueFamilyIndex,as->getCreationParams().bufferRange); + }; + for (const auto& blas : reservation.getGPUObjects()) + acquireAS(blas.value.get()); + acquireAS(reservation.getGPUObjects().front().value.get()); + } + if (!bufBarriers.empty()) + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,{.memBarriers={},.bufBarriers=bufBarriers}); + } + cmdbuf->end(); } - - smart_refctd_ptr scratchBuffer; + if (!cmdbuf->empty()) { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - params.size = buildSizes.buildScratchSize; - scratchBuffer = createBuffer(params); + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = { + .cmdbuf = cmdbuf.get() + }; + const IQueue::SSubmitInfo::SSemaphoreInfo signal = { + .semaphore = compute.scratchSemaphore.semaphore, + .value = compute.getFutureScratchSemaphore().value, + .stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS + }; + auto wait = signal; + wait.value--; + const IQueue::SSubmitInfo info = { + .waitSemaphores = {&wait,1}, // we already waited with the host on the AS build + .commandBuffers = {&cmdbufInfo,1}, + .signalSemaphores = {&signal,1} + }; + if (const auto retval=gQueue->submit({&info,1}); retval!=IQueue::RESULT::SUCCESS) + m_logger->log("Failed to transfer ownership with code %d!",system::ILogger::ELL_ERROR,retval); } - - tlasBuildInfo.dstAS = gpuTlas.get(); - tlasBuildInfo.scratch.buffer = scratchBuffer; - tlasBuildInfo.scratch.offset = 0u; - - IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u]; - buildRangeInfo[0].instanceCount = instancesCount; - buildRangeInfo[0].instanceByteOffset = 0u; - IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos; - pRangeInfos = &buildRangeInfo[0]; - - if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos)) - return logFail("Failed to build TLAS"); } - - cmdbufTlas->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufTlas, queue, 45); - +#undef TEST_REBAR_FALLBACK + #ifdef TRY_BUILD_FOR_NGFX { const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { @@ -1055,7 +891,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu #endif m_api->endCapture(); - return true; + return reservation.getGPUObjects().front().value; } @@ -1073,18 +909,12 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); video::CDumbPresentationOracle oracle; - std::array objectsGpu; - - std::array, OT_COUNT> gpuBlas; - smart_refctd_ptr gpuTlas; - smart_refctd_ptr instancesBuffer; - smart_refctd_ptr geometryInfoBuffer; + core::vector> retainedBuffers; smart_refctd_ptr outHDRImage; smart_refctd_ptr renderPipeline; smart_refctd_ptr renderDs; - smart_refctd_ptr renderPool; uint16_t gcIndex = {}; From bb0e4fd1a0064ee8e50c65051fbf3bac8e50b460 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Jun 2025 09:28:48 +0700 Subject: [PATCH 236/296] Fix merge by using master code --- 71_RayTracingPipeline/main.cpp | 3442 +++++++++++++++----------------- 1 file changed, 1602 insertions(+), 1840 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index c9ee0eafb..42aaa2233 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -6,787 +6,778 @@ #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" #include "nbl/builtin/hlsl/indirect_commands.hlsl" + class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::SimpleWindowedApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; - using clock_t = std::chrono::steady_clock; - - constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; - constexpr static inline uint32_t MaxFramesInFlight = 3u; - constexpr static inline uint8_t MaxUITextureCount = 1u; - constexpr static inline uint32_t NumberOfProceduralGeometries = 5; - - static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = { - "Directional", - "Point", - "Spot" - }; - - constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); - - struct ShaderBindingTable - { - SBufferRange raygenGroupRange; - SBufferRange hitGroupsRange; - uint32_t hitGroupsStride; - SBufferRange missGroupsRange; - uint32_t missGroupsStride; - SBufferRange callableGroupsRange; - uint32_t callableGroupsStride; - }; + using device_base_t = examples::SimpleWindowedApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using clock_t = std::chrono::steady_clock; + + constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; + constexpr static inline uint32_t MaxFramesInFlight = 3u; + constexpr static inline uint8_t MaxUITextureCount = 1u; + constexpr static inline uint32_t NumberOfProceduralGeometries = 5; + + static constexpr const char* s_lightTypeNames[E_LIGHT_TYPE::ELT_COUNT] = { + "Directional", + "Point", + "Spot" + }; + + struct ShaderBindingTable + { + SBufferRange raygenGroupRange; + SBufferRange hitGroupsRange; + uint32_t hitGroupsStride; + SBufferRange missGroupsRange; + uint32_t missGroupsStride; + SBufferRange callableGroupsRange; + uint32_t callableGroupsStride; + }; public: - inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) - : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) - { - } - - inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override - { - auto retval = device_base_t::getRequiredDeviceFeatures(); - retval.rayTracingPipeline = true; - retval.accelerationStructure = true; - retval.rayQuery = true; - return retval; - } - - inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override - { - auto retval = device_base_t::getPreferredDeviceFeatures(); - retval.accelerationStructureHostCommands = true; - return retval; - } - - inline core::vector getSurfaces() const override - { - if (!m_surface) - { - { - auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); - IWindow::SCreationParams params = {}; - params.callback = core::make_smart_refctd_ptr(); - params.width = WIN_W; - params.height = WIN_H; - params.x = 32; - params.y = 32; - params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; - params.windowCaption = "RaytracingPipelineApp"; - params.callback = windowCallback; - const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); - } - - auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); - const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); - } - - if (m_surface) - return { {m_surface->getSurface()/*,EQF_NONE*/} }; - - return {}; - } - - // so that we can use the same queue for asset converter and rendering - inline core::vector getQueueRequirements() const override - { - auto reqs = device_base_t::getQueueRequirements(); - reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; - return reqs; - } - - inline bool onAppInitialized(smart_refctd_ptr&& system) override - { - m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); - - if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - - if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - - smart_refctd_ptr shaderReadCache = nullptr; - smart_refctd_ptr shaderWriteCache = core::make_smart_refctd_ptr(); - auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin"; - - { - core::smart_refctd_ptr shaderReadCacheFile; - { - system::ISystem::future_t> future; - m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ); - if (future.wait()) - { - future.acquire().move_into(shaderReadCacheFile); - if (shaderReadCacheFile) - { - const size_t size = shaderReadCacheFile->getSize(); - if (size > 0ull) - { - std::vector contents(size); - system::IFile::success_t succ; - shaderReadCacheFile->read(succ, contents.data(), 0, size); - if (succ) - shaderReadCache = IShaderCompiler::CCache::deserialize(contents); - } - } - } - else - m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR); - } - - } - - // Load Custom Shader - auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = ""; // virtual root - auto assetBundle = m_assetMgr->getAsset(relPath, lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - { - assert(false); - return nullptr; - } - - // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - auto sourceRaw = IAsset::castDown(assets[0]); - if (!sourceRaw) - { - assert(false); - return nullptr; - } - - return m_device->compileShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); - }; - - // load shaders - const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl"); - const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl"); - const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl"); - const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl"); - const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl"); - const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl"); - const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); - const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl"); - const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); - const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); - const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl"); - const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl"); - - core::smart_refctd_ptr shaderWriteCacheFile; - { - system::ISystem::future_t> future; - m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json - m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE); - if (future.wait()) - { - future.acquire().move_into(shaderWriteCacheFile); - if (shaderWriteCacheFile) - { - auto serializedCache = shaderWriteCache->serialize(); - if (shaderWriteCacheFile) - { - system::IFile::success_t succ; - shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize()); - if (!succ) - m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR); - } - } - else - m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); - } - else - m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); - } - - m_semaphore = m_device->createSemaphore(m_realFrameIx); - if (!m_semaphore) - return logFail("Failed to Create a Semaphore!"); - - auto gQueue = getGraphicsQueue(); - - // Create renderpass and init surface - nbl::video::IGPURenderpass* renderpass; - { - ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr(m_surface->getSurface()) }; - if (!swapchainParams.deduceFormat(m_physicalDevice)) - return logFail("Could not choose a Surface Format for the Swapchain!"); - - const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = - { - { - .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .dstSubpass = 0, - .memoryBarrier = - { - .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, - .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, - .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - }, - { - .srcSubpass = 0, - .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .memoryBarrier = - { - .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - }, - IGPURenderpass::SCreationParams::DependenciesEnd - }; - - auto scResources = std::make_unique(m_device.get(), swapchainParams.surfaceFormat.format, dependencies); - renderpass = scResources->getRenderpass(); - - if (!renderpass) - return logFail("Failed to create Renderpass!"); - - if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) - return logFail("Could not create Window & Surface or initialize the Surface!"); - } - - auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - - m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); - - for (auto i = 0u; i < MaxFramesInFlight; i++) - { - if (!pool) - return logFail("Couldn't create Command Pool!"); - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) - return logFail("Couldn't create Command Buffer!"); - } - - m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); - m_surface->recreateSwapchain(); - - - // create output images - m_hdrImage = m_device->createImage({ - { - .type = IGPUImage::ET_2D, - .samples = ICPUImage::ESCF_1_BIT, - .format = EF_R16G16B16A16_SFLOAT, - .extent = {WIN_W, WIN_H, 1}, - .mipLevels = 1, - .arrayLayers = 1, - .flags = IImage::ECF_NONE, - .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT - } - }); - - if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid()) - return logFail("Could not create HDR Image"); - - m_hdrImageView = m_device->createImageView({ - .flags = IGPUImageView::ECF_NONE, - .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT, - .image = m_hdrImage, - .viewType = IGPUImageView::E_TYPE::ET_2D, - .format = asset::EF_R16G16B16A16_SFLOAT - }); - - - - // ray trace pipeline and descriptor set layout setup - { - const IGPUDescriptorSetLayout::SBinding bindings[] = { - { - .binding = 0, - .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, - .count = 1, - }, - { - .binding = 1, - .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, - .count = 1, - } - }; - const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); - - const std::array dsLayoutPtrs = { descriptorSetLayout.get() }; - m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); - m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout); - - const SPushConstantRange pcRange = { - .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, - .offset = 0u, - .size = sizeof(SPushConstants), - }; - const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr); - - IGPURayTracingPipeline::SCreationParams params = {}; - - enum RtDemoShader - { - RTDS_RAYGEN, - RTDS_MISS, - RTDS_MISS_SHADOW, - RTDS_CLOSEST_HIT, - RTDS_SPHERE_CLOSEST_HIT, - RTDS_ANYHIT_PRIMARY, - RTDS_ANYHIT_SHADOW, - RTDS_INTERSECTION, - RTDS_DIRECTIONAL_CALL, - RTDS_POINT_CALL, - RTDS_SPOT_CALL, - RTDS_COUNT - }; - - IPipelineBase::SShaderSpecInfo shaders[RTDS_COUNT]; - shaders[RTDS_RAYGEN] = {.shader = raygenShader.get(), .entryPoint = "main", .stage = ESS_RAYGEN}; - shaders[RTDS_MISS] = {.shader = missShader.get(), .entryPoint = "main", .stage = ESS_MISS}; - shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get(), .entryPoint = "main", .stage = ESS_MISS}; - shaders[RTDS_CLOSEST_HIT] = {.shader = closestHitShader.get(), .entryPoint = "main", .stage = ESS_CLOSEST_HIT}; - shaders[RTDS_SPHERE_CLOSEST_HIT] = {.shader = proceduralClosestHitShader.get(), .entryPoint = "main", .stage = ESS_CLOSEST_HIT}; - shaders[RTDS_ANYHIT_PRIMARY] = {.shader = anyHitShaderColorPayload.get(), .entryPoint = "main", .stage = ESS_ANY_HIT}; - shaders[RTDS_ANYHIT_SHADOW] = {.shader = anyHitShaderShadowPayload.get(), .entryPoint = "main", .stage = ESS_ANY_HIT}; - shaders[RTDS_INTERSECTION] = {.shader = intersectionHitShader.get(), .entryPoint = "main", .stage = ESS_INTERSECTION }; - shaders[RTDS_DIRECTIONAL_CALL] = {.shader = directionalLightCallShader.get(), .entryPoint = "main", .stage = ESS_CALLABLE}; - shaders[RTDS_POINT_CALL] = {.shader = pointLightCallShader.get(), .entryPoint = "main", .stage = ESS_CALLABLE}; - shaders[RTDS_SPOT_CALL] = {.shader = spotLightCallShader.get(), .entryPoint = "main", .stage = ESS_CALLABLE}; - - params.layout = pipelineLayout.get(); - params.shaders = std::span(shaders); - using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS; - params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) | - RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | - RayTracingFlags::NO_NULL_ANY_HIT_SHADERS; - - auto& shaderGroups = params.shaderGroups; - - shaderGroups.raygen = { .index = RTDS_RAYGEN }; - - IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT]; - missGroups[EMT_PRIMARY] = { .index = RTDS_MISS }; - missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW }; - shaderGroups.misses = missGroups; - - auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) - { - return geomType * ERT_COUNT + rayType; - }; - IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; - hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { - .closestHit = RTDS_CLOSEST_HIT, - .anyHit = RTDS_ANYHIT_PRIMARY, - }; - hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { - .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, - .anyHit = RTDS_ANYHIT_SHADOW, - }; - hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { - .closestHit = RTDS_SPHERE_CLOSEST_HIT, - .anyHit = RTDS_ANYHIT_PRIMARY, - .intersection = RTDS_INTERSECTION, - }; - hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { - .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, - .anyHit = RTDS_ANYHIT_SHADOW, - .intersection = RTDS_INTERSECTION, - }; - shaderGroups.hits = hitGroups; - - IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT]; - callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL }; - callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL }; - callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL }; - shaderGroups.callables = callableGroups; - - params.cached.maxRecursionDepth = 1; - params.cached.dynamicStackSize = true; - - if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) - return logFail("Failed to create ray tracing pipeline"); - - calculateRayTracingStackSize(m_rayTracingPipeline); - - if (!createShaderBindingTable(gQueue, m_rayTracingPipeline)) - return logFail("Could not create shader binding table"); - - } - - auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); - auto* geometryCreator = assetManager->getGeometryCreator(); - - if (!createIndirectBuffer(gQueue)) - return logFail("Could not create indirect buffer"); - - // create geometry objects - if (!createGeometries(gQueue, geometryCreator)) - return logFail("Could not create geometries from geometry creator"); - - if (!createAccelerationStructures(getComputeQueue())) - return logFail("Could not create acceleration structures"); - - ISampler::SParams samplerParams = { - .AnisotropicFilter = 0 - }; - auto defaultSampler = m_device->createSampler(samplerParams); - - { - const IGPUDescriptorSetLayout::SBinding bindings[] = { - { - .binding = 0u, - .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, - .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT, - .count = 1u, - .immutableSamplers = &defaultSampler - } - }; - auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings); - const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() }; - const uint32_t setCounts[] = { 1u }; - m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts); - m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout); - - auto scRes = static_cast(m_surface->getSwapchainResources()); - ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); - if (!fsTriProtoPPln) - return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); - - const IPipelineBase::SShaderSpecInfo fragSpec = { - .shader = fragmentShader.get(), - .entryPoint = "main", - .stage = ESS_FRAGMENT, - }; - - auto presentLayout = m_device->createPipelineLayout( - {}, - core::smart_refctd_ptr(gpuPresentDescriptorSetLayout), - nullptr, - nullptr, - nullptr - ); - m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass()); - if (!m_presentPipeline) - return logFail("Could not create Graphics Pipeline!"); - } - - // write descriptors - IGPUDescriptorSet::SDescriptorInfo infos[3]; - infos[0].desc = m_gpuTlas; - - infos[1].desc = m_hdrImageView; - if (!infos[1].desc) - return logFail("Failed to create image view"); - infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL; - - infos[2].desc = m_hdrImageView; - infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; - - IGPUDescriptorSet::SWriteDescriptorSet writes[] = { - {.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]}, - {.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]}, - {.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] }, - }; - m_device->updateDescriptorSets(std::span(writes), {}); - - // gui descriptor setup - { - using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS; - { - IGPUSampler::SParams params; - params.AnisotropicFilter = 1u; - params.TextureWrapU = ETC_REPEAT; - params.TextureWrapV = ETC_REPEAT; - params.TextureWrapW = ETC_REPEAT; - - m_ui.samplers.gui = m_device->createSampler(params); - m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler"); - } - - std::array, 69u> immutableSamplers; - for (auto& it : immutableSamplers) - it = smart_refctd_ptr(m_ui.samplers.scene); - - immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui); - - nbl::ext::imgui::UI::SCreationParameters params; - - params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u }; - params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u }; - params.assetManager = m_assetMgr; - params.pipelineCache = nullptr; - params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount); - params.renderpass = smart_refctd_ptr(renderpass); - params.streamingBuffer = nullptr; - params.subpassIx = 0u; - params.transfer = getTransferUpQueue(); - params.utilities = m_utils; - { - m_ui.manager = ext::imgui::UI::create(std::move(params)); - - // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources - const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u); - const auto& params = m_ui.manager->getCreationParameters(); - - IDescriptorPool::SCreateInfo descriptorPoolInfo = {}; - descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT; - descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount; - descriptorPoolInfo.maxSets = 1u; - descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT; - - m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo)); - assert(m_guiDescriptorSetPool); - - m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet); - assert(m_ui.descriptorSet); - } - } - - m_ui.manager->registerListener( - [this]() -> void { - ImGuiIO& io = ImGui::GetIO(); - - m_camera.setProjectionMatrix([&]() - { - static matrix4SIMD projection; - - projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( - core::radians(m_cameraSetting.fov), - io.DisplaySize.x / io.DisplaySize.y, - m_cameraSetting.zNear, - m_cameraSetting.zFar); - - return projection; - }()); - - ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); - ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); - - // create a window and insert the inspector - ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); - ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); - ImGui::Begin("Controls"); - - ImGui::SameLine(); - - ImGui::Text("Camera"); - - ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f); - ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f); - ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f); - ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f); - ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f); - Light m_oldLight = m_light; - int light_type = m_light.type; - ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT); - m_light.type = static_cast(light_type); - if (m_light.type == ELT_DIRECTIONAL) - { - ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); - } else if (m_light.type == ELT_POINT) - { - ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); - } else if (m_light.type == ELT_SPOT) - { - ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); - ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); - - float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff)); - if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f)) - { - m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff)); - } - } - ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand); - if (m_light != m_oldLight) - { - m_frameAccumulationCounter = 0; - } - - ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); - - ImGui::End(); - } - ); - - // Set Camera - { - core::vectorSIMDf cameraPosition(0, 5, -10); - matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( - core::radians(60.0f), - WIN_W / WIN_H, - 0.01f, - 500.0f - ); - m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); - } - - m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); - m_surface->recreateSwapchain(); - m_winMgr->show(m_window.get()); - m_oracle.reportBeginFrameRecord(); - m_camera.mapKeysToWASD(); - - return true; - } - - bool updateGUIDescriptorSet() - { - // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout - static std::array descriptorInfo; - static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount]; - - descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; - descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr(m_ui.manager->getFontAtlasView()); - - for (uint32_t i = 0; i < descriptorInfo.size(); ++i) - { - writes[i].dstSet = m_ui.descriptorSet.get(); - writes[i].binding = 0u; - writes[i].arrayElement = i; - writes[i].count = 1u; - } - writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId; - - return m_device->updateDescriptorSets(writes, {}); - } - - inline void workLoopBody() override - { - // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. - const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); - // We block for semaphores for 2 reasons here: - // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] - // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] - if (m_realFrameIx >= framesInFlight) - { - const ISemaphore::SWaitInfo cbDonePending[] = - { - { - .semaphore = m_semaphore.get(), - .value = m_realFrameIx + 1 - framesInFlight - } - }; - if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) - return; - } - const auto resourceIx = m_realFrameIx % MaxFramesInFlight; - - m_api->startCapture(); - - update(); - - auto queue = getGraphicsQueue(); - auto cmdbuf = m_cmdBufs[resourceIx].get(); - - if (!keepRunning()) - return; - - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame"); - - const auto viewMatrix = m_camera.getViewMatrix(); - const auto projectionMatrix = m_camera.getProjectionMatrix(); - const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix(); - - core::matrix3x4SIMD modelMatrix; - modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0)); - modelMatrix.setRotation(quaternion(0, 0, 0)); - - core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); - if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix) - { - m_frameAccumulationCounter = 0; - m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix; - } - core::matrix4SIMD invModelViewProjectionMatrix; - modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix); - - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; - imageBarriers[0].barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader - .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS, - .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, - .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS - } - }; - imageBarriers[0].image = m_hdrImage.get(); - imageBarriers[0].subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .baseMipLevel = 0u, - .levelCount = 1u, - .baseArrayLayer = 0u, - .layerCount = 1u - }; - imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL; - imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; - cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); - } - - // Trace Rays Pass - { - SPushConstants pc; - pc.light = m_light; - pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress(); - pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress(); - pc.frameCounter = m_frameAccumulationCounter; - const core::vector3df camPos = m_camera.getPosition().getAsVector3df(); - pc.camPos = { camPos.X, camPos.Y, camPos.Z }; - memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP)); - - cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); - cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize); - cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); - cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get()); - if (m_useIndirectCommand) - { - cmdbuf->traceRaysIndirect( - SBufferBinding{ - .offset = 0, - .buffer = m_indirectBuffer, - }); - }else - { - cmdbuf->traceRays( - m_shaderBindingTable.raygenGroupRange, - m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, - m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, - m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, - WIN_W, WIN_H, 1); - } - } - - // pipeline barrier - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; - imageBarriers[0].barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, - .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, - .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - }; - imageBarriers[0].image = m_hdrImage.get(); - imageBarriers[0].subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .baseMipLevel = 0u, - .levelCount = 1u, - .baseArrayLayer = 0u, - .layerCount = 1u - }; - imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL; - imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; - - cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); - } - - { + inline RaytracingPipelineApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) + { + } + + inline SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + { + auto retval = device_base_t::getRequiredDeviceFeatures(); + retval.rayTracingPipeline = true; + retval.accelerationStructure = true; + retval.rayQuery = true; + return retval; + } + + inline SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override + { + auto retval = device_base_t::getPreferredDeviceFeatures(); + retval.accelerationStructureHostCommands = true; + return retval; + } + + inline core::vector getSurfaces() const override + { + if (!m_surface) + { + { + auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); + IWindow::SCreationParams params = {}; + params.callback = core::make_smart_refctd_ptr(); + params.width = WIN_W; + params.height = WIN_H; + params.x = 32; + params.y = 32; + params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; + params.windowCaption = "RaytracingPipelineApp"; + params.callback = windowCallback; + const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); + } + + auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); + const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); + } + + if (m_surface) + return { {m_surface->getSurface()/*,EQF_NONE*/} }; + + return {}; + } + + // so that we can use the same queue for asset converter and rendering + inline core::vector getQueueRequirements() const override + { + auto reqs = device_base_t::getQueueRequirements(); + reqs.front().requiredFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + return reqs; + } + + inline bool onAppInitialized(smart_refctd_ptr&& system) override + { + m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + smart_refctd_ptr shaderReadCache = nullptr; + smart_refctd_ptr shaderWriteCache = core::make_smart_refctd_ptr(); + auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin"; + + { + core::smart_refctd_ptr shaderReadCacheFile; + { + system::ISystem::future_t> future; + m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ); + if (future.wait()) + { + future.acquire().move_into(shaderReadCacheFile); + if (shaderReadCacheFile) + { + const size_t size = shaderReadCacheFile->getSize(); + if (size > 0ull) + { + std::vector contents(size); + system::IFile::success_t succ; + shaderReadCacheFile->read(succ, contents.data(), 0, size); + if (succ) + shaderReadCache = IShaderCompiler::CCache::deserialize(contents); + } + } + } + else + m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR); + } + + } + + // Load Custom Shader + auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + auto assetBundle = m_assetMgr->getAsset(relPath, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + return nullptr; + + // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader + auto sourceRaw = IAsset::castDown(assets[0]); + if (!sourceRaw) + return nullptr; + + return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); + }; + + // load shaders + const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl"); + const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl"); + const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl"); + const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl"); + const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl"); + const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl"); + const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl"); + const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl"); + const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl"); + const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl"); + const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl"); + const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl"); + + core::smart_refctd_ptr shaderWriteCacheFile; + { + system::ISystem::future_t> future; + m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json + m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE); + if (future.wait()) + { + future.acquire().move_into(shaderWriteCacheFile); + if (shaderWriteCacheFile) + { + auto serializedCache = shaderWriteCache->serialize(); + if (shaderWriteCacheFile) + { + system::IFile::success_t succ; + shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize()); + if (!succ) + m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR); + } + } + else + m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); + } + else + m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR); + } + + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); + + auto gQueue = getGraphicsQueue(); + + // Create renderpass and init surface + nbl::video::IGPURenderpass* renderpass; + { + ISwapchain::SCreationParams swapchainParams = { .surface = smart_refctd_ptr(m_surface->getSurface()) }; + if (!swapchainParams.deduceFormat(m_physicalDevice)) + return logFail("Could not choose a Surface Format for the Swapchain!"); + + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = + { + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = + { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; + + auto scResources = std::make_unique(m_device.get(), swapchainParams.surfaceFormat.format, dependencies); + renderpass = scResources->getRenderpass(); + + if (!renderpass) + return logFail("Failed to create Renderpass!"); + + if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) + return logFail("Could not create Window & Surface or initialize the Surface!"); + } + + auto pool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + + m_converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); + + for (auto i = 0u; i < MaxFramesInFlight; i++) + { + if (!pool) + return logFail("Couldn't create Command Pool!"); + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) + return logFail("Couldn't create Command Buffer!"); + } + + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); + + + // create output images + m_hdrImage = m_device->createImage({ + { + .type = IGPUImage::ET_2D, + .samples = ICPUImage::ESCF_1_BIT, + .format = EF_R16G16B16A16_SFLOAT, + .extent = {WIN_W, WIN_H, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .flags = IImage::ECF_NONE, + .usage = bitflag(IImage::EUF_STORAGE_BIT) | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_SAMPLED_BIT + } + }); + + if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid()) + return logFail("Could not create HDR Image"); + + m_hdrImageView = m_device->createImageView({ + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT | IGPUImage::E_USAGE_FLAGS::EUF_SAMPLED_BIT, + .image = m_hdrImage, + .viewType = IGPUImageView::E_TYPE::ET_2D, + .format = asset::EF_R16G16B16A16_SFLOAT + }); + + + + // ray trace pipeline and descriptor set layout setup + { + const IGPUDescriptorSetLayout::SBinding bindings[] = { + { + .binding = 0, + .type = asset::IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, + .count = 1, + }, + { + .binding = 1, + .type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_RAYGEN, + .count = 1, + } + }; + const auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); + + const std::array dsLayoutPtrs = { descriptorSetLayout.get() }; + m_rayTracingDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, std::span(dsLayoutPtrs.begin(), dsLayoutPtrs.end())); + m_rayTracingDs = m_rayTracingDsPool->createDescriptorSet(descriptorSetLayout); + + const SPushConstantRange pcRange = { + .stageFlags = IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, + .offset = 0u, + .size = sizeof(SPushConstants), + }; + const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr); + + IGPURayTracingPipeline::SCreationParams params = {}; + + enum RtDemoShader + { + RTDS_RAYGEN, + RTDS_MISS, + RTDS_MISS_SHADOW, + RTDS_CLOSEST_HIT, + RTDS_SPHERE_CLOSEST_HIT, + RTDS_ANYHIT_PRIMARY, + RTDS_ANYHIT_SHADOW, + RTDS_INTERSECTION, + RTDS_DIRECTIONAL_CALL, + RTDS_POINT_CALL, + RTDS_SPOT_CALL, + RTDS_COUNT + }; + + IGPUShader::SSpecInfo shaders[RTDS_COUNT]; + shaders[RTDS_RAYGEN] = { .shader = raygenShader.get() }; + shaders[RTDS_MISS] = { .shader = missShader.get() }; + shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() }; + shaders[RTDS_CLOSEST_HIT] = { .shader = closestHitShader.get() }; + shaders[RTDS_SPHERE_CLOSEST_HIT] = { .shader = proceduralClosestHitShader.get() }; + shaders[RTDS_ANYHIT_PRIMARY] = { .shader = anyHitShaderColorPayload.get() }; + shaders[RTDS_ANYHIT_SHADOW] = { .shader = anyHitShaderShadowPayload.get() }; + shaders[RTDS_INTERSECTION] = { .shader = intersectionHitShader.get() }; + shaders[RTDS_DIRECTIONAL_CALL] = { .shader = directionalLightCallShader.get() }; + shaders[RTDS_POINT_CALL] = { .shader = pointLightCallShader.get() }; + shaders[RTDS_SPOT_CALL] = { .shader = spotLightCallShader.get() }; + + params.layout = pipelineLayout.get(); + params.shaders = std::span(shaders); + using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS; + params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) | + RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | + RayTracingFlags::NO_NULL_ANY_HIT_SHADERS; + + auto& shaderGroups = params.shaderGroups; + + shaderGroups.raygen = { .index = RTDS_RAYGEN }; + + IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT]; + missGroups[EMT_PRIMARY] = { .index = RTDS_MISS }; + missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW }; + shaderGroups.misses = missGroups; + + auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) + { + return geomType * ERT_COUNT + rayType; + }; + IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; + hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { + .closestHit = RTDS_CLOSEST_HIT, + .anyHit = RTDS_ANYHIT_PRIMARY, + }; + hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { + .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, + .anyHit = RTDS_ANYHIT_SHADOW, + }; + hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { + .closestHit = RTDS_SPHERE_CLOSEST_HIT, + .anyHit = RTDS_ANYHIT_PRIMARY, + .intersection = RTDS_INTERSECTION, + }; + hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { + .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, + .anyHit = RTDS_ANYHIT_SHADOW, + .intersection = RTDS_INTERSECTION, + }; + shaderGroups.hits = hitGroups; + + IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT]; + callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL }; + callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL }; + callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL }; + shaderGroups.callables = callableGroups; + + params.cached.maxRecursionDepth = 1; + params.cached.dynamicStackSize = true; + + if (!m_device->createRayTracingPipelines(nullptr, { ¶ms, 1 }, &m_rayTracingPipeline)) + return logFail("Failed to create ray tracing pipeline"); + + calculateRayTracingStackSize(m_rayTracingPipeline); + + if (!createShaderBindingTable(m_rayTracingPipeline)) + return logFail("Could not create shader binding table"); + + } + + auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); + auto* geometryCreator = assetManager->getGeometryCreator(); + + if (!createIndirectBuffer()) + return logFail("Could not create indirect buffer"); + + if (!createAccelerationStructuresFromGeometry(geometryCreator)) + return logFail("Could not create acceleration structures from geometry creator"); + + ISampler::SParams samplerParams = { + .AnisotropicFilter = 0 + }; + auto defaultSampler = m_device->createSampler(samplerParams); + + { + const IGPUDescriptorSetLayout::SBinding bindings[] = { + { + .binding = 0u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, + .createFlags = ICPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_FRAGMENT, + .count = 1u, + .immutableSamplers = &defaultSampler + } + }; + auto gpuPresentDescriptorSetLayout = m_device->createDescriptorSetLayout(bindings); + const video::IGPUDescriptorSetLayout* const layouts[] = { gpuPresentDescriptorSetLayout.get() }; + const uint32_t setCounts[] = { 1u }; + m_presentDsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts); + m_presentDs = m_presentDsPool->createDescriptorSet(gpuPresentDescriptorSetLayout); + + auto scRes = static_cast(m_surface->getSwapchainResources()); + ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); + if (!fsTriProtoPPln) + return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); + + const IGPUShader::SSpecInfo fragSpec = { + .entryPoint = "main", + .shader = fragmentShader.get() + }; + + auto presentLayout = m_device->createPipelineLayout( + {}, + core::smart_refctd_ptr(gpuPresentDescriptorSetLayout), + nullptr, + nullptr, + nullptr + ); + m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass()); + if (!m_presentPipeline) + return logFail("Could not create Graphics Pipeline!"); + } + + // write descriptors + IGPUDescriptorSet::SDescriptorInfo infos[3]; + infos[0].desc = m_gpuTlas; + + infos[1].desc = m_hdrImageView; + if (!infos[1].desc) + return logFail("Failed to create image view"); + infos[1].info.image.imageLayout = IImage::LAYOUT::GENERAL; + + infos[2].desc = m_hdrImageView; + infos[2].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + + IGPUDescriptorSet::SWriteDescriptorSet writes[] = { + {.dstSet = m_rayTracingDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[0]}, + {.dstSet = m_rayTracingDs.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &infos[1]}, + {.dstSet = m_presentDs.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &infos[2] }, + }; + m_device->updateDescriptorSets(std::span(writes), {}); + + // gui descriptor setup + { + using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS; + { + IGPUSampler::SParams params; + params.AnisotropicFilter = 1u; + params.TextureWrapU = ETC_REPEAT; + params.TextureWrapV = ETC_REPEAT; + params.TextureWrapW = ETC_REPEAT; + + m_ui.samplers.gui = m_device->createSampler(params); + m_ui.samplers.gui->setObjectDebugName("Nabla IMGUI UI Sampler"); + } + + std::array, 69u> immutableSamplers; + for (auto& it : immutableSamplers) + it = smart_refctd_ptr(m_ui.samplers.scene); + + immutableSamplers[nbl::ext::imgui::UI::FontAtlasTexId] = smart_refctd_ptr(m_ui.samplers.gui); + + nbl::ext::imgui::UI::SCreationParameters params; + + params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u }; + params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u }; + params.assetManager = m_assetMgr; + params.pipelineCache = nullptr; + params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxUITextureCount); + params.renderpass = smart_refctd_ptr(renderpass); + params.streamingBuffer = nullptr; + params.subpassIx = 0u; + params.transfer = getGraphicsQueue(); + params.utilities = m_utils; + { + m_ui.manager = ext::imgui::UI::create(std::move(params)); + + // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources + const auto* descriptorSetLayout = m_ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u); + const auto& params = m_ui.manager->getCreationParameters(); + + IDescriptorPool::SCreateInfo descriptorPoolInfo = {}; + descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT; + descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = MaxUITextureCount; + descriptorPoolInfo.maxSets = 1u; + descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT; + + m_guiDescriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo)); + assert(m_guiDescriptorSetPool); + + m_guiDescriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &m_ui.descriptorSet); + assert(m_ui.descriptorSet); + } + } + + m_ui.manager->registerListener( + [this]() -> void { + ImGuiIO& io = ImGui::GetIO(); + + m_camera.setProjectionMatrix([&]() + { + static matrix4SIMD projection; + + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( + core::radians(m_cameraSetting.fov), + io.DisplaySize.x / io.DisplaySize.y, + m_cameraSetting.zNear, + m_cameraSetting.zFar); + + return projection; + }()); + + ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); + + // create a window and insert the inspector + ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); + ImGui::Begin("Controls"); + + ImGui::SameLine(); + + ImGui::Text("Camera"); + + ImGui::SliderFloat("Move speed", &m_cameraSetting.moveSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Rotate speed", &m_cameraSetting.rotateSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Fov", &m_cameraSetting.fov, 20.f, 150.f); + ImGui::SliderFloat("zNear", &m_cameraSetting.zNear, 0.1f, 100.f); + ImGui::SliderFloat("zFar", &m_cameraSetting.zFar, 110.f, 10000.f); + Light m_oldLight = m_light; + int light_type = m_light.type; + ImGui::ListBox("LightType", &light_type, s_lightTypeNames, ELT_COUNT); + m_light.type = static_cast(light_type); + if (m_light.type == ELT_DIRECTIONAL) + { + ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); + } + else if (m_light.type == ELT_POINT) + { + ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); + } + else if (m_light.type == ELT_SPOT) + { + ImGui::SliderFloat3("Light Direction", &m_light.direction.x, -1.f, 1.f); + ImGui::SliderFloat3("Light Position", &m_light.position.x, -20.f, 20.f); + + float32_t dOuterCutoff = hlsl::degrees(acos(m_light.outerCutoff)); + if (ImGui::SliderFloat("Light Outer Cutoff", &dOuterCutoff, 0.0f, 45.0f)) + { + m_light.outerCutoff = cos(hlsl::radians(dOuterCutoff)); + } + } + ImGui::Checkbox("Use Indirect Command", &m_useIndirectCommand); + if (m_light != m_oldLight) + { + m_frameAccumulationCounter = 0; + } + + ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); + + ImGui::End(); + } + ); + + // Set Camera + { + core::vectorSIMDf cameraPosition(0, 5, -10); + matrix4SIMD proj = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH( + core::radians(60.0f), + WIN_W / WIN_H, + 0.01f, + 500.0f + ); + m_camera = Camera(cameraPosition, core::vectorSIMDf(0, 0, 0), proj); + } + + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); + m_winMgr->show(m_window.get()); + m_oracle.reportBeginFrameRecord(); + m_camera.mapKeysToWASD(); + + return true; + } + + bool updateGUIDescriptorSet() + { + // texture atlas, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout + static std::array descriptorInfo; + static IGPUDescriptorSet::SWriteDescriptorSet writes[MaxUITextureCount]; + + descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = smart_refctd_ptr(m_ui.manager->getFontAtlasView()); + + for (uint32_t i = 0; i < descriptorInfo.size(); ++i) + { + writes[i].dstSet = m_ui.descriptorSet.get(); + writes[i].binding = 0u; + writes[i].arrayElement = i; + writes[i].count = 1u; + } + writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId; + + return m_device->updateDescriptorSets(writes, {}); + } + + inline void workLoopBody() override + { + // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. + const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); + // We block for semaphores for 2 reasons here: + // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] + // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] + if (m_realFrameIx >= framesInFlight) + { + const ISemaphore::SWaitInfo cbDonePending[] = + { + { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + 1 - framesInFlight + } + }; + if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) + return; + } + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + + m_api->startCapture(); + + update(); + + auto queue = getGraphicsQueue(); + auto cmdbuf = m_cmdBufs[resourceIx].get(); + + if (!keepRunning()) + return; + + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdbuf->beginDebugMarker("RaytracingPipelineApp Frame"); + + const auto viewMatrix = m_camera.getViewMatrix(); + const auto projectionMatrix = m_camera.getProjectionMatrix(); + const auto viewProjectionMatrix = m_camera.getConcatenatedMatrix(); + + core::matrix3x4SIMD modelMatrix; + modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0)); + modelMatrix.setRotation(quaternion(0, 0, 0)); + + core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); + if (m_cachedModelViewProjectionMatrix != modelViewProjectionMatrix) + { + m_frameAccumulationCounter = 0; + m_cachedModelViewProjectionMatrix = modelViewProjectionMatrix; + } + core::matrix4SIMD invModelViewProjectionMatrix; + modelViewProjectionMatrix.getInverseTransform(invModelViewProjectionMatrix); + + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, // previous frame read from framgent shader + .srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }; + imageBarriers[0].image = m_hdrImage.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = m_frameAccumulationCounter == 0 ? IImage::LAYOUT::UNDEFINED : IImage::LAYOUT::READ_ONLY_OPTIMAL; + imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + // Trace Rays Pass + { + SPushConstants pc; + pc.light = m_light; + pc.proceduralGeomInfoBuffer = m_proceduralGeomInfoBuffer->getDeviceAddress(); + pc.triangleGeomInfoBuffer = m_triangleGeomInfoBuffer->getDeviceAddress(); + pc.frameCounter = m_frameAccumulationCounter; + const core::vector3df camPos = m_camera.getPosition().getAsVector3df(); + pc.camPos = { camPos.X, camPos.Y, camPos.Z }; + memcpy(&pc.invMVP, invModelViewProjectionMatrix.pointer(), sizeof(pc.invMVP)); + + cmdbuf->bindRayTracingPipeline(m_rayTracingPipeline.get()); + cmdbuf->setRayTracingPipelineStackSize(m_rayTracingStackSize); + cmdbuf->pushConstants(m_rayTracingPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING, 0, sizeof(SPushConstants), &pc); + cmdbuf->bindDescriptorSets(EPBP_RAY_TRACING, m_rayTracingPipeline->getLayout(), 0, 1, &m_rayTracingDs.get()); + if (m_useIndirectCommand) + { + cmdbuf->traceRaysIndirect( + SBufferBinding{ + .offset = 0, + .buffer = m_indirectBuffer, + }); + } + else + { + cmdbuf->traceRays( + m_shaderBindingTable.raygenGroupRange, + m_shaderBindingTable.missGroupsRange, m_shaderBindingTable.missGroupsStride, + m_shaderBindingTable.hitGroupsRange, m_shaderBindingTable.hitGroupsStride, + m_shaderBindingTable.callableGroupsRange, m_shaderBindingTable.callableGroupsStride, + WIN_W, WIN_H, 1); + } + } + + // pipeline barrier + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::RAY_TRACING_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + }; + imageBarriers[0].image = m_hdrImage.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL; + imageBarriers[0].newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + { asset::SViewport viewport; { viewport.minDepth = 1.f; @@ -802,1071 +793,842 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, VkRect2D defaultScisors[] = { {.offset = {(int32_t)viewport.x, (int32_t)viewport.y}, .extent = {(uint32_t)viewport.width, (uint32_t)viewport.height}} }; cmdbuf->setScissor(defaultScisors); - auto scRes = static_cast(m_surface->getSwapchainResources()); - const VkRect2D currentRenderArea = - { - .offset = {0,0}, - .extent = {m_window->getWidth(),m_window->getHeight()} - }; - const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; - const IGPUCommandBuffer::SRenderpassBeginInfo info = - { - .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex), - .colorClearValues = &clearColor, - .depthStencilClearValues = nullptr, - .renderArea = currentRenderArea - }; - nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u }; - - cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); - - cmdbuf->bindGraphicsPipeline(m_presentPipeline.get()); - cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get()); - ext::FullScreenTriangle::recordDrawCall(cmdbuf); - - const auto uiParams = m_ui.manager->getCreationParameters(); - auto* uiPipeline = m_ui.manager->getPipeline(); - cmdbuf->bindGraphicsPipeline(uiPipeline); - cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get()); - m_ui.manager->render(cmdbuf, waitInfo); - - cmdbuf->endRenderPass(); - - } - - cmdbuf->endDebugMarker(); - cmdbuf->end(); - - { - const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = - { - { - .semaphore = m_semaphore.get(), - .value = ++m_realFrameIx, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS - } - }; - { - { - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = - { - {.cmdbuf = cmdbuf } - }; - - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = - { - { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::NONE - } - }; - const IQueue::SSubmitInfo infos[] = - { - { - .waitSemaphores = acquired, - .commandBuffers = commandBuffers, - .signalSemaphores = rendered - } - }; - - updateGUIDescriptorSet(); - - if (queue->submit(infos) != IQueue::RESULT::SUCCESS) - m_realFrameIx--; - } - } - - m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline"); - m_surface->present(m_currentImageAcquire.imageIndex, rendered); - } - m_api->endCapture(); - m_frameAccumulationCounter++; - } - - inline void update() - { - m_camera.setMoveSpeed(m_cameraSetting.moveSpeed); - m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed); - - static std::chrono::microseconds previousEventTimestamp{}; - - m_inputSystem->getDefaultMouse(&m_mouse); - m_inputSystem->getDefaultKeyboard(&m_keyboard); - - auto updatePresentationTimestamp = [&]() - { - m_currentImageAcquire = m_surface->acquireNextImage(); - - m_oracle.reportEndFrameRecord(); - const auto timestamp = m_oracle.getNextPresentationTimeStamp(); - m_oracle.reportBeginFrameRecord(); - - return timestamp; - }; - - const auto nextPresentationTimestamp = updatePresentationTimestamp(); - - struct - { - std::vector mouse{}; - std::vector keyboard{}; - } capturedEvents; - - m_camera.beginInputProcessing(nextPresentationTimestamp); - { - m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void - { - m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl - - for (const auto& e : events) // here capture - { - if (e.timeStamp < previousEventTimestamp) - continue; - - previousEventTimestamp = e.timeStamp; - capturedEvents.mouse.emplace_back(e); - - } - }, m_logger.get()); - - m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void - { - m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl - - for (const auto& e : events) // here capture - { - if (e.timeStamp < previousEventTimestamp) - continue; - - previousEventTimestamp = e.timeStamp; - capturedEvents.keyboard.emplace_back(e); - } - }, m_logger.get()); - - } - m_camera.endInputProcessing(nextPresentationTimestamp); - - const core::SRange mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size()); - const core::SRange keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size()); - const auto cursorPosition = m_window->getCursorControl()->getPosition(); - const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()); - - const ext::imgui::UI::SUpdateParameters params = - { - .mousePosition = mousePosition, - .displaySize = { m_window->getWidth(), m_window->getHeight() }, - .mouseEvents = mouseEvents, - .keyboardEvents = keyboardEvents - }; - - m_ui.manager->update(params); - } - - inline bool keepRunning() override - { - if (m_surface->irrecoverable()) - return false; - - return true; - } - - inline bool onAppTerminated() override - { - return device_base_t::onAppTerminated(); - } + auto scRes = static_cast(m_surface->getSwapchainResources()); + const VkRect2D currentRenderArea = + { + .offset = {0,0}, + .extent = {m_window->getWidth(),m_window->getHeight()} + }; + const IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} }; + const IGPUCommandBuffer::SRenderpassBeginInfo info = + { + .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex), + .colorClearValues = &clearColor, + .depthStencilClearValues = nullptr, + .renderArea = currentRenderArea + }; + nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u }; + + cmdbuf->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); + + cmdbuf->bindGraphicsPipeline(m_presentPipeline.get()); + cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, m_presentPipeline->getLayout(), 0, 1u, &m_presentDs.get()); + ext::FullScreenTriangle::recordDrawCall(cmdbuf); + + const auto uiParams = m_ui.manager->getCreationParameters(); + auto* uiPipeline = m_ui.manager->getPipeline(); + cmdbuf->bindGraphicsPipeline(uiPipeline); + cmdbuf->bindDescriptorSets(EPBP_GRAPHICS, uiPipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &m_ui.descriptorSet.get()); + m_ui.manager->render(cmdbuf, waitInfo); + + cmdbuf->endRenderPass(); + + } + + cmdbuf->endDebugMarker(); + cmdbuf->end(); + + { + const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = + { + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } + }; + { + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cmdbuf } + }; + + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = + { + { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = rendered + } + }; + + updateGUIDescriptorSet(); + + if (queue->submit(infos) != IQueue::RESULT::SUCCESS) + m_realFrameIx--; + } + } + + m_window->setCaption("[Nabla Engine] Ray Tracing Pipeline"); + m_surface->present(m_currentImageAcquire.imageIndex, rendered); + } + m_api->endCapture(); + m_frameAccumulationCounter++; + } + + inline void update() + { + m_camera.setMoveSpeed(m_cameraSetting.moveSpeed); + m_camera.setRotateSpeed(m_cameraSetting.rotateSpeed); + + static std::chrono::microseconds previousEventTimestamp{}; + + m_inputSystem->getDefaultMouse(&m_mouse); + m_inputSystem->getDefaultKeyboard(&m_keyboard); + + auto updatePresentationTimestamp = [&]() + { + m_currentImageAcquire = m_surface->acquireNextImage(); + + m_oracle.reportEndFrameRecord(); + const auto timestamp = m_oracle.getNextPresentationTimeStamp(); + m_oracle.reportBeginFrameRecord(); + + return timestamp; + }; + + const auto nextPresentationTimestamp = updatePresentationTimestamp(); + + struct + { + std::vector mouse{}; + std::vector keyboard{}; + } capturedEvents; + + m_camera.beginInputProcessing(nextPresentationTimestamp); + { + const auto& io = ImGui::GetIO(); + m_mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + { + if (!io.WantCaptureMouse) + m_camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.mouse.emplace_back(e); + + } + }, m_logger.get()); + + m_keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + { + if (!io.WantCaptureKeyboard) + m_camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + capturedEvents.keyboard.emplace_back(e); + } + }, m_logger.get()); + + } + m_camera.endInputProcessing(nextPresentationTimestamp); + + const core::SRange mouseEvents(capturedEvents.mouse.data(), capturedEvents.mouse.data() + capturedEvents.mouse.size()); + const core::SRange keyboardEvents(capturedEvents.keyboard.data(), capturedEvents.keyboard.data() + capturedEvents.keyboard.size()); + const auto cursorPosition = m_window->getCursorControl()->getPosition(); + const auto mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()); + + const ext::imgui::UI::SUpdateParameters params = + { + .mousePosition = mousePosition, + .displaySize = { m_window->getWidth(), m_window->getHeight() }, + .mouseEvents = mouseEvents, + .keyboardEvents = keyboardEvents + }; + + m_ui.manager->update(params); + } + + inline bool keepRunning() override + { + if (m_surface->irrecoverable()) + return false; + + return true; + } + + inline bool onAppTerminated() override + { + return device_base_t::onAppTerminated(); + } private: - uint32_t getWorkgroupCount(uint32_t dim, uint32_t size) - { - return (dim + size - 1) / size; - } - - smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams& params) - { - smart_refctd_ptr buffer; - buffer = m_device->createBuffer(std::move(params)); - auto bufReqs = buffer->getMemoryReqs(); - bufReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); - m_device->allocate(bufReqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - - return buffer; - } - - smart_refctd_ptr getSingleUseCommandBufferAndBegin(smart_refctd_ptr pool) - { - smart_refctd_ptr cmdbuf; - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) - return nullptr; - - cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - - return cmdbuf; - } - - void cmdbufSubmitAndWait(smart_refctd_ptr cmdbuf, CThreadSafeQueueAdapter* queue, uint64_t startValue) - { - cmdbuf->end(); - - uint64_t finishedValue = startValue + 1; - - // submit builds - { - auto completed = m_device->createSemaphore(startValue); - - std::array signals; - { - auto& signal = signals.front(); - signal.value = finishedValue; - signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS); - signal.semaphore = completed.get(); - } - - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { - .cmdbuf = cmdbuf.get() - } }; - - const IQueue::SSubmitInfo infos[] = - { - { - .waitSemaphores = {}, - .commandBuffers = commandBuffers, - .signalSemaphores = signals - } - }; - - if (queue->submit(infos) != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to submit geometry transfer upload operations!", ILogger::ELL_ERROR); - return; - } - - const ISemaphore::SWaitInfo info[] = - { { - .semaphore = completed.get(), - .value = finishedValue - } }; - - m_device->blockForSemaphores(info); - } - } - - bool createIndirectBuffer(video::CThreadSafeQueueAdapter* queue) - { - const auto getBufferRangeAddress = [](const SBufferRange& range) - { - return range.buffer->getDeviceAddress() + range.offset; - }; - const auto command = TraceRaysIndirectCommand_t{ - .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange), - .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size, - .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange), - .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size, - .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride, - .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange), - .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size, - .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride, - .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange), - .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size, - .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride, - .width = WIN_W, - .height = WIN_H, - .depth = 1, - }; - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = sizeof(TraceRaysIndirectCommand_t); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), &command).move_into(m_indirectBuffer); - return true; - } - - bool createGeometries(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) - { - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for geometry creation!"); - - const auto defaultMaterial = Material{ - .ambient = {0.2, 0.1, 0.1}, - .diffuse = {0.8, 0.3, 0.3}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - .alpha = 1.0f, - }; - - auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) - { - core::matrix3x4SIMD transform; - transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); - return transform; - }; - - core::matrix3x4SIMD planeTransform; - planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); - - const auto cpuObjects = std::array{ - ReferenceObjectCpu { - .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, - .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), - .material = defaultMaterial, - .transform = planeTransform, - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), - .material = defaultMaterial, - .transform = getTranslationMatrix(0, 0.5f, 0), - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = Material{ - .ambient = {0.1, 0.1, 0.2}, - .diffuse = {0.2, 0.2, 0.8}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - }, - .transform = getTranslationMatrix(-5.0f, 1.0f, 0), - }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), - .material = Material{ - .ambient = {0.1, 0.2, 0.1}, - .diffuse = {0.2, 0.8, 0.2}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - .alpha = 0.2, - }, - .transform = getTranslationMatrix(5.0f, 1.0f, 0), - }, - }; - - struct ScratchVIBindings - { - nbl::asset::SBufferBinding vertex, index; - }; - std::array scratchBuffers; - - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - const auto& cpuObject = cpuObjects[i]; - - auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset - auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - vBuffer->addUsageFlags(vUsage); - vBuffer->setContentHash(vBuffer->computeContentHash()); - - auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset - auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - - if (cpuObject.data.indexType != EIT_UNKNOWN) - if (iBuffer) - { - iBuffer->addUsageFlags(iUsage); - iBuffer->setContentHash(iBuffer->computeContentHash()); - } - - scratchBuffers[i] = { - .vertex = {.offset = 0, .buffer = vBuffer}, - .index = {.offset = 0, .buffer = iBuffer}, - }; - - } - - auto cmdbuf = getSingleUseCommandBufferAndBegin(pool); - cmdbuf->beginDebugMarker("Build geometry vertex and index buffers"); - - CAssetConverter::SInputs inputs = {}; - inputs.logger = m_logger.get(); - std::array tmpBuffers; - { - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - tmpBuffers[2 * i + 0] = scratchBuffers[i].vertex.buffer.get(); - tmpBuffers[2 * i + 1] = scratchBuffers[i].index.buffer.get(); - } - - std::get>(inputs.assets) = tmpBuffers; - } - - auto reservation = m_converter->reserve(inputs); - { - auto prepass = [&](const auto & references) -> bool - { - auto objects = reservation.getGPUObjects(); - uint32_t counter = {}; - for (auto& object : objects) - { - auto gpu = object.value; - auto* reference = references[counter]; - - if (reference) - { - if (!gpu) - { - m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); - return false; - } - } - counter++; - } - return true; - }; - - prepass.template operator() < ICPUBuffer > (tmpBuffers); - } - - auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) }); - STriangleGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); - - m_gpuTriangleGeometries.reserve(std::size(cpuObjects)); - // convert - { - // not sure if need this (probably not, originally for transition img view) - auto semaphore = m_device->createSemaphore(0u); - - std::array cmdbufs = {}; - cmdbufs.front().cmdbuf = cmdbuf.get(); - - SIntendedSubmitInfo transfer = {}; - transfer.queue = queue; - transfer.scratchCommandBuffers = cmdbufs; - transfer.scratchSemaphore = { - .semaphore = semaphore.get(), - .value = 0u, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS - }; - - CAssetConverter::SConvertParams params = {}; - params.utilities = m_utils.get(); - params.transfer = &transfer; - - auto future = reservation.convert(params); - if (future.copy() != IQueue::RESULT::SUCCESS) - { - m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); - return false; - } - - auto&& buffers = reservation.getGPUObjects(); - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - auto& cpuObject = cpuObjects[i]; - - m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ - .meta = cpuObject.meta, - .bindings = { - .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, - .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, - }, - .vertexStride = cpuObject.data.inputParams.bindings[0].stride, - .indexType = cpuObject.data.indexType, - .indexCount = cpuObject.data.indexCount, - .material = hlsl::_static_cast(cpuObject.material), - .transform = cpuObject.transform, - }); - } - - for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) - { - const auto& gpuObject = m_gpuTriangleGeometries[i]; - const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); - geomInfos[i] = { - .material = gpuObject.material, - .vertexBufferAddress = vertexBufferAddress, - .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, - .vertexStride = gpuObject.vertexStride, - .objType = gpuObject.meta.type, - .indexType = gpuObject.indexType, - .smoothNormals = s_smoothNormals[gpuObject.meta.type], - }; - } - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = geomInfoBuffer->getSize(); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); - } - - // intersection geometries setup - { - core::vector proceduralGeoms; - proceduralGeoms.reserve(NumberOfProceduralGeometries); - using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; - core::vector aabbs; - aabbs.reserve(NumberOfProceduralGeometries); - for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) - { - const auto middle_i = NumberOfProceduralGeometries / 2.0; - SProceduralGeomInfo sphere = { - .material = hlsl::_static_cast(Material{ - .ambient = {0.1, 0.05 * i, 0.1}, - .diffuse = {0.3, 0.2 * i, 0.3}, - .specular = {0.8, 0.8, 0.8}, - .shininess = 1.0f, - }), - .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), - .radius = 1, - }; - - proceduralGeoms.push_back(sphere); - const auto sphereMin = sphere.center - sphere.radius; - const auto sphereMax = sphere.center + sphere.radius; - aabbs.emplace_back( - vector3d(sphereMin.x, sphereMin.y, sphereMin.z), - vector3d(sphereMax.x, sphereMax.y, sphereMax.z)); - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT; - params.size = aabbs.size() * sizeof(Aabb); - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), aabbs.data()).move_into(m_proceduralAabbBuffer); - } - } - - return true; - } - - void calculateRayTracingStackSize(const smart_refctd_ptr& pipeline) - { - const auto raygenStackSize = pipeline->getRaygenStackSize(); - auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t - { - auto maxValue = 0; - for (const auto& val : ranges) - { - maxValue = std::max(maxValue, std::invoke(valProj, val)); - } - return maxValue; - }; - - const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit); - const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit); - const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection); - const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{}); - const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{}); - auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax); - firstDepthStackSizeMax = std::max(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax); - m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax); - } - - bool createShaderBindingTable(video::CThreadSafeQueueAdapter* queue, const smart_refctd_ptr& pipeline) - { - const auto& limits = m_device->getPhysicalDevice()->getLimits(); - const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize; - const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment); - - auto& raygenRange = m_shaderBindingTable.raygenGroupRange; - - auto& hitRange = m_shaderBindingTable.hitGroupsRange; - const auto hitHandles = pipeline->getHitHandles(); - - auto& missRange = m_shaderBindingTable.missGroupsRange; - const auto missHandles = pipeline->getMissHandles(); - - auto& callableRange = m_shaderBindingTable.callableGroupsRange; - const auto callableHandles = pipeline->getCallableHandles(); - - raygenRange = { - .offset = 0, - .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment) - }; - - missRange = { - .offset = raygenRange.size, - .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), - }; - m_shaderBindingTable.missGroupsStride = handleSizeAligned; - - hitRange = { - .offset = missRange.offset + missRange.size, - .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), - }; - m_shaderBindingTable.hitGroupsStride = handleSizeAligned; - - callableRange = { - .offset = hitRange.offset + hitRange.size, - .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), - }; - m_shaderBindingTable.callableGroupsStride = handleSizeAligned; - - const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size; - - ICPUBuffer::SCreationParams cpuBufferParams; - cpuBufferParams.size = bufferSize; - auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams)); - uint8_t* pData = reinterpret_cast(cpuBuffer->getPointer()); - - // copy raygen region - memcpy(pData, &pipeline->getRaygen(), handleSize); - - // copy miss region - uint8_t* pMissData = pData + missRange.offset; - for (const auto& handle : missHandles) - { - memcpy(pMissData, &handle, handleSize); - pMissData += m_shaderBindingTable.missGroupsStride; - } - - // copy hit region - uint8_t* pHitData = pData + hitRange.offset; - for (const auto& handle : hitHandles) - { - memcpy(pHitData, &handle, handleSize); - pHitData += m_shaderBindingTable.hitGroupsStride; - } - - // copy callable region - uint8_t* pCallableData = pData + callableRange.offset; - for (const auto& handle : callableHandles) - { - memcpy(pCallableData, &handle, handleSize); - pCallableData += m_shaderBindingTable.callableGroupsStride; - } - - { - IGPUBuffer::SCreationParams params; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT; - params.size = bufferSize; - m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), pData).move_into(raygenRange.buffer); - missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); - hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); - callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); - } - - return true; - } - - bool createAccelerationStructures(video::CThreadSafeQueueAdapter* queue) - { - // plus 1 blas for procedural geometry contains {{var::NumberOfProcedural}} - // spheres. Each sphere is a primitive instead one instance or geometry - const auto blasCount = m_gpuTriangleGeometries.size() + 1; - const auto proceduralBlasIdx = m_gpuTriangleGeometries.size(); - - IQueryPool::SCreationParams qParams{ .queryCount = static_cast(blasCount), .queryType = IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE }; - smart_refctd_ptr queryPool = m_device->createQueryPool(std::move(qParams)); - - auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); - if (!pool) - return logFail("Couldn't create Command Pool for blas/tlas creation!"); - - m_api->startCapture(); -#ifdef TRY_BUILD_FOR_NGFX // NSight is "debugger-challenged" it can't capture anything not happenning "during a frame", so we need to trick it - m_currentImageAcquire = m_surface->acquireNextImage(); - { - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS - } }; - m_surface->present(m_currentImageAcquire.imageIndex, acquired); - } - m_currentImageAcquire = m_surface->acquireNextImage(); -#endif - size_t totalScratchSize = 0; - const auto scratchOffsetAlignment = m_device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; - - // build bottom level ASes - { - core::vector primitiveCounts(blasCount); - core::vector> triangles(m_gpuTriangleGeometries.size()); - core::vector scratchSizes(blasCount); - IGPUBottomLevelAccelerationStructure::AABBs aabbs; - - auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; - if (m_physicalDevice->getProperties().limits.rayTracingPositionFetch) - blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_DATA_ACCESS_KHR; - - IGPUBottomLevelAccelerationStructure::DeviceBuildInfo initBuildInfo; - initBuildInfo.buildFlags = blasFlags; - initBuildInfo.geometryCount = 1; // only 1 geometry object per blas - initBuildInfo.srcAS = nullptr; - initBuildInfo.dstAS = nullptr; - initBuildInfo.scratch = {}; - - auto blasBuildInfos = core::vector(blasCount, initBuildInfo); - - m_gpuBlasList.resize(blasCount); - // setup blas info for triangle geometries - for (uint32_t i = 0; i < blasCount; i++) - { - const auto isProcedural = i == proceduralBlasIdx; - if (isProcedural) - { - aabbs.data.buffer = smart_refctd_ptr(m_proceduralAabbBuffer); - aabbs.data.offset = 0; - aabbs.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t); - aabbs.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now - - primitiveCounts[proceduralBlasIdx] = NumberOfProceduralGeometries; - blasBuildInfos[proceduralBlasIdx].aabbs = &aabbs; - blasBuildInfos[proceduralBlasIdx].buildFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; - } else - { - const auto& gpuObject = m_gpuTriangleGeometries[i]; - - const uint32_t vertexStride = gpuObject.vertexStride; - const uint32_t numVertices = gpuObject.bindings.vertex.buffer->getSize() / vertexStride; - if (gpuObject.useIndex()) - primitiveCounts[i] = gpuObject.indexCount / 3; - else - primitiveCounts[i] = numVertices / 3; - - triangles[i].vertexData[0] = gpuObject.bindings.vertex; - triangles[i].indexData = gpuObject.useIndex() ? gpuObject.bindings.index : gpuObject.bindings.vertex; - triangles[i].maxVertex = numVertices - 1; - triangles[i].vertexStride = vertexStride; - triangles[i].vertexFormat = EF_R32G32B32_SFLOAT; - triangles[i].indexType = gpuObject.indexType; - triangles[i].geometryFlags = gpuObject.material.isTransparent() ? - IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : - IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; - - blasBuildInfos[i].triangles = &triangles[i]; - } - ILogicalDevice::AccelerationStructureBuildSizes buildSizes; - { - const uint32_t maxPrimCount[1] = { primitiveCounts[i] }; - if (isProcedural) - { - buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{&aabbs, 1}, maxPrimCount); - } else - { - buildSizes = m_device->getAccelerationStructureBuildSizes(blasBuildInfos[i].buildFlags, false, std::span{&triangles[i], 1}, maxPrimCount); - } - if (!buildSizes) - return logFail("Failed to get BLAS build sizes"); - } - - scratchSizes[i] = buildSizes.buildScratchSize; - totalScratchSize = core::alignUp(totalScratchSize, scratchOffsetAlignment); - totalScratchSize += buildSizes.buildScratchSize; - - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = buildSizes.accelerationStructureSize; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = buildSizes.accelerationStructureSize; - blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); - if (!m_gpuBlasList[i]) - return logFail("Could not create BLAS"); - } - } - - - auto cmdbufBlas = getSingleUseCommandBufferAndBegin(pool); - cmdbufBlas->beginDebugMarker("Build BLAS"); - - cmdbufBlas->resetQueryPool(queryPool.get(), 0, blasCount); - - smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - params.size = totalScratchSize; - scratchBuffer = createBuffer(params); - } - - core::vector buildRangeInfos(blasCount); - core::vector pRangeInfos(blasCount); - for (uint32_t i = 0; i < blasCount; i++) - { - blasBuildInfos[i].dstAS = m_gpuBlasList[i].get(); - blasBuildInfos[i].scratch.buffer = scratchBuffer; - if (i == 0) - { - blasBuildInfos[i].scratch.offset = 0u; - } else - { - const auto unalignedOffset = blasBuildInfos[i - 1].scratch.offset + scratchSizes[i - 1]; - blasBuildInfos[i].scratch.offset = core::alignUp(unalignedOffset, scratchOffsetAlignment); - } - - buildRangeInfos[i].primitiveCount = primitiveCounts[i]; - buildRangeInfos[i].primitiveByteOffset = 0u; - buildRangeInfos[i].firstVertex = 0u; - buildRangeInfos[i].transformByteOffset = 0u; - - pRangeInfos[i] = &buildRangeInfos[i]; - } - - if (!cmdbufBlas->buildAccelerationStructures(std::span(blasBuildInfos), pRangeInfos.data())) - return logFail("Failed to build BLAS"); - - { - SMemoryBarrier memBarrier; - memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; - memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT; - cmdbufBlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); - } - - - core::vector ases(blasCount); - for (uint32_t i = 0; i < blasCount; i++) - ases[i] = m_gpuBlasList[i].get(); - if (!cmdbufBlas->writeAccelerationStructureProperties(std::span(ases), IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE, - queryPool.get(), 0)) - return logFail("Failed to write acceleration structure properties!"); - - cmdbufBlas->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufBlas, queue, 39); - } - - auto cmdbufCompact = getSingleUseCommandBufferAndBegin(pool); - cmdbufCompact->beginDebugMarker("Compact BLAS"); - - // compact blas - { - core::vector asSizes(blasCount); - if (!m_device->getQueryPoolResults(queryPool.get(), 0, blasCount, asSizes.data(), sizeof(size_t), bitflag(IQueryPool::WAIT_BIT) | IQueryPool::_64_BIT)) - return logFail("Could not get query pool results for AS sizes"); - - core::vector> cleanupBlas(blasCount); - for (uint32_t i = 0; i < blasCount; i++) - { - if (asSizes[i] == 0) continue; - cleanupBlas[i] = m_gpuBlasList[i]; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = asSizes[i]; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUBottomLevelAccelerationStructure::SCreationParams blasParams; - blasParams.bufferRange.buffer = asBuffer; - blasParams.bufferRange.offset = 0u; - blasParams.bufferRange.size = asSizes[i]; - blasParams.flags = IGPUBottomLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - m_gpuBlasList[i] = m_device->createBottomLevelAccelerationStructure(std::move(blasParams)); - if (!m_gpuBlasList[i]) - return logFail("Could not create compacted BLAS"); - } - - IGPUBottomLevelAccelerationStructure::CopyInfo copyInfo; - copyInfo.src = cleanupBlas[i].get(); - copyInfo.dst = m_gpuBlasList[i].get(); - copyInfo.mode = IGPUBottomLevelAccelerationStructure::COPY_MODE::COMPACT; - if (!cmdbufCompact->copyAccelerationStructure(copyInfo)) - return logFail("Failed to copy AS to compact"); - } - } - - cmdbufCompact->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufCompact, queue, 40); - - auto cmdbufTlas = getSingleUseCommandBufferAndBegin(pool); - cmdbufTlas->beginDebugMarker("Build TLAS"); - - // build top level AS - { - const uint32_t instancesCount = blasCount; - core::vector instances(instancesCount); - for (uint32_t i = 0; i < instancesCount; i++) - { - const auto isProceduralInstance = i == proceduralBlasIdx; - instances[i].base.blas.deviceAddress = m_gpuBlasList[i]->getReferenceForDeviceOperations().deviceAddress; - instances[i].base.mask = 0xFF; - instances[i].base.instanceCustomIndex = i; - instances[i].base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0; - instances[i].base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); - instances[i].transform = isProceduralInstance ? matrix3x4SIMD() : m_gpuTriangleGeometries[i].transform; - } - - { - size_t bufSize = instancesCount * sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT | - IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - params.size = bufSize; - m_instanceBuffer = createBuffer(params); - - SBufferRange range = { .offset = 0u, .size = bufSize, .buffer = m_instanceBuffer }; - cmdbufTlas->updateBuffer(range, instances.data()); - } - - // make sure instances upload complete first - { - SMemoryBarrier memBarrier; - memBarrier.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; - memBarrier.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; - memBarrier.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT; - memBarrier.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT; - cmdbufTlas->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); - } - - auto tlasFlags = bitflag(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); - - IGPUTopLevelAccelerationStructure::DeviceBuildInfo tlasBuildInfo; - tlasBuildInfo.buildFlags = tlasFlags; - tlasBuildInfo.srcAS = nullptr; - tlasBuildInfo.dstAS = nullptr; - tlasBuildInfo.instanceData.buffer = m_instanceBuffer; - tlasBuildInfo.instanceData.offset = 0u; - tlasBuildInfo.scratch = {}; - - auto buildSizes = m_device->getAccelerationStructureBuildSizes(tlasFlags, false, instancesCount); - if (!buildSizes) - return logFail("Failed to get TLAS build sizes"); - - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - params.size = buildSizes.accelerationStructureSize; - smart_refctd_ptr asBuffer = createBuffer(params); - - IGPUTopLevelAccelerationStructure::SCreationParams tlasParams; - tlasParams.bufferRange.buffer = asBuffer; - tlasParams.bufferRange.offset = 0u; - tlasParams.bufferRange.size = buildSizes.accelerationStructureSize; - tlasParams.flags = IGPUTopLevelAccelerationStructure::SCreationParams::FLAGS::NONE; - m_gpuTlas = m_device->createTopLevelAccelerationStructure(std::move(tlasParams)); - if (!m_gpuTlas) - return logFail("Could not create TLAS"); - } - - smart_refctd_ptr scratchBuffer; - { - IGPUBuffer::SCreationParams params; - params.usage = bitflag(IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - params.size = buildSizes.buildScratchSize; - scratchBuffer = createBuffer(params); - } - - tlasBuildInfo.dstAS = m_gpuTlas.get(); - tlasBuildInfo.scratch.buffer = scratchBuffer; - tlasBuildInfo.scratch.offset = 0u; - - IGPUTopLevelAccelerationStructure::BuildRangeInfo buildRangeInfo[1u]; - buildRangeInfo[0].instanceCount = instancesCount; - buildRangeInfo[0].instanceByteOffset = 0u; - IGPUTopLevelAccelerationStructure::BuildRangeInfo* pRangeInfos; - pRangeInfos = &buildRangeInfo[0]; - - if (!cmdbufTlas->buildAccelerationStructures({ &tlasBuildInfo, 1 }, pRangeInfos)) - return logFail("Failed to build TLAS"); - } - - cmdbufTlas->endDebugMarker(); - cmdbufSubmitAndWait(cmdbufTlas, queue, 45); - -#ifdef TRY_BUILD_FOR_NGFX - { - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS - } }; - m_surface->present(m_currentImageAcquire.imageIndex, acquired); - } -#endif - m_api->endCapture(); - - return true; - } - - - smart_refctd_ptr m_window; - smart_refctd_ptr> m_surface; - smart_refctd_ptr m_semaphore; - uint64_t m_realFrameIx = 0; - uint32_t m_frameAccumulationCounter = 0; - std::array, MaxFramesInFlight> m_cmdBufs; - ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; - - core::smart_refctd_ptr m_inputSystem; - InputSystem::ChannelReader m_mouse; - InputSystem::ChannelReader m_keyboard; - - struct CameraSetting - { - float fov = 60.f; - float zNear = 0.1f; - float zFar = 10000.f; - float moveSpeed = 1.f; - float rotateSpeed = 1.f; - float viewWidth = 10.f; - float camYAngle = 165.f / 180.f * 3.14159f; - float camXAngle = 32.f / 180.f * 3.14159f; - - } m_cameraSetting; - Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); - - Light m_light = { - .direction = {-1.0f, -1.0f, -0.4f}, - .position = {10.0f, 15.0f, 8.0f}, - .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, - .type = ELT_DIRECTIONAL - }; - - video::CDumbPresentationOracle m_oracle; - - struct C_UI - { - nbl::core::smart_refctd_ptr manager; - - struct - { - core::smart_refctd_ptr gui, scene; - } samplers; - - core::smart_refctd_ptr descriptorSet; - } m_ui; - core::smart_refctd_ptr m_guiDescriptorSetPool; - - core::vector m_gpuTriangleGeometries; - core::vector m_gpuIntersectionSpheres; - uint32_t m_intersectionHitGroupIdx; - - std::vector> m_gpuBlasList; - smart_refctd_ptr m_gpuTlas; - smart_refctd_ptr m_instanceBuffer; - - smart_refctd_ptr m_triangleGeomInfoBuffer; - smart_refctd_ptr m_proceduralGeomInfoBuffer; - smart_refctd_ptr m_proceduralAabbBuffer; - smart_refctd_ptr m_indirectBuffer; - - smart_refctd_ptr m_hdrImage; - smart_refctd_ptr m_hdrImageView; - - smart_refctd_ptr m_rayTracingDsPool; - smart_refctd_ptr m_rayTracingDs; - smart_refctd_ptr m_rayTracingPipeline; - uint64_t m_rayTracingStackSize; - ShaderBindingTable m_shaderBindingTable; - - smart_refctd_ptr m_presentDs; - smart_refctd_ptr m_presentDsPool; - smart_refctd_ptr m_presentPipeline; - - smart_refctd_ptr m_converter; - - - core::matrix4SIMD m_cachedModelViewProjectionMatrix; - bool m_useIndirectCommand = false; + uint32_t getWorkgroupCount(uint32_t dim, uint32_t size) + { + return (dim + size - 1) / size; + } + + bool createIndirectBuffer() + { + const auto getBufferRangeAddress = [](const SBufferRange& range) + { + return range.buffer->getDeviceAddress() + range.offset; + }; + const auto command = TraceRaysIndirectCommand_t{ + .raygenShaderRecordAddress = getBufferRangeAddress(m_shaderBindingTable.raygenGroupRange), + .raygenShaderRecordSize = m_shaderBindingTable.raygenGroupRange.size, + .missShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.missGroupsRange), + .missShaderBindingTableSize = m_shaderBindingTable.missGroupsRange.size, + .missShaderBindingTableStride = m_shaderBindingTable.missGroupsStride, + .hitShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.hitGroupsRange), + .hitShaderBindingTableSize = m_shaderBindingTable.hitGroupsRange.size, + .hitShaderBindingTableStride = m_shaderBindingTable.hitGroupsStride, + .callableShaderBindingTableAddress = getBufferRangeAddress(m_shaderBindingTable.callableGroupsRange), + .callableShaderBindingTableSize = m_shaderBindingTable.callableGroupsRange.size, + .callableShaderBindingTableStride = m_shaderBindingTable.callableGroupsStride, + .width = WIN_W, + .height = WIN_H, + .depth = 1, + }; + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INDIRECT_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = sizeof(TraceRaysIndirectCommand_t); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), &command).move_into(m_indirectBuffer); + return true; + } + + void calculateRayTracingStackSize(const smart_refctd_ptr& pipeline) + { + const auto raygenStackSize = pipeline->getRaygenStackSize(); + auto getMaxSize = [&](auto ranges, auto valProj) -> uint16_t + { + auto maxValue = 0; + for (const auto& val : ranges) + { + maxValue = std::max(maxValue, std::invoke(valProj, val)); + } + return maxValue; + }; + + const auto closestHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::closestHit); + const auto anyHitStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::anyHit); + const auto intersectionStackMax = getMaxSize(pipeline->getHitStackSizes(), &IGPURayTracingPipeline::SHitGroupStackSize::intersection); + const auto missStackMax = getMaxSize(pipeline->getMissStackSizes(), std::identity{}); + const auto callableStackMax = getMaxSize(pipeline->getCallableStackSizes(), std::identity{}); + auto firstDepthStackSizeMax = std::max(closestHitStackMax, missStackMax); + firstDepthStackSizeMax = std::max(firstDepthStackSizeMax, intersectionStackMax + anyHitStackMax); + m_rayTracingStackSize = raygenStackSize + std::max(firstDepthStackSizeMax, callableStackMax); + } + + bool createShaderBindingTable(const smart_refctd_ptr& pipeline) + { + const auto& limits = m_device->getPhysicalDevice()->getLimits(); + const auto handleSize = SPhysicalDeviceLimits::ShaderGroupHandleSize; + const auto handleSizeAligned = nbl::core::alignUp(handleSize, limits.shaderGroupHandleAlignment); + + auto& raygenRange = m_shaderBindingTable.raygenGroupRange; + + auto& hitRange = m_shaderBindingTable.hitGroupsRange; + const auto hitHandles = pipeline->getHitHandles(); + + auto& missRange = m_shaderBindingTable.missGroupsRange; + const auto missHandles = pipeline->getMissHandles(); + + auto& callableRange = m_shaderBindingTable.callableGroupsRange; + const auto callableHandles = pipeline->getCallableHandles(); + + raygenRange = { + .offset = 0, + .size = core::alignUp(handleSizeAligned, limits.shaderGroupBaseAlignment) + }; + + missRange = { + .offset = raygenRange.size, + .size = core::alignUp(missHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), + }; + m_shaderBindingTable.missGroupsStride = handleSizeAligned; + + hitRange = { + .offset = missRange.offset + missRange.size, + .size = core::alignUp(hitHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), + }; + m_shaderBindingTable.hitGroupsStride = handleSizeAligned; + + callableRange = { + .offset = hitRange.offset + hitRange.size, + .size = core::alignUp(callableHandles.size() * handleSizeAligned, limits.shaderGroupBaseAlignment), + }; + m_shaderBindingTable.callableGroupsStride = handleSizeAligned; + + const auto bufferSize = raygenRange.size + missRange.size + hitRange.size + callableRange.size; + + ICPUBuffer::SCreationParams cpuBufferParams; + cpuBufferParams.size = bufferSize; + auto cpuBuffer = ICPUBuffer::create(std::move(cpuBufferParams)); + uint8_t* pData = reinterpret_cast(cpuBuffer->getPointer()); + + // copy raygen region + memcpy(pData, &pipeline->getRaygen(), handleSize); + + // copy miss region + uint8_t* pMissData = pData + missRange.offset; + for (const auto& handle : missHandles) + { + memcpy(pMissData, &handle, handleSize); + pMissData += m_shaderBindingTable.missGroupsStride; + } + + // copy hit region + uint8_t* pHitData = pData + hitRange.offset; + for (const auto& handle : hitHandles) + { + memcpy(pHitData, &handle, handleSize); + pHitData += m_shaderBindingTable.hitGroupsStride; + } + + // copy callable region + uint8_t* pCallableData = pData + callableRange.offset; + for (const auto& handle : callableHandles) + { + memcpy(pCallableData, &handle, handleSize); + pCallableData += m_shaderBindingTable.callableGroupsStride; + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_SHADER_BINDING_TABLE_BIT; + params.size = bufferSize; + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getGraphicsQueue() }, std::move(params), pData).move_into(raygenRange.buffer); + missRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + hitRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + callableRange.buffer = core::smart_refctd_ptr(raygenRange.buffer); + } + + return true; + } + + bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc) + { + auto queue = getGraphicsQueue(); + // get geometries into ICPUBuffers + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!pool) + return logFail("Couldn't create Command Pool for geometry creation!"); + + const auto defaultMaterial = Material{ + .ambient = {0.2, 0.1, 0.1}, + .diffuse = {0.8, 0.3, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 1.0f, + }; + + auto getTranslationMatrix = [](float32_t x, float32_t y, float32_t z) + { + core::matrix3x4SIMD transform; + transform.setTranslation(nbl::core::vectorSIMDf(x, y, z, 0)); + return transform; + }; + + core::matrix3x4SIMD planeTransform; + planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); + + // triangles geometries + const auto cpuObjects = std::array{ + ReferenceObjectCpu { + .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, + .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), + .material = defaultMaterial, + .transform = planeTransform, + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), + .material = defaultMaterial, + .transform = getTranslationMatrix(0, 0.5f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.1, 0.2}, + .diffuse = {0.2, 0.2, 0.8}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }, + .transform = getTranslationMatrix(-5.0f, 1.0f, 0), + }, + ReferenceObjectCpu { + .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, + .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + .material = Material{ + .ambient = {0.1, 0.2, 0.1}, + .diffuse = {0.2, 0.8, 0.2}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + .alpha = 0.2, + }, + .transform = getTranslationMatrix(5.0f, 1.0f, 0), + }, + }; + + struct CPUTriBufferBindings + { + nbl::asset::SBufferBinding vertex, index; + }; + std::array cpuTriBuffers; + + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + const auto& cpuObject = cpuObjects[i]; + + auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset + auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + vBuffer->addUsageFlags(vUsage); + vBuffer->setContentHash(vBuffer->computeContentHash()); + + auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset + auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | + IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + + if (cpuObject.data.indexType != EIT_UNKNOWN) + if (iBuffer) + { + iBuffer->addUsageFlags(iUsage); + iBuffer->setContentHash(iBuffer->computeContentHash()); + } + + cpuTriBuffers[i] = { + .vertex = {.offset = 0, .buffer = vBuffer}, + .index = {.offset = 0, .buffer = iBuffer}, + }; + + } + + // procedural geometries + using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; + + smart_refctd_ptr cpuProcBuffer; + { + ICPUBuffer::SCreationParams params; + params.size = NumberOfProceduralGeometries * sizeof(Aabb); + cpuProcBuffer = ICPUBuffer::create(std::move(params)); + } + + core::vector proceduralGeoms; + proceduralGeoms.reserve(NumberOfProceduralGeometries); + auto proceduralGeometries = reinterpret_cast(cpuProcBuffer->getPointer()); + for (int32_t i = 0; i < NumberOfProceduralGeometries; i++) + { + const auto middle_i = NumberOfProceduralGeometries / 2.0; + SProceduralGeomInfo sphere = { + .material = hlsl::_static_cast(Material{ + .ambient = {0.1, 0.05 * i, 0.1}, + .diffuse = {0.3, 0.2 * i, 0.3}, + .specular = {0.8, 0.8, 0.8}, + .shininess = 1.0f, + }), + .center = float32_t3((i - middle_i) * 4.0, 2, 5.0), + .radius = 1, + }; + + proceduralGeoms.push_back(sphere); + const auto sphereMin = sphere.center - sphere.radius; + const auto sphereMax = sphere.center + sphere.radius; + proceduralGeometries[i] = { + vector3d(sphereMin.x, sphereMin.y, sphereMin.z), + vector3d(sphereMax.x, sphereMax.y, sphereMax.z) + }; + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = proceduralGeoms.size() * sizeof(SProceduralGeomInfo); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), proceduralGeoms.data()).move_into(m_proceduralGeomInfoBuffer); + } + + // get ICPUBuffers into ICPUBLAS + // TODO use one BLAS and multiple triangles/aabbs in one + const auto blasCount = std::size(cpuObjects) + 1; + const auto proceduralBlasIdx = std::size(cpuObjects); + + std::array, std::size(cpuObjects)+1u> cpuBlas; + for (uint32_t i = 0; i < blasCount; i++) + { + auto& blas = cpuBlas[i]; + blas = make_smart_refctd_ptr(); + + if (i == proceduralBlasIdx) + { + auto aabbs = make_refctd_dynamic_array>>(1u); + auto primitiveCounts = make_refctd_dynamic_array>(1u); + + auto& aabb = aabbs->front(); + auto& primCount = primitiveCounts->front(); + + primCount = NumberOfProceduralGeometries; + aabb.data = { .offset = 0, .buffer = cpuProcBuffer }; + aabb.stride = sizeof(IGPUBottomLevelAccelerationStructure::AABB_t); + aabb.geometryFlags = IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; // only allow opaque for now + + blas->setGeometries(std::move(aabbs), std::move(primitiveCounts)); + } + else + { + auto triangles = make_refctd_dynamic_array>>(1u); + auto primitiveCounts = make_refctd_dynamic_array>(1u); + + auto& tri = triangles->front(); + auto& primCount = primitiveCounts->front(); + const auto& geom = cpuObjects[i]; + const auto& cpuBuf = cpuTriBuffers[i]; + + const bool useIndex = geom.data.indexType != EIT_UNKNOWN; + const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride; + const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride; + + if (useIndex) + primCount = geom.data.indexCount / 3; + else + primCount = numVertices / 3; + + tri.vertexData[0] = cpuBuf.vertex; + tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex; + tri.maxVertex = numVertices - 1; + tri.vertexStride = vertexStride; + tri.vertexFormat = EF_R32G32B32_SFLOAT; + tri.indexType = geom.data.indexType; + tri.geometryFlags = geom.material.isTransparent() ? + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : + IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; + + blas->setGeometries(std::move(triangles), std::move(primitiveCounts)); + } + + auto blasFlags = bitflag(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT) | IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::ALLOW_COMPACTION_BIT; + if (i == proceduralBlasIdx) + blasFlags |= IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; + + blas->setBuildFlags(blasFlags); + blas->setContentHash(blas->computeContentHash()); + } + + auto geomInfoBuffer = ICPUBuffer::create({ std::size(cpuObjects) * sizeof(STriangleGeomInfo) }); + STriangleGeomInfo* geomInfos = reinterpret_cast(geomInfoBuffer->getPointer()); + + // get ICPUBLAS into ICPUTLAS + auto geomInstances = make_refctd_dynamic_array>(blasCount); + { + uint32_t i = 0; + for (auto instance = geomInstances->begin(); instance != geomInstances->end(); instance++, i++) + { + const auto isProceduralInstance = i == proceduralBlasIdx; + ICPUTopLevelAccelerationStructure::StaticInstance inst; + inst.base.blas = cpuBlas[i]; + inst.base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); + inst.base.instanceCustomIndex = i; + inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;; + inst.base.mask = 0xFF; + inst.transform = isProceduralInstance ? matrix3x4SIMD() : cpuObjects[i].transform; + + instance->instance = inst; + } + } + + auto cpuTlas = make_smart_refctd_ptr(); + cpuTlas->setInstances(std::move(geomInstances)); + cpuTlas->setBuildFlags(IGPUTopLevelAccelerationStructure::BUILD_FLAGS::PREFER_FAST_TRACE_BIT); + + // convert with asset converter + smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); + struct MyInputs : CAssetConverter::SInputs + { + // For the GPU Buffers to be directly writeable and so that we don't need a Transfer Queue submit at all + inline uint32_t constrainMemoryTypeBits(const size_t groupCopyID, const IAsset* canonicalAsset, const blake3_hash_t& contentHash, const IDeviceMemoryBacked* memoryBacked) const override + { + assert(memoryBacked); + return memoryBacked->getObjectType() != IDeviceMemoryBacked::EOT_BUFFER ? (~0u) : rebarMemoryTypes; + } + + uint32_t rebarMemoryTypes; + } inputs = {}; + inputs.logger = m_logger.get(); + inputs.rebarMemoryTypes = m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); + // the allocator needs to be overriden to hand out memory ranges which have already been mapped so that the ReBAR fast-path can kick in + // (multiple buffers can be bound to same memory, but memory can only be mapped once at one place, so Asset Converter can't do it) + struct MyAllocator final : public IDeviceMemoryAllocator + { + ILogicalDevice* getDeviceForAllocations() const override { return device; } + + SAllocation allocate(const SAllocateInfo& info) override + { + auto retval = device->allocate(info); + // map what is mappable by default so ReBAR checks succeed + if (retval.isValid() && retval.memory->isMappable()) + retval.memory->map({ .offset = 0,.length = info.size }); + return retval; + } + + ILogicalDevice* device; + } myalloc; + myalloc.device = m_device.get(); + inputs.allocator = &myalloc; + + std::array tmpTlas; + std::array tmpBuffers; + { + tmpTlas[0] = cpuTlas.get(); + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get(); + tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get(); + } + tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get(); + + std::get>(inputs.assets) = tmpTlas; + std::get>(inputs.assets) = tmpBuffers; + } + + auto reservation = converter->reserve(inputs); + { + auto prepass = [&](const auto & references) -> bool + { + auto objects = reservation.getGPUObjects(); + uint32_t counter = {}; + for (auto& object : objects) + { + auto gpu = object.value; + auto* reference = references[counter]; + + if (reference) + { + if (!gpu) + { + m_logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); + return false; + } + } + counter++; + } + return true; + }; + + prepass.template operator() < ICPUTopLevelAccelerationStructure > (tmpTlas); + prepass.template operator() < ICPUBuffer > (tmpBuffers); + } + + constexpr auto CompBufferCount = 2; + std::array, CompBufferCount> compBufs = {}; + std::array compBufInfos = {}; + { + auto pool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, compBufs); + compBufs.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + for (auto i = 0; i < CompBufferCount; i++) + compBufInfos[i].cmdbuf = compBufs[i].get(); + } + auto compSema = m_device->createSemaphore(0u); + SIntendedSubmitInfo compute = {}; + compute.queue = queue; + compute.scratchCommandBuffers = compBufInfos; + compute.scratchSemaphore = { + .semaphore = compSema.get(), + .value = 0u, + .stageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT | PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT + }; + // convert + { + smart_refctd_ptr scratchAlloc; + { + constexpr auto MaxAlignment = 256; + constexpr auto MinAllocationSize = 1024; + const auto scratchSize = core::alignUp(reservation.getMaxASBuildScratchSize(false), MaxAlignment); + + + IGPUBuffer::SCreationParams creationParams = {}; + creationParams.size = scratchSize; + creationParams.usage = IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + auto scratchBuffer = m_device->createBuffer(std::move(creationParams)); + + auto reqs = scratchBuffer->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); + + auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + allocation.memory->map({ .offset = 0,.length = reqs.size }); + + scratchAlloc = make_smart_refctd_ptr( + SBufferRange{0ull, scratchSize, std::move(scratchBuffer)}, + core::allocator(), MaxAlignment, MinAllocationSize + ); + } + + struct MyParams final : CAssetConverter::SConvertParams + { + inline uint32_t getFinalOwnerQueueFamily(const IGPUBuffer* buffer, const core::blake3_hash_t& createdFrom) override + { + return finalUser; + } + inline uint32_t getFinalOwnerQueueFamily(const IGPUAccelerationStructure* image, const core::blake3_hash_t& createdFrom) override + { + return finalUser; + } + + uint8_t finalUser; + } params = {}; + params.utilities = m_utils.get(); + params.compute = &compute; + params.scratchForDeviceASBuild = scratchAlloc.get(); + params.finalUser = queue->getFamilyIndex(); + + auto future = reservation.convert(params); + if (future.copy() != IQueue::RESULT::SUCCESS) + { + m_logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); + return false; + } + // 2 submits, BLAS build, TLAS build, DO NOT ADD COMPACTIONS IN THIS EXAMPLE! + if (compute.getFutureScratchSemaphore().value>3) + m_logger->log("Overflow submitted on Compute Queue despite using ReBAR (no transfer submits or usage of staging buffer) and providing a AS Build Scratch Buffer of correctly queried max size!",system::ILogger::ELL_ERROR); + + // assign gpu objects to output + auto&& tlases = reservation.getGPUObjects(); + m_gpuTlas = tlases[0].value; + auto&& buffers = reservation.getGPUObjects(); + for (uint32_t i = 0; i < cpuObjects.size(); i++) + { + auto& cpuObject = cpuObjects[i]; + + m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ + .meta = cpuObject.meta, + .bindings = { + .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, + .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, + }, + .vertexStride = cpuObject.data.inputParams.bindings[0].stride, + .indexType = cpuObject.data.indexType, + .indexCount = cpuObject.data.indexCount, + .material = hlsl::_static_cast(cpuObject.material), + .transform = cpuObject.transform, + }); + } + m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value; + + for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) + { + const auto& gpuObject = m_gpuTriangleGeometries[i]; + const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); + geomInfos[i] = { + .material = gpuObject.material, + .vertexBufferAddress = vertexBufferAddress, + .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, + .vertexStride = gpuObject.vertexStride, + .objType = gpuObject.meta.type, + .indexType = gpuObject.indexType, + .smoothNormals = s_smoothNormals[gpuObject.meta.type], + }; + } + } + + { + IGPUBuffer::SCreationParams params; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + params.size = geomInfoBuffer->getSize(); + m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); + } + + return true; + } + + + + smart_refctd_ptr m_window; + smart_refctd_ptr> m_surface; + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; + uint32_t m_frameAccumulationCounter = 0; + std::array, MaxFramesInFlight> m_cmdBufs; + ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + + core::smart_refctd_ptr m_inputSystem; + InputSystem::ChannelReader m_mouse; + InputSystem::ChannelReader m_keyboard; + + struct CameraSetting + { + float fov = 60.f; + float zNear = 0.1f; + float zFar = 10000.f; + float moveSpeed = 1.f; + float rotateSpeed = 1.f; + float viewWidth = 10.f; + float camYAngle = 165.f / 180.f * 3.14159f; + float camXAngle = 32.f / 180.f * 3.14159f; + + } m_cameraSetting; + Camera m_camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); + + Light m_light = { + .direction = {-1.0f, -1.0f, -0.4f}, + .position = {10.0f, 15.0f, 8.0f}, + .outerCutoff = 0.866025404f, // {cos(radians(30.0f))}, + .type = ELT_DIRECTIONAL + }; + + video::CDumbPresentationOracle m_oracle; + + struct C_UI + { + nbl::core::smart_refctd_ptr manager; + + struct + { + core::smart_refctd_ptr gui, scene; + } samplers; + + core::smart_refctd_ptr descriptorSet; + } m_ui; + core::smart_refctd_ptr m_guiDescriptorSetPool; + + core::vector m_gpuTriangleGeometries; + core::vector m_gpuIntersectionSpheres; + uint32_t m_intersectionHitGroupIdx; + + smart_refctd_ptr m_gpuTlas; + smart_refctd_ptr m_instanceBuffer; + + smart_refctd_ptr m_triangleGeomInfoBuffer; + smart_refctd_ptr m_proceduralGeomInfoBuffer; + smart_refctd_ptr m_proceduralAabbBuffer; + smart_refctd_ptr m_indirectBuffer; + + smart_refctd_ptr m_hdrImage; + smart_refctd_ptr m_hdrImageView; + + smart_refctd_ptr m_rayTracingDsPool; + smart_refctd_ptr m_rayTracingDs; + smart_refctd_ptr m_rayTracingPipeline; + uint64_t m_rayTracingStackSize; + ShaderBindingTable m_shaderBindingTable; + + smart_refctd_ptr m_presentDs; + smart_refctd_ptr m_presentDsPool; + smart_refctd_ptr m_presentPipeline; + + smart_refctd_ptr m_converter; + + + core::matrix4SIMD m_cachedModelViewProjectionMatrix; + bool m_useIndirectCommand = false; }; -NBL_MAIN_FUNC(RaytracingPipelineApp) +NBL_MAIN_FUNC(RaytracingPipelineApp) \ No newline at end of file From 04e32adc077f87f2fe854e9cf03172ed7da7a35e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Jun 2025 09:47:52 +0700 Subject: [PATCH 237/296] Fix example 64,67,70,71 --- 64_EmulatedFloatTest/main.cpp | 20 ++++----- 67_RayQueryGeometry/main.cpp | 6 +-- 70_FLIPFluids/main.cpp | 11 +---- 71_RayTracingPipeline/main.cpp | 78 ++++++++++------------------------ 4 files changed, 36 insertions(+), 79 deletions(-) diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp index a9ff5fde6..b44cb2b4e 100644 --- a/64_EmulatedFloatTest/main.cpp +++ b/64_EmulatedFloatTest/main.cpp @@ -255,7 +255,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa // Load shaders, set up pipeline { - smart_refctd_ptr shader; + smart_refctd_ptr shader; { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = base.m_logger.get(); @@ -271,12 +271,12 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa // It would be super weird if loading a shader from a file produced more than 1 asset assert(assets.size() == 1); - smart_refctd_ptr source = IAsset::castDown(assets[0]); + smart_refctd_ptr source = IAsset::castDown(assets[0]); auto* compilerSet = base.m_assetMgr->getCompilerSet(); nbl::asset::IShaderCompiler::SCompilerOptions options = {}; - options.stage = source->getStage(); + options.stage = ESS_COMPUTE; options.targetSpirvVersion = base.m_device->getPhysicalDevice()->getLimits().spirvVersion; options.spirvOptimizer = nullptr; options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT; @@ -286,9 +286,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa auto spirv = compilerSet->compileToSPIRV(source.get(), options); - ILogicalDevice::SShaderCreationParameters params{}; - params.cpushader = spirv.get(); - shader = base.m_device->createShader(params); + shader = base.m_device->compileShader({spirv.get()}); } if (!shader) @@ -923,7 +921,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa // Load shaders, set up pipeline { - smart_refctd_ptr shader; + smart_refctd_ptr shader; { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = base.m_logger.get(); @@ -939,12 +937,12 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa // It would be super weird if loading a shader from a file produced more than 1 asset assert(assets.size() == 1); - smart_refctd_ptr source = IAsset::castDown(assets[0]); + smart_refctd_ptr source = IAsset::castDown(assets[0]); auto* compilerSet = base.m_assetMgr->getCompilerSet(); IShaderCompiler::SCompilerOptions options = {}; - options.stage = source->getStage(); + options.stage = ESS_COMPUTE; options.targetSpirvVersion = base.m_device->getPhysicalDevice()->getLimits().spirvVersion; options.spirvOptimizer = nullptr; options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT; @@ -954,9 +952,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetMa auto spirv = compilerSet->compileToSPIRV(source.get(), options); - ILogicalDevice::SShaderCreationParameters params{}; - params.cpushader = spirv.get(); - shader = base.m_device->createShader(params); + shader = base.m_device->compileShader({spirv.get()}); } if (!shader) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 7371cf1ea..fdee5c5a1 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -161,9 +161,8 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu const auto assets = bundle.getContents(); assert(assets.size() == 1); - smart_refctd_ptr shaderSrc = IAsset::castDown(assets[0]); - shaderSrc->setShaderStage(IShader::E_SHADER_STAGE::ESS_COMPUTE); - auto shader = m_device->createShader(shaderSrc.get()); + smart_refctd_ptr shaderSrc = IAsset::castDown(assets[0]); + auto shader = m_device->compileShader({shaderSrc.get()}); if (!shader) return logFail("Failed to create shader!"); @@ -173,6 +172,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu IGPUComputePipeline::SCreationParams params = {}; params.layout = pipelineLayout.get(); params.shader.shader = shader.get(); + params.shader.entryPoint = "main"; if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &renderPipeline)) return logFail("Failed to create compute pipeline"); } diff --git a/70_FLIPFluids/main.cpp b/70_FLIPFluids/main.cpp index a0d2ad95d..c0f68ca49 100644 --- a/70_FLIPFluids/main.cpp +++ b/70_FLIPFluids/main.cpp @@ -374,7 +374,6 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a params.layout = pipelineLayout.get(); params.shader.entryPoint = entryPoint; params.shader.shader = shader.get(); - params.shader.stage = ESS_COMPUTE; m_device->createComputePipelines(nullptr, { ¶ms,1 }, &pipeline); }; @@ -631,7 +630,6 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a params.layout = pipelineLayout.get(); params.shader.entryPoint = iterateKernel; params.shader.shader = iterateShader.get(); - params.shader.stage = ESS_COMPUTE; m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_iterateDiffusionPipeline); } @@ -640,7 +638,6 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a params.layout = pipelineLayout.get(); params.shader.entryPoint = applyKernel; params.shader.shader = applyShader.get(); - params.shader.stage = ESS_COMPUTE; m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_diffusionPipeline); } @@ -1635,11 +1632,6 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a blendParams.blendParams[0u].colorWriteMask = (1u << 0u) | (1u << 1u) | (1u << 2u) | (1u << 3u); { - IPipelineBase::SShaderSpecInfo specInfo[] = { - {.shader = vs.get(), .entryPoint = "main", .stage = ESS_VERTEX, }, - {.shader = fs.get(), .entryPoint = "main", .stage = ESS_FRAGMENT, }, - }; - const asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX, .offset = 0, .size = sizeof(uint64_t) }; const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange , 1 }, nullptr, smart_refctd_ptr(descriptorSetLayout1), nullptr, nullptr); @@ -1649,7 +1641,8 @@ class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public a IGPUGraphicsPipeline::SCreationParams params[1] = {}; params[0].layout = pipelineLayout.get(); - params[0].shaders = specInfo; + params[0].vertexShader = { .shader = vs.get(), .entryPoint = "main", }; + params[0].fragmentShader = { .shader = fs.get(), .entryPoint = "main", }; params[0].cached = { .vertexInput = { }, diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 42aaa2233..0642220ba 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -136,7 +136,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } // Load Custom Shader - auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr + auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = m_logger.get(); @@ -147,11 +147,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, return nullptr; // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - auto sourceRaw = IAsset::castDown(assets[0]); + auto sourceRaw = IAsset::castDown(assets[0]); if (!sourceRaw) return nullptr; - return m_device->createShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); + return m_device->compileShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() }); }; // load shaders @@ -317,38 +317,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, const auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(descriptorSetLayout), nullptr, nullptr, nullptr); IGPURayTracingPipeline::SCreationParams params = {}; - - enum RtDemoShader - { - RTDS_RAYGEN, - RTDS_MISS, - RTDS_MISS_SHADOW, - RTDS_CLOSEST_HIT, - RTDS_SPHERE_CLOSEST_HIT, - RTDS_ANYHIT_PRIMARY, - RTDS_ANYHIT_SHADOW, - RTDS_INTERSECTION, - RTDS_DIRECTIONAL_CALL, - RTDS_POINT_CALL, - RTDS_SPOT_CALL, - RTDS_COUNT - }; - - IGPUShader::SSpecInfo shaders[RTDS_COUNT]; - shaders[RTDS_RAYGEN] = { .shader = raygenShader.get() }; - shaders[RTDS_MISS] = { .shader = missShader.get() }; - shaders[RTDS_MISS_SHADOW] = { .shader = missShadowShader.get() }; - shaders[RTDS_CLOSEST_HIT] = { .shader = closestHitShader.get() }; - shaders[RTDS_SPHERE_CLOSEST_HIT] = { .shader = proceduralClosestHitShader.get() }; - shaders[RTDS_ANYHIT_PRIMARY] = { .shader = anyHitShaderColorPayload.get() }; - shaders[RTDS_ANYHIT_SHADOW] = { .shader = anyHitShaderShadowPayload.get() }; - shaders[RTDS_INTERSECTION] = { .shader = intersectionHitShader.get() }; - shaders[RTDS_DIRECTIONAL_CALL] = { .shader = directionalLightCallShader.get() }; - shaders[RTDS_POINT_CALL] = { .shader = pointLightCallShader.get() }; - shaders[RTDS_SPOT_CALL] = { .shader = spotLightCallShader.get() }; - params.layout = pipelineLayout.get(); - params.shaders = std::span(shaders); using RayTracingFlags = IGPURayTracingPipeline::SCreationParams::FLAGS; params.flags = core::bitflag(RayTracingFlags::NO_NULL_MISS_SHADERS) | RayTracingFlags::NO_NULL_INTERSECTION_SHADERS | @@ -356,42 +325,40 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, auto& shaderGroups = params.shaderGroups; - shaderGroups.raygen = { .index = RTDS_RAYGEN }; + shaderGroups.raygen = { .shader = raygenShader.get(), .entryPoint = "main" }; - IRayTracingPipelineBase::SGeneralShaderGroup missGroups[EMT_COUNT]; - missGroups[EMT_PRIMARY] = { .index = RTDS_MISS }; - missGroups[EMT_OCCLUSION] = { .index = RTDS_MISS_SHADOW }; + IGPUPipelineBase::SShaderSpecInfo missGroups[EMT_COUNT]; + missGroups[EMT_PRIMARY] = { .shader = missShader.get(), .entryPoint = "main" }; + missGroups[EMT_OCCLUSION] = { .shader = missShadowShader.get(), .entryPoint = "main" }; shaderGroups.misses = missGroups; auto getHitGroupIndex = [](E_GEOM_TYPE geomType, E_RAY_TYPE rayType) { return geomType * ERT_COUNT + rayType; }; - IRayTracingPipelineBase::SHitShaderGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; + IGPURayTracingPipeline::SHitGroup hitGroups[E_RAY_TYPE::ERT_COUNT * E_GEOM_TYPE::EGT_COUNT]; hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_PRIMARY)] = { - .closestHit = RTDS_CLOSEST_HIT, - .anyHit = RTDS_ANYHIT_PRIMARY, + .closestHit = {.shader = closestHitShader.get(), .entryPoint = "main" }, + .anyHit = { .shader = anyHitShaderColorPayload.get(), .entryPoint = "main" }, }; hitGroups[getHitGroupIndex(EGT_TRIANGLES, ERT_OCCLUSION)] = { - .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, - .anyHit = RTDS_ANYHIT_SHADOW, + .anyHit = { .shader = anyHitShaderShadowPayload.get(), .entryPoint = "main" }, }; hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_PRIMARY)] = { - .closestHit = RTDS_SPHERE_CLOSEST_HIT, - .anyHit = RTDS_ANYHIT_PRIMARY, - .intersection = RTDS_INTERSECTION, + .closestHit = { .shader = proceduralClosestHitShader.get(), .entryPoint = "main" }, + .anyHit = { .shader = anyHitShaderColorPayload.get(), .entryPoint = "main" }, + .intersection = { .shader = intersectionHitShader.get(), .entryPoint = "main" }, }; hitGroups[getHitGroupIndex(EGT_PROCEDURAL, ERT_OCCLUSION)] = { - .closestHit = IGPURayTracingPipeline::SGeneralShaderGroup::Unused, - .anyHit = RTDS_ANYHIT_SHADOW, - .intersection = RTDS_INTERSECTION, + .anyHit = { .shader = anyHitShaderShadowPayload.get(), .entryPoint = "main" }, + .intersection = { .shader = intersectionHitShader.get(), .entryPoint = "main" }, }; shaderGroups.hits = hitGroups; - IRayTracingPipelineBase::SGeneralShaderGroup callableGroups[ELT_COUNT]; - callableGroups[ELT_DIRECTIONAL] = { .index = RTDS_DIRECTIONAL_CALL }; - callableGroups[ELT_POINT] = { .index = RTDS_POINT_CALL }; - callableGroups[ELT_SPOT] = { .index = RTDS_SPOT_CALL }; + IGPUPipelineBase::SShaderSpecInfo callableGroups[ELT_COUNT]; + callableGroups[ELT_DIRECTIONAL] = { .shader = directionalLightCallShader.get(), .entryPoint = "main" }; + callableGroups[ELT_POINT] = { .shader = pointLightCallShader.get(), .entryPoint = "main" }; + callableGroups[ELT_SPOT] = { .shader = spotLightCallShader.get(), .entryPoint = "main" }; shaderGroups.callables = callableGroups; params.cached.maxRecursionDepth = 1; @@ -443,9 +410,9 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, if (!fsTriProtoPPln) return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); - const IGPUShader::SSpecInfo fragSpec = { + const IGPUPipelineBase::SShaderSpecInfo fragSpec = { + .shader = fragmentShader.get(), .entryPoint = "main", - .shader = fragmentShader.get() }; auto presentLayout = m_device->createPipelineLayout( @@ -1163,6 +1130,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .diffuse = {0.2, 0.2, 0.8}, .specular = {0.8, 0.8, 0.8}, .shininess = 1.0f, + .alpha = 1.0f, }, .transform = getTranslationMatrix(-5.0f, 1.0f, 0), }, From 4c10dc1cdba4ab12dfedef97768aa4a10e606213 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 16 Jun 2025 15:09:09 +0700 Subject: [PATCH 238/296] use config header file for workgroup sizes --- 23_Arithmetic2UnitTest/main.cpp | 25 +++---------------------- 29_Arithmetic2Bench/main.cpp | 27 ++++----------------------- 2 files changed, 7 insertions(+), 45 deletions(-) diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 85d6e610f..158fc5c4c 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -1,6 +1,7 @@ #include "nbl/application_templates/BasicMultiQueueApplication.hpp" #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl" using namespace nbl; using namespace core; @@ -214,7 +215,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); - itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvocation); + hlsl::workgroup2::SArithmeticConfiguration wgConfig = hlsl::workgroup2::SArithmeticConfiguration::create(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvocation); + itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0; m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); @@ -268,27 +270,6 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } } - // reflects calculations in workgroup2::ArithmeticConfiguration - uint32_t calculateItemsPerWorkgroup(const uint32_t workgroupSize, const uint32_t subgroupSize, const uint32_t itemsPerInvocation) - { - if (workgroupSize <= subgroupSize) - return workgroupSize * itemsPerInvocation; - - const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - const uint8_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); - - const uint16_t levels = (workgroupSizeLog2 == subgroupSizeLog2) ? 1 : - (workgroupSizeLog2 > subgroupSizeLog2 * 2 + 2) ? 3 : 2; - - const uint16_t itemsPerInvocationProductLog2 = max(workgroupSizeLog2 - subgroupSizeLog2 * levels, 0); - uint16_t itemsPerInvocation1 = (levels == 3) ? min(itemsPerInvocationProductLog2, 2) : itemsPerInvocationProductLog2; - itemsPerInvocation1 = uint16_t(1u) << itemsPerInvocation1; - - uint32_t virtualWorkgroupSize = 1u << max(subgroupSizeLog2 * levels, workgroupSizeLog2); - - return itemsPerInvocation * virtualWorkgroupSize; - } - // create pipeline (specialized every test) [TODO: turn into a future/async] smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2) { diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index d317f07df..98ff65e05 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -2,6 +2,7 @@ #include "CEventCallback.hpp" #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl" using namespace nbl; using namespace core; @@ -508,27 +509,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub bool keepRunning() override { return numSubmits < MaxNumSubmits; } private: - // reflects calculations in workgroup2::ArithmeticConfiguration - uint32_t calculateItemsPerWorkgroup(const uint32_t workgroupSize, const uint32_t subgroupSize, const uint32_t itemsPerInvocation) - { - if (workgroupSize <= subgroupSize) - return workgroupSize * itemsPerInvocation; - - const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - const uint8_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); - - const uint16_t levels = (workgroupSizeLog2 == subgroupSizeLog2) ? 1 : - (workgroupSizeLog2 > subgroupSizeLog2 * 2 + 2) ? 3 : 2; - - const uint16_t itemsPerInvocationProductLog2 = max(workgroupSizeLog2 - subgroupSizeLog2 * levels, 0); - uint16_t itemsPerInvocation1 = (levels == 3) ? min(itemsPerInvocationProductLog2, 2) : itemsPerInvocationProductLog2; - itemsPerInvocation1 = uint16_t(1u) << itemsPerInvocation1; - - uint32_t virtualWorkgroupSize = 1u << max(subgroupSizeLog2 * levels, workgroupSizeLog2); - - return itemsPerInvocation * virtualWorkgroupSize; - } - // create pipeline (specialized every test) [TODO: turn into a future/async] smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2) { @@ -577,11 +557,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub options.preprocessorOptions.includeFinder = includeFinder; const uint32_t subgroupSize = 0x1u << subgroupSizeLog2; - const uint32_t itemsPerWG = calculateItemsPerWorkgroup(workgroupSize, subgroupSize, itemsPerInvoc); + const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); + hlsl::workgroup2::SArithmeticConfiguration wgConfig = hlsl::workgroup2::SArithmeticConfiguration::create(workgroupSizeLog2, subgroupSizeLog2, itemsPerInvoc); + const uint32_t itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0; smart_refctd_ptr overriddenUnspecialized; if constexpr (WorkgroupBench) { - const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); const std::string definitions[7] = { "workgroup2::" + arith_name, std::to_string(workgroupSizeLog2), From 6c6c6451fd31bbd4debaef22158fd4e0e9d819f2 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 16 Jun 2025 16:04:55 +0700 Subject: [PATCH 239/296] simplified some cpp code, write all benchmark descriptors at beginning --- 23_Arithmetic2UnitTest/main.cpp | 24 ++++--------- 29_Arithmetic2Bench/main.cpp | 62 +++++++++++++++++++-------------- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 158fc5c4c..35983ef08 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -176,22 +176,12 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; - for (uint32_t useNative = 0; useNative < 2; useNative++) + for (uint32_t useNative = 0; useNative <= uint32_t(m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic); useNative++) { - bool b_useNative = false; - if (!m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic && useNative == 0) - { - m_logger->log("Device property shaderSubgroupArithmetic is false! Skipping to emulated arithmetic...", ILogger::ELL_INFO); - continue; - } - if (useNative) m_logger->log("Testing with emulated subgroup arithmetic", ILogger::ELL_INFO); else - { m_logger->log("Testing with native subgroup arithmetic", ILogger::ELL_INFO); - b_useNative = true; - } for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { @@ -208,21 +198,21 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu uint32_t itemsPerWG = workgroupSize * itemsPerInvocation; m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation); bool passed = true; - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); hlsl::workgroup2::SArithmeticConfiguration wgConfig = hlsl::workgroup2::SArithmeticConfiguration::create(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvocation); itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0; m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, b_useNative, itemsPerWG, itemsPerInvocation) && passed; + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); } m_api->endCapture(); diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index 98ff65e05..9e98cfe5b 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -292,18 +292,45 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub benchLayout = m_device->createDescriptorSetLayout(binding); } - benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &benchLayout.get(),1 }); - benchDs = benchPool->createDescriptorSet(smart_refctd_ptr(benchLayout)); + const uint32_t setCount = ISwapchain::MaxImages; + benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &benchLayout.get(),1 }, &setCount); + for (auto i = 0u; i < ISwapchain::MaxImages; i++) + { + benchDs[i] = benchPool->createDescriptorSet(smart_refctd_ptr(benchLayout)); + if (!benchDs[i]) + return logFail("Could not create Descriptor Set!"); + } SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) }; benchPplnLayout = m_device->createPipelineLayout({ &pcRange, 1 }, std::move(benchLayout)); } if (UseNativeArithmetic && !m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic) { - m_logger->log("UseNativeArithmetic is true but device does not support shaderSubgroupArithmetic!", ILogger::ELL_ERROR); - exit(-1); + logFail("UseNativeArithmetic is true but device does not support shaderSubgroupArithmetic!"); + return false; + } + + IGPUDescriptorSet::SWriteDescriptorSet dsWrites[ISwapchain::MaxImages]; + for (auto i = 0u; i < ISwapchain::MaxImages; i++) + { + if (swapchainImageViews[i].get() == nullptr) + continue; + + video::IGPUDescriptorSet::SDescriptorInfo dsInfo; + dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL; + dsInfo.desc = swapchainImageViews[i]; + + dsWrites[i] = + { + .dstSet = benchDs[i].get(), + .binding = 2u, + .arrayElement = 0u, + .count = 1u, + .info = &dsInfo, + }; + m_device->updateDescriptorSets(1u, &dsWrites[i], 0u, nullptr); } - + // load shader source from file auto getShaderSource = [&](const char* filePath) -> auto @@ -396,31 +423,14 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); } - video::IGPUDescriptorSet::SDescriptorInfo dsInfo; - dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL; - dsInfo.desc = swapchainImageViews[m_currentImageAcquire.imageIndex]; - - IGPUDescriptorSet::SWriteDescriptorSet dsWrites[1u] = - { - { - .dstSet = benchDs.get(), - .binding = 2u, - .arrayElement = 0u, - .count = 1u, - .info = &dsInfo, - } - }; - m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr); - - const uint32_t elementCount = 1024*1024; const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; const auto SubgroupSizeLog2 = hlsl::findMSB(MaxSubgroupSize); - cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get()); + cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs[m_currentImageAcquire.imageIndex].get()); cmdbuf->pushConstants(benchSets[0].pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc); for (uint32_t i = 0; i < benchSets.size(); i++) - runBenchmark(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2); + runBenchmark(cmdbuf, benchSets[i], ElementCount, SubgroupSizeLog2); // barrier transition to PRESENT { @@ -688,13 +698,13 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t NumLoops = 1000u; constexpr static inline uint32_t NumBenchmarks = 6u; - std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; + std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; std::array arithmeticOperations = { "reduction", "inclusive_scan", "exclusive_scan" }; std::array benchSets; smart_refctd_ptr benchPool; - smart_refctd_ptr benchDs; + std::array, ISwapchain::MaxImages> benchDs; constexpr static inline uint32_t OutputBufferCount = 2u; smart_refctd_ptr outputBuffers[OutputBufferCount]; From 638846ead247d596a7bbf75fe014e0a38001671d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 16:09:41 +0700 Subject: [PATCH 240/296] Fix ray tracing pipeline demo alpha --- 71_RayTracingPipeline/main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 18f15a488..5ee6789ae 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1131,6 +1131,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, .diffuse = {0.2, 0.2, 0.8}, .specular = {0.8, 0.8, 0.8}, .shininess = 1.0f, + .alpha = 1.0f, }, .transform = getTranslationMatrix(-5.0f, 1.0f, 0), }, From 6c251d10bb54af6cafa09c53ba9c95fa61ae0115 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Jun 2025 16:12:05 +0700 Subject: [PATCH 241/296] Remove test code in ray query geometry shaders --- .../app_resources/render.comp.hlsl | 135 ------------------ 1 file changed, 135 deletions(-) diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl index 7e8a7dad6..657d0bbf0 100644 --- a/67_RayQueryGeometry/app_resources/render.comp.hlsl +++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl @@ -16,16 +16,6 @@ using namespace nbl::hlsl; [[vk::binding(1, 0)]] RWTexture2D outImage; [[vk::constant_id(0)]] const float shader_variant = 1.0; -struct SGeomInfo2 -{ - uint64_t vertexBufferAddress; - uint64_t indexBufferAddress; - - uint32_t vertexStride : 29; - uint32_t indexType : 2; // 16 bit, 32 bit or none - uint32_t smoothNormals : 1; // flat for cube, rectangle, disk -}; - float3 unpackNormals3x10(uint32_t v) { // host side changes float32_t3 to EF_A2B10G10R10_SNORM_PACK32 @@ -35,77 +25,6 @@ float3 unpackNormals3x10(uint32_t v) return clamp(float3(pn) / 511.0, -1.0, 1.0); } -float3 calculateSmoothNormals2(int instID, int primID, SGeomInfo2 geom, float2 bary) -{ - const uint indexType = geom.indexType; - const uint vertexStride = geom.vertexStride; - - const uint64_t vertexBufferAddress = geom.vertexBufferAddress; - const uint64_t indexBufferAddress = geom.indexBufferAddress; - - uint32_t3 indices; - switch (indexType) - { - case 0: // EIT_16BIT - indices = uint32_t3((nbl::hlsl::bda::__ptr::create(indexBufferAddress)+primID).deref().load()); - break; - case 1: // EIT_32BIT - indices = uint32_t3((nbl::hlsl::bda::__ptr::create(indexBufferAddress)+primID).deref().load()); - break; - default: // EIT_NONE - { - indices[0] = primID * 3; - indices[1] = indices[0] + 1; - indices[2] = indices[0] + 2; - } - } - - float3 n0, n1, n2; - switch (instID) - { - case OT_CUBE: - { - // TODO: document why the alignment is 2 here and nowhere else? isnt the `vertexStride` aligned to more than 2 anyway? - uint32_t v0 = vk::RawBufferLoad(vertexBufferAddress + indices[0] * vertexStride, 2u); - uint32_t v1 = vk::RawBufferLoad(vertexBufferAddress + indices[1] * vertexStride, 2u); - uint32_t v2 = vk::RawBufferLoad(vertexBufferAddress + indices[2] * vertexStride, 2u); - - n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz); - n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz); - n2 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v2).xyz); - } - break; - case OT_SPHERE: - case OT_CYLINDER: - case OT_ARROW: - case OT_CONE: - { - uint32_t v0 = vk::RawBufferLoad(vertexBufferAddress + indices[0] * vertexStride); - uint32_t v1 = vk::RawBufferLoad(vertexBufferAddress + indices[1] * vertexStride); - uint32_t v2 = vk::RawBufferLoad(vertexBufferAddress + indices[2] * vertexStride); - - n0 = normalize(unpackNormals3x10(v0)); - n1 = normalize(unpackNormals3x10(v1)); - n2 = normalize(unpackNormals3x10(v2)); - } - break; - case OT_RECTANGLE: - case OT_DISK: - case OT_ICOSPHERE: - default: - { - n0 = normalize(vk::RawBufferLoad(vertexBufferAddress + indices[0] * vertexStride)); - n1 = normalize(vk::RawBufferLoad(vertexBufferAddress + indices[1] * vertexStride)); - n2 = normalize(vk::RawBufferLoad(vertexBufferAddress + indices[2] * vertexStride)); - } - } - - float3 barycentrics = float3(0.0, bary); - barycentrics.x = 1.0 - barycentrics.y - barycentrics.z; - - return barycentrics.x * n0 + barycentrics.y * n1 + barycentrics.z * n2; -} - float3 calculateSmoothNormals(int instID, int primID, SGeomInfo geom, float2 bary) { const uint indexType = geom.indexType; @@ -221,57 +140,3 @@ void main(uint32_t3 threadID : SV_DispatchThreadID) outImage[threadID.xy] = color; } - -[numthreads(WorkgroupSize, WorkgroupSize, 1)] -[shader("compute")] -void main2(uint32_t3 threadID : SV_DispatchThreadID) -{ - uint2 coords = threadID.xy; - coords.y = nbl::hlsl::glsl::gl_NumWorkGroups().y * WorkgroupSize - coords.y; // need to invert it - - - float4 NDC; - NDC.xy = float2(coords) * pc.scaleNDC; - NDC.xy += pc.offsetNDC; - NDC.zw = float2(0, 1.0); - float3 targetPos; - { - float4 tmp = mul(pc.invMVP, NDC); - targetPos = tmp.xyz / tmp.w; - } - - float3 direction = normalize(targetPos - pc.camPos); - - spirv::RayQueryKHR query; - spirv::rayQueryInitializeKHR(query, topLevelAS, spv::RayFlagsOpaqueKHRMask, 0xFF, pc.camPos, 0.01, direction, 1000.0); - - while (spirv::rayQueryProceedKHR(query)) {} - - float4 color = float4(0, 0, 0, 1); - - if (spirv::rayQueryGetIntersectionTypeKHR(query, true) == spv::RayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionTriangleKHR) - { - const int instID = spirv::rayQueryGetIntersectionInstanceIdKHR(query, true); - const int primID = spirv::rayQueryGetIntersectionPrimitiveIndexKHR(query, true); - - // TODO: candidate for `bda::__ptr` - const SGeomInfo2 geom = vk::RawBufferLoad(pc.geometryInfoBuffer + instID * sizeof(SGeomInfo2)); - - float3 normals; - if (geom.smoothNormals) - { - float2 barycentrics = spirv::rayQueryGetIntersectionBarycentricsKHR(query, true); - normals = calculateSmoothNormals2(instID, primID, geom, barycentrics); - } - else - { - float3 pos[3] = spirv::rayQueryGetIntersectionTriangleVertexPositionsKHR(query, true); - normals = cross(pos[1] - pos[0], pos[2] - pos[0]); - } - - normals = normalize(normals) * 0.5 + 0.5; - color = float4(normals, shader_variant); - } - - outImage[threadID.xy] = color; -} From 2076b666c2fb8a86390b0e49e16290b6c1ed7483 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 16 Jun 2025 17:15:30 +0700 Subject: [PATCH 242/296] simplified data accessor template + usage --- .../app_resources/testSubgroup.comp.hlsl | 5 +-- .../app_resources/testWorkgroup.comp.hlsl | 10 ++--- 23_Arithmetic2UnitTest/main.cpp | 9 ++-- .../app_resources/benchmarkSubgroup.comp.hlsl | 2 +- .../benchmarkWorkgroup.comp.hlsl | 33 ++++++++------- 29_Arithmetic2Bench/app_resources/common.hlsl | 1 - 29_Arithmetic2Bench/main.cpp | 2 +- common/include/WorkgroupDataAccessors.hlsl | 41 ++++++++++--------- 8 files changed, 53 insertions(+), 50 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl index 6cd496648..e079e5e63 100644 --- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl @@ -24,13 +24,12 @@ static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex]; - if (glsl::gl_SubgroupSize()!=1u<(outputBufAddr, glsl::gl_SubgroupSize()); + assert(glsl::gl_SubgroupSize() == 1u< func; type_t val = func(sourceVal); - vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t)); + vk::RawBufferStore(outputBufAddr + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t)); } type_t test() diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 97ff31481..4b30526a6 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -30,17 +30,18 @@ struct operation_t // workgroup scans do no return anything, but use the data accessor to do the storing directly void operator()() { - PreloadedDataProxy dataAccessor = PreloadedDataProxy::create(); + using data_proxy_t = PreloadedDataProxy; + data_proxy_t dataAccessor = data_proxy_t::create(pc.pInputBuf, pc.pOutputBuf[Binop::BindingIndex]); dataAccessor.preload(); #if IS_REDUCTION otype_t value = #endif - OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); + OPERATION::template __call(dataAccessor,arithmeticAccessor); // we barrier before because we alias the accessors for Binop arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); #if IS_REDUCTION [unroll] - for (uint32_t i = 0; i < PreloadedDataProxy::PreloadedDataCount; i++) + for (uint32_t i = 0; i < data_proxy_t::PreloadedDataCount; i++) dataAccessor.preloaded[i] = value; #endif dataAccessor.unload(); @@ -51,8 +52,7 @@ struct operation_t template static void subtest() { - if (glsl::gl_SubgroupSize()!=1u<(pc.pOutputBuf[Binop::BindingIndex], glsl::gl_SubgroupSize()); + assert(glsl::gl_SubgroupSize() == 1u< func; func(); diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 35983ef08..6c979d7e5 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -89,7 +89,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu for (auto i=0u; igetSize(); + params.size = gpuinputDataBuffer->getSize(); params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; outputBuffers[i] = m_device->createBuffer(std::move(params)); @@ -179,9 +179,9 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu for (uint32_t useNative = 0; useNative <= uint32_t(m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic); useNative++) { if (useNative) - m_logger->log("Testing with emulated subgroup arithmetic", ILogger::ELL_INFO); - else m_logger->log("Testing with native subgroup arithmetic", ILogger::ELL_INFO); + else + m_logger->log("Testing with emulated subgroup arithmetic", ILogger::ELL_INFO); for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { @@ -417,9 +417,8 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer()); using type_t = typename Binop::type_t; - const auto dataFromBuffer = reinterpret_cast(resultsBuffer->getPointer()); + const auto testData = reinterpret_cast(resultsBuffer->getPointer()); - const auto testData = reinterpret_cast(dataFromBuffer + 1); // TODO: parallel for (the temporary values need to be threadlocal or what?) // now check if the data obtained has valid values type_t* tmp = new type_t[itemsPerWG]; diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl index 2da7de38f..9141ade55 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl @@ -31,7 +31,7 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) for (uint32_t i = 0; i < NUM_LOOPS; i++) value = func(value); - vk::RawBufferStore(outputBufAddr + sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value, sizeof(uint32_t)); + vk::RawBufferStore(outputBufAddr + sizeof(type_t) * globalIndex(), value, sizeof(uint32_t)); } void benchmark() diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index ad861a30d..561aadc56 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -19,29 +19,30 @@ groupshared uint32_t scratch[mpl::max_v +template struct RandomizedInputDataProxy { - using dtype_t = vector; + using dtype_t = vector; - NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; + NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = _PreloadedDataCount; + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1u) << WorkgroupSizeLog2; - static RandomizedInputDataProxy create() + static RandomizedInputDataProxy create(uint64_t inputBuf, uint64_t outputBuf) { - RandomizedInputDataProxy retval; - retval.data = DataProxy::create(); + RandomizedInputDataProxy retval; + retval.data = DataProxy::create(inputBuf, outputBuf); return retval; } template void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { - value = preloaded[ix>>Config::WorkgroupSizeLog2]; + value = preloaded[ix>>WorkgroupSizeLog2]; } template void set(const IndexType ix, const AccessType value) { - preloaded[ix>>Config::WorkgroupSizeLog2] = value; + preloaded[ix>>WorkgroupSizeLog2] = value; } void preload() @@ -51,7 +52,7 @@ struct RandomizedInputDataProxy [unroll] for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) [unroll] - for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++) + for (uint16_t i = 0; i < ItemsPerInvocation; i++) preloaded[idx][i] = xoroshiro(); } void unload() @@ -59,7 +60,7 @@ struct RandomizedInputDataProxy const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); [unroll] for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) - data.template set(idx * Config::WorkgroupSize + invocationIndex, preloaded[idx]); + data.template set(idx * WorkgroupSize + invocationIndex, preloaded[idx]); } void workgroupExecutionAndMemoryBarrier() @@ -68,29 +69,31 @@ struct RandomizedInputDataProxy //glsl::memoryBarrierShared(); implied by the above } - DataProxy data; + DataProxy data; dtype_t preloaded[PreloadedDataCount]; }; static ScratchProxy arithmeticAccessor; +using data_proxy_t = RandomizedInputDataProxy; + template struct operation_t { using binop_base_t = typename Binop::base_t; using otype_t = typename Binop::type_t; - void operator()(RandomizedInputDataProxy dataAccessor) + void operator()(data_proxy_t dataAccessor) { #if IS_REDUCTION otype_t value = #endif - OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); + OPERATION::template __call(dataAccessor,arithmeticAccessor); // we barrier before because we alias the accessors for Binop arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); #if IS_REDUCTION [unroll] - for (uint32_t i = 0; i < RandomizedInputDataProxy::PreloadedDataCount; i++) + for (uint32_t i = 0; i < data_proxy_t::PreloadedDataCount; i++) dataAccessor.preloaded[i] = value; #endif } @@ -99,7 +102,7 @@ struct operation_t template static void subbench() { - RandomizedInputDataProxy dataAccessor = RandomizedInputDataProxy::create(); + data_proxy_t dataAccessor = data_proxy_t::create(0, pc.pOutputBuf[Binop::BindingIndex]); dataAccessor.preload(); operation_t func; diff --git a/29_Arithmetic2Bench/app_resources/common.hlsl b/29_Arithmetic2Bench/app_resources/common.hlsl index 0cdcd7dad..cca5af987 100644 --- a/29_Arithmetic2Bench/app_resources/common.hlsl +++ b/29_Arithmetic2Bench/app_resources/common.hlsl @@ -3,7 +3,6 @@ struct PushConstantData { - uint64_t pInputBuf; uint64_t pOutputBuf[2]; }; diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index 9e98cfe5b..945749320 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -698,7 +698,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub uint32_t ItemsPerInvocation = 4u; constexpr static inline uint32_t NumLoops = 1000u; constexpr static inline uint32_t NumBenchmarks = 6u; - std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; + std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; std::array arithmeticOperations = { "reduction", "inclusive_scan", "exclusive_scan" }; diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl index 6beadfbc9..e1774fad6 100644 --- a/common/include/WorkgroupDataAccessors.hlsl +++ b/common/include/WorkgroupDataAccessors.hlsl @@ -31,28 +31,29 @@ struct ScratchProxy } }; -template +template struct DataProxy { - using dtype_t = vector; + using dtype_t = vector; - static DataProxy create() + static DataProxy create(uint64_t inputBuf, uint64_t outputBuf) { - DataProxy retval; - retval.workgroupOffset = glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; - retval.outputBufAddr = sizeof(uint32_t) + pc.pOutputBuf[Binop::BindingIndex]; + DataProxy retval; + retval.workgroupOffset = glsl::gl_WorkGroupID().x * WorkgroupSize; + retval.inputBufAddr = inputBuf; + retval.outputBufAddr = outputBuf; return retval; } template void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { - value = vk::RawBufferLoad(pc.pInputBuf + (workgroupOffset + ix) * sizeof(AccessType)); + value = vk::RawBufferLoad(inputBufAddr + (workgroupOffset + ix) * sizeof(AccessType)); } template void set(const IndexType ix, const AccessType value) { - vk::RawBufferStore(outputBufAddr + sizeof(AccessType) * (workgroupOffset+ix), value, sizeof(uint32_t)); + vk::RawBufferStore(outputBufAddr + (workgroupOffset + ix) * sizeof(AccessType), value, sizeof(uint32_t)); } void workgroupExecutionAndMemoryBarrier() @@ -62,32 +63,34 @@ struct DataProxy } uint32_t workgroupOffset; + uint64_t inputBufAddr; uint64_t outputBufAddr; }; -template +template struct PreloadedDataProxy { - using dtype_t = vector; + using dtype_t = vector; - NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; + NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = _PreloadedDataCount; + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1u) << WorkgroupSizeLog2; - static PreloadedDataProxy create() + static PreloadedDataProxy create(uint64_t inputBuf, uint64_t outputBuf) { - PreloadedDataProxy retval; - retval.data = DataProxy::create(); + PreloadedDataProxy retval; + retval.data = DataProxy::create(inputBuf, outputBuf); return retval; } template void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { - value = preloaded[ix>>Config::WorkgroupSizeLog2]; + value = preloaded[ix>>WorkgroupSizeLog2]; } template void set(const IndexType ix, const AccessType value) { - preloaded[ix>>Config::WorkgroupSizeLog2] = value; + preloaded[ix>>WorkgroupSizeLog2] = value; } void preload() @@ -95,14 +98,14 @@ struct PreloadedDataProxy const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); [unroll] for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) - data.template get(idx * Config::WorkgroupSize + invocationIndex, preloaded[idx]); + data.template get(idx * WorkgroupSize + invocationIndex, preloaded[idx]); } void unload() { const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); [unroll] for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) - data.template set(idx * Config::WorkgroupSize + invocationIndex, preloaded[idx]); + data.template set(idx * WorkgroupSize + invocationIndex, preloaded[idx]); } void workgroupExecutionAndMemoryBarrier() @@ -111,7 +114,7 @@ struct PreloadedDataProxy //glsl::memoryBarrierShared(); implied by the above } - DataProxy data; + DataProxy data; dtype_t preloaded[PreloadedDataCount]; }; From d1a8113db65c3cbd2be2d7ccf804c054f4aff1e2 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 16 Jun 2025 17:35:42 +0200 Subject: [PATCH 243/296] prep the push constants a little and move onto scene conversion --- 09_GeometryCreator/main.cpp | 9 +- .../examples/common/SBasicViewParameters.hlsl | 28 +- .../geometry/CGeometryCreatorScene.hpp | 411 ++++++------------ .../nbl/examples/geometry/SPushConstants.hlsl | 33 ++ 4 files changed, 185 insertions(+), 296 deletions(-) create mode 100644 common/include/nbl/examples/geometry/SPushConstants.hlsl diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index 2a3a1553e..f246b5c79 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -37,6 +37,12 @@ class GeometryCreatorApp final : public examples::MonoWindowApplication return logFail("Couldn't create Command Buffer!"); } +// auto scRes = static_cast(m_surface->getSwapchainResources()); +// .renderpass = core::smart_refctd_ptr(scRes->getRenderpass()) + auto scene = CGeometryCreatorScene::create({ + .utilities = m_utils, + .logger = m_logger + }); #if 0 //using Builder = typename CScene::CreateResourcesDirectlyWithDevice::Builder; using Builder = typename CScene::CreateResourcesWithAssetConverter::Builder; @@ -92,9 +98,6 @@ class GeometryCreatorApp final : public examples::MonoWindowApplication const auto viewMatrix = camera.getViewMatrix(); const auto viewProjectionMatrix = camera.getConcatenatedMatrix(); - core::matrix3x4SIMD modelMatrix; - modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0)); - modelMatrix.setRotation(quaternion(0, 0, 0)); core::matrix3x4SIMD modelViewMatrix = core::concatenateBFollowedByA(viewMatrix, modelMatrix); core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); diff --git a/common/include/nbl/examples/common/SBasicViewParameters.hlsl b/common/include/nbl/examples/common/SBasicViewParameters.hlsl index 0d0990186..b7ad31cb6 100644 --- a/common/include/nbl/examples/common/SBasicViewParameters.hlsl +++ b/common/include/nbl/examples/common/SBasicViewParameters.hlsl @@ -1,15 +1,27 @@ -#ifndef _S_BASIC_VIEW_PARAMETERS_COMMON_HLSL_ -#define _S_BASIC_VIEW_PARAMETERS_COMMON_HLSL_ +#ifndef _NBL_EXAMPLES_S_BASIC_VIEW_PARAMETERS_HLSL_ +#define _NBL_EXAMPLES_S_BASIC_VIEW_PARAMETERS_HLSL_ -#ifdef __HLSL_VERSION -struct SBasicViewParameters //! matches CPU version size & alignment (160, 4) + +#include "nbl/builtin/hlsl/cpp_compat/matrix.hlsl" + + +namespace nbl +{ +namespace hlsl +{ +namespace examples +{ + +struct SBasicViewParameters { - float4x4 MVP; - float3x4 MV; - float3x3 normalMat; + float32_t4x4 MVP; + float32_t3x4 MV; + float32_t3x3 normalMat; }; -#endif // _S_BASIC_VIEW_PARAMETERS_COMMON_HLSL_ +} +} +} #endif /* diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index e68441ffe..e39e536b0 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -3,11 +3,10 @@ #include - +#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl" #include "nbl/asset/utils/CGeometryCreator.h" -// soon to be deprecated! -#include "nbl/examples/common/SBasicViewParameters.hlsl" +#include "nbl/examples/geometry/SPushConstants.hlsl" // TODO: Arek bring back //#include "nbl/examples/geometry/spirv/builtin/CArchive.h" @@ -17,9 +16,11 @@ namespace nbl::examples { -class CGeometryCreatorScene +class CGeometryCreatorScene : public core::IReferenceCounted { public: + using SPushConstants = hlsl::geometry_creator_scene::SPushConstants; + // enum ObjectType : uint8_t { OT_CUBE, @@ -32,144 +33,145 @@ class CGeometryCreatorScene OT_ICOSPHERE, OT_COUNT, - OT_UNKNOWN = std::numeric_limits::max() + OT_UNKNOWN = OT_COUNT }; -}; -#if 0 -struct ObjectMeta -{ - ObjectType type = OT_UNKNOWN; - std::string_view name = "Unknown"; -}; +#define EXPOSE_NABLA_NAMESPACES using namespace nbl::core; \ +using namespace nbl::system; \ +using namespace nbl::asset; \ +using namespace nbl::video -constexpr static inline struct ClearValues -{ - nbl::video::IGPUCommandBuffer::SClearColorValue color = { .float32 = {0.f,0.f,0.f,1.f} }; - nbl::video::IGPUCommandBuffer::SClearDepthStencilValue depth = { .depth = 0.f }; -} clear; - -#define TYPES_IMPL_BOILERPLATE(WithConverter) struct Types \ -{ \ - using descriptor_set_layout_t = std::conditional_t; \ - using pipeline_layout_t = std::conditional_t; \ - using renderpass_t = std::conditional_t; \ - using image_view_t = std::conditional_t; \ - using image_t = std::conditional_t; \ - using buffer_t = std::conditional_t; \ - using shader_t = std::conditional_t; \ - using graphics_pipeline_t = std::conditional_t; \ - using descriptor_set = std::conditional_t; \ -} - -template -struct ResourcesBundleBase -{ - TYPES_IMPL_BOILERPLATE(withAssetConverter); - - struct ReferenceObject - { - struct Bindings + // + struct SCreateParams { - nbl::asset::SBufferBinding vertex, index; + core::smart_refctd_ptr utilities; + core::smart_refctd_ptr logger; }; + static inline core::smart_refctd_ptr create(SCreateParams&& params) + { + EXPOSE_NABLA_NAMESPACES; + auto* logger = params.logger.get(); + assert(logger); + if (!params.utilities) + { + logger->log("Pass a non-null `IUtilities`!",ILogger::ELL_ERROR); + return nullptr; + } + auto device = params.utilities->getLogicalDevice(); - nbl::core::smart_refctd_ptr pipeline = nullptr; - - Bindings bindings; - nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN; - uint32_t indexCount = {}; - }; - - using ReferenceDrawHook = std::pair; - - nbl::core::smart_refctd_ptr renderpass; - std::array objects; - nbl::asset::SBufferBinding ubo; - - struct - { - nbl::core::smart_refctd_ptr color, depth; - } attachments; - - nbl::core::smart_refctd_ptr descriptorSet; -}; - -struct ResourcesBundle : public ResourcesBundleBase -{ - using base_t = ResourcesBundleBase; -}; + constexpr auto DescriptorCount = 255; + smart_refctd_ptr cpuDS; + { + // create Descriptor Set Layout + smart_refctd_ptr dsLayout; + { + const ICPUDescriptorSetLayout::SBinding bindings[] = + { + { + .binding = 0, + .type = IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER, + // some geometries may not have particular attributes + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT, + .stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT, + .count = DescriptorCount + } + }; + dsLayout = core::make_smart_refctd_ptr(bindings); + if (!dsLayout) + { + logger->log("Could not create descriptor set layout!", ILogger::ELL_ERROR); + return nullptr; + } + } -#define EXPOSE_NABLA_NAMESPACES() using namespace nbl; \ -using namespace core; \ -using namespace asset; \ -using namespace video; \ -using namespace scene; \ -using namespace system + // create Descriptor Set + cpuDS = core::make_smart_refctd_ptr(std::move(dsLayout)); + if (!cpuDS) + { + logger->log("Could not descriptor set!", ILogger::ELL_ERROR); + return nullptr; + } + } -template -class ResourceBuilder -{ -public: - TYPES_IMPL_BOILERPLATE(withAssetConverter); + SInitParams init; + // create out geometries + { + auto* const outDescs = cpuDS->getDescriptorInfoStorage(IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER).data(); + uint8_t nextDesc = 0; + auto allocateUTB = [DescriptorCount,outDescs,&nextDesc](const IGeometry::SDataView& view)->uint8_t + { + if (!view) + return DescriptorCount; + outDescs[nextDesc].desc = core::make_smart_refctd_ptr(view.src,view.composed.format); + return nextDesc++; + }; - using this_t = ResourceBuilder; + auto addGeometry = [&allocateUTB,&init](const ICPUPolygonGeometry* geom)->void + { + auto& out = init.geoms.emplace_back(); + out.elementCount = geom->getPrimitiveCount()*geom->getIndexingCallback()->degree(); + out.positionView = allocateUTB(geom->getPositionView()); + out.normalView = allocateUTB(geom->getNormalView()); + // the first view is usually the UV + if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty()) + out.uvView = allocateUTB(auxViews.front()); + }; - ResourceBuilder(nbl::video::IUtilities* const _utilities, nbl::video::IGPUCommandBuffer* const _commandBuffer, nbl::system::ILogger* const _logger, const nbl::asset::IGeometryCreator* const _geometryCreator) - : utilities(_utilities), commandBuffer(_commandBuffer), logger(_logger), geometries(_geometryCreator) - { - assert(utilities); - assert(logger); - } + auto creator = core::make_smart_refctd_ptr(); + addGeometry(creator->createCube().get()); + } - /* - if (withAssetConverter) then - -> .build cpu objects - else - -> .build gpu objects & record any resource update upload transfers into command buffer - */ + // convert the geometries + { + init.ds = nullptr; + } - inline bool build() - { - EXPOSE_NABLA_NAMESPACES(); + return smart_refctd_ptr(new CGeometryCreatorScene(std::move(init)),dont_grab); + } - if constexpr (!withAssetConverter) + // + struct SPackedGeometry { - commandBuffer->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - commandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - commandBuffer->beginDebugMarker("Resources builder's buffers upload [manual]"); - } + inline SPushConstants convert(const hlsl::float32_t3x4& model, const hlsl::float32_t3x4& view, const hlsl::float32_t4x4& viewProj) + { + return { + .basic = { + .MVP = hlsl::math::linalg::promoted_mul(viewProj,model), + .MV = hlsl::math::linalg::promoted_mul(view,model), + .normalMat = hlsl::inverse(hlsl::transpose(hlsl::float32_t3x3(view))) + }, + .positionView = positionView, + .normalView = normalView, + .uvView = uvView + }; + } - using functor_t = std::function; - - auto work = std::to_array - ({ - functor_t(std::bind(&this_t::createDescriptorSetLayout, this)), - functor_t(std::bind(&this_t::createPipelineLayout, this)), - functor_t(std::bind(&this_t::createRenderpass, this)), - functor_t(std::bind(&this_t::createFramebufferAttachments, this)), - functor_t(std::bind(&this_t::createShaders, this)), - functor_t(std::bind(&this_t::createGeometries, this)), - functor_t(std::bind(&this_t::createViewParametersUboBuffer, this)), - functor_t(std::bind(&this_t::createDescriptorSet, this)) - }); - - for (auto& task : work) - if (!task()) - return false; + core::smart_refctd_ptr indexBuffer = nullptr; + uint32_t elementCount = 0; + // indices into the descriptor set + uint8_t positionView = 0; + uint8_t normalView = 0; + uint8_t uvView = 0; + uint8_t indexType = EIT_UNKNOWN; + ObjectType type : 6 = ObjectType::OT_UNKNOWN; + }; + std::span getGeometries() const {return m_params.geoms;} - if constexpr (!withAssetConverter) - commandBuffer->end(); + protected: + struct SInitParams + { + core::smart_refctd_ptr ds; + core::vector geoms; + } m_params; + inline CGeometryCreatorScene(SInitParams&& _params) : m_params(std::move(_params)) {} - return true; - } +#undef EXPOSE_NABLA_NAMESPACES +}; - /* - if (withAssetConverter) then - -> .convert cpu objects to gpu & update gpu buffers - else - -> update gpu buffers - */ +#if 0 +class ResourceBuilder +{ +public: inline bool finalize(ResourcesBundle& output, nbl::video::CThreadSafeQueueAdapter* transferCapableQueue) { @@ -181,7 +183,6 @@ class ResourceBuilder commandBuffers.front().cmdbuf = commandBuffer; } - if constexpr (withAssetConverter) { // note that asset converter records basic transfer uploads itself, we only begin the recording with ONE_TIME_SUBMIT_BIT commandBuffer->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); @@ -401,43 +402,6 @@ class ResourceBuilder } } } - else - { - auto completed = utilities->getLogicalDevice()->createSemaphore(0u); - - std::array signals; - { - auto& signal = signals.front(); - signal.value = 1; - signal.stageMask = bitflag(PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS); - signal.semaphore = completed.get(); - } - - const IQueue::SSubmitInfo infos [] = - { - { - .waitSemaphores = {}, - .commandBuffers = commandBuffers, // note that here our command buffer is already recorded! - .signalSemaphores = signals - } - }; - - if (transferCapableQueue->submit(infos) != IQueue::RESULT::SUCCESS) - { - logger->log("Failed to submit transfer upload operations!", ILogger::ELL_ERROR); - return false; - } - - const ISemaphore::SWaitInfo info [] = - { { - .semaphore = completed.get(), - .value = 1 - } }; - - utilities->getLogicalDevice()->blockForSemaphores(info); - - static_cast(output) = static_cast(scratch); // scratch has all ready to use allocated gpu resources with uploaded memory so now just assign resources to base output - } // write the descriptor set { @@ -468,86 +432,7 @@ class ResourceBuilder } private: - bool createDescriptorSetLayout() - { - EXPOSE_NABLA_NAMESPACES(); - - typename Types::descriptor_set_layout_t::SBinding bindings[] = - { - { - .binding = 0u, - .type = IDescriptor::E_TYPE::ET_UNIFORM_BUFFER, - .createFlags = Types::descriptor_set_layout_t::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX | IShader::E_SHADER_STAGE::ESS_FRAGMENT, - .count = 1u, - } - }; - - if constexpr (withAssetConverter) - scratch.descriptorSetLayout = make_smart_refctd_ptr(bindings); - else - scratch.descriptorSetLayout = utilities->getLogicalDevice()->createDescriptorSetLayout(bindings); - - if (!scratch.descriptorSetLayout) - { - logger->log("Could not descriptor set layout!", ILogger::ELL_ERROR); - return false; - } - return true; - } - - bool createDescriptorSet() - { - EXPOSE_NABLA_NAMESPACES(); - - if constexpr (withAssetConverter) - scratch.descriptorSet = make_smart_refctd_ptr(smart_refctd_ptr(scratch.descriptorSetLayout)); - else - { - const IGPUDescriptorSetLayout* const layouts[] = { scratch.descriptorSetLayout.get()}; - const uint32_t setCounts[] = { 1u }; - - // note descriptor set has back smart pointer to its pool, so we dont need to keep it explicitly - auto pool = utilities->getLogicalDevice()->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_NONE, layouts, setCounts); - - if (!pool) - { - logger->log("Could not create Descriptor Pool!", ILogger::ELL_ERROR); - return false; - } - - pool->createDescriptorSets(layouts, &scratch.descriptorSet); - } - - if (!scratch.descriptorSet) - { - logger->log("Could not create Descriptor Set!", ILogger::ELL_ERROR); - return false; - } - - return true; - } - - bool createPipelineLayout() - { - EXPOSE_NABLA_NAMESPACES(); - - const std::span range = {}; - - if constexpr (withAssetConverter) - scratch.pipelineLayout = make_smart_refctd_ptr(range, nullptr, smart_refctd_ptr(scratch.descriptorSetLayout), nullptr, nullptr); - else - scratch.pipelineLayout = utilities->getLogicalDevice()->createPipelineLayout(range, nullptr, smart_refctd_ptr(scratch.descriptorSetLayout), nullptr, nullptr); - - if (!scratch.pipelineLayout) - { - logger->log("Could not create pipeline layout!", ILogger::ELL_ERROR); - return false; - } - - return true; - } bool createRenderpass() { @@ -646,8 +531,6 @@ class ResourceBuilder if constexpr (withAssetConverter) scratch.renderpass = ICPURenderpass::create(params); - else - scratch.renderpass = utilities->getLogicalDevice()->createRenderpass(params); if (!scratch.renderpass) { @@ -747,8 +630,6 @@ class ResourceBuilder if constexpr (withAssetConverter) outView = make_smart_refctd_ptr(std::move(params)); - else - outView = utilities->getLogicalDevice()->createImageView(std::move(params)); if (!outView) { @@ -788,8 +669,6 @@ class ResourceBuilder buffer->setContentHash(buffer->computeContentHash()); outShader = std::move(shader); } - else - outShader = utilities->getLogicalDevice()->createShader(shader.get()); return outShader; }; @@ -995,41 +874,6 @@ class ResourceBuilder return true; } - bool createViewParametersUboBuffer() - { - EXPOSE_NABLA_NAMESPACES(); - - using ibuffer_t = ::nbl::asset::IBuffer; // seems to be ambigous, both asset & core namespaces has IBuffer - constexpr static auto UboUsage = bitflag(ibuffer_t::EUF_UNIFORM_BUFFER_BIT) | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF; - - if constexpr (withAssetConverter) - { - auto uboBuffer = ICPUBuffer::create({ sizeof(SBasicViewParameters) }); - uboBuffer->addUsageFlags(UboUsage); - uboBuffer->setContentHash(uboBuffer->computeContentHash()); - scratch.ubo = { .offset = 0u, .buffer = std::move(uboBuffer) }; - } - else - { - const auto mask = utilities->getLogicalDevice()->getPhysicalDevice()->getUpStreamingMemoryTypeBits(); - auto uboBuffer = utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = sizeof(SBasicViewParameters), .usage = UboUsage })); - - if (!uboBuffer) - return false; - - for (auto it : { uboBuffer }) - { - IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = it->getMemoryReqs(); - reqs.memoryTypeBits &= mask; - - utilities->getLogicalDevice()->allocate(reqs, it.get()); - } - - scratch.ubo = { .offset = 0u, .buffer = std::move(uboBuffer) }; - } - - return true; - } struct GeometriesCpu { @@ -1099,9 +943,6 @@ class ResourceBuilder ResourcesBundleScratch scratch; - nbl::video::IUtilities* const utilities; - nbl::video::IGPUCommandBuffer* const commandBuffer; - nbl::system::ILogger* const logger; GeometriesCpu geometries; }; diff --git a/common/include/nbl/examples/geometry/SPushConstants.hlsl b/common/include/nbl/examples/geometry/SPushConstants.hlsl new file mode 100644 index 000000000..f02ddea12 --- /dev/null +++ b/common/include/nbl/examples/geometry/SPushConstants.hlsl @@ -0,0 +1,33 @@ +#ifndef _NBL_EXAMPLES_S_PUSH_CONSTANTS_HLSL_ +#define _NBL_EXAMPLES_S_PUSH_CONSTANTS_HLSL_ + + +#include "nbl/examples/common/SBasicViewParameters.hlsl" + + +namespace nbl +{ +namespace hlsl +{ +namespace examples +{ +namespace geometry_creator_scene +{ + +struct SPushConstants +{ + SBasicViewParameters basic; + uint32_t positionView : 11; + uint32_t normalView : 10; + uint32_t uvView : 11; +}; + +} +} +} +} +#endif + +/* + do not remove this text, WAVE is so bad that you can get errors if no proper ending xD +*/ \ No newline at end of file From 629a0acb6110386e46a3a21fc7d0ece294c3c6d1 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 16 Jun 2025 21:28:10 +0200 Subject: [PATCH 244/296] correct small typos, get stuff to compile --- 09_GeometryCreator/include/common.hpp | 2 +- 09_GeometryCreator/main.cpp | 11 ++--------- .../nbl/examples/geometry/CGeometryCreatorScene.hpp | 13 +++++++------ 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/09_GeometryCreator/include/common.hpp b/09_GeometryCreator/include/common.hpp index 02197171d..d172e1959 100644 --- a/09_GeometryCreator/include/common.hpp +++ b/09_GeometryCreator/include/common.hpp @@ -12,6 +12,6 @@ using namespace asset; using namespace ui; using namespace video; using namespace scene; -using namespace examples; +using namespace nbl::examples; #endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ \ No newline at end of file diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index f246b5c79..af2c0ed93 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -4,9 +4,9 @@ #include "common.hpp" -class GeometryCreatorApp final : public examples::MonoWindowApplication +class GeometryCreatorApp final : public MonoWindowApplication { - using base_t = examples::MonoWindowApplication; + using base_t = MonoWindowApplication; public: GeometryCreatorApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) @@ -97,13 +97,6 @@ class GeometryCreatorApp final : public examples::MonoWindowApplication const auto viewMatrix = camera.getViewMatrix(); const auto viewProjectionMatrix = camera.getConcatenatedMatrix(); - - - core::matrix3x4SIMD modelViewMatrix = core::concatenateBFollowedByA(viewMatrix, modelMatrix); - core::matrix4SIMD modelViewProjectionMatrix = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); - - core::matrix3x4SIMD normalMatrix; - modelViewMatrix.getSub3x3InverseTranspose(normalMatrix); #if 0 SBasicViewParameters uboData; memcpy(uboData.MVP, modelViewProjectionMatrix.pointer(), sizeof(uboData.MVP)); diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index e39e536b0..dbe3933d7 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -19,7 +19,7 @@ namespace nbl::examples class CGeometryCreatorScene : public core::IReferenceCounted { public: - using SPushConstants = hlsl::geometry_creator_scene::SPushConstants; + using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants; // enum ObjectType : uint8_t { @@ -134,11 +134,12 @@ using namespace nbl::video { inline SPushConstants convert(const hlsl::float32_t3x4& model, const hlsl::float32_t3x4& view, const hlsl::float32_t4x4& viewProj) { + using namespace hlsl; return { .basic = { - .MVP = hlsl::math::linalg::promoted_mul(viewProj,model), - .MV = hlsl::math::linalg::promoted_mul(view,model), - .normalMat = hlsl::inverse(hlsl::transpose(hlsl::float32_t3x3(view))) + .MVP = math::linalg::promoted_mul(viewProj,model), + .MV = math::linalg::promoted_mul(view,model), + .normalMat = inverse(transpose(float32_t3x3(view))) }, .positionView = positionView, .normalView = normalView, @@ -152,7 +153,7 @@ using namespace nbl::video uint8_t positionView = 0; uint8_t normalView = 0; uint8_t uvView = 0; - uint8_t indexType = EIT_UNKNOWN; + uint8_t indexType = asset::EIT_UNKNOWN; ObjectType type : 6 = ObjectType::OT_UNKNOWN; }; std::span getGeometries() const {return m_params.geoms;} @@ -160,7 +161,7 @@ using namespace nbl::video protected: struct SInitParams { - core::smart_refctd_ptr ds; + core::smart_refctd_ptr ds; core::vector geoms; } m_params; inline CGeometryCreatorScene(SInitParams&& _params) : m_params(std::move(_params)) {} From 37330ab2c7c0b4c2e49bdf4d8b4c64c724dd6f74 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 16 Jun 2025 22:40:22 +0200 Subject: [PATCH 245/296] add more test geometries --- .../examples/geometry/CGeometryCreatorScene.hpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index dbe3933d7..187d97768 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -109,7 +109,7 @@ using namespace nbl::video auto addGeometry = [&allocateUTB,&init](const ICPUPolygonGeometry* geom)->void { auto& out = init.geoms.emplace_back(); - out.elementCount = geom->getPrimitiveCount()*geom->getIndexingCallback()->degree(); + out.elementCount = geom->getVertexReferenceCount(); out.positionView = allocateUTB(geom->getPositionView()); out.normalView = allocateUTB(geom->getNormalView()); // the first view is usually the UV @@ -118,7 +118,19 @@ using namespace nbl::video }; auto creator = core::make_smart_refctd_ptr(); - addGeometry(creator->createCube().get()); + /* TODO: others + ReferenceObjectCpu {.meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) }, + ReferenceObjectCpu {.meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) }, + ReferenceObjectCpu {.meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .shadersType = GP_BASIC, .data = gc->createCylinderMesh(2, 2, 20) }, + ReferenceObjectCpu {.meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .shadersType = GP_BASIC, .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(1.5, 3)) }, + ReferenceObjectCpu {.meta = {.type = OT_DISK, .name = "Disk Mesh" }, .shadersType = GP_BASIC, .data = gc->createDiskMesh(2, 30) }, + ReferenceObjectCpu {.meta = {.type = OT_ARROW, .name = "Arrow Mesh" }, .shadersType = GP_BASIC, .data = gc->createArrowMesh() }, + ReferenceObjectCpu {.meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) }, + ReferenceObjectCpu {.meta = {.type = OT_ICOSPHERE, .name = "Icoshpere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) } + */ + addGeometry(creator->createCube({1.f,1.f,1.f}).get()); + addGeometry(creator->createRectangle({1.5f,3.f}).get()); + addGeometry(creator->createDisk(2.f,30).get()); } // convert the geometries From 5200ea1cc42f94848f7daef4e70aaa4014743bce Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 16 Jun 2025 23:39:45 +0200 Subject: [PATCH 246/296] fire up the converter and handle ownership between queues --- 09_GeometryCreator/main.cpp | 8 +- .../geometry/CGeometryCreatorScene.hpp | 470 +++++------------- 2 files changed, 124 insertions(+), 354 deletions(-) diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index af2c0ed93..2e31c90dd 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -39,14 +39,16 @@ class GeometryCreatorApp final : public MonoWindowApplication // auto scRes = static_cast(m_surface->getSwapchainResources()); // .renderpass = core::smart_refctd_ptr(scRes->getRenderpass()) + const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()}; auto scene = CGeometryCreatorScene::create({ - .utilities = m_utils, - .logger = m_logger + .transferQueue = getTransferUpQueue(), + .utilities = m_utils.get(), + .logger = m_logger.get(), + .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies }); #if 0 //using Builder = typename CScene::CreateResourcesDirectlyWithDevice::Builder; using Builder = typename CScene::CreateResourcesWithAssetConverter::Builder; - auto oneRunCmd = CScene::createCommandBuffer(m_utils->getLogicalDevice(), m_utils->getLogger(), gQueue->getFamilyIndex()); Builder builder(m_utils.get(), oneRunCmd.get(), m_logger.get(), geometry); // gpu resources diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index 187d97768..dd462c03c 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -44,20 +44,26 @@ using namespace nbl::video // struct SCreateParams { - core::smart_refctd_ptr utilities; - core::smart_refctd_ptr logger; + video::IQueue* transferQueue; + video::IUtilities* utilities; + system::ILogger* logger; + std::span addtionalBufferOwnershipFamilies = {}; }; static inline core::smart_refctd_ptr create(SCreateParams&& params) { EXPOSE_NABLA_NAMESPACES; - auto* logger = params.logger.get(); + auto* logger = params.logger; assert(logger); + if (!params.transferQueue) + { + logger->log("Pass a non-null `IQueue* transferQueue`!",ILogger::ELL_ERROR); + return nullptr; + } if (!params.utilities) { - logger->log("Pass a non-null `IUtilities`!",ILogger::ELL_ERROR); + logger->log("Pass a non-null `IUtilities* utilities`!",ILogger::ELL_ERROR); return nullptr; } - auto device = params.utilities->getLogicalDevice(); constexpr auto DescriptorCount = 255; smart_refctd_ptr cpuDS; @@ -94,6 +100,8 @@ using namespace nbl::video } SInitParams init; + constexpr size_t NoIndexBufferMarker = 0xdeadbeefBADC0FFEull; + core::vector> indexBuffers; // create out geometries { auto* const outDescs = cpuDS->getDescriptorInfoStorage(IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER).data(); @@ -106,9 +114,16 @@ using namespace nbl::video return nextDesc++; }; - auto addGeometry = [&allocateUTB,&init](const ICPUPolygonGeometry* geom)->void + auto addGeometry = [&allocateUTB,&indexBuffers,&init](const ICPUPolygonGeometry* geom)->void { auto& out = init.geoms.emplace_back(); + if (const auto& view=geom->getIndexView(); view) + { + out.indexBuffer.offset = view.src.offset; + indexBuffers.push_back(view.src.buffer); + } + else + out.indexBuffer.offset = NoIndexBufferMarker; out.elementCount = geom->getVertexReferenceCount(); out.positionView = allocateUTB(geom->getPositionView()); out.normalView = allocateUTB(geom->getNormalView()); @@ -135,7 +150,103 @@ using namespace nbl::video // convert the geometries { - init.ds = nullptr; + auto device = params.utilities->getLogicalDevice(); + smart_refctd_ptr converter = CAssetConverter::create({.device=device}); + + + const auto transferFamily = params.transferQueue->getFamilyIndex(); + + struct SInputs : CAssetConverter::SInputs + { + virtual inline std::span getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUBuffer* buffer, const CAssetConverter::patch_t& patch) const + { + return sharedBufferOwnership; + } + + core::vector sharedBufferOwnership; + } inputs = {}; + { + inputs.logger = logger; + // descriptor set should convert everthing downstream + std::get>(inputs.assets) = {&cpuDS.get(),1}; + // except index buffers + if (!indexBuffers.empty()) + std::get>(inputs.assets) = {&indexBuffers.front().get(),indexBuffers.size()}; + // set up shared ownership so we don't have to + core::unordered_set families; + families.insert(transferFamily); + families.insert(params.addtionalBufferOwnershipFamilies.begin(),params.addtionalBufferOwnershipFamilies.end()); + if (families.size()>1) + for (const auto fam : families) + inputs.sharedBufferOwnership.push_back(fam); + } + + // reserve + auto reservation = converter->reserve(inputs); + if (!reservation) + { + logger->log("Failed to reserve GPU objects for CPU->GPU conversion!",ILogger::ELL_ERROR); + return nullptr; + } + + // convert + { + auto semaphore = device->createSemaphore(0u); + + constexpr auto MultiBuffering = 2; + std::array,MultiBuffering> commandBuffers = {}; + { + auto pool = device->createCommandPool(transferFamily,IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT|IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,commandBuffers,smart_refctd_ptr(logger)); + } + commandBuffers.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + std::array commandBufferSubmits; + for (auto i=0; ilog("Failed to await submission feature!", ILogger::ELL_ERROR); + return nullptr; + } + } + + // assign outputs + { + auto assign = [logger](auto& out, const auto& in)->bool + { + if (!in.value) + { + logger->log("Failed to convert CPU object to GPU!",ILogger::ELL_ERROR); + return false; + } + out = in.value; + return true; + }; + if (!assign(init.ds,reservation.getGPUObjects().front())) + return nullptr; + auto indexBufIt = reservation.getGPUObjects().data(); + for (auto& entry : init.geoms) + if (entry.indexBuffer.offset!=NoIndexBufferMarker) + if (!assign(entry.indexBuffer.buffer,*(indexBufIt++))) + return nullptr; + } } return smart_refctd_ptr(new CGeometryCreatorScene(std::move(init)),dont_grab); @@ -159,7 +270,7 @@ using namespace nbl::video }; } - core::smart_refctd_ptr indexBuffer = nullptr; + asset::SBufferBinding indexBuffer = {}; uint32_t elementCount = 0; // indices into the descriptor set uint8_t positionView = 0; @@ -184,266 +295,6 @@ using namespace nbl::video #if 0 class ResourceBuilder { -public: - - inline bool finalize(ResourcesBundle& output, nbl::video::CThreadSafeQueueAdapter* transferCapableQueue) - { - EXPOSE_NABLA_NAMESPACES(); - - // TODO: use multiple command buffers - std::array commandBuffers = {}; - { - commandBuffers.front().cmdbuf = commandBuffer; - } - - { - // note that asset converter records basic transfer uploads itself, we only begin the recording with ONE_TIME_SUBMIT_BIT - commandBuffer->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - commandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - commandBuffer->beginDebugMarker("Resources builder's buffers upload [asset converter]"); - - // asset converter - scratch at this point has ready to convert cpu resources - smart_refctd_ptr converter = CAssetConverter::create({ .device = utilities->getLogicalDevice(),.optimizer = {} }); - CAssetConverter::SInputs inputs = {}; - inputs.logger = logger; - - struct ProxyCpuHooks - { - using object_size_t = std::tuple_size; - - std::array renderpass; - std::array pipelines; - std::array buffers; - std::array attachments; - std::array descriptorSet; - } hooks; - - enum AttachmentIx - { - AI_COLOR = 0u, - AI_DEPTH = 1u, - - AI_COUNT - }; - - // gather CPU assets into span memory views - { - hooks.renderpass.front() = scratch.renderpass.get(); - for (uint32_t i = 0u; i < hooks.pipelines.size(); ++i) - { - auto& [reference, meta] = scratch.objects[static_cast(i)]; - hooks.pipelines[i] = reference.pipeline.get(); - - // [[ [vertex, index] [vertex, index] [vertex, index] ... [ubo] ]] - hooks.buffers[2u * i + 0u] = reference.bindings.vertex.buffer.get(); - hooks.buffers[2u * i + 1u] = reference.bindings.index.buffer.get(); - } - hooks.buffers.back() = scratch.ubo.buffer.get(); - hooks.attachments[AI_COLOR] = scratch.attachments.color.get(); - hooks.attachments[AI_DEPTH] = scratch.attachments.depth.get(); - hooks.descriptorSet.front() = scratch.descriptorSet.get(); - } - - // assign the CPU hooks to converter's inputs - { - std::get>(inputs.assets) = hooks.renderpass; - std::get>(inputs.assets) = hooks.pipelines; - std::get>(inputs.assets) = hooks.buffers; - // std::get>(inputs.assets) = hooks.attachments; // NOTE: THIS IS NOT IMPLEMENTED YET IN CONVERTER! - std::get>(inputs.assets) = hooks.descriptorSet; - } - - // reserve and create the GPU object handles - auto reservation = converter->reserve(inputs); - { - auto prepass = [&](const auto& references) -> bool - { - // retrieve the reserved handles - auto objects = reservation.getGPUObjects(); - - uint32_t counter = {}; - for (auto& object : objects) - { - // anything that fails to be reserved is a nullptr in the span of GPU Objects - auto gpu = object.value; - auto* reference = references[counter]; - - if (reference) - { - // validate - if (!gpu) // throw errors only if corresponding cpu hook was VALID (eg. we may have nullptr for some index buffers in the span for converter but it's OK, I'm too lazy to filter them before passing to the converter inputs and don't want to deal with dynamic alloc) - { - logger->log("Failed to convert a CPU object to GPU!", ILogger::ELL_ERROR); - return false; - } - } - - ++counter; - } - - return true; - }; - - prepass.template operator() < ICPURenderpass > (hooks.renderpass); - prepass.template operator() < ICPUGraphicsPipeline > (hooks.pipelines); - prepass.template operator() < ICPUBuffer > (hooks.buffers); - // validate.template operator() < ICPUImageView > (hooks.attachments); - prepass.template operator() < ICPUDescriptorSet > (hooks.descriptorSet); - } - - auto semaphore = utilities->getLogicalDevice()->createSemaphore(0u); - - // TODO: compute submit as well for the images' mipmaps - SIntendedSubmitInfo transfer = {}; - transfer.queue = transferCapableQueue; - transfer.scratchCommandBuffers = commandBuffers; - transfer.scratchSemaphore = { - .semaphore = semaphore.get(), - .value = 0u, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS - }; - // issue the convert call - { - CAssetConverter::SConvertParams params = {}; - params.utilities = utilities; - params.transfer = &transfer; - - // basically it records all data uploads and submits them right away - auto future = reservation.convert(params); - if (future.copy()!=IQueue::RESULT::SUCCESS) - { - logger->log("Failed to await submission feature!", ILogger::ELL_ERROR); - return false; - } - - // assign gpu objects to output - auto& base = static_cast(output); - { - auto&& [renderpass, pipelines, buffers, descriptorSet] = std::make_tuple(reservation.getGPUObjects().front().value, reservation.getGPUObjects(), reservation.getGPUObjects(), reservation.getGPUObjects().front().value); - { - base.renderpass = renderpass; - for (uint32_t i = 0u; i < pipelines.size(); ++i) - { - const auto type = static_cast(i); - const auto& [rcpu, rmeta] = scratch.objects[type]; - auto& [gpu, meta] = base.objects[type]; - - gpu.pipeline = pipelines[i].value; - // [[ [vertex, index] [vertex, index] [vertex, index] ... [ubo] ]] - gpu.bindings.vertex = {.offset = 0u, .buffer = buffers[2u * i + 0u].value}; - gpu.bindings.index = {.offset = 0u, .buffer = buffers[2u * i + 1u].value}; - - gpu.indexCount = rcpu.indexCount; - gpu.indexType = rcpu.indexType; - meta.name = rmeta.name; - meta.type = rmeta.type; - } - base.ubo = {.offset = 0u, .buffer = buffers.back().value}; - base.descriptorSet = descriptorSet; - - /* - // base.attachments.color = attachments[AI_COLOR].value; - // base.attachments.depth = attachments[AI_DEPTH].value; - - note conversion of image views is not yet supported by the asset converter - - it's complicated, we have to kinda temporary ignore DRY a bit here to not break the design which is correct - - TEMPORARY: we patch attachments by allocating them ourselves here given cpu instances & parameters - TODO: remove following code once asset converter works with image views & update stuff - */ - - for (uint32_t i = 0u; i < AI_COUNT; ++i) - { - const auto* reference = hooks.attachments[i]; - auto& out = (i == AI_COLOR ? base.attachments.color : base.attachments.depth); - - const auto& viewParams = reference->getCreationParameters(); - const auto& imageParams = viewParams.image->getCreationParameters(); - - auto image = utilities->getLogicalDevice()->createImage - ( - IGPUImage::SCreationParams - ({ - .type = imageParams.type, - .samples = imageParams.samples, - .format = imageParams.format, - .extent = imageParams.extent, - .mipLevels = imageParams.mipLevels, - .arrayLayers = imageParams.arrayLayers, - .usage = imageParams.usage - }) - ); - - if (!image) - { - logger->log("Could not create image!", ILogger::ELL_ERROR); - return false; - } - - bool IS_DEPTH = isDepthOrStencilFormat(imageParams.format); - std::string_view DEBUG_NAME = IS_DEPTH ? "UI Scene Depth Attachment Image" : "UI Scene Color Attachment Image"; - image->setObjectDebugName(DEBUG_NAME.data()); - - if (!utilities->getLogicalDevice()->allocate(image->getMemoryReqs(), image.get()).isValid()) - { - logger->log("Could not allocate memory for an image!", ILogger::ELL_ERROR); - return false; - } - - out = utilities->getLogicalDevice()->createImageView - ( - IGPUImageView::SCreationParams - ({ - .flags = viewParams.flags, - .subUsages = viewParams.subUsages, - .image = std::move(image), - .viewType = viewParams.viewType, - .format = viewParams.format, - .subresourceRange = viewParams.subresourceRange - }) - ); - - if (!out) - { - logger->log("Could not create image view!", ILogger::ELL_ERROR); - return false; - } - } - - logger->log("Image View attachments has been allocated by hand after asset converter successful submit becasuse it doesn't support converting them yet!", ILogger::ELL_WARNING); - } - } - } - } - - // write the descriptor set - { - // descriptor write ubo - IGPUDescriptorSet::SWriteDescriptorSet write; - write.dstSet = output.descriptorSet.get(); - write.binding = 0; - write.arrayElement = 0u; - write.count = 1u; - - IGPUDescriptorSet::SDescriptorInfo info; - { - info.desc = smart_refctd_ptr(output.ubo.buffer); - info.info.buffer.offset = output.ubo.offset; - info.info.buffer.size = output.ubo.buffer->getSize(); - } - - write.info = &info; - - if(!utilities->getLogicalDevice()->updateDescriptorSets(1u, &write, 0u, nullptr)) - { - logger->log("Could not write descriptor set!", ILogger::ELL_ERROR); - return false; - } - } - - return true; - } - private: @@ -988,49 +839,6 @@ class CScene final : public nbl::core::IReferenceCounted nbl::core::smart_refctd_ptr progress; } semaphore; - struct CreateResourcesDirectlyWithDevice { using Builder = ResourceBuilder; }; - struct CreateResourcesWithAssetConverter { using Builder = ResourceBuilder; }; - - ~CScene() {} - - static inline nbl::core::smart_refctd_ptr createCommandBuffer(nbl::video::ILogicalDevice* const device, nbl::system::ILogger* const logger, const uint32_t familyIx) - { - EXPOSE_NABLA_NAMESPACES(); - auto pool = device->createCommandPool(familyIx, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - - if (!pool) - { - logger->log("Couldn't create Command Pool!", ILogger::ELL_ERROR); - return nullptr; - } - - nbl::core::smart_refctd_ptr cmd; - - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmd , 1 })) - { - logger->log("Couldn't create Command Buffer!", ILogger::ELL_ERROR); - return nullptr; - } - - return cmd; - } - - template - static auto create(Args&&... args) -> decltype(auto) - { - EXPOSE_NABLA_NAMESPACES(); - - /* - user should call the constructor's args without last argument explicitly, this is a trick to make constructor templated, - eg.create(smart_refctd_ptr(device), smart_refctd_ptr(logger), queuePointer, geometryPointer) - */ - - auto* scene = new CScene(std::forward(args)..., CreateWith {}); - smart_refctd_ptr smart(scene, dont_grab); - - return smart; - } - inline void begin() { EXPOSE_NABLA_NAMESPACES(); @@ -1109,46 +917,6 @@ class CScene final : public nbl::core::IReferenceCounted m_commandBuffer->end(); } - inline bool submit() - { - EXPOSE_NABLA_NAMESPACES(); - - const IQueue::SSubmitInfo::SCommandBufferInfo buffers[] = - { - { .cmdbuf = m_commandBuffer.get() } - }; - - const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = semaphore.progress.get(),.value = semaphore.finishedValue,.stageMask = PIPELINE_STAGE_FLAGS::FRAMEBUFFER_SPACE_BITS} }; - - const IQueue::SSubmitInfo infos[] = - { - { - .waitSemaphores = {}, - .commandBuffers = buffers, - .signalSemaphores = signals - } - }; - - return queue->submit(infos) == IQueue::RESULT::SUCCESS; - } - - // note: must be updated outside render pass - inline void update() - { - EXPOSE_NABLA_NAMESPACES(); - - SBufferRange range; - range.buffer = smart_refctd_ptr(resources.ubo.buffer); - range.size = resources.ubo.buffer->getSize(); - - m_commandBuffer->updateBuffer(range, &object.viewParameters); - } - - inline decltype(auto) getResources() - { - return (resources); // note: do not remove "()" - it makes the return type lvalue reference instead of copy - } - private: template // TODO: enforce constraints, only those 2 above are valid CScene(nbl::core::smart_refctd_ptr _utilities, nbl::core::smart_refctd_ptr _logger, nbl::video::CThreadSafeQueueAdapter* _graphicsQueue, const nbl::asset::IGeometryCreator* _geometryCreator, CreateWith createWith = {}) From a89ffbef02366cb06931ac3cbc9a4b05b8c15155 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 17 Jun 2025 15:43:04 +0700 Subject: [PATCH 247/296] make accessor template nicer to read --- .../app_resources/testWorkgroup.comp.hlsl | 2 +- .../benchmarkWorkgroup.comp.hlsl | 14 ++++++------- common/include/WorkgroupDataAccessors.hlsl | 20 +++++++++---------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 4b30526a6..38e8b250f 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -30,7 +30,7 @@ struct operation_t // workgroup scans do no return anything, but use the data accessor to do the storing directly void operator()() { - using data_proxy_t = PreloadedDataProxy; + using data_proxy_t = PreloadedDataProxy; data_proxy_t dataAccessor = data_proxy_t::create(pc.pInputBuf, pc.pOutputBuf[Binop::BindingIndex]); dataAccessor.preload(); #if IS_REDUCTION diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index 561aadc56..50a9d912b 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -19,18 +19,18 @@ groupshared uint32_t scratch[mpl::max_v +template struct RandomizedInputDataProxy { using dtype_t = vector; - NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = _PreloadedDataCount; NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1u) << WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = VirtualWorkgroupSize / WorkgroupSize; - static RandomizedInputDataProxy create(uint64_t inputBuf, uint64_t outputBuf) + static RandomizedInputDataProxy create(uint64_t inputBuf, uint64_t outputBuf) { - RandomizedInputDataProxy retval; - retval.data = DataProxy::create(inputBuf, outputBuf); + RandomizedInputDataProxy retval; + retval.data = DataProxy::create(inputBuf, outputBuf); return retval; } @@ -69,13 +69,13 @@ struct RandomizedInputDataProxy //glsl::memoryBarrierShared(); implied by the above } - DataProxy data; + DataProxy data; dtype_t preloaded[PreloadedDataCount]; }; static ScratchProxy arithmeticAccessor; -using data_proxy_t = RandomizedInputDataProxy; +using data_proxy_t = RandomizedInputDataProxy; template struct operation_t diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl index e1774fad6..a274f5c08 100644 --- a/common/include/WorkgroupDataAccessors.hlsl +++ b/common/include/WorkgroupDataAccessors.hlsl @@ -31,15 +31,15 @@ struct ScratchProxy } }; -template +template struct DataProxy { using dtype_t = vector; - static DataProxy create(uint64_t inputBuf, uint64_t outputBuf) + static DataProxy create(const uint64_t inputBuf, const uint64_t outputBuf) { - DataProxy retval; - retval.workgroupOffset = glsl::gl_WorkGroupID().x * WorkgroupSize; + DataProxy retval; + retval.workgroupOffset = glsl::gl_WorkGroupID().x * VirtualWorkgroupSize; retval.inputBufAddr = inputBuf; retval.outputBufAddr = outputBuf; return retval; @@ -67,18 +67,18 @@ struct DataProxy uint64_t outputBufAddr; }; -template +template struct PreloadedDataProxy { using dtype_t = vector; - NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = _PreloadedDataCount; NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1u) << WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = VirtualWorkgroupSize / WorkgroupSize; - static PreloadedDataProxy create(uint64_t inputBuf, uint64_t outputBuf) + static PreloadedDataProxy create(const uint64_t inputBuf, const uint64_t outputBuf) { - PreloadedDataProxy retval; - retval.data = DataProxy::create(inputBuf, outputBuf); + PreloadedDataProxy retval; + retval.data = DataProxy::create(inputBuf, outputBuf); return retval; } @@ -114,7 +114,7 @@ struct PreloadedDataProxy //glsl::memoryBarrierShared(); implied by the above } - DataProxy data; + DataProxy data; dtype_t preloaded[PreloadedDataCount]; }; From 272a26918cc32f15c7ec77acdde63fed7fdff921 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 17 Jun 2025 17:18:12 +0200 Subject: [PATCH 248/296] use Asset Converter for ICPUPolygonGeometry, and split into Scene and Renderer --- 09_GeometryCreator/main.cpp | 57 +-- .../common/CSwapchainFramebuffersAndDepth.hpp | 2 +- .../geometry/CGeometryCreatorScene.hpp | 480 +++++------------- 3 files changed, 163 insertions(+), 376 deletions(-) diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index 2e31c90dd..a98dcee5b 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -37,29 +37,25 @@ class GeometryCreatorApp final : public MonoWindowApplication return logFail("Couldn't create Command Buffer!"); } -// auto scRes = static_cast(m_surface->getSwapchainResources()); -// .renderpass = core::smart_refctd_ptr(scRes->getRenderpass()) const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()}; - auto scene = CGeometryCreatorScene::create({ - .transferQueue = getTransferUpQueue(), - .utilities = m_utils.get(), - .logger = m_logger.get(), - .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies - }); -#if 0 - //using Builder = typename CScene::CreateResourcesDirectlyWithDevice::Builder; - using Builder = typename CScene::CreateResourcesWithAssetConverter::Builder; - Builder builder(m_utils.get(), oneRunCmd.get(), m_logger.get(), geometry); + // we want to use the vertex data through UTBs + using usage_f = IGPUBuffer::E_USAGE_FLAGS; + CAssetConverter::patch_t patch = {}; + patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; + patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT; + patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; + auto scene = CGeometryCreatorScene::create( + { + .transferQueue = getTransferUpQueue(), + .utilities = m_utils.get(), + .logger = m_logger.get(), + .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies + },patch + ); + + auto scRes = static_cast(m_surface->getSwapchainResources()); + auto renderer = CSimpleDebugRenderer::create(scRes->getRenderpass(),0,scene.get()); - // gpu resources - if (builder.build()) - { - if (!builder.finalize(resources, gQueue)) - m_logger->log("Could not finalize resource objects to gpu objects!", ILogger::ELL_ERROR); - } - else - m_logger->log("Could not build resource objects!", ILogger::ELL_ERROR); -#endif // camera { core::vectorSIMDf cameraPosition(-5.81655884, 2.58630896, -4.23974705); @@ -139,7 +135,7 @@ class GeometryCreatorApp final : public MonoWindowApplication .extent = {m_window->getWidth(),m_window->getHeight()} }; - const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} }; + const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {1.f,0.f,1.f,1.f} }; const IGPUCommandBuffer::SClearDepthStencilValue depthValue = { .depth = 0.f }; auto scRes = static_cast(m_surface->getSwapchainResources()); const IGPUCommandBuffer::SRenderpassBeginInfo info = @@ -214,19 +210,20 @@ class GeometryCreatorApp final : public MonoWindowApplication { // Subsequent submits don't wait for each other, hence its important to have External Dependencies which prevent users of the depth attachment overlapping. const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { - // wipe-transition of Color to ATTACHMENT_OPTIMAL + // wipe-transition of Color to ATTACHMENT_OPTIMAL and depth { .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .dstSubpass = 0, .memoryBarrier = { - // last place where the depth can get modified in previous frame + // last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later .srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT, - // only write ops, reads can't be made available - .srcAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + // don't want any writes to be available, we'll clear + .srcAccessMask = ACCESS_FLAGS::NONE, // destination needs to wait as early as possible - .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT, - // because of depth test needing a read and a write - .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_READ_BIT + // TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h` + .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // because depth and color get cleared first no read mask + .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT } // leave view offsets and flags default }, @@ -235,7 +232,7 @@ class GeometryCreatorApp final : public MonoWindowApplication .srcSubpass = 0, .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .memoryBarrier = { - // last place where the depth can get modified + // last place where the color can get modified, depth is implicitly earlier .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, // only write ops, reads can't be made available .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT diff --git a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp index a79d59730..ef88fb325 100644 --- a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp +++ b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp @@ -36,7 +36,7 @@ class CSwapchainFramebuffersAndDepth final : public video::CDefaultSwapchainFram /*.loadOp = */{IGPURenderpass::LOAD_OP::CLEAR}, /*.storeOp = */{IGPURenderpass::STORE_OP::STORE}, /*.initialLayout = */{IGPUImage::LAYOUT::UNDEFINED}, // because we clear we don't care about contents - /*.finalLayout = */{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} // transition to presentation right away so we can skip a barrier + /*.finalLayout = */{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} }}, IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd }; diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index dd462c03c..1f8d1ac6a 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -16,10 +16,14 @@ namespace nbl::examples { +#define EXPOSE_NABLA_NAMESPACES using namespace nbl::core; \ +using namespace nbl::system; \ +using namespace nbl::asset; \ +using namespace nbl::video + class CGeometryCreatorScene : public core::IReferenceCounted { public: - using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants; // enum ObjectType : uint8_t { @@ -36,11 +40,6 @@ class CGeometryCreatorScene : public core::IReferenceCounted OT_UNKNOWN = OT_COUNT }; -#define EXPOSE_NABLA_NAMESPACES using namespace nbl::core; \ -using namespace nbl::system; \ -using namespace nbl::asset; \ -using namespace nbl::video - // struct SCreateParams { @@ -49,7 +48,7 @@ using namespace nbl::video system::ILogger* logger; std::span addtionalBufferOwnershipFamilies = {}; }; - static inline core::smart_refctd_ptr create(SCreateParams&& params) + static inline core::smart_refctd_ptr create(SCreateParams&& params, const video::CAssetConverter::patch_t& geometryPatch) { EXPOSE_NABLA_NAMESPACES; auto* logger = params.logger; @@ -65,71 +64,15 @@ using namespace nbl::video return nullptr; } - constexpr auto DescriptorCount = 255; - smart_refctd_ptr cpuDS; - { - // create Descriptor Set Layout - smart_refctd_ptr dsLayout; - { - const ICPUDescriptorSetLayout::SBinding bindings[] = - { - { - .binding = 0, - .type = IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER, - // some geometries may not have particular attributes - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT, - .stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT, - .count = DescriptorCount - } - }; - dsLayout = core::make_smart_refctd_ptr(bindings); - if (!dsLayout) - { - logger->log("Could not create descriptor set layout!", ILogger::ELL_ERROR); - return nullptr; - } - } - - // create Descriptor Set - cpuDS = core::make_smart_refctd_ptr(std::move(dsLayout)); - if (!cpuDS) - { - logger->log("Could not descriptor set!", ILogger::ELL_ERROR); - return nullptr; - } - } - SInitParams init; - constexpr size_t NoIndexBufferMarker = 0xdeadbeefBADC0FFEull; - core::vector> indexBuffers; + core::vector namedGeometries; + core::vector> geometries; // create out geometries { - auto* const outDescs = cpuDS->getDescriptorInfoStorage(IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER).data(); - uint8_t nextDesc = 0; - auto allocateUTB = [DescriptorCount,outDescs,&nextDesc](const IGeometry::SDataView& view)->uint8_t + auto addGeometry = [&namedGeometries,&geometries](const std::string_view name, smart_refctd_ptr&& geom)->void { - if (!view) - return DescriptorCount; - outDescs[nextDesc].desc = core::make_smart_refctd_ptr(view.src,view.composed.format); - return nextDesc++; - }; - - auto addGeometry = [&allocateUTB,&indexBuffers,&init](const ICPUPolygonGeometry* geom)->void - { - auto& out = init.geoms.emplace_back(); - if (const auto& view=geom->getIndexView(); view) - { - out.indexBuffer.offset = view.src.offset; - indexBuffers.push_back(view.src.buffer); - } - else - out.indexBuffer.offset = NoIndexBufferMarker; - out.elementCount = geom->getVertexReferenceCount(); - out.positionView = allocateUTB(geom->getPositionView()); - out.normalView = allocateUTB(geom->getNormalView()); - // the first view is usually the UV - if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty()) - out.uvView = allocateUTB(auxViews.front()); + namedGeometries.emplace_back().name = name; + geometries.push_back(std::move(geom)); }; auto creator = core::make_smart_refctd_ptr(); @@ -143,9 +86,9 @@ using namespace nbl::video ReferenceObjectCpu {.meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) }, ReferenceObjectCpu {.meta = {.type = OT_ICOSPHERE, .name = "Icoshpere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) } */ - addGeometry(creator->createCube({1.f,1.f,1.f}).get()); - addGeometry(creator->createRectangle({1.5f,3.f}).get()); - addGeometry(creator->createDisk(2.f,30).get()); + addGeometry("Cube",creator->createCube({1.f,1.f,1.f})); + addGeometry("Rectangle",creator->createRectangle({1.5f,3.f})); + addGeometry("Disk",creator->createDisk(2.f,30)); } // convert the geometries @@ -165,13 +108,11 @@ using namespace nbl::video core::vector sharedBufferOwnership; } inputs = {}; + core::vector> patches(geometries.size(),geometryPatch); { inputs.logger = logger; - // descriptor set should convert everthing downstream - std::get>(inputs.assets) = {&cpuDS.get(),1}; - // except index buffers - if (!indexBuffers.empty()) - std::get>(inputs.assets) = {&indexBuffers.front().get(),indexBuffers.size()}; + std::get>(inputs.assets) = {&geometries.front().get(),geometries.size()}; + std::get>(inputs.patches) = patches; // set up shared ownership so we don't have to core::unordered_set families; families.insert(transferFamily); @@ -229,30 +170,44 @@ using namespace nbl::video // assign outputs { - auto assign = [logger](auto& out, const auto& in)->bool + auto inIt = reservation.getGPUObjects().data(); + for (auto outIt=namedGeometries.begin(); outIt!=namedGeometries.end(); inIt++) { - if (!in.value) + if (inIt->value) + (outIt++)->geom = inIt->value; + else { - logger->log("Failed to convert CPU object to GPU!",ILogger::ELL_ERROR); - return false; + logger->log("Failed to convert ICPUPolygonGeometry %s to GPU!",ILogger::ELL_ERROR,outIt->name.data()); + outIt = namedGeometries.erase(outIt); } - out = in.value; - return true; - }; - if (!assign(init.ds,reservation.getGPUObjects().front())) - return nullptr; - auto indexBufIt = reservation.getGPUObjects().data(); - for (auto& entry : init.geoms) - if (entry.indexBuffer.offset!=NoIndexBufferMarker) - if (!assign(entry.indexBuffer.buffer,*(indexBufIt++))) - return nullptr; + } } } - return smart_refctd_ptr(new CGeometryCreatorScene(std::move(init)),dont_grab); + return smart_refctd_ptr(new CGeometryCreatorScene(std::move(namedGeometries)),dont_grab); } // + struct SNamedGeometry + { + std::string_view name = {}; + core::smart_refctd_ptr geom; + }; + std::span getGeometries() const {return m_geometries;} + + protected: + inline CGeometryCreatorScene(core::vector&& _geometries) : m_geometries(std::move(_geometries)) {} + + core::vector m_geometries; +}; + +class CSimpleDebugRenderer final : public core::IReferenceCounted +{ + public: + // + constexpr static inline auto DescriptorCount = 255; + // + using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants; struct SPackedGeometry { inline SPushConstants convert(const hlsl::float32_t3x4& model, const hlsl::float32_t3x4& view, const hlsl::float32_t4x4& viewProj) @@ -270,251 +225,132 @@ using namespace nbl::video }; } - asset::SBufferBinding indexBuffer = {}; + asset::SBufferBinding indexBuffer = {}; uint32_t elementCount = 0; // indices into the descriptor set uint8_t positionView = 0; uint8_t normalView = 0; uint8_t uvView = 0; uint8_t indexType = asset::EIT_UNKNOWN; - ObjectType type : 6 = ObjectType::OT_UNKNOWN; }; - std::span getGeometries() const {return m_params.geoms;} - protected: - struct SInitParams + static inline core::smart_refctd_ptr create(video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene) { - core::smart_refctd_ptr ds; - core::vector geoms; - } m_params; - inline CGeometryCreatorScene(SInitParams&& _params) : m_params(std::move(_params)) {} - -#undef EXPOSE_NABLA_NAMESPACES -}; + EXPOSE_NABLA_NAMESPACES; -#if 0 -class ResourceBuilder -{ -private: + if (!renderpass) + return nullptr; + auto device = const_cast(renderpass->getOriginDevice()); + auto logger = device->getLogger(); + if (!scene) + return nullptr; + const auto namedGeoms = scene->getGeometries(); + if (namedGeoms.empty()) + return nullptr; - bool createRenderpass() - { - EXPOSE_NABLA_NAMESPACES(); + // TODO: Load Shaders and Create Pipelines - static constexpr Types::renderpass_t::SCreationParams::SColorAttachmentDescription colorAttachments[] = - { - { - { - { - .format = ColorFboAttachmentFormat, - .samples = Samples, - .mayAlias = false - }, - /* .loadOp = */ Types::renderpass_t::LOAD_OP::CLEAR, - /* .storeOp = */ Types::renderpass_t::STORE_OP::STORE, - /* .initialLayout = */ Types::image_t::LAYOUT::UNDEFINED, - /* .finalLayout = */ Types::image_t::LAYOUT::READ_ONLY_OPTIMAL - } - }, - Types::renderpass_t::SCreationParams::ColorAttachmentsEnd - }; + SInitParams init; - static constexpr Types::renderpass_t::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = - { + // create descriptor set { + // create Descriptor Set Layout + smart_refctd_ptr dsLayout; { + const IGPUDescriptorSetLayout::SBinding bindings[] = { - .format = DepthFboAttachmentFormat, - .samples = Samples, - .mayAlias = false - }, - /* .loadOp = */ {Types::renderpass_t::LOAD_OP::CLEAR}, - /* .storeOp = */ {Types::renderpass_t::STORE_OP::STORE}, - /* .initialLayout = */ {Types::image_t::LAYOUT::UNDEFINED}, - /* .finalLayout = */ {Types::image_t::LAYOUT::ATTACHMENT_OPTIMAL} - } - }, - Types::renderpass_t::SCreationParams::DepthStencilAttachmentsEnd - }; - - typename Types::renderpass_t::SCreationParams::SSubpassDescription subpasses[] = - { - {}, - Types::renderpass_t::SCreationParams::SubpassesEnd - }; - - subpasses[0].depthStencilAttachment.render = { .attachmentIndex = 0u,.layout = Types::image_t::LAYOUT::ATTACHMENT_OPTIMAL }; - subpasses[0].colorAttachments[0] = { .render = {.attachmentIndex = 0u, .layout = Types::image_t::LAYOUT::ATTACHMENT_OPTIMAL } }; - - static constexpr Types::renderpass_t::SCreationParams::SSubpassDependency dependencies[] = - { - // wipe-transition of Color to ATTACHMENT_OPTIMAL - { - .srcSubpass = Types::renderpass_t::SCreationParams::SSubpassDependency::External, - .dstSubpass = 0, - .memoryBarrier = - { - // - .srcStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, - // only write ops, reads can't be made available - .srcAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT, - // destination needs to wait as early as possible - .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - // because of depth test needing a read and a write - .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_READ_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_READ_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - // leave view offsets and flags default - }, - // color from ATTACHMENT_OPTIMAL to PRESENT_SRC - { - .srcSubpass = 0, - .dstSubpass = Types::renderpass_t::SCreationParams::SSubpassDependency::External, - .memoryBarrier = - { - // last place where the depth can get modified - .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - // only write ops, reads can't be made available - .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT, - // - .dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, - // - .dstAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT - // - } - // leave view offsets and flags default - }, - Types::renderpass_t::SCreationParams::DependenciesEnd - }; - - typename Types::renderpass_t::SCreationParams params = {}; - params.colorAttachments = colorAttachments; - params.depthStencilAttachments = depthAttachments; - params.subpasses = subpasses; - params.dependencies = dependencies; - - if constexpr (withAssetConverter) - scratch.renderpass = ICPURenderpass::create(params); - - if (!scratch.renderpass) - { - logger->log("Could not create render pass!", ILogger::ELL_ERROR); - return false; - } - - return true; - } - - bool createFramebufferAttachments() - { - EXPOSE_NABLA_NAMESPACES(); - - auto createImageView = [&](smart_refctd_ptr& outView) -> smart_refctd_ptr - { - constexpr bool IS_DEPTH = isDepthOrStencilFormat(); - constexpr auto USAGE = [](const bool isDepth) - { - bitflag usage = Types::image_t::EUF_RENDER_ATTACHMENT_BIT; - - if (!isDepth) - usage |= Types::image_t::EUF_SAMPLED_BIT; - - return usage; - }(IS_DEPTH); - constexpr auto ASPECT = IS_DEPTH ? IImage::E_ASPECT_FLAGS::EAF_DEPTH_BIT : IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; - constexpr std::string_view DEBUG_NAME = IS_DEPTH ? "UI Scene Depth Attachment Image" : "UI Scene Color Attachment Image"; - { - smart_refctd_ptr image; - { - auto params = typename Types::image_t::SCreationParams( + { + .binding = 0, + .type = IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER, + // some geometries may not have particular attributes + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT, + .stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT, + .count = DescriptorCount + } + }; + dsLayout = device->createDescriptorSetLayout(bindings); + if (!dsLayout) { - .type = Types::image_t::ET_2D, - .samples = Samples, - .format = format, - .extent = { FramebufferW, FramebufferH, 1u }, - .mipLevels = 1u, - .arrayLayers = 1u, - .usage = USAGE - }); - - if constexpr (withAssetConverter) - image = ICPUImage::create(params); - else - image = utilities->getLogicalDevice()->createImage(std::move(params)); + logger->log("Could not create descriptor set layout!",ILogger::ELL_ERROR); + return nullptr; + } } - if (!image) + // create Descriptor Set + auto pool = device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT,{&dsLayout.get(),1}); + init.ds = pool->createDescriptorSet(std::move(dsLayout)); + if (!init.ds) { - logger->log("Could not create image!", ILogger::ELL_ERROR); + logger->log("Could not descriptor set!",ILogger::ELL_ERROR); return nullptr; } + } - if constexpr (withAssetConverter) + // write geometries' attributes to descriptor set + { + core::vector infos; + auto allocateUTB = [device,&infos](const IGeometry::SDataView& view)->uint8_t { - auto dummyBuffer = ICPUBuffer::create({ FramebufferW * FramebufferH * getTexelOrBlockBytesize() }); - dummyBuffer->setContentHash(dummyBuffer->computeContentHash()); - - auto regions = make_refctd_dynamic_array>(1u); - auto& region = regions->front(); - - region.imageSubresource = { .aspectMask = ASPECT, .mipLevel = 0u, .baseArrayLayer = 0u, .layerCount = 0u }; - region.bufferOffset = 0u; - region.bufferRowLength = IImageAssetHandlerBase::calcPitchInBlocks(FramebufferW, getTexelOrBlockBytesize()); - region.bufferImageHeight = 0u; - region.imageOffset = { 0u, 0u, 0u }; - region.imageExtent = { FramebufferW, FramebufferH, 1u }; + if (!view) + return DescriptorCount; + const auto retval = infos.size(); + infos.emplace_back().desc = device->createBufferView(view.src, view.composed.format); + return retval; + }; - if (!image->setBufferAndRegions(std::move(dummyBuffer), regions)) - { - logger->log("Could not set image's regions!", ILogger::ELL_ERROR); - return nullptr; - } - image->setContentHash(image->computeContentHash()); - } - else + for (const auto& entry : namedGeoms) { - image->setObjectDebugName(DEBUG_NAME.data()); - - if (!utilities->getLogicalDevice()->allocate(image->getMemoryReqs(), image.get()).isValid()) + const auto* geom = entry.geom.get(); + // could also check device origin on all buffers + if (!geom->valid()) + continue; + auto& out = init.geoms.emplace_back(); + if (const auto& view=geom->getIndexView(); view) { - logger->log("Could not allocate memory for an image!", ILogger::ELL_ERROR); - return nullptr; + out.indexBuffer.offset = view.src.offset; + out.indexBuffer.buffer = view.src.buffer; } + out.elementCount = geom->getVertexReferenceCount(); + out.positionView = allocateUTB(geom->getPositionView()); + out.normalView = allocateUTB(geom->getNormalView()); + // the first view is usually the UV + if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty()) + out.uvView = allocateUTB(auxViews.front()); } - auto params = typename Types::image_view_t::SCreationParams - ({ - .flags = Types::image_view_t::ECF_NONE, - .subUsages = USAGE, - .image = std::move(image), - .viewType = Types::image_view_t::ET_2D, - .format = format, - .subresourceRange = { .aspectMask = ASPECT, .baseMipLevel = 0u, .levelCount = 1u, .baseArrayLayer = 0u, .layerCount = 1u } - }); - - if constexpr (withAssetConverter) - outView = make_smart_refctd_ptr(std::move(params)); - - if (!outView) - { - logger->log("Could not create image view!", ILogger::ELL_ERROR); + if (infos.empty()) + return nullptr; + const IGPUDescriptorSet::SWriteDescriptorSet write = { + .dstSet = init.ds.get(), + .binding = 0, + .arrayElement = 0, + .count = static_cast(infos.size()), + .info = infos.data() + }; + if (!device->updateDescriptorSets({&write,1},{})) return nullptr; - } - - return smart_refctd_ptr(outView); } - }; - const bool allocated = createImageView.template operator() < ColorFboAttachmentFormat > (scratch.attachments.color) && createImageView.template operator() < DepthFboAttachmentFormat > (scratch.attachments.depth); + return smart_refctd_ptr(new CSimpleDebugRenderer(std::move(init)),dont_grab); + } + - if (!allocated) + protected: + struct SInitParams { - logger->log("Could not allocate frame buffer's attachments!", ILogger::ELL_ERROR); - return false; - } + core::smart_refctd_ptr ds; + core::vector geoms; + } m_params; - return true; - } + inline CSimpleDebugRenderer(SInitParams&& _params) : m_params(std::move(_params)) {} +}; + +#undef EXPOSE_NABLA_NAMESPACES +#if 0 +class ResourceBuilder +{ +private: bool createShaders() { @@ -852,45 +688,6 @@ class CScene final : public nbl::core::IReferenceCounted inline void record() { - EXPOSE_NABLA_NAMESPACES(); - - const struct - { - const uint32_t width, height; - } fbo = { .width = m_frameBuffer->getCreationParameters().width, .height = m_frameBuffer->getCreationParameters().height }; - - SViewport viewport; - { - viewport.minDepth = 1.f; - viewport.maxDepth = 0.f; - viewport.x = 0u; - viewport.y = 0u; - viewport.width = fbo.width; - viewport.height = fbo.height; - } - - m_commandBuffer->setViewport(0u, 1u, &viewport); - - VkRect2D scissor = {}; - scissor.offset = { 0, 0 }; - scissor.extent = { fbo.width, fbo.height }; - m_commandBuffer->setScissor(0u, 1u, &scissor); - - const VkRect2D renderArea = - { - .offset = { 0,0 }, - .extent = { fbo.width, fbo.height } - }; - - const IGPUCommandBuffer::SRenderpassBeginInfo info = - { - .framebuffer = m_frameBuffer.get(), - .colorClearValues = &clear.color, - .depthStencilClearValues = &clear.depth, - .renderArea = renderArea - }; - - m_commandBuffer->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); const auto& [hook, meta] = resources.objects[object.meta.type]; auto* rawPipeline = hook.pipeline.get(); @@ -908,13 +705,6 @@ class CScene final : public nbl::core::IReferenceCounted } else m_commandBuffer->draw(hook.indexCount, 1, 0, 0); - - m_commandBuffer->endRenderPass(); - } - - inline void end() - { - m_commandBuffer->end(); } private: From 3a487ac9cc933b0866707611e45eb7615813adf2 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 17 Jun 2025 23:47:08 +0200 Subject: [PATCH 249/296] make example 30 run again --- 30_ComputeShaderPathTracer/main.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp index ed93cf81f..201eacaf3 100644 --- a/30_ComputeShaderPathTracer/main.cpp +++ b/30_ComputeShaderPathTracer/main.cpp @@ -534,6 +534,9 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication region.imageExtent = scrambleMapCPU->getCreationParameters().extent; scrambleMapCPU->setBufferAndRegions(std::move(texelBuffer), regions); + + // programmatically user-created IPreHashed need to have their hash computed (loaders do it while loading) + scrambleMapCPU->setContentHash(scrambleMapCPU->computeContentHash()); } std::array cpuImgs = { envMapCPU.get(), scrambleMapCPU.get()}; From 8fd7f5d9e8d75a14cbb17a623f60a916355a2e90 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 18 Jun 2025 09:52:41 +0700 Subject: [PATCH 250/296] fix ex 11 fft --- 11_FFT/app_resources/shader.comp.hlsl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/11_FFT/app_resources/shader.comp.hlsl b/11_FFT/app_resources/shader.comp.hlsl index ecbf4f092..63a85b0c4 100644 --- a/11_FFT/app_resources/shader.comp.hlsl +++ b/11_FFT/app_resources/shader.comp.hlsl @@ -14,13 +14,13 @@ uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParamete struct SharedMemoryAccessor { - template + template void set(IndexType idx, AccessType value) { sharedmem[idx] = value; } - template + template void get(IndexType idx, NBL_REF_ARG(AccessType) value) { value = sharedmem[idx]; @@ -44,14 +44,14 @@ struct Accessor } // TODO: can't use our own BDA yet, because it doesn't support the types `workgroup::FFT` will invoke these templates with - template - void get(const uint32_t index, NBL_REF_ARG(AccessType) value) + template + void get(const IndexType index, NBL_REF_ARG(AccessType) value) { value = vk::RawBufferLoad(address + index * sizeof(AccessType)); } - template - void set(const uint32_t index, const AccessType value) + template + void set(const IndexType index, const AccessType value) { vk::RawBufferStore(address + index * sizeof(AccessType), value); } From e5d4a354946afdb08db81cea3867b08ff2bd0a4b Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 18 Jun 2025 11:01:28 +0700 Subject: [PATCH 251/296] Fix fft bloom example to use the reworked shader spec info interface --- 28_FFTBloom/main.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/28_FFTBloom/main.cpp b/28_FFTBloom/main.cpp index 4718a4090..b528d3c41 100644 --- a/28_FFTBloom/main.cpp +++ b/28_FFTBloom/main.cpp @@ -723,10 +723,9 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app params[i].layout = pipelineLayout.get(); params[i].shader.shader = shaders[i].get(); params[i].shader.entryPoint = "main"; - params[i].shader.stage = hlsl::ShaderStage::ESS_COMPUTE; // Normalization doesn't require full subgroups - params[i].shader.requireFullSubgroups = bool(2-i); - params[i].shader.requiredSubgroupSize = static_cast(hlsl::findMSB(deviceLimits.maxSubgroupSize)); + params[i].cached.requireFullSubgroups = bool(2-i); + params[i].shader.requiredSubgroupSize = static_cast(hlsl::findMSB(deviceLimits.maxSubgroupSize)); } smart_refctd_ptr pipelines[3]; @@ -928,9 +927,8 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app params[i].layout = pipelineLayout.get(); params[i].shader.shader = shaders[i].get(); params[i].shader.entryPoint = "main"; - params[i].shader.stage = hlsl::ShaderStage::ESS_COMPUTE; - params[i].shader.requiredSubgroupSize = static_cast(hlsl::findMSB(deviceLimits.maxSubgroupSize)); - params[i].shader.requireFullSubgroups = true; + params[i].shader.requiredSubgroupSize = static_cast(hlsl::findMSB(deviceLimits.maxSubgroupSize)); + params[i].cached.requireFullSubgroups = true; } smart_refctd_ptr pipelines[3]; From 8ae32e1cbc991c31da7f75d55a9958f188ebba1d Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 18 Jun 2025 12:06:09 +0700 Subject: [PATCH 252/296] removed redundant barrier --- 29_Arithmetic2Bench/main.cpp | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index 945749320..e88a59cae 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -398,31 +398,6 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - // barrier transition to GENERAL - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; - imageBarriers[0].barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, - .srcAccessMask = ACCESS_FLAGS::NONE, - .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, - .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS - } - }; - imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); - imageBarriers[0].subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .baseMipLevel = 0u, - .levelCount = 1u, - .baseArrayLayer = 0u, - .layerCount = 1u - }; - imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; - imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; - - cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); - } - const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; const auto SubgroupSizeLog2 = hlsl::findMSB(MaxSubgroupSize); @@ -451,7 +426,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub .baseArrayLayer = 0u, .layerCount = 1u }; - imageBarriers[0].oldLayout = IImage::LAYOUT::GENERAL; + imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC; cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); @@ -568,7 +543,8 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub const uint32_t subgroupSize = 0x1u << subgroupSizeLog2; const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); - hlsl::workgroup2::SArithmeticConfiguration wgConfig = hlsl::workgroup2::SArithmeticConfiguration::create(workgroupSizeLog2, subgroupSizeLog2, itemsPerInvoc); + hlsl::workgroup2::SArithmeticConfiguration wgConfig; + wgConfig.init(workgroupSizeLog2, subgroupSizeLog2, itemsPerInvoc); const uint32_t itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0; smart_refctd_ptr overriddenUnspecialized; if constexpr (WorkgroupBench) From 7121e8b266f1cc59b3d0e56db6248a84e26e26ad Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 18 Jun 2025 13:04:29 +0700 Subject: [PATCH 253/296] Fix example 05 --- .../app_resources/shader.comp.hlsl | 1 + 05_StreamingAndBufferDeviceAddressApp/main.cpp | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/05_StreamingAndBufferDeviceAddressApp/app_resources/shader.comp.hlsl b/05_StreamingAndBufferDeviceAddressApp/app_resources/shader.comp.hlsl index 4aeef0e0f..af38ffada 100644 --- a/05_StreamingAndBufferDeviceAddressApp/app_resources/shader.comp.hlsl +++ b/05_StreamingAndBufferDeviceAddressApp/app_resources/shader.comp.hlsl @@ -10,6 +10,7 @@ template void dummyTraitTest() {} [numthreads(WorkgroupSize,1,1)] +[shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { dummyTraitTest(); diff --git a/05_StreamingAndBufferDeviceAddressApp/main.cpp b/05_StreamingAndBufferDeviceAddressApp/main.cpp index c6c537363..f98e38f66 100644 --- a/05_StreamingAndBufferDeviceAddressApp/main.cpp +++ b/05_StreamingAndBufferDeviceAddressApp/main.cpp @@ -102,7 +102,8 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M return logFail("Could not load shader!"); // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - shader = IAsset::castDown(assets[0]); + const auto shaderSource = IAsset::castDown(assets[0]); + shader = m_device->compileShader({shaderSource.get()}); // The down-cast should not fail! assert(shader); } From 683aa878ae5c9f252226955e240ab477524339e7 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 18 Jun 2025 14:03:41 +0700 Subject: [PATCH 254/296] use builtin bda accessor --- .../app_resources/testSubgroup.comp.hlsl | 2 +- .../app_resources/benchmarkSubgroup.comp.hlsl | 2 +- common/include/WorkgroupDataAccessors.hlsl | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl index e079e5e63..3b99e5a79 100644 --- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl @@ -7,7 +7,7 @@ #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" #include "shaderCommon.hlsl" -#include "nbl/builtin/hlsl/workgroup/basic.hlsl" +#include "nbl/builtin/hlsl/workgroup2/basic.hlsl" typedef vector type_t; diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl index 9141ade55..2c102c13d 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl @@ -8,7 +8,7 @@ #include "nbl/builtin/hlsl/random/xoroshiro.hlsl" #include "shaderCommon.hlsl" -#include "nbl/builtin/hlsl/workgroup/basic.hlsl" +#include "nbl/builtin/hlsl/workgroup2/basic.hlsl" typedef vector type_t; diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl index a274f5c08..7287a4135 100644 --- a/common/include/WorkgroupDataAccessors.hlsl +++ b/common/include/WorkgroupDataAccessors.hlsl @@ -1,6 +1,8 @@ #ifndef _WORKGROUP_DATA_ACCESSORS_HLSL_ #define _WORKGROUP_DATA_ACCESSORS_HLSL_ +#include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl" + namespace nbl { namespace hlsl @@ -35,25 +37,25 @@ template struct DataProxy { using dtype_t = vector; + // function template AccessType should be the same as dtype_t static DataProxy create(const uint64_t inputBuf, const uint64_t outputBuf) { DataProxy retval; - retval.workgroupOffset = glsl::gl_WorkGroupID().x * VirtualWorkgroupSize; - retval.inputBufAddr = inputBuf; - retval.outputBufAddr = outputBuf; + const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * VirtualWorkgroupSize * sizeof(dtype_t); + retval.accessor = DoubleLegacyBdaAccessor::create(inputBuf + workgroupOffset, outputBuf + workgroupOffset); return retval; } template void get(const IndexType ix, NBL_REF_ARG(AccessType) value) { - value = vk::RawBufferLoad(inputBufAddr + (workgroupOffset + ix) * sizeof(AccessType)); + accessor.get(ix, value); } template void set(const IndexType ix, const AccessType value) { - vk::RawBufferStore(outputBufAddr + (workgroupOffset + ix) * sizeof(AccessType), value, sizeof(uint32_t)); + accessor.set(ix, value); } void workgroupExecutionAndMemoryBarrier() @@ -62,9 +64,7 @@ struct DataProxy //glsl::memoryBarrierShared(); implied by the above } - uint32_t workgroupOffset; - uint64_t inputBufAddr; - uint64_t outputBufAddr; + DoubleLegacyBdaAccessor accessor; }; template From 1c56eb05453fb7c3ba3c03c7e8d130279c8f4873 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 18 Jun 2025 14:33:30 +0700 Subject: [PATCH 255/296] reduce workgroup macro definitions, use config string --- .../app_resources/shaderCommon.hlsl | 11 ----- .../app_resources/testWorkgroup.comp.hlsl | 8 ++-- 23_Arithmetic2UnitTest/main.cpp | 44 +++++++++---------- 3 files changed, 24 insertions(+), 39 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl index 6b9575ccd..3793b08f8 100644 --- a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl @@ -3,13 +3,6 @@ using namespace nbl; using namespace hlsl; -// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 -uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} - -#ifndef ITEMS_PER_INVOCATION -#error "Define ITEMS_PER_INVOCATION!" -#endif - [[vk::push_constant]] PushConstantData pc; struct device_capabilities @@ -24,7 +17,3 @@ struct device_capabilities #ifndef OPERATION #error "Define OPERATION!" #endif - -#ifndef SUBGROUP_SIZE_LOG2 -#error "Define SUBGROUP_SIZE_LOG2!" -#endif diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 38e8b250f..2a32ed20e 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -5,12 +5,10 @@ #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" -static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2; +using config_t = WORKGROUP_CONFIG_T; #include "shaderCommon.hlsl" -using config_t = workgroup2::ArithmeticConfiguration; - typedef vector type_t; // final (level 1/2) scan needs to fit in one subgroup exactly @@ -52,7 +50,7 @@ struct operation_t template static void subtest() { - assert(glsl::gl_SubgroupSize() == 1u< func; func(); @@ -69,7 +67,7 @@ void test() subtest >(); } -[numthreads(WORKGROUP_SIZE,1,1)] +[numthreads(config_t::WorkgroupSize,1,1)] void main() { test(); diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 6c979d7e5..51847e710 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -186,7 +186,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u) + for (uint32_t workgroupSize = 64; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u) { // make sure renderdoc captures everything for debugging m_api->startCapture(); @@ -198,14 +198,15 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu uint32_t itemsPerWG = workgroupSize * itemsPerInvocation; m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation); bool passed = true; - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; - logTestOutcome(passed, itemsPerWG); - - hlsl::workgroup2::SArithmeticConfiguration wgConfig = hlsl::workgroup2::SArithmeticConfiguration::create(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvocation); + //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; + //logTestOutcome(passed, itemsPerWG); + //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; + //logTestOutcome(passed, itemsPerWG); + //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; + //logTestOutcome(passed, itemsPerWG); + + hlsl::workgroup2::SArithmeticConfiguration wgConfig; + wgConfig.init(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvocation); itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0; m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; @@ -306,28 +307,25 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu smart_refctd_ptr overriddenUnspecialized; if constexpr (WorkgroupTest) { - const std::string definitions[6] = { + hlsl::workgroup2::SArithmeticConfiguration wgConfig; + wgConfig.init(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvoc); + + const std::string definitions[3] = { "workgroup2::" + arith_name, - std::to_string(workgroupSizeLog2), - std::to_string(itemsPerWG), - std::to_string(itemsPerInvoc), - std::to_string(subgroupSizeLog2), + wgConfig.getConfigTemplateStructString(), std::to_string(arith_name=="reduction") }; - const IShaderCompiler::SMacroDefinition defines[7] = { + const IShaderCompiler::SMacroDefinition defines[4] = { { "OPERATION", definitions[0] }, - { "WORKGROUP_SIZE_LOG2", definitions[1] }, - { "ITEMS_PER_WG", definitions[2] }, - { "ITEMS_PER_INVOCATION", definitions[3] }, - { "SUBGROUP_SIZE_LOG2", definitions[4] }, - { "IS_REDUCTION", definitions[5] }, + { "WORKGROUP_CONFIG_T", definitions[1] }, + { "IS_REDUCTION", definitions[2] }, { "TEST_NATIVE", "1" } }; if (useNative) - options.preprocessorOptions.extraDefines = { defines, defines + 7 }; + options.preprocessorOptions.extraDefines = { defines, defines + 4 }; else - options.preprocessorOptions.extraDefines = { defines, defines + 6 }; + options.preprocessorOptions.extraDefines = { defines, defines + 3 }; overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); } @@ -358,7 +356,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2); // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) - uint32_t workgroupCount = min(elementCount / itemsPerWG, m_physicalDevice->getLimits().maxComputeWorkGroupCount[0]); + uint32_t workgroupCount = 1;// min(elementCount / itemsPerWG, m_physicalDevice->getLimits().maxComputeWorkGroupCount[0]); cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); cmdbuf->bindComputePipeline(pipeline.get()); From 3e910b55890aac6ccc3fcd9e1c43b5f5ee84b0df Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 18 Jun 2025 15:16:51 +0700 Subject: [PATCH 256/296] similar config string thing but for subgroups --- .../app_resources/testSubgroup.comp.hlsl | 29 +++++++++-------- 23_Arithmetic2UnitTest/main.cpp | 32 ++++++++++--------- 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl index 3b99e5a79..3105aec56 100644 --- a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl @@ -5,28 +5,29 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl" #include "shaderCommon.hlsl" #include "nbl/builtin/hlsl/workgroup2/basic.hlsl" -typedef vector type_t; +template +using params_t = SUBGROUP_CONFIG_T; + +typedef vector::base_t, device_capabilities>::ItemsPerInvocation> type_t; uint32_t globalIndex() { return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex(); } -template +template static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) { - using config_t = subgroup2::Configuration; - using params_t = subgroup2::ArithmeticParams; - const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex]; - assert(glsl::gl_SubgroupSize() == 1u<::config_t::Size) - operation_t func; + operation_t > func; type_t val = func(sourceVal); vk::RawBufferStore(outputBufAddr + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t)); @@ -37,13 +38,13 @@ type_t test() const uint32_t idx = globalIndex(); type_t sourceVal = vk::RawBufferLoad(pc.pInputBuf + idx * sizeof(type_t)); - subtest, ITEMS_PER_INVOCATION>(sourceVal); - subtest, ITEMS_PER_INVOCATION>(sourceVal); - subtest, ITEMS_PER_INVOCATION>(sourceVal); - subtest, ITEMS_PER_INVOCATION>(sourceVal); - subtest, ITEMS_PER_INVOCATION>(sourceVal); - subtest, ITEMS_PER_INVOCATION>(sourceVal); - subtest, ITEMS_PER_INVOCATION>(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); return sourceVal; } diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 51847e710..65ef126ad 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -2,6 +2,7 @@ #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "app_resources/common.hlsl" #include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl" using namespace nbl; using namespace core; @@ -186,7 +187,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - for (uint32_t workgroupSize = 64; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u) + for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u) { // make sure renderdoc captures everything for debugging m_api->startCapture(); @@ -198,12 +199,12 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu uint32_t itemsPerWG = workgroupSize * itemsPerInvocation; m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation); bool passed = true; - //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; - //logTestOutcome(passed, itemsPerWG); - //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; - //logTestOutcome(passed, itemsPerWG); - //passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; - //logTestOutcome(passed, itemsPerWG); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); hlsl::workgroup2::SArithmeticConfiguration wgConfig; wgConfig.init(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvocation); @@ -331,24 +332,25 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } else { - const std::string definitions[4] = { + hlsl::subgroup2::SArithmeticParams sgParams; + sgParams.init(subgroupSizeLog2, itemsPerInvoc); + + const std::string definitions[3] = { "subgroup2::" + arith_name, std::to_string(workgroupSize), - std::to_string(itemsPerInvoc), - std::to_string(subgroupSizeLog2) + sgParams.getParamTemplateStructString() }; - const IShaderCompiler::SMacroDefinition defines[5] = { + const IShaderCompiler::SMacroDefinition defines[4] = { { "OPERATION", definitions[0] }, { "WORKGROUP_SIZE", definitions[1] }, - { "ITEMS_PER_INVOCATION", definitions[2] }, - { "SUBGROUP_SIZE_LOG2", definitions[3] }, + { "SUBGROUP_CONFIG_T", definitions[2] }, { "TEST_NATIVE", "1" } }; if (useNative) - options.preprocessorOptions.extraDefines = { defines, defines + 5 }; - else options.preprocessorOptions.extraDefines = { defines, defines + 4 }; + else + options.preprocessorOptions.extraDefines = { defines, defines + 3 }; overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); } From c446d7ef3cd2d6773fe2bf261d6d867341ebeff1 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 18 Jun 2025 10:18:01 +0200 Subject: [PATCH 257/296] prep for the final stretch in ex 09 (shaders) --- 09_GeometryCreator/main.cpp | 71 ++--- .../geometry/CGeometryCreatorScene.hpp | 301 +++++++----------- .../nbl/examples/geometry/SPushConstants.hlsl | 10 +- 3 files changed, 142 insertions(+), 240 deletions(-) diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index a98dcee5b..5bbe40f37 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -44,7 +44,7 @@ class GeometryCreatorApp final : public MonoWindowApplication patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT; patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; - auto scene = CGeometryCreatorScene::create( + m_scene = CGeometryCreatorScene::create( { .transferQueue = getTransferUpQueue(), .utilities = m_utils.get(), @@ -54,7 +54,9 @@ class GeometryCreatorApp final : public MonoWindowApplication ); auto scRes = static_cast(m_surface->getSwapchainResources()); - auto renderer = CSimpleDebugRenderer::create(scRes->getRenderpass(),0,scene.get()); + m_renderer = CSimpleDebugRenderer::create(scRes->getRenderpass(),0,m_scene.get()); + if (!m_renderer) + return logFail("Could not create Renderer!"); // camera { @@ -84,30 +86,8 @@ class GeometryCreatorApp final : public MonoWindowApplication mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); mouseProcess(events); }, m_logger.get()); keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); }, m_logger.get()); camera.endInputProcessing(nextPresentationTimestamp); -#if 0 - const auto type = static_cast(gcIndex); - const auto& [gpu, meta] = resources.objects[type]; - - object.meta.type = type; - object.meta.name = meta.name; -#endif } - const auto viewMatrix = camera.getViewMatrix(); - const auto viewProjectionMatrix = camera.getConcatenatedMatrix(); -#if 0 - SBasicViewParameters uboData; - memcpy(uboData.MVP, modelViewProjectionMatrix.pointer(), sizeof(uboData.MVP)); - memcpy(uboData.MV, modelViewMatrix.pointer(), sizeof(uboData.MV)); - memcpy(uboData.NormalMat, normalMatrix.pointer(), sizeof(uboData.NormalMat)); - { - SBufferRange range; - range.buffer = core::smart_refctd_ptr(resources.ubo.buffer); - range.size = resources.ubo.buffer->getSize(); - - cb->updateBuffer(range, &uboData); - } -#endif auto* queue = getGraphicsQueue(); asset::SViewport viewport; @@ -148,24 +128,17 @@ class GeometryCreatorApp final : public MonoWindowApplication cb->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); } -#if 0 - const auto& [hook, meta] = resources.objects[object.meta.type]; - auto* rawPipeline = hook.pipeline.get(); - - SBufferBinding vertex = hook.bindings.vertex, index = hook.bindings.index; - cb->bindGraphicsPipeline(rawPipeline); - cb->bindDescriptorSets(EPBP_GRAPHICS, rawPipeline->getLayout(), 1, 1, &resources.descriptorSet.get()); - cb->bindVertexBuffers(0, 1, &vertex); - - if (index.buffer && hook.indexType != EIT_UNKNOWN) + float32_t3x4 viewMatrix; + float32_t4x4 viewProjMatrix; + // TODO: get rid of legacy matrices { - cb->bindIndexBuffer(index, hook.indexType); - cb->drawIndexed(hook.indexCount, 1, 0, 0, 0); + memcpy(&viewMatrix,camera.getViewMatrix().pointer(),sizeof(viewMatrix)); + memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewMatrix)); } - else - cb->draw(hook.indexCount, 1, 0, 0); -#endif + const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix); + m_renderer->render(cb,viewParams); + cb->endRenderPass(); cb->end(); @@ -199,7 +172,9 @@ class GeometryCreatorApp final : public MonoWindowApplication std::string caption = "[Nabla Engine] Geometry Creator"; { -// caption += ", displaying [" + std::string(object.meta.name.data()) + "]"; + caption += ", displaying [" + + caption += m_scene->getGeometries()[gcIndex].name; + caption += "]"; m_window->setCaption(caption); } return retval; @@ -246,17 +221,20 @@ class GeometryCreatorApp final : public MonoWindowApplication } private: + // + smart_refctd_ptr m_scene; + smart_refctd_ptr m_renderer; + // smart_refctd_ptr m_semaphore; uint64_t m_realFrameIx = 0; std::array,base_t::MaxFramesInFlight> m_cmdBufs; - + // InputSystem::ChannelReader mouse; InputSystem::ChannelReader keyboard; + // Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); -// ResourcesBundle resources; -// ObjectDrawHookCpu object; uint16_t gcIndex = {}; void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events) @@ -265,8 +243,11 @@ class GeometryCreatorApp final : public MonoWindowApplication { auto ev = *eventIt; - if (ev.type == nbl::ui::SMouseEvent::EET_SCROLL) - gcIndex = std::clamp(int16_t(gcIndex) + int16_t(core::sign(ev.scrollEvent.verticalScroll)), int64_t(0), int64_t(CGeometryCreatorScene::OT_COUNT - (uint8_t)1u)); + if (ev.type==nbl::ui::SMouseEvent::EET_SCROLL && m_renderer) + { + gcIndex += int16_t(core::sign(ev.scrollEvent.verticalScroll)); + gcIndex = core::clamp(gcIndex,0ull,m_renderer->getInitParams().geoms.size()); + } } } }; diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index 1f8d1ac6a..74b5d02d8 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -3,15 +3,8 @@ #include -#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl" #include "nbl/asset/utils/CGeometryCreator.h" -#include "nbl/examples/geometry/SPushConstants.hlsl" - -// TODO: Arek bring back -//#include "nbl/examples/geometry/spirv/builtin/CArchive.h" -//#include "nbl/examples/geometry/spirv/builtin/builtinResources.h" - namespace nbl::examples { @@ -201,39 +194,84 @@ class CGeometryCreatorScene : public core::IReferenceCounted core::vector m_geometries; }; +} +//! + + +#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl" +#include "nbl/examples/geometry/SPushConstants.hlsl" + +// TODO: Arek bring back +//#include "nbl/examples/geometry/spirv/builtin/CArchive.h" +//#include "nbl/examples/geometry/spirv/builtin/builtinResources.h" + + +namespace nbl::examples +{ + class CSimpleDebugRenderer final : public core::IReferenceCounted { public: // constexpr static inline auto DescriptorCount = 255; // - using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants; - struct SPackedGeometry + struct SViewParams { - inline SPushConstants convert(const hlsl::float32_t3x4& model, const hlsl::float32_t3x4& view, const hlsl::float32_t4x4& viewProj) + inline SViewParams(const hlsl::float32_t3x4& _view, const hlsl::float32_t4x4& _viewProj) { - using namespace hlsl; - return { - .basic = { - .MVP = math::linalg::promoted_mul(viewProj,model), - .MV = math::linalg::promoted_mul(view,model), - .normalMat = inverse(transpose(float32_t3x3(view))) - }, - .positionView = positionView, - .normalView = normalView, - .uvView = uvView + view = _view; + viewProj = _viewProj; + using namespace nbl::hlsl; + normal = transpose(inverse(float32_t3x3(view))); + } + + inline auto computeForInstance(hlsl::float32_t3x4 world) const + { + using namespace nbl::hlsl; + hlsl::examples::geometry_creator_scene::SInstanceMatrices retval = { + .worldViewProj = float32_t4x4(math::linalg::promoted_mul(float64_t4x4(viewProj),float64_t3x4(world))) }; + const auto sub3x3 = mul(float64_t3x3(viewProj),float64_t3x3(world)); + retval.normal = float32_t3x3(transpose(inverse(sub3x3))); + return retval; } + hlsl::float32_t3x4 view; + hlsl::float32_t4x4 viewProj; + hlsl::float32_t3x3 normal; + }; + // + struct SPackedGeometry + { + core::smart_refctd_ptr pipeline = {}; asset::SBufferBinding indexBuffer = {}; uint32_t elementCount = 0; // indices into the descriptor set uint8_t positionView = 0; uint8_t normalView = 0; uint8_t uvView = 0; - uint8_t indexType = asset::EIT_UNKNOWN; + asset::E_INDEX_TYPE indexType = asset::EIT_UNKNOWN; }; + // + struct SInstance + { + using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants; + inline SPushConstants computePushConstants(const SViewParams& viewParams) const + { + using namespace hlsl; + return { + .matrices = viewParams.computeForInstance(world), + .positionView = packedGeo->positionView, + .normalView = packedGeo->normalView, + .uvView = packedGeo->uvView + }; + } + hlsl::float32_t3x4 world; + const SPackedGeometry* packedGeo; + }; + + // static inline core::smart_refctd_ptr create(video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene) { EXPOSE_NABLA_NAMESPACES; @@ -249,8 +287,6 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted if (namedGeoms.empty()) return nullptr; - // TODO: Load Shaders and Create Pipelines - SInitParams init; // create descriptor set @@ -287,6 +323,19 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted } } + // + const SPushConstantRange ranges[] = {{ + .stageFlags = hlsl::ShaderStage::ESS_VERTEX, + .offset = 0, + .size = sizeof(SInstance::SPushConstants), + }}; + init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr(init.ds->getLayout())); + + // TODO: Load Shaders and Create Pipelines + { + // + } + // write geometries' attributes to descriptor set { core::vector infos; @@ -335,15 +384,48 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted return smart_refctd_ptr(new CSimpleDebugRenderer(std::move(init)),dont_grab); } - - protected: + // struct SInitParams { core::smart_refctd_ptr ds; + core::smart_refctd_ptr layout; core::vector geoms; - } m_params; + }; + inline const SInitParams& getInitParams() const {return m_params;} + // + inline void render(video::IGPUCommandBuffer* cmdbuf, const SViewParams& viewParams) const + { + EXPOSE_NABLA_NAMESPACES; + + cmdbuf->beginDebugMarker("CSimpleDebugRenderer::render"); + + const auto* layout = m_params.layout.get(); + cmdbuf->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_GRAPHICS,layout,0,1,&m_params.ds.get()); + + for (const auto& instance : m_instances) + { + const auto* geo = instance.packedGeo; + cmdbuf->bindGraphicsPipeline(geo->pipeline.get()); + const auto pc = instance.computePushConstants(viewParams); + cmdbuf->pushConstants(layout,hlsl::ShaderStage::ESS_VERTEX,0,sizeof(pc),&pc); + if (geo->indexBuffer) + { + cmdbuf->bindIndexBuffer(geo->indexBuffer,geo->indexType); + cmdbuf->drawIndexed(geo->elementCount,1,0,0,0); + } + else + cmdbuf->draw(geo->elementCount,1,0,0); + } + cmdbuf->endDebugMarker(); + } + + core::vector m_instances; + + protected: inline CSimpleDebugRenderer(SInitParams&& _params) : m_params(std::move(_params)) {} + + SInitParams m_params; }; #undef EXPOSE_NABLA_NAMESPACES @@ -586,182 +668,15 @@ class ResourceBuilder GP_COUNT }; - struct ReferenceObjectCpu - { - ObjectMeta meta; - GeometryShader shadersType; - nbl::asset::CGeometryCreator::return_type data; - }; - GeometriesCpu(const nbl::asset::IGeometryCreator* _gc) - : gc(_gc), - objects - ({ - ReferenceObjectCpu {.meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) }, - ReferenceObjectCpu {.meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) }, - ReferenceObjectCpu {.meta = {.type = OT_CYLINDER, .name = "Cylinder Mesh" }, .shadersType = GP_BASIC, .data = gc->createCylinderMesh(2, 2, 20) }, - ReferenceObjectCpu {.meta = {.type = OT_RECTANGLE, .name = "Rectangle Mesh" }, .shadersType = GP_BASIC, .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(1.5, 3)) }, - ReferenceObjectCpu {.meta = {.type = OT_DISK, .name = "Disk Mesh" }, .shadersType = GP_BASIC, .data = gc->createDiskMesh(2, 30) }, - ReferenceObjectCpu {.meta = {.type = OT_ARROW, .name = "Arrow Mesh" }, .shadersType = GP_BASIC, .data = gc->createArrowMesh() }, - ReferenceObjectCpu {.meta = {.type = OT_CONE, .name = "Cone Mesh" }, .shadersType = GP_CONE, .data = gc->createConeMesh(2, 3, 10) }, - ReferenceObjectCpu {.meta = {.type = OT_ICOSPHERE, .name = "Icoshpere Mesh" }, .shadersType = GP_ICO, .data = gc->createIcoSphere(1, 3, true) } - }) - { - gc = nullptr; // one shot - } - - private: - const nbl::asset::IGeometryCreator* gc; - - public: - const std::array objects; }; - using resources_bundle_base_t = ResourcesBundleBase; - - struct ResourcesBundleScratch : public resources_bundle_base_t - { - using Types = resources_bundle_base_t::Types; - - ResourcesBundleScratch() - : resources_bundle_base_t() {} - struct Shaders { nbl::core::smart_refctd_ptr vertex = nullptr, fragment = nullptr; }; - nbl::core::smart_refctd_ptr descriptorSetLayout; - nbl::core::smart_refctd_ptr pipelineLayout; std::array shaders; - }; - - // TODO: we could make those params templated with default values like below - static constexpr auto FramebufferW = 1280u, FramebufferH = 720u; - static constexpr auto ColorFboAttachmentFormat = nbl::asset::EF_R8G8B8A8_SRGB, DepthFboAttachmentFormat = nbl::asset::EF_D16_UNORM; - static constexpr auto Samples = nbl::video::IGPUImage::ESCF_1_BIT; - - ResourcesBundleScratch scratch; - - GeometriesCpu geometries; -}; - -#undef TYPES_IMPL_BOILERPLATE - -struct ObjectDrawHookCpu -{ - nbl::core::matrix3x4SIMD model; - nbl::asset::SBasicViewParameters viewParameters; - ObjectMeta meta; -}; - -/* - Rendering to offline framebuffer which we don't present, color - scene attachment texture we use for second UI renderpass - sampling it & rendering into desired GUI area. - - The scene can be created from simple geometry - using our Geomtry Creator class. -*/ - -class CScene final : public nbl::core::IReferenceCounted -{ -public: - ObjectDrawHookCpu object; // TODO: this could be a vector (to not complicate the example I leave it single object), we would need a better system for drawing then to make only 1 max 2 indirect draw calls (indexed and not indexed objects) - - struct - { - const uint32_t startedValue = 0, finishedValue = 0x45; - nbl::core::smart_refctd_ptr progress; - } semaphore; - - inline void begin() - { - EXPOSE_NABLA_NAMESPACES(); - - m_commandBuffer->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - m_commandBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_commandBuffer->beginDebugMarker("UISampleApp Offline Scene Frame"); - - semaphore.progress = m_utilities->getLogicalDevice()->createSemaphore(semaphore.startedValue); - } - - inline void record() - { - - const auto& [hook, meta] = resources.objects[object.meta.type]; - auto* rawPipeline = hook.pipeline.get(); - - SBufferBinding vertex = hook.bindings.vertex, index = hook.bindings.index; - - m_commandBuffer->bindGraphicsPipeline(rawPipeline); - m_commandBuffer->bindDescriptorSets(EPBP_GRAPHICS, rawPipeline->getLayout(), 1, 1, &resources.descriptorSet.get()); - m_commandBuffer->bindVertexBuffers(0, 1, &vertex); - - if (index.buffer && hook.indexType != EIT_UNKNOWN) - { - m_commandBuffer->bindIndexBuffer(index, hook.indexType); - m_commandBuffer->drawIndexed(hook.indexCount, 1, 0, 0, 0); - } - else - m_commandBuffer->draw(hook.indexCount, 1, 0, 0); - } - -private: - template // TODO: enforce constraints, only those 2 above are valid - CScene(nbl::core::smart_refctd_ptr _utilities, nbl::core::smart_refctd_ptr _logger, nbl::video::CThreadSafeQueueAdapter* _graphicsQueue, const nbl::asset::IGeometryCreator* _geometryCreator, CreateWith createWith = {}) - : m_utilities(nbl::core::smart_refctd_ptr(_utilities)), m_logger(nbl::core::smart_refctd_ptr(_logger)), queue(_graphicsQueue) - { - EXPOSE_NABLA_NAMESPACES(); - using Builder = typename CreateWith::Builder; - - m_commandBuffer = createCommandBuffer(m_utilities->getLogicalDevice(), m_utilities->getLogger(), queue->getFamilyIndex()); - Builder builder(m_utilities.get(), m_commandBuffer.get(), m_logger.get(), _geometryCreator); - - // gpu resources - if (builder.build()) - { - if (!builder.finalize(resources, queue)) - m_logger->log("Could not finalize resource objects to gpu objects!", ILogger::ELL_ERROR); - } - else - m_logger->log("Could not build resource objects!", ILogger::ELL_ERROR); - - // frame buffer - { - const auto extent = resources.attachments.color->getCreationParameters().image->getCreationParameters().extent; - - IGPUFramebuffer::SCreationParams params = - { - { - .renderpass = smart_refctd_ptr(resources.renderpass), - .depthStencilAttachments = &resources.attachments.depth.get(), - .colorAttachments = &resources.attachments.color.get(), - .width = extent.width, - .height = extent.height, - .layers = 1u - } - }; - - m_frameBuffer = m_utilities->getLogicalDevice()->createFramebuffer(std::move(params)); - - if (!m_frameBuffer) - { - m_logger->log("Could not create frame buffer!", ILogger::ELL_ERROR); - return; - } - } - } - - nbl::core::smart_refctd_ptr m_utilities; - nbl::core::smart_refctd_ptr m_logger; - - nbl::video::CThreadSafeQueueAdapter* queue; - nbl::core::smart_refctd_ptr m_commandBuffer; - - nbl::core::smart_refctd_ptr m_frameBuffer; - - ResourcesBundle resources; }; #endif diff --git a/common/include/nbl/examples/geometry/SPushConstants.hlsl b/common/include/nbl/examples/geometry/SPushConstants.hlsl index f02ddea12..2048f1f3f 100644 --- a/common/include/nbl/examples/geometry/SPushConstants.hlsl +++ b/common/include/nbl/examples/geometry/SPushConstants.hlsl @@ -2,7 +2,7 @@ #define _NBL_EXAMPLES_S_PUSH_CONSTANTS_HLSL_ -#include "nbl/examples/common/SBasicViewParameters.hlsl" +#include "nbl/builtin/hlsl/cpp_compat.hlsl" namespace nbl @@ -14,9 +14,15 @@ namespace examples namespace geometry_creator_scene { +struct SInstanceMatrices +{ + float32_t4x4 worldViewProj; + float32_t3x3 normal; +}; + struct SPushConstants { - SBasicViewParameters basic; + SInstanceMatrices matrices; uint32_t positionView : 11; uint32_t normalView : 10; uint32_t uvView : 11; From e76bfcc4f642c3c5f01f5b6fecfaa737307f1ea1 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 18 Jun 2025 10:22:52 +0200 Subject: [PATCH 258/296] move ex09 renderer to its own include --- common/include/nbl/examples/PCH.hpp | 1 + .../geometry/CGeometryCreatorScene.hpp | 511 +----------------- .../geometry/CSimpleDebugRenderer.hpp | 493 +++++++++++++++++ 3 files changed, 499 insertions(+), 506 deletions(-) create mode 100644 common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp index 179c9f037..ed5da666e 100644 --- a/common/include/nbl/examples/PCH.hpp +++ b/common/include/nbl/examples/PCH.hpp @@ -18,6 +18,7 @@ #include "nbl/examples/cameras/CCamera.hpp" #include "nbl/examples/geometry/CGeometryCreatorScene.hpp" +#include "nbl/examples/geometry/CSimpleDebugRenderer.hpp" #endif // _NBL_EXAMPLES_COMMON_PCH_HPP_ \ No newline at end of file diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index 74b5d02d8..8a73f2e14 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -9,30 +9,14 @@ namespace nbl::examples { -#define EXPOSE_NABLA_NAMESPACES using namespace nbl::core; \ -using namespace nbl::system; \ -using namespace nbl::asset; \ -using namespace nbl::video - class CGeometryCreatorScene : public core::IReferenceCounted { +#define EXPOSE_NABLA_NAMESPACES \ + using namespace nbl::core; \ + using namespace nbl::system; \ + using namespace nbl::asset; \ + using namespace nbl::video public: - // - enum ObjectType : uint8_t - { - OT_CUBE, - OT_SPHERE, - OT_CYLINDER, - OT_RECTANGLE, - OT_DISK, - OT_ARROW, - OT_CONE, - OT_ICOSPHERE, - - OT_COUNT, - OT_UNKNOWN = OT_COUNT - }; - // struct SCreateParams { @@ -192,493 +176,8 @@ class CGeometryCreatorScene : public core::IReferenceCounted inline CGeometryCreatorScene(core::vector&& _geometries) : m_geometries(std::move(_geometries)) {} core::vector m_geometries; -}; - -} -//! - - -#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl" -#include "nbl/examples/geometry/SPushConstants.hlsl" - -// TODO: Arek bring back -//#include "nbl/examples/geometry/spirv/builtin/CArchive.h" -//#include "nbl/examples/geometry/spirv/builtin/builtinResources.h" - - -namespace nbl::examples -{ - -class CSimpleDebugRenderer final : public core::IReferenceCounted -{ - public: - // - constexpr static inline auto DescriptorCount = 255; - // - struct SViewParams - { - inline SViewParams(const hlsl::float32_t3x4& _view, const hlsl::float32_t4x4& _viewProj) - { - view = _view; - viewProj = _viewProj; - using namespace nbl::hlsl; - normal = transpose(inverse(float32_t3x3(view))); - } - - inline auto computeForInstance(hlsl::float32_t3x4 world) const - { - using namespace nbl::hlsl; - hlsl::examples::geometry_creator_scene::SInstanceMatrices retval = { - .worldViewProj = float32_t4x4(math::linalg::promoted_mul(float64_t4x4(viewProj),float64_t3x4(world))) - }; - const auto sub3x3 = mul(float64_t3x3(viewProj),float64_t3x3(world)); - retval.normal = float32_t3x3(transpose(inverse(sub3x3))); - return retval; - } - - hlsl::float32_t3x4 view; - hlsl::float32_t4x4 viewProj; - hlsl::float32_t3x3 normal; - }; - // - struct SPackedGeometry - { - core::smart_refctd_ptr pipeline = {}; - asset::SBufferBinding indexBuffer = {}; - uint32_t elementCount = 0; - // indices into the descriptor set - uint8_t positionView = 0; - uint8_t normalView = 0; - uint8_t uvView = 0; - asset::E_INDEX_TYPE indexType = asset::EIT_UNKNOWN; - }; - // - struct SInstance - { - using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants; - inline SPushConstants computePushConstants(const SViewParams& viewParams) const - { - using namespace hlsl; - return { - .matrices = viewParams.computeForInstance(world), - .positionView = packedGeo->positionView, - .normalView = packedGeo->normalView, - .uvView = packedGeo->uvView - }; - } - - hlsl::float32_t3x4 world; - const SPackedGeometry* packedGeo; - }; - - // - static inline core::smart_refctd_ptr create(video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene) - { - EXPOSE_NABLA_NAMESPACES; - - if (!renderpass) - return nullptr; - auto device = const_cast(renderpass->getOriginDevice()); - auto logger = device->getLogger(); - - if (!scene) - return nullptr; - const auto namedGeoms = scene->getGeometries(); - if (namedGeoms.empty()) - return nullptr; - - SInitParams init; - - // create descriptor set - { - // create Descriptor Set Layout - smart_refctd_ptr dsLayout; - { - const IGPUDescriptorSetLayout::SBinding bindings[] = - { - { - .binding = 0, - .type = IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER, - // some geometries may not have particular attributes - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT, - .stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT, - .count = DescriptorCount - } - }; - dsLayout = device->createDescriptorSetLayout(bindings); - if (!dsLayout) - { - logger->log("Could not create descriptor set layout!",ILogger::ELL_ERROR); - return nullptr; - } - } - - // create Descriptor Set - auto pool = device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT,{&dsLayout.get(),1}); - init.ds = pool->createDescriptorSet(std::move(dsLayout)); - if (!init.ds) - { - logger->log("Could not descriptor set!",ILogger::ELL_ERROR); - return nullptr; - } - } - - // - const SPushConstantRange ranges[] = {{ - .stageFlags = hlsl::ShaderStage::ESS_VERTEX, - .offset = 0, - .size = sizeof(SInstance::SPushConstants), - }}; - init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr(init.ds->getLayout())); - - // TODO: Load Shaders and Create Pipelines - { - // - } - - // write geometries' attributes to descriptor set - { - core::vector infos; - auto allocateUTB = [device,&infos](const IGeometry::SDataView& view)->uint8_t - { - if (!view) - return DescriptorCount; - const auto retval = infos.size(); - infos.emplace_back().desc = device->createBufferView(view.src, view.composed.format); - return retval; - }; - - for (const auto& entry : namedGeoms) - { - const auto* geom = entry.geom.get(); - // could also check device origin on all buffers - if (!geom->valid()) - continue; - auto& out = init.geoms.emplace_back(); - if (const auto& view=geom->getIndexView(); view) - { - out.indexBuffer.offset = view.src.offset; - out.indexBuffer.buffer = view.src.buffer; - } - out.elementCount = geom->getVertexReferenceCount(); - out.positionView = allocateUTB(geom->getPositionView()); - out.normalView = allocateUTB(geom->getNormalView()); - // the first view is usually the UV - if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty()) - out.uvView = allocateUTB(auxViews.front()); - } - - if (infos.empty()) - return nullptr; - const IGPUDescriptorSet::SWriteDescriptorSet write = { - .dstSet = init.ds.get(), - .binding = 0, - .arrayElement = 0, - .count = static_cast(infos.size()), - .info = infos.data() - }; - if (!device->updateDescriptorSets({&write,1},{})) - return nullptr; - } - - return smart_refctd_ptr(new CSimpleDebugRenderer(std::move(init)),dont_grab); - } - - // - struct SInitParams - { - core::smart_refctd_ptr ds; - core::smart_refctd_ptr layout; - core::vector geoms; - }; - inline const SInitParams& getInitParams() const {return m_params;} - - // - inline void render(video::IGPUCommandBuffer* cmdbuf, const SViewParams& viewParams) const - { - EXPOSE_NABLA_NAMESPACES; - - cmdbuf->beginDebugMarker("CSimpleDebugRenderer::render"); - - const auto* layout = m_params.layout.get(); - cmdbuf->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_GRAPHICS,layout,0,1,&m_params.ds.get()); - - for (const auto& instance : m_instances) - { - const auto* geo = instance.packedGeo; - cmdbuf->bindGraphicsPipeline(geo->pipeline.get()); - const auto pc = instance.computePushConstants(viewParams); - cmdbuf->pushConstants(layout,hlsl::ShaderStage::ESS_VERTEX,0,sizeof(pc),&pc); - if (geo->indexBuffer) - { - cmdbuf->bindIndexBuffer(geo->indexBuffer,geo->indexType); - cmdbuf->drawIndexed(geo->elementCount,1,0,0,0); - } - else - cmdbuf->draw(geo->elementCount,1,0,0); - } - cmdbuf->endDebugMarker(); - } - - core::vector m_instances; - - protected: - inline CSimpleDebugRenderer(SInitParams&& _params) : m_params(std::move(_params)) {} - - SInitParams m_params; -}; - #undef EXPOSE_NABLA_NAMESPACES -#if 0 -class ResourceBuilder -{ -private: - - bool createShaders() - { - EXPOSE_NABLA_NAMESPACES(); - - auto createShader = [&](IShader::E_SHADER_STAGE stage, smart_refctd_ptr& outShader) -> smart_refctd_ptr - { - // TODO: use SPIRV loader & our ::system ns to get those cpu shaders, do not create myself (shit I forgot it exists) - - const SBuiltinFile& in = ::geometry::creator::spirv::builtin::get_resource(); - const auto buffer = ICPUBuffer::create({ { in.size }, (void*)in.contents, core::getNullMemoryResource() }, adopt_memory); - auto shader = make_smart_refctd_ptr(smart_refctd_ptr(buffer), stage, IShader::E_CONTENT_TYPE::ECT_SPIRV, ""); // must create cpu instance regardless underlying type - - if constexpr (withAssetConverter) - { - buffer->setContentHash(buffer->computeContentHash()); - outShader = std::move(shader); - } - - return outShader; - }; - - typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment); - - typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic! - - typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic! - - for (const auto& it : scratch.shaders) - { - if (!it.vertex || !it.fragment) - { - logger->log("Could not create shaders!", ILogger::ELL_ERROR); - return false; - } - } - - return true; - } - - bool createGeometries() - { - EXPOSE_NABLA_NAMESPACES(); - - for (uint32_t i = 0; i < geometries.objects.size(); ++i) - { - const auto& inGeometry = geometries.objects[i]; - auto& [obj, meta] = scratch.objects[i]; - - bool status = true; - - meta.name = inGeometry.meta.name; - meta.type = inGeometry.meta.type; - - struct - { - SBlendParams blend; - SRasterizationParams rasterization; - typename Types::graphics_pipeline_t::SCreationParams pipeline; - } params; - - { - params.blend.logicOp = ELO_NO_OP; - - auto& b = params.blend.blendParams[0]; - b.srcColorFactor = EBF_SRC_ALPHA; - b.dstColorFactor = EBF_ONE_MINUS_SRC_ALPHA; - b.colorBlendOp = EBO_ADD; - b.srcAlphaFactor = EBF_SRC_ALPHA; - b.dstAlphaFactor = EBF_SRC_ALPHA; - b.alphaBlendOp = EBO_ADD; - b.colorWriteMask = (1u << 0u) | (1u << 1u) | (1u << 2u) | (1u << 3u); - } - - params.rasterization.faceCullingMode = EFCM_NONE; - { - const typename Types::shader_t::SSpecInfo info [] = - { - {.entryPoint = "VSMain", .shader = scratch.shaders[inGeometry.shadersType].vertex.get() }, - {.entryPoint = "PSMain", .shader = scratch.shaders[inGeometry.shadersType].fragment.get() } - }; - - params.pipeline.layout = scratch.pipelineLayout.get(); - params.pipeline.shaders = info; - params.pipeline.renderpass = scratch.renderpass.get(); - params.pipeline.cached = { .vertexInput = inGeometry.data.inputParams, .primitiveAssembly = inGeometry.data.assemblyParams, .rasterization = params.rasterization, .blend = params.blend, .subpassIx = 0u }; - - obj.indexCount = inGeometry.data.indexCount; - obj.indexType = inGeometry.data.indexType; - - // TODO: cache pipeline & try lookup for existing one first maybe - - // similar issue like with shaders again, in this case gpu contructor allows for extra cache parameters + there is no constructor you can use to fire make_smart_refctd_ptr yourself for cpu - if constexpr (withAssetConverter) - obj.pipeline = ICPUGraphicsPipeline::create(params.pipeline); - else - { - const std::array info = { { params.pipeline } }; - utilities->getLogicalDevice()->createGraphicsPipelines(nullptr, info, &obj.pipeline); - } - - if (!obj.pipeline) - { - logger->log("Could not create graphics pipeline for [%s] object!", ILogger::ELL_ERROR, meta.name.data()); - status = false; - } - - // object buffers - auto createVIBuffers = [&]() -> bool - { - using ibuffer_t = ::nbl::asset::IBuffer; // seems to be ambigous, both asset & core namespaces has IBuffer - - // note: similar issue like with shaders, this time with cpu-gpu constructors differing in arguments - auto vBuffer = smart_refctd_ptr(inGeometry.data.bindings[0].buffer); // no offset - constexpr static auto VERTEX_USAGE = bitflag(ibuffer_t::EUF_VERTEX_BUFFER_BIT) | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF; - obj.bindings.vertex.offset = 0u; - - auto iBuffer = smart_refctd_ptr(inGeometry.data.indexBuffer.buffer); // no offset - constexpr static auto INDEX_USAGE = bitflag(ibuffer_t::EUF_INDEX_BUFFER_BIT) | ibuffer_t::EUF_VERTEX_BUFFER_BIT | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF; - obj.bindings.index.offset = 0u; - - if constexpr (withAssetConverter) - { - if (!vBuffer) - return false; - - vBuffer->addUsageFlags(VERTEX_USAGE); - vBuffer->setContentHash(vBuffer->computeContentHash()); - obj.bindings.vertex = { .offset = 0u, .buffer = vBuffer }; - - if (inGeometry.data.indexType != EIT_UNKNOWN) - if (iBuffer) - { - iBuffer->addUsageFlags(INDEX_USAGE); - iBuffer->setContentHash(iBuffer->computeContentHash()); - } - else - return false; - - obj.bindings.index = { .offset = 0u, .buffer = iBuffer }; - } - else - { - auto vertexBuffer = utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = vBuffer->getSize(), .usage = VERTEX_USAGE })); - auto indexBuffer = iBuffer ? utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = iBuffer->getSize(), .usage = INDEX_USAGE })) : nullptr; - - if (!vertexBuffer) - return false; - - if (inGeometry.data.indexType != EIT_UNKNOWN) - if (!indexBuffer) - return false; - - const auto mask = utilities->getLogicalDevice()->getPhysicalDevice()->getUpStreamingMemoryTypeBits(); - for (auto it : { vertexBuffer , indexBuffer }) - { - if (it) - { - auto reqs = it->getMemoryReqs(); - reqs.memoryTypeBits &= mask; - - utilities->getLogicalDevice()->allocate(reqs, it.get()); - } - } - - // record transfer uploads - obj.bindings.vertex = { .offset = 0u, .buffer = std::move(vertexBuffer) }; - { - const SBufferRange range = { .offset = obj.bindings.vertex.offset, .size = obj.bindings.vertex.buffer->getSize(), .buffer = obj.bindings.vertex.buffer }; - if (!commandBuffer->updateBuffer(range, vBuffer->getPointer())) - { - logger->log("Could not record vertex buffer transfer upload for [%s] object!", ILogger::ELL_ERROR, meta.name.data()); - status = false; - } - } - obj.bindings.index = { .offset = 0u, .buffer = std::move(indexBuffer) }; - { - if (iBuffer) - { - const SBufferRange range = { .offset = obj.bindings.index.offset, .size = obj.bindings.index.buffer->getSize(), .buffer = obj.bindings.index.buffer }; - - if (!commandBuffer->updateBuffer(range, iBuffer->getPointer())) - { - logger->log("Could not record index buffer transfer upload for [%s] object!", ILogger::ELL_ERROR, meta.name.data()); - status = false; - } - } - } - } - - return true; - }; - - if (!createVIBuffers()) - { - logger->log("Could not create buffers for [%s] object!", ILogger::ELL_ERROR, meta.name.data()); - status = false; - } - - if (!status) - { - logger->log("[%s] object will not be created!", ILogger::ELL_ERROR, meta.name.data()); - - obj.bindings.vertex = {}; - obj.bindings.index = {}; - obj.indexCount = 0u; - obj.indexType = E_INDEX_TYPE::EIT_UNKNOWN; - obj.pipeline = nullptr; - - continue; - } - } - } - - return true; - } - - - struct GeometriesCpu - { - enum GeometryShader - { - GP_BASIC = 0, - GP_CONE, - GP_ICO, - - GP_COUNT - }; - - - }; - - struct Shaders - { - nbl::core::smart_refctd_ptr vertex = nullptr, fragment = nullptr; - }; - - std::array shaders; }; -#endif } #endif \ No newline at end of file diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp new file mode 100644 index 000000000..b6f6b4aaf --- /dev/null +++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp @@ -0,0 +1,493 @@ +#ifndef _NBL_EXAMPLES_C_SIMPLE_DEBUG_RENDERER_H_INCLUDED_ +#define _NBL_EXAMPLES_C_SIMPLE_DEBUG_RENDERER_H_INCLUDED_ + + +#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl" +#include "nbl/examples/geometry/SPushConstants.hlsl" + +// TODO: Arek bring back +//#include "nbl/examples/geometry/spirv/builtin/CArchive.h" +//#include "nbl/examples/geometry/spirv/builtin/builtinResources.h" + + +namespace nbl::examples +{ + +class CSimpleDebugRenderer final : public core::IReferenceCounted +{ +#define EXPOSE_NABLA_NAMESPACES \ + using namespace nbl::core; \ + using namespace nbl::system; \ + using namespace nbl::asset; \ + using namespace nbl::video + public: + // + constexpr static inline auto DescriptorCount = 255; + // + struct SViewParams + { + inline SViewParams(const hlsl::float32_t3x4& _view, const hlsl::float32_t4x4& _viewProj) + { + view = _view; + viewProj = _viewProj; + using namespace nbl::hlsl; + normal = transpose(inverse(float32_t3x3(view))); + } + + inline auto computeForInstance(hlsl::float32_t3x4 world) const + { + using namespace nbl::hlsl; + hlsl::examples::geometry_creator_scene::SInstanceMatrices retval = { + .worldViewProj = float32_t4x4(math::linalg::promoted_mul(float64_t4x4(viewProj),float64_t3x4(world))) + }; + const auto sub3x3 = mul(float64_t3x3(viewProj),float64_t3x3(world)); + retval.normal = float32_t3x3(transpose(inverse(sub3x3))); + return retval; + } + + hlsl::float32_t3x4 view; + hlsl::float32_t4x4 viewProj; + hlsl::float32_t3x3 normal; + }; + // + struct SPackedGeometry + { + core::smart_refctd_ptr pipeline = {}; + asset::SBufferBinding indexBuffer = {}; + uint32_t elementCount = 0; + // indices into the descriptor set + uint8_t positionView = 0; + uint8_t normalView = 0; + uint8_t uvView = 0; + asset::E_INDEX_TYPE indexType = asset::EIT_UNKNOWN; + }; + // + struct SInstance + { + using SPushConstants = hlsl::examples::geometry_creator_scene::SPushConstants; + inline SPushConstants computePushConstants(const SViewParams& viewParams) const + { + using namespace hlsl; + return { + .matrices = viewParams.computeForInstance(world), + .positionView = packedGeo->positionView, + .normalView = packedGeo->normalView, + .uvView = packedGeo->uvView + }; + } + + hlsl::float32_t3x4 world; + const SPackedGeometry* packedGeo; + }; + + // + static inline core::smart_refctd_ptr create(video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene) + { + EXPOSE_NABLA_NAMESPACES; + + if (!renderpass) + return nullptr; + auto device = const_cast(renderpass->getOriginDevice()); + auto logger = device->getLogger(); + + if (!scene) + return nullptr; + const auto namedGeoms = scene->getGeometries(); + if (namedGeoms.empty()) + return nullptr; + + SInitParams init; + + // create descriptor set + { + // create Descriptor Set Layout + smart_refctd_ptr dsLayout; + { + const IGPUDescriptorSetLayout::SBinding bindings[] = + { + { + .binding = 0, + .type = IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER, + // some geometries may not have particular attributes + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT, + .stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT, + .count = DescriptorCount + } + }; + dsLayout = device->createDescriptorSetLayout(bindings); + if (!dsLayout) + { + logger->log("Could not create descriptor set layout!",ILogger::ELL_ERROR); + return nullptr; + } + } + + // create Descriptor Set + auto pool = device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT,{&dsLayout.get(),1}); + init.ds = pool->createDescriptorSet(std::move(dsLayout)); + if (!init.ds) + { + logger->log("Could not descriptor set!",ILogger::ELL_ERROR); + return nullptr; + } + } + + // + const SPushConstantRange ranges[] = {{ + .stageFlags = hlsl::ShaderStage::ESS_VERTEX, + .offset = 0, + .size = sizeof(SInstance::SPushConstants), + }}; + init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr(init.ds->getLayout())); + + // TODO: Load Shaders and Create Pipelines + { + // + } + + // write geometries' attributes to descriptor set + { + core::vector infos; + auto allocateUTB = [device,&infos](const IGeometry::SDataView& view)->uint8_t + { + if (!view) + return DescriptorCount; + const auto retval = infos.size(); + infos.emplace_back().desc = device->createBufferView(view.src, view.composed.format); + return retval; + }; + + for (const auto& entry : namedGeoms) + { + const auto* geom = entry.geom.get(); + // could also check device origin on all buffers + if (!geom->valid()) + continue; + auto& out = init.geoms.emplace_back(); + if (const auto& view=geom->getIndexView(); view) + { + out.indexBuffer.offset = view.src.offset; + out.indexBuffer.buffer = view.src.buffer; + } + out.elementCount = geom->getVertexReferenceCount(); + out.positionView = allocateUTB(geom->getPositionView()); + out.normalView = allocateUTB(geom->getNormalView()); + // the first view is usually the UV + if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty()) + out.uvView = allocateUTB(auxViews.front()); + } + + if (infos.empty()) + return nullptr; + const IGPUDescriptorSet::SWriteDescriptorSet write = { + .dstSet = init.ds.get(), + .binding = 0, + .arrayElement = 0, + .count = static_cast(infos.size()), + .info = infos.data() + }; + if (!device->updateDescriptorSets({&write,1},{})) + return nullptr; + } + + return smart_refctd_ptr(new CSimpleDebugRenderer(std::move(init)),dont_grab); + } + + // + struct SInitParams + { + core::smart_refctd_ptr ds; + core::smart_refctd_ptr layout; + core::vector geoms; + }; + inline const SInitParams& getInitParams() const {return m_params;} + + // + inline void render(video::IGPUCommandBuffer* cmdbuf, const SViewParams& viewParams) const + { + EXPOSE_NABLA_NAMESPACES; + + cmdbuf->beginDebugMarker("CSimpleDebugRenderer::render"); + + const auto* layout = m_params.layout.get(); + cmdbuf->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_GRAPHICS,layout,0,1,&m_params.ds.get()); + + for (const auto& instance : m_instances) + { + const auto* geo = instance.packedGeo; + cmdbuf->bindGraphicsPipeline(geo->pipeline.get()); + const auto pc = instance.computePushConstants(viewParams); + cmdbuf->pushConstants(layout,hlsl::ShaderStage::ESS_VERTEX,0,sizeof(pc),&pc); + if (geo->indexBuffer) + { + cmdbuf->bindIndexBuffer(geo->indexBuffer,geo->indexType); + cmdbuf->drawIndexed(geo->elementCount,1,0,0,0); + } + else + cmdbuf->draw(geo->elementCount,1,0,0); + } + cmdbuf->endDebugMarker(); + } + + core::vector m_instances; + + protected: + inline CSimpleDebugRenderer(SInitParams&& _params) : m_params(std::move(_params)) {} + + SInitParams m_params; +#undef EXPOSE_NABLA_NAMESPACES +}; + +#if 0 +class ResourceBuilder +{ +private: + + bool createShaders() + { + EXPOSE_NABLA_NAMESPACES(); + + auto createShader = [&](IShader::E_SHADER_STAGE stage, smart_refctd_ptr& outShader) -> smart_refctd_ptr + { + // TODO: use SPIRV loader & our ::system ns to get those cpu shaders, do not create myself (shit I forgot it exists) + + const SBuiltinFile& in = ::geometry::creator::spirv::builtin::get_resource(); + const auto buffer = ICPUBuffer::create({ { in.size }, (void*)in.contents, core::getNullMemoryResource() }, adopt_memory); + auto shader = make_smart_refctd_ptr(smart_refctd_ptr(buffer), stage, IShader::E_CONTENT_TYPE::ECT_SPIRV, ""); // must create cpu instance regardless underlying type + + if constexpr (withAssetConverter) + { + buffer->setContentHash(buffer->computeContentHash()); + outShader = std::move(shader); + } + + return outShader; + }; + + typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC]; + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment); + + typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE]; + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic! + + typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO]; + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic! + + for (const auto& it : scratch.shaders) + { + if (!it.vertex || !it.fragment) + { + logger->log("Could not create shaders!", ILogger::ELL_ERROR); + return false; + } + } + + return true; + } + + bool createGeometries() + { + EXPOSE_NABLA_NAMESPACES(); + + for (uint32_t i = 0; i < geometries.objects.size(); ++i) + { + const auto& inGeometry = geometries.objects[i]; + auto& [obj, meta] = scratch.objects[i]; + + bool status = true; + + meta.name = inGeometry.meta.name; + meta.type = inGeometry.meta.type; + + struct + { + SBlendParams blend; + SRasterizationParams rasterization; + typename Types::graphics_pipeline_t::SCreationParams pipeline; + } params; + + { + params.blend.logicOp = ELO_NO_OP; + + auto& b = params.blend.blendParams[0]; + b.srcColorFactor = EBF_SRC_ALPHA; + b.dstColorFactor = EBF_ONE_MINUS_SRC_ALPHA; + b.colorBlendOp = EBO_ADD; + b.srcAlphaFactor = EBF_SRC_ALPHA; + b.dstAlphaFactor = EBF_SRC_ALPHA; + b.alphaBlendOp = EBO_ADD; + b.colorWriteMask = (1u << 0u) | (1u << 1u) | (1u << 2u) | (1u << 3u); + } + + params.rasterization.faceCullingMode = EFCM_NONE; + { + const typename Types::shader_t::SSpecInfo info [] = + { + {.entryPoint = "VSMain", .shader = scratch.shaders[inGeometry.shadersType].vertex.get() }, + {.entryPoint = "PSMain", .shader = scratch.shaders[inGeometry.shadersType].fragment.get() } + }; + + params.pipeline.layout = scratch.pipelineLayout.get(); + params.pipeline.shaders = info; + params.pipeline.renderpass = scratch.renderpass.get(); + params.pipeline.cached = { .vertexInput = inGeometry.data.inputParams, .primitiveAssembly = inGeometry.data.assemblyParams, .rasterization = params.rasterization, .blend = params.blend, .subpassIx = 0u }; + + obj.indexCount = inGeometry.data.indexCount; + obj.indexType = inGeometry.data.indexType; + + // TODO: cache pipeline & try lookup for existing one first maybe + + // similar issue like with shaders again, in this case gpu contructor allows for extra cache parameters + there is no constructor you can use to fire make_smart_refctd_ptr yourself for cpu + if constexpr (withAssetConverter) + obj.pipeline = ICPUGraphicsPipeline::create(params.pipeline); + else + { + const std::array info = { { params.pipeline } }; + utilities->getLogicalDevice()->createGraphicsPipelines(nullptr, info, &obj.pipeline); + } + + if (!obj.pipeline) + { + logger->log("Could not create graphics pipeline for [%s] object!", ILogger::ELL_ERROR, meta.name.data()); + status = false; + } + + // object buffers + auto createVIBuffers = [&]() -> bool + { + using ibuffer_t = ::nbl::asset::IBuffer; // seems to be ambigous, both asset & core namespaces has IBuffer + + // note: similar issue like with shaders, this time with cpu-gpu constructors differing in arguments + auto vBuffer = smart_refctd_ptr(inGeometry.data.bindings[0].buffer); // no offset + constexpr static auto VERTEX_USAGE = bitflag(ibuffer_t::EUF_VERTEX_BUFFER_BIT) | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF; + obj.bindings.vertex.offset = 0u; + + auto iBuffer = smart_refctd_ptr(inGeometry.data.indexBuffer.buffer); // no offset + constexpr static auto INDEX_USAGE = bitflag(ibuffer_t::EUF_INDEX_BUFFER_BIT) | ibuffer_t::EUF_VERTEX_BUFFER_BIT | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF; + obj.bindings.index.offset = 0u; + + if constexpr (withAssetConverter) + { + if (!vBuffer) + return false; + + vBuffer->addUsageFlags(VERTEX_USAGE); + vBuffer->setContentHash(vBuffer->computeContentHash()); + obj.bindings.vertex = { .offset = 0u, .buffer = vBuffer }; + + if (inGeometry.data.indexType != EIT_UNKNOWN) + if (iBuffer) + { + iBuffer->addUsageFlags(INDEX_USAGE); + iBuffer->setContentHash(iBuffer->computeContentHash()); + } + else + return false; + + obj.bindings.index = { .offset = 0u, .buffer = iBuffer }; + } + else + { + auto vertexBuffer = utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = vBuffer->getSize(), .usage = VERTEX_USAGE })); + auto indexBuffer = iBuffer ? utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = iBuffer->getSize(), .usage = INDEX_USAGE })) : nullptr; + + if (!vertexBuffer) + return false; + + if (inGeometry.data.indexType != EIT_UNKNOWN) + if (!indexBuffer) + return false; + + const auto mask = utilities->getLogicalDevice()->getPhysicalDevice()->getUpStreamingMemoryTypeBits(); + for (auto it : { vertexBuffer , indexBuffer }) + { + if (it) + { + auto reqs = it->getMemoryReqs(); + reqs.memoryTypeBits &= mask; + + utilities->getLogicalDevice()->allocate(reqs, it.get()); + } + } + + // record transfer uploads + obj.bindings.vertex = { .offset = 0u, .buffer = std::move(vertexBuffer) }; + { + const SBufferRange range = { .offset = obj.bindings.vertex.offset, .size = obj.bindings.vertex.buffer->getSize(), .buffer = obj.bindings.vertex.buffer }; + if (!commandBuffer->updateBuffer(range, vBuffer->getPointer())) + { + logger->log("Could not record vertex buffer transfer upload for [%s] object!", ILogger::ELL_ERROR, meta.name.data()); + status = false; + } + } + obj.bindings.index = { .offset = 0u, .buffer = std::move(indexBuffer) }; + { + if (iBuffer) + { + const SBufferRange range = { .offset = obj.bindings.index.offset, .size = obj.bindings.index.buffer->getSize(), .buffer = obj.bindings.index.buffer }; + + if (!commandBuffer->updateBuffer(range, iBuffer->getPointer())) + { + logger->log("Could not record index buffer transfer upload for [%s] object!", ILogger::ELL_ERROR, meta.name.data()); + status = false; + } + } + } + } + + return true; + }; + + if (!createVIBuffers()) + { + logger->log("Could not create buffers for [%s] object!", ILogger::ELL_ERROR, meta.name.data()); + status = false; + } + + if (!status) + { + logger->log("[%s] object will not be created!", ILogger::ELL_ERROR, meta.name.data()); + + obj.bindings.vertex = {}; + obj.bindings.index = {}; + obj.indexCount = 0u; + obj.indexType = E_INDEX_TYPE::EIT_UNKNOWN; + obj.pipeline = nullptr; + + continue; + } + } + } + + return true; + } + + + struct GeometriesCpu + { + enum GeometryShader + { + GP_BASIC = 0, + GP_CONE, + GP_ICO, + + GP_COUNT + }; + + + }; + + struct Shaders + { + nbl::core::smart_refctd_ptr vertex = nullptr, fragment = nullptr; + }; + + std::array shaders; +}; +#endif + +} +#endif \ No newline at end of file From 0ba8eed179bd6a4d86a63625cb9254a6f4a9714c Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 18 Jun 2025 15:53:44 +0700 Subject: [PATCH 259/296] apply reduced macro definitions to benchmark ex --- .../app_resources/benchmarkSubgroup.comp.hlsl | 19 +++++---- .../benchmarkWorkgroup.comp.hlsl | 6 +-- .../app_resources/shaderCommon.hlsl | 11 ----- 29_Arithmetic2Bench/main.cpp | 42 +++++++++---------- 4 files changed, 33 insertions(+), 45 deletions(-) diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl index 2c102c13d..f6ad3e678 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl @@ -4,29 +4,33 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" #include "nbl/builtin/hlsl/random/xoroshiro.hlsl" #include "shaderCommon.hlsl" #include "nbl/builtin/hlsl/workgroup2/basic.hlsl" -typedef vector type_t; +template +using params_t = SUBGROUP_CONFIG_T; + +NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation = params_t::base_t, device_capabilities>::ItemsPerInvocation; + +typedef vector type_t; uint32_t globalIndex() { return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex(); } -template +template static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) { - using config_t = subgroup2::Configuration; - using params_t = subgroup2::ArithmeticParams; type_t value = sourceVal; const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex]; - operation_t func; + operation_t > func; // [unroll] for (uint32_t i = 0; i < NUM_LOOPS; i++) value = func(value); @@ -36,13 +40,14 @@ static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) void benchmark() { + const uint32_t invocationIndex = globalIndex(); type_t sourceVal; Xoroshiro64Star xoroshiro = Xoroshiro64Star::construct(uint32_t2(invocationIndex,invocationIndex+1)); [unroll] - for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++) + for (uint16_t i = 0; i < ItemsPerInvocation; i++) sourceVal[i] = xoroshiro(); - subbench, ITEMS_PER_INVOCATION>(sourceVal); + subbench >(sourceVal); } [numthreads(WORKGROUP_SIZE,1,1)] diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index 50a9d912b..a56945467 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -6,12 +6,10 @@ #include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" #include "nbl/builtin/hlsl/random/xoroshiro.hlsl" -static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2; +using config_t = WORKGROUP_CONFIG_T; #include "shaderCommon.hlsl" -using config_t = workgroup2::ArithmeticConfiguration; - typedef vector type_t; // final (level 1/2) scan needs to fit in one subgroup exactly @@ -119,7 +117,7 @@ void benchmark() } -[numthreads(WORKGROUP_SIZE,1,1)] +[numthreads(config_t::WorkgroupSize,1,1)] void main() { benchmark(); diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl index 4866efe81..242ededd8 100644 --- a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl +++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl @@ -3,13 +3,6 @@ using namespace nbl; using namespace hlsl; -// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 -uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} - -#ifndef ITEMS_PER_INVOCATION -#error "Define ITEMS_PER_INVOCATION!" -#endif - [[vk::push_constant]] PushConstantData pc; struct device_capabilities @@ -25,10 +18,6 @@ struct device_capabilities #error "Define OPERATION!" #endif -#ifndef SUBGROUP_SIZE_LOG2 -#error "Define SUBGROUP_SIZE_LOG2!" -#endif - #ifndef NUM_LOOPS #error "Define NUM_LOOPS!" #endif diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index e88a59cae..2d5afeb4c 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -3,6 +3,7 @@ #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "app_resources/common.hlsl" #include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl" using namespace nbl; using namespace core; @@ -549,55 +550,50 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub smart_refctd_ptr overriddenUnspecialized; if constexpr (WorkgroupBench) { - const std::string definitions[7] = { + const std::string definitions[4] = { "workgroup2::" + arith_name, - std::to_string(workgroupSizeLog2), - std::to_string(itemsPerWG), - std::to_string(itemsPerInvoc), - std::to_string(subgroupSizeLog2), + wgConfig.getConfigTemplateStructString(), std::to_string(numLoops), std::to_string(arith_name=="reduction") }; - const IShaderCompiler::SMacroDefinition defines[8] = { + const IShaderCompiler::SMacroDefinition defines[5] = { { "OPERATION", definitions[0] }, - { "WORKGROUP_SIZE_LOG2", definitions[1] }, - { "ITEMS_PER_WG", definitions[2] }, - { "ITEMS_PER_INVOCATION", definitions[3] }, - { "SUBGROUP_SIZE_LOG2", definitions[4] }, - { "NUM_LOOPS", definitions[5] }, - { "IS_REDUCTION", definitions[6] }, + { "WORKGROUP_CONFIG_T", definitions[1] }, + { "NUM_LOOPS", definitions[2] }, + { "IS_REDUCTION", definitions[3] }, { "TEST_NATIVE", "1" } }; if (UseNativeArithmetic) - options.preprocessorOptions.extraDefines = { defines, defines + 8 }; + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; else - options.preprocessorOptions.extraDefines = { defines, defines + 7 }; + options.preprocessorOptions.extraDefines = { defines, defines + 4 }; overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); } else { - const std::string definitions[5] = { + hlsl::subgroup2::SArithmeticParams sgParams; + sgParams.init(subgroupSizeLog2, itemsPerInvoc); + + const std::string definitions[4] = { "subgroup2::" + arith_name, std::to_string(workgroupSize), - std::to_string(itemsPerInvoc), - std::to_string(subgroupSizeLog2), + sgParams.getParamTemplateStructString(), std::to_string(numLoops) }; - const IShaderCompiler::SMacroDefinition defines[6] = { + const IShaderCompiler::SMacroDefinition defines[5] = { { "OPERATION", definitions[0] }, { "WORKGROUP_SIZE", definitions[1] }, - { "ITEMS_PER_INVOCATION", definitions[2] }, - { "SUBGROUP_SIZE_LOG2", definitions[3] }, - { "NUM_LOOPS", definitions[4] }, + { "SUBGROUP_CONFIG_T", definitions[2] }, + { "NUM_LOOPS", definitions[3] }, { "TEST_NATIVE", "1" } }; if (UseNativeArithmetic) - options.preprocessorOptions.extraDefines = { defines, defines + 6 }; - else options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + else + options.preprocessorOptions.extraDefines = { defines, defines + 4 }; overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); } From c3786dfe24dd3f4d9ff2b60c63496f40bd3238b5 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 18 Jun 2025 12:27:43 +0200 Subject: [PATCH 260/296] prep for shader loading --- 09_GeometryCreator/main.cpp | 27 +- common/include/nbl/examples/PCH.hpp | 2 + .../geometry/CGeometryCreatorScene.hpp | 16 + .../geometry/CSimpleDebugRenderer.hpp | 332 ++++-------------- 4 files changed, 110 insertions(+), 267 deletions(-) diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index 5bbe40f37..4c982e8f8 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -4,24 +4,27 @@ #include "common.hpp" -class GeometryCreatorApp final : public MonoWindowApplication +class GeometryCreatorApp final : public MonoWindowApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using base_t = MonoWindowApplication; + using device_base_t = MonoWindowApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; public: GeometryCreatorApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) - : base_t({1280,720}, EF_D16_UNORM, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + : device_base_t({1280,720}, EF_D16_UNORM, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override { - auto retval = base_t::getRequiredDeviceFeatures(); + auto retval = device_base_t::getRequiredDeviceFeatures(); retval.geometryShader = true; return retval; } inline bool onAppInitialized(smart_refctd_ptr&& system) override { - if (!base_t::onAppInitialized(smart_refctd_ptr(system))) + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) return false; m_semaphore = m_device->createSemaphore(m_realFrameIx); @@ -54,7 +57,7 @@ class GeometryCreatorApp final : public MonoWindowApplication ); auto scRes = static_cast(m_surface->getSwapchainResources()); - m_renderer = CSimpleDebugRenderer::create(scRes->getRenderpass(),0,m_scene.get()); + m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,m_scene.get()); if (!m_renderer) return logFail("Could not create Renderer!"); @@ -75,7 +78,7 @@ class GeometryCreatorApp final : public MonoWindowApplication m_inputSystem->getDefaultMouse(&mouse); m_inputSystem->getDefaultKeyboard(&keyboard); - const auto resourceIx = m_realFrameIx % base_t::MaxFramesInFlight; + const auto resourceIx = m_realFrameIx % device_base_t::MaxFramesInFlight; auto* const cb = m_cmdBufs.data()[resourceIx].get(); cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); @@ -120,7 +123,7 @@ class GeometryCreatorApp final : public MonoWindowApplication auto scRes = static_cast(m_surface->getSwapchainResources()); const IGPUCommandBuffer::SRenderpassBeginInfo info = { - .framebuffer = scRes->getFramebuffer(base_t::getCurrentAcquire().imageIndex), + .framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), .colorClearValues = &clearValue, .depthStencilClearValues = &depthValue, .renderArea = currentRenderArea @@ -153,7 +156,11 @@ class GeometryCreatorApp final : public MonoWindowApplication {.cmdbuf = cb } }; const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { - {.semaphore = base_t::getCurrentAcquire().semaphore, .value = base_t::getCurrentAcquire().acquireCount, .stageMask = PIPELINE_STAGE_FLAGS::NONE} + { + .semaphore = device_base_t::getCurrentAcquire().semaphore, + .value = device_base_t::getCurrentAcquire().acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } }; const IQueue::SSubmitInfo infos[] = { @@ -227,7 +234,7 @@ class GeometryCreatorApp final : public MonoWindowApplication // smart_refctd_ptr m_semaphore; uint64_t m_realFrameIx = 0; - std::array,base_t::MaxFramesInFlight> m_cmdBufs; + std::array,device_base_t::MaxFramesInFlight> m_cmdBufs; // InputSystem::ChannelReader mouse; InputSystem::ChannelReader keyboard; diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp index ed5da666e..2c08a2d84 100644 --- a/common/include/nbl/examples/PCH.hpp +++ b/common/include/nbl/examples/PCH.hpp @@ -6,6 +6,8 @@ #include +// why isnt this in `nabla.h` ? +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" // #include "nbl/ui/CGraphicalApplicationAndroid.h" // #include "nbl/ui/CWindowManagerAndroid.h" diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index 8a73f2e14..12c12e3f3 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -180,4 +180,20 @@ class CGeometryCreatorScene : public core::IReferenceCounted }; } +#endif + + + +#if 0 + typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC]; + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment); + + typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE]; + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic! + + typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO]; + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic! #endif \ No newline at end of file diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp index b6f6b4aaf..e18c6664a 100644 --- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp +++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp @@ -81,21 +81,33 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted }; // - static inline core::smart_refctd_ptr create(video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene) + static inline core::smart_refctd_ptr create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene) { EXPOSE_NABLA_NAMESPACES; - if (!renderpass) + if (!!renderpass) return nullptr; auto device = const_cast(renderpass->getOriginDevice()); auto logger = device->getLogger(); - if (!scene) + if (!assMan || !scene) return nullptr; const auto namedGeoms = scene->getGeometries(); if (namedGeoms.empty()) return nullptr; + // load shader + smart_refctd_ptr shader; + { + const auto bundle = assMan->getAsset("nbl/examples/geometry/spirv/unified.spv",{}); + const auto contents = bundle.getContents(); + if (bundle.getAssetType()!=IAsset::ET_SHADER || contents.empty()) + return nullptr; + shader = IAsset::castDown(contents[0]); + if (!shader) + return nullptr; + } + SInitParams init; // create descriptor set @@ -132,7 +144,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted } } - // + // create pipeline layout const SPushConstantRange ranges[] = {{ .stageFlags = hlsl::ShaderStage::ESS_VERTEX, .offset = 0, @@ -140,9 +152,64 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted }}; init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr(init.ds->getLayout())); - // TODO: Load Shaders and Create Pipelines + // create pipelines + enum PipelineType : uint8_t + { + BasicTriangleList, + BasicTriangleFan, + Cone, + Count + }; + smart_refctd_ptr pipelines[PipelineType::Count] = {}; { - // + IGPUGraphicsPipeline::SCreationParams params[PipelineType::Count] = {}; + for (auto i=0; i< PipelineType::Count; i++) + { + const auto type = static_cast(i); + // no vertex input + { + auto& primitiveAssembly = params[i].cached.primitiveAssembly; + switch (type) + { + case PipelineType::BasicTriangleFan: + primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN; + break; + default: + primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_LIST; + break; + } + primitiveAssembly.primitiveRestartEnable = false; + primitiveAssembly.tessPatchVertCount = 3; + } + { + auto& rasterization = params[i].cached.rasterization; + rasterization.faceCullingMode = EFCM_NONE; + } + { + auto& blend = params[i].cached.blend; + // everything as default + } + params[i].cached.subpassIx = subpassIX; + params[i].renderpass = renderpass; + } + /* + typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC]; + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment); + + typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE]; + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic! + + typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO]; + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex); + createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic! + */ + if (!device->createGraphicsPipelines(nullptr,params,pipelines)) + { + logger->log("Could not create Graphics Pipelines!",ILogger::ELL_ERROR); + return nullptr; + } } // write geometries' attributes to descriptor set @@ -164,6 +231,8 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted if (!geom->valid()) continue; auto& out = init.geoms.emplace_back(); +// TODO: handle special cases + out.pipeline = pipelines[PipelineType::BasicTriangleList]; if (const auto& view=geom->getIndexView(); view) { out.indexBuffer.offset = view.src.offset; @@ -238,256 +307,5 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted #undef EXPOSE_NABLA_NAMESPACES }; -#if 0 -class ResourceBuilder -{ -private: - - bool createShaders() - { - EXPOSE_NABLA_NAMESPACES(); - - auto createShader = [&](IShader::E_SHADER_STAGE stage, smart_refctd_ptr& outShader) -> smart_refctd_ptr - { - // TODO: use SPIRV loader & our ::system ns to get those cpu shaders, do not create myself (shit I forgot it exists) - - const SBuiltinFile& in = ::geometry::creator::spirv::builtin::get_resource(); - const auto buffer = ICPUBuffer::create({ { in.size }, (void*)in.contents, core::getNullMemoryResource() }, adopt_memory); - auto shader = make_smart_refctd_ptr(smart_refctd_ptr(buffer), stage, IShader::E_CONTENT_TYPE::ECT_SPIRV, ""); // must create cpu instance regardless underlying type - - if constexpr (withAssetConverter) - { - buffer->setContentHash(buffer->computeContentHash()); - outShader = std::move(shader); - } - - return outShader; - }; - - typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment); - - typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic! - - typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic! - - for (const auto& it : scratch.shaders) - { - if (!it.vertex || !it.fragment) - { - logger->log("Could not create shaders!", ILogger::ELL_ERROR); - return false; - } - } - - return true; - } - - bool createGeometries() - { - EXPOSE_NABLA_NAMESPACES(); - - for (uint32_t i = 0; i < geometries.objects.size(); ++i) - { - const auto& inGeometry = geometries.objects[i]; - auto& [obj, meta] = scratch.objects[i]; - - bool status = true; - - meta.name = inGeometry.meta.name; - meta.type = inGeometry.meta.type; - - struct - { - SBlendParams blend; - SRasterizationParams rasterization; - typename Types::graphics_pipeline_t::SCreationParams pipeline; - } params; - - { - params.blend.logicOp = ELO_NO_OP; - - auto& b = params.blend.blendParams[0]; - b.srcColorFactor = EBF_SRC_ALPHA; - b.dstColorFactor = EBF_ONE_MINUS_SRC_ALPHA; - b.colorBlendOp = EBO_ADD; - b.srcAlphaFactor = EBF_SRC_ALPHA; - b.dstAlphaFactor = EBF_SRC_ALPHA; - b.alphaBlendOp = EBO_ADD; - b.colorWriteMask = (1u << 0u) | (1u << 1u) | (1u << 2u) | (1u << 3u); - } - - params.rasterization.faceCullingMode = EFCM_NONE; - { - const typename Types::shader_t::SSpecInfo info [] = - { - {.entryPoint = "VSMain", .shader = scratch.shaders[inGeometry.shadersType].vertex.get() }, - {.entryPoint = "PSMain", .shader = scratch.shaders[inGeometry.shadersType].fragment.get() } - }; - - params.pipeline.layout = scratch.pipelineLayout.get(); - params.pipeline.shaders = info; - params.pipeline.renderpass = scratch.renderpass.get(); - params.pipeline.cached = { .vertexInput = inGeometry.data.inputParams, .primitiveAssembly = inGeometry.data.assemblyParams, .rasterization = params.rasterization, .blend = params.blend, .subpassIx = 0u }; - - obj.indexCount = inGeometry.data.indexCount; - obj.indexType = inGeometry.data.indexType; - - // TODO: cache pipeline & try lookup for existing one first maybe - - // similar issue like with shaders again, in this case gpu contructor allows for extra cache parameters + there is no constructor you can use to fire make_smart_refctd_ptr yourself for cpu - if constexpr (withAssetConverter) - obj.pipeline = ICPUGraphicsPipeline::create(params.pipeline); - else - { - const std::array info = { { params.pipeline } }; - utilities->getLogicalDevice()->createGraphicsPipelines(nullptr, info, &obj.pipeline); - } - - if (!obj.pipeline) - { - logger->log("Could not create graphics pipeline for [%s] object!", ILogger::ELL_ERROR, meta.name.data()); - status = false; - } - - // object buffers - auto createVIBuffers = [&]() -> bool - { - using ibuffer_t = ::nbl::asset::IBuffer; // seems to be ambigous, both asset & core namespaces has IBuffer - - // note: similar issue like with shaders, this time with cpu-gpu constructors differing in arguments - auto vBuffer = smart_refctd_ptr(inGeometry.data.bindings[0].buffer); // no offset - constexpr static auto VERTEX_USAGE = bitflag(ibuffer_t::EUF_VERTEX_BUFFER_BIT) | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF; - obj.bindings.vertex.offset = 0u; - - auto iBuffer = smart_refctd_ptr(inGeometry.data.indexBuffer.buffer); // no offset - constexpr static auto INDEX_USAGE = bitflag(ibuffer_t::EUF_INDEX_BUFFER_BIT) | ibuffer_t::EUF_VERTEX_BUFFER_BIT | ibuffer_t::EUF_TRANSFER_DST_BIT | ibuffer_t::EUF_INLINE_UPDATE_VIA_CMDBUF; - obj.bindings.index.offset = 0u; - - if constexpr (withAssetConverter) - { - if (!vBuffer) - return false; - - vBuffer->addUsageFlags(VERTEX_USAGE); - vBuffer->setContentHash(vBuffer->computeContentHash()); - obj.bindings.vertex = { .offset = 0u, .buffer = vBuffer }; - - if (inGeometry.data.indexType != EIT_UNKNOWN) - if (iBuffer) - { - iBuffer->addUsageFlags(INDEX_USAGE); - iBuffer->setContentHash(iBuffer->computeContentHash()); - } - else - return false; - - obj.bindings.index = { .offset = 0u, .buffer = iBuffer }; - } - else - { - auto vertexBuffer = utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = vBuffer->getSize(), .usage = VERTEX_USAGE })); - auto indexBuffer = iBuffer ? utilities->getLogicalDevice()->createBuffer(IGPUBuffer::SCreationParams({ .size = iBuffer->getSize(), .usage = INDEX_USAGE })) : nullptr; - - if (!vertexBuffer) - return false; - - if (inGeometry.data.indexType != EIT_UNKNOWN) - if (!indexBuffer) - return false; - - const auto mask = utilities->getLogicalDevice()->getPhysicalDevice()->getUpStreamingMemoryTypeBits(); - for (auto it : { vertexBuffer , indexBuffer }) - { - if (it) - { - auto reqs = it->getMemoryReqs(); - reqs.memoryTypeBits &= mask; - - utilities->getLogicalDevice()->allocate(reqs, it.get()); - } - } - - // record transfer uploads - obj.bindings.vertex = { .offset = 0u, .buffer = std::move(vertexBuffer) }; - { - const SBufferRange range = { .offset = obj.bindings.vertex.offset, .size = obj.bindings.vertex.buffer->getSize(), .buffer = obj.bindings.vertex.buffer }; - if (!commandBuffer->updateBuffer(range, vBuffer->getPointer())) - { - logger->log("Could not record vertex buffer transfer upload for [%s] object!", ILogger::ELL_ERROR, meta.name.data()); - status = false; - } - } - obj.bindings.index = { .offset = 0u, .buffer = std::move(indexBuffer) }; - { - if (iBuffer) - { - const SBufferRange range = { .offset = obj.bindings.index.offset, .size = obj.bindings.index.buffer->getSize(), .buffer = obj.bindings.index.buffer }; - - if (!commandBuffer->updateBuffer(range, iBuffer->getPointer())) - { - logger->log("Could not record index buffer transfer upload for [%s] object!", ILogger::ELL_ERROR, meta.name.data()); - status = false; - } - } - } - } - - return true; - }; - - if (!createVIBuffers()) - { - logger->log("Could not create buffers for [%s] object!", ILogger::ELL_ERROR, meta.name.data()); - status = false; - } - - if (!status) - { - logger->log("[%s] object will not be created!", ILogger::ELL_ERROR, meta.name.data()); - - obj.bindings.vertex = {}; - obj.bindings.index = {}; - obj.indexCount = 0u; - obj.indexType = E_INDEX_TYPE::EIT_UNKNOWN; - obj.pipeline = nullptr; - - continue; - } - } - } - - return true; - } - - - struct GeometriesCpu - { - enum GeometryShader - { - GP_BASIC = 0, - GP_CONE, - GP_ICO, - - GP_COUNT - }; - - - }; - - struct Shaders - { - nbl::core::smart_refctd_ptr vertex = nullptr, fragment = nullptr; - }; - - std::array shaders; -}; -#endif - } #endif \ No newline at end of file From 173a3c960bcf10bf7705e1a0c30f49727db87daa Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 18 Jun 2025 18:17:11 +0700 Subject: [PATCH 261/296] Fix compile error for example 23 and 29 --- 23_Arithmetic2UnitTest/main.cpp | 16 ++++++++-------- 29_Arithmetic2Bench/main.cpp | 18 +++++++++--------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index 65ef126ad..da0d3de7d 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -157,7 +157,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu exit(-1); } auto firstAssetInBundle = bundle.getContents()[0]; - return smart_refctd_ptr_static_cast(firstAssetInBundle); + return smart_refctd_ptr_static_cast(firstAssetInBundle); }; auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); @@ -263,18 +263,18 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } // create pipeline (specialized every test) [TODO: turn into a future/async] - smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2) + smart_refctd_ptr createPipeline(const IShader* overridenUnspecialized, const uint8_t subgroupSizeLog2) { - auto shader = m_device->createShader(overridenUnspecialized); + auto shader = m_device->compileShader({ overridenUnspecialized }); IGPUComputePipeline::SCreationParams params = {}; params.layout = pipelineLayout.get(); params.shader = { - .entryPoint = "main", .shader = shader.get(), + .entryPoint = "main", + .requiredSubgroupSize = static_cast(subgroupSizeLog2), .entries = nullptr, - .requiredSubgroupSize = static_cast(subgroupSizeLog2), - .requireFullSubgroups = true }; + params.cached.requireFullSubgroups = true; core::smart_refctd_ptr pipeline; if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{¶ms,1},&pipeline)) return nullptr; @@ -282,7 +282,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu } template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, bool useNative, uint32_t itemsPerWG, uint32_t itemsPerInvoc = 1u) + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, bool useNative, uint32_t itemsPerWG, uint32_t itemsPerInvoc = 1u) { std::string arith_name = Arithmetic>::name; const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); @@ -305,7 +305,7 @@ class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueu auto* includeFinder = compiler->getDefaultIncludeFinder(); options.preprocessorOptions.includeFinder = includeFinder; - smart_refctd_ptr overriddenUnspecialized; + smart_refctd_ptr overriddenUnspecialized; if constexpr (WorkgroupTest) { hlsl::workgroup2::SArithmeticConfiguration wgConfig; diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index 2d5afeb4c..61e94607b 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -346,12 +346,12 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub exit(-1); } auto firstAssetInBundle = bundle.getContents()[0]; - return smart_refctd_ptr_static_cast(firstAssetInBundle); + return smart_refctd_ptr_static_cast(firstAssetInBundle); }; // for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram) const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; - smart_refctd_ptr shaderSource; + smart_refctd_ptr shaderSource; if constexpr (DoWorkgroupBenchmarks) shaderSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl"); else @@ -496,18 +496,18 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub private: // create pipeline (specialized every test) [TODO: turn into a future/async] - smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2) + smart_refctd_ptr createPipeline(const IShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2) { - auto shader = m_device->createShader(overridenUnspecialized); + auto shader = m_device->compileShader({ overridenUnspecialized }); IGPUComputePipeline::SCreationParams params = {}; params.layout = layout; params.shader = { - .entryPoint = "main", .shader = shader.get(), + .entryPoint = "main", + .requiredSubgroupSize = static_cast(subgroupSizeLog2), .entries = nullptr, - .requiredSubgroupSize = static_cast(subgroupSizeLog2), - .requireFullSubgroups = true }; + params.cached.requireFullSubgroups = true; core::smart_refctd_ptr pipeline; if (!m_device->createComputePipelines(nullptr,{¶ms,1},&pipeline)) return nullptr; @@ -522,7 +522,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub }; template - BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const std::string& arith_name, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) + BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const std::string& arith_name, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) { auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); CHLSLCompiler::SOptions options = {}; @@ -547,7 +547,7 @@ class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, pub hlsl::workgroup2::SArithmeticConfiguration wgConfig; wgConfig.init(workgroupSizeLog2, subgroupSizeLog2, itemsPerInvoc); const uint32_t itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0; - smart_refctd_ptr overriddenUnspecialized; + smart_refctd_ptr overriddenUnspecialized; if constexpr (WorkgroupBench) { const std::string definitions[4] = { From d03f31b1c623618cbfa43855c23183d7b20afd07 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 18 Jun 2025 23:00:49 +0200 Subject: [PATCH 262/296] kill a CMakeLists.txt not used in the example PCH branch --- .../geometry/CGeometryCreatorScene.hpp | 16 ---- .../src/nbl/examples/geometry/CMakeLists.txt | 73 ------------------- 2 files changed, 89 deletions(-) delete mode 100644 common/src/nbl/examples/geometry/CMakeLists.txt diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index 12c12e3f3..8a73f2e14 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -180,20 +180,4 @@ class CGeometryCreatorScene : public core::IReferenceCounted }; } -#endif - - - -#if 0 - typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment); - - typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic! - - typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic! #endif \ No newline at end of file diff --git a/common/src/nbl/examples/geometry/CMakeLists.txt b/common/src/nbl/examples/geometry/CMakeLists.txt deleted file mode 100644 index c402a2b8a..000000000 --- a/common/src/nbl/examples/geometry/CMakeLists.txt +++ /dev/null @@ -1,73 +0,0 @@ -# TODO: let arek figure out how to redo the shaders -#[===[ - -# shaders IO directories -set(NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/shaders") -get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_ "${CMAKE_CURRENT_BINARY_DIR}/shaders/include" ABSOLUTE) -get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/include" ABSOLUTE) -get_filename_component(_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/builtin/src" ABSOLUTE) -set(NBL_EXAMPLES_GEOMETRY_OUTPUT_SPIRV_DIRECTORY "${_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}/nbl/examples/geometry/spirv") - -# list of input source shaders -set(NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS - # geometry creator - "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.basic.fragment.hlsl" - "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.basic.vertex.hlsl" - "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.cone.vertex.hlsl" - "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/gc.ico.vertex.hlsl" - - # grid - "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/grid.vertex.hlsl" - "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/grid.fragment.hlsl" -) - -file(GLOB_RECURSE NBL_EXAMPLES_GEOMETRY_INPUT_COMMONS CONFIGURE_DEPENDS "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}/template/*.hlsl") - -include("${NBL_ROOT_PATH}/src/nbl/builtin/utils.cmake") - -foreach(NBL_INPUT_SHADER IN LISTS NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS) - cmake_path(GET NBL_INPUT_SHADER FILENAME NBL_INPUT_SHADER_FILENAME) - cmake_path(GET NBL_INPUT_SHADER_FILENAME STEM LAST_ONLY NBL_SHADER_STEM) # filename without .hlsl extension - cmake_path(GET NBL_SHADER_STEM EXTENSION LAST_ONLY NBL_SHADER_TYPE) # . - - set(NBL_OUTPUT_SPIRV_FILENAME "${NBL_SHADER_STEM}.spv") - set(NBL_OUTPUT_SPIRV_PATH "${NBL_EXAMPLES_GEOMETRY_OUTPUT_SPIRV_DIRECTORY}/${NBL_OUTPUT_SPIRV_FILENAME}") - - if(NBL_SHADER_TYPE STREQUAL .vertex) - set(NBL_NSC_COMPILE_OPTIONS -T vs_6_8 -E VSMain) - elseif(NBL_SHADER_TYPE STREQUAL .geometry) - set(NBL_NSC_COMPILE_OPTIONS -T gs_6_8 -E GSMain) - elseif(NBL_SHADER_TYPE STREQUAL .fragment) - set(NBL_NSC_COMPILE_OPTIONS -T ps_6_8 -E PSMain) - else() - message(FATAL_ERROR "Input shader is supposed to be ..hlsl!") - endif() - - set(NBL_NSC_COMPILE_COMMAND - "$" - -Fc "${NBL_OUTPUT_SPIRV_PATH}" - -I "${NBL_EXAMPLES_API_INCLUDE_DIRECTORY}" - ${NBL_NSC_COMPILE_OPTIONS} # this should come from shader's [#pragma WAVE ] but our NSC doesn't seem to work properly currently - "${NBL_INPUT_SHADER}" - ) - - set(NBL_DEPENDS - "${NBL_INPUT_SHADER}" - ${NBL_EXAMPLES_GEOMETRY_INPUT_COMMONS} - ) - - add_custom_command(OUTPUT "${NBL_OUTPUT_SPIRV_PATH}" - COMMAND ${NBL_NSC_COMPILE_COMMAND} - DEPENDS ${NBL_DEPENDS} - WORKING_DIRECTORY "${NBL_EXAMPLES_GEOMETRY_INPUT_SHADERS_DIRECTORY}" - COMMENT "Generating \"${NBL_OUTPUT_SPIRV_PATH}\"" - VERBATIM - COMMAND_EXPAND_LISTS - ) - - list(APPEND NBL_EXAMPLES_GEOMETRY_OUTPUT_SPIRV_BUILTINS "${NBL_OUTPUT_SPIRV_PATH}") - LIST_BUILTIN_RESOURCE(GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "geometry/spirv/${NBL_OUTPUT_SPIRV_FILENAME}") -endforeach() - -ADD_CUSTOM_BUILTIN_RESOURCES(geometryCreatorSpirvBRD GEOMETRY_CREATOR_SPIRV_RESOURCES_TO_EMBED "${_EXAMPLES_GEOMETRY_SPIRV_BR_BUNDLE_SEARCH_DIRECTORY_}" "nbl" "geometry::spirv::builtin" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_HEADER_}" "${_EXAMPLES_GEOMETRY_SPIRV_BR_OUTPUT_DIRECTORY_SOURCE_}" "STATIC" "INTERNAL") -]===] \ No newline at end of file From f6aebbf086db6151f43798a622ab427ba7463142 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 18 Jun 2025 23:02:19 +0200 Subject: [PATCH 263/296] commit shaders somewhere for now --- common/src/nbl/examples/CMakeLists.txt | 7 +- .../geometry/shaders/grid.vertex.hlsl | 6 -- .../template/gc.basic.vertex.input.hlsl | 18 ++--- .../geometry/shaders/template/gc.common.hlsl | 22 ++++--- .../template/gc.cone.vertex.input.hlsl | 15 ++--- .../shaders/template/gc.ico.vertex.input.hlsl | 16 ++--- .../geometry/shaders/template/gc.vertex.hlsl | 9 +-- .../shaders/template/grid.common.hlsl | 65 ++++++++++--------- 8 files changed, 70 insertions(+), 88 deletions(-) diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt index 96ccaabea..032c038b4 100644 --- a/common/src/nbl/examples/CMakeLists.txt +++ b/common/src/nbl/examples/CMakeLists.txt @@ -1,9 +1,8 @@ -# TODO: @AnastaZluk redo the PCH -# add_subdirectory(pch EXCLUDE_FROM_ALL) - # we add common libraries # add_subdirectory(cameras EXCLUDE_FROM_ALL) # header only currently -add_subdirectory(geometry EXCLUDE_FROM_ALL) + +# TODO builtin SPIR-V shaders +# add_subdirectory(geometry EXCLUDE_FROM_ALL) # we get all available targets inclusive & below this directory NBL_GET_ALL_TARGETS(NBL_SUBDIRECTORY_TARGETS) diff --git a/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl index 167b981d3..389c37bf2 100644 --- a/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl +++ b/common/src/nbl/examples/geometry/shaders/grid.vertex.hlsl @@ -1,11 +1,5 @@ #include "template/grid.common.hlsl" -// set 1, binding 0 -[[vk::binding(0, 1)]] -cbuffer CameraData -{ - SBasicViewParameters params; -}; PSInput VSMain(VSInput input) { diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl index d9e2fa172..862d4508e 100644 --- a/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl +++ b/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl @@ -1,16 +1,12 @@ -#ifndef _THIS_EXAMPLE_GC_BASIC_VERTEX_INPUT_HLSL_ -#define _THIS_EXAMPLE_GC_BASIC_VERTEX_INPUT_HLSL_ +#ifndef _NBL_EXAMPLES_GC_BASIC_VERTEX_INPUT_HLSL_ +#define _NBL_EXAMPLES_GC_BASIC_VERTEX_INPUT_HLSL_ -struct VSInput -{ - [[vk::location(0)]] float3 position : POSITION; - [[vk::location(1)]] float4 color : COLOR; - [[vk::location(2)]] float2 uv : TEXCOORD; - [[vk::location(3)]] float3 normal : NORMAL; -}; - -#endif // _THIS_EXAMPLE_GC_BASIC_VERTEX_INPUT_HLSL_ +[[vk::binding(0)]] Buffer position; +[[vk::binding(1)]] Buffer normal; +[[vk::binding(2)]] Buffer uv; +[[vk::binding(3)]] Buffer color; +#endif // _NBL_EXAMPLES_GC_BASIC_VERTEX_INPUT_HLSL_ /* do not remove this text, WAVE is so bad that you can get errors if no proper ending xD */ diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl index 26e2885f7..ff40fb3c8 100644 --- a/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl +++ b/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl @@ -1,17 +1,21 @@ -#ifndef _THIS_EXAMPLE_GC_COMMON_HLSL_ -#define _THIS_EXAMPLE_GC_COMMON_HLSL_ +#ifndef _NBL_EXAMPLES_GC_COMMON_HLSL_ +#define _NBL_EXAMPLES_GC_COMMON_HLSL_ + + +#include "common/SBasicViewParameters.hlsl" #ifdef __HLSL_VERSION - struct PSInput - { - float4 position : SV_Position; - float4 color : COLOR0; - }; +[[vk::push_constant]] SBasicViewParameters params; + +struct PSInput +{ + float4 position : SV_Position; + float3 color : COLOR0; +}; #endif // __HLSL_VERSION -#include "common/SBasicViewParameters.hlsl" -#endif // _THIS_EXAMPLE_GC_COMMON_HLSL_ +#endif // _NBL_EXAMPLES_GC_COMMON_HLSL_ /* do not remove this text, WAVE is so bad that you can get errors if no proper ending xD diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl index 66221fef1..7c40f54ab 100644 --- a/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl +++ b/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl @@ -1,14 +1,11 @@ -#ifndef _THIS_EXAMPLE_GC_CONE_VERTEX_INPUT_HLSL_ -#define _THIS_EXAMPLE_GC_CONE_VERTEX_INPUT_HLSL_ +#ifndef _NBL_EXAMPLES_GEOMETRY_CONE_VERTEX_INPUT_HLSL_ +#define _NBL_EXAMPLES_GEOMETRY_CONE_VERTEX_INPUT_HLSL_ -struct VSInput -{ - [[vk::location(0)]] float3 position : POSITION; - [[vk::location(1)]] float4 color : COLOR; - [[vk::location(2)]] float3 normal : NORMAL; -}; +[[vk::binding(0)]] Buffer position; +[[vk::binding(1)]] Buffer normal; +[[vk::binding(2)]] Buffer color; -#endif // _THIS_EXAMPLE_GC_CONE_VERTEX_INPUT_HLSL_ +#endif // _NBL_EXAMPLES_GEOMETRY_CONE_VERTEX_INPUT_HLSL_ /* do not remove this text, WAVE is so bad that you can get errors if no proper ending xD diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl index 6b85486d9..67092ccf0 100644 --- a/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl +++ b/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl @@ -1,15 +1,11 @@ -#ifndef _THIS_EXAMPLE_GC_ICO_VERTEX_INPUT_HLSL_ -#define _THIS_EXAMPLE_GC_ICO_VERTEX_INPUT_HLSL_ +#ifndef _NBL_EXAMPLES_GEOMETRY_ICO_VERTEX_INPUT_HLSL_ +#define _NBL_EXAMPLES_GEOMETRY_ICO_VERTEX_INPUT_HLSL_ -struct VSInput -{ - [[vk::location(0)]] float3 position : POSITION; - [[vk::location(1)]] float3 normal : NORMAL; - [[vk::location(2)]] float2 uv : TEXCOORD; -}; - -#endif // _THIS_EXAMPLE_GC_ICO_VERTEX_INPUT_HLSL_ +[[vk::binding(0)]] Buffer position; +[[vk::binding(1)]] Buffer normal; +[[vk::binding(2)]] Buffer uv; +#endif // _NBL_EXAMPLES_GEOMETRY_ICO_VERTEX_INPUT_HLSL_ /* do not remove this text, WAVE is so bad that you can get errors if no proper ending xD */ diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl index 5a8f26722..e878bf7d7 100644 --- a/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl +++ b/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl @@ -1,13 +1,6 @@ #include "gc.common.hlsl" -// set 1, binding 0 -[[vk::binding(0, 1)]] -cbuffer CameraData -{ - SBasicViewParameters params; -}; - -PSInput VSMain(VSInput input) +PSInput VSMain() { PSInput output; diff --git a/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl b/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl index 616412245..7ec9017e9 100644 --- a/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl +++ b/common/src/nbl/examples/geometry/shaders/template/grid.common.hlsl @@ -1,40 +1,43 @@ -#ifndef _THIS_EXAMPLE_GRID_COMMON_HLSL_ -#define _THIS_EXAMPLE_GRID_COMMON_HLSL_ +#ifndef _NBL_EXAMPLES_GRID_COMMON_HLSL_ +#define _NBL_EXAMPLES_GRID_COMMON_HLSL_ + +#include "common/SBasicViewParameters.hlsl" #ifdef __HLSL_VERSION - struct VSInput - { - [[vk::location(0)]] float3 position : POSITION; - [[vk::location(1)]] float4 color : COLOR; - [[vk::location(2)]] float2 uv : TEXCOORD; - [[vk::location(3)]] float3 normal : NORMAL; - }; - - struct PSInput - { - float4 position : SV_Position; - float2 uv : TEXCOORD0; - }; - - float gridTextureGradBox(float2 p, float2 ddx, float2 ddy) - { - float N = 30.0; // grid ratio - float2 w = max(abs(ddx), abs(ddy)) + 0.01; // filter kernel - - // analytic (box) filtering - float2 a = p + 0.5 * w; - float2 b = p - 0.5 * w; - float2 i = (floor(a) + min(frac(a) * N, 1.0) - floor(b) - min(frac(b) * N, 1.0)) / (N * w); - - // pattern - return (1.0 - i.x) * (1.0 - i.y); - } +// TODO: why is there even a mesh with HW vertices for this? +struct VSInput +{ + [[vk::location(0)]] float3 position : POSITION; + [[vk::location(1)]] float4 color : COLOR; + [[vk::location(2)]] float2 uv : TEXCOORD; + [[vk::location(3)]] float3 normal : NORMAL; +}; + +struct PSInput +{ + float4 position : SV_Position; + float2 uv : TEXCOORD0; +}; + +[[vk::push_constant]] SBasicViewParameters params; #endif // __HLSL_VERSION -#include "common/SBasicViewParameters.hlsl" -#endif // _THIS_EXAMPLE_GRID_COMMON_HLSL_ +float gridTextureGradBox(float2 p, float2 ddx, float2 ddy) +{ + float N = 30.0; // grid ratio + float2 w = max(abs(ddx), abs(ddy)) + 0.01; // filter kernel + + // analytic (box) filtering + float2 a = p + 0.5 * w; + float2 b = p - 0.5 * w; + float2 i = (floor(a) + min(frac(a) * N, 1.0) - floor(b) - min(frac(b) * N, 1.0)) / (N * w); + + // pattern + return (1.0 - i.x) * (1.0 - i.y); +} +#endif // _NBL_EXAMPLES_GRID_COMMON_HLSL_ /* do not remove this text, WAVE is so bad that you can get errors if no proper ending xD */ \ No newline at end of file From 8965fb33e3cff0b0cf1d05f5fc4124072c8898e3 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 19 Jun 2025 00:14:23 +0200 Subject: [PATCH 264/296] get ex 09 in line with Stageless Shaders --- .../geometry/CSimpleDebugRenderer.hpp | 76 +++++++++---------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp index e18c6664a..bd190c082 100644 --- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp +++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp @@ -97,13 +97,14 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted return nullptr; // load shader - smart_refctd_ptr shader; + smart_refctd_ptr shader; { - const auto bundle = assMan->getAsset("nbl/examples/geometry/spirv/unified.spv",{}); + const auto bundle = assMan->getAsset("nbl/examples/geometry/shaders/unified.hlsl",{}); + //const auto bundle = assMan->getAsset("nbl/examples/geometry/shaders/unified.spv",{}); const auto contents = bundle.getContents(); if (bundle.getAssetType()!=IAsset::ET_SHADER || contents.empty()) return nullptr; - shader = IAsset::castDown(contents[0]); + shader = IAsset::castDown(contents[0]); if (!shader) return nullptr; } @@ -163,48 +164,35 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted smart_refctd_ptr pipelines[PipelineType::Count] = {}; { IGPUGraphicsPipeline::SCreationParams params[PipelineType::Count] = {}; + params[PipelineType::BasicTriangleList].vertexShader = {.shader=shader.get(),.entryPoint="BasicTriangleListVS"}; + params[PipelineType::BasicTriangleList].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"}; + params[PipelineType::BasicTriangleFan].vertexShader = {.shader=shader.get(),.entryPoint="BasicTriangleFanVS"}; + params[PipelineType::BasicTriangleFan].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"}; + params[PipelineType::Cone].vertexShader = {.shader=shader.get(),.entryPoint="ConeVS"}; + params[PipelineType::Cone].fragmentShader = {.shader=shader.get(),.entryPoint="ConeFS"}; for (auto i=0; i< PipelineType::Count; i++) { - const auto type = static_cast(i); + params[i].layout = init.layout.get(); // no vertex input + auto& primitiveAssembly = params[i].cached.primitiveAssembly; + auto& rasterization = params[i].cached.rasterization; + auto& blend = params[i].cached.blend; + const auto type = static_cast(i); + switch (type) { - auto& primitiveAssembly = params[i].cached.primitiveAssembly; - switch (type) - { - case PipelineType::BasicTriangleFan: - primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN; - break; - default: - primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_LIST; - break; - } - primitiveAssembly.primitiveRestartEnable = false; - primitiveAssembly.tessPatchVertCount = 3; - } - { - auto& rasterization = params[i].cached.rasterization; - rasterization.faceCullingMode = EFCM_NONE; - } - { - auto& blend = params[i].cached.blend; - // everything as default + case PipelineType::BasicTriangleFan: + primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN; + break; + default: + primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_LIST; + break; } + primitiveAssembly.primitiveRestartEnable = false; + primitiveAssembly.tessPatchVertCount = 3; + rasterization.faceCullingMode = EFCM_NONE; params[i].cached.subpassIx = subpassIX; params[i].renderpass = renderpass; } - /* - typename ResourcesBundleScratch::Shaders& basic = scratch.shaders[GeometriesCpu::GP_BASIC]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, basic.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, basic.fragment); - - typename ResourcesBundleScratch::Shaders& cone = scratch.shaders[GeometriesCpu::GP_CONE]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.cone.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, cone.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, cone.fragment); // note we reuse fragment from basic! - - typename ResourcesBundleScratch::Shaders& ico = scratch.shaders[GeometriesCpu::GP_ICO]; - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.ico.vertex.spv") > (IShader::E_SHADER_STAGE::ESS_VERTEX, ico.vertex); - createShader.template operator() < NBL_CORE_UNIQUE_STRING_LITERAL_TYPE("geometryCreator/spirv/gc.basic.fragment.spv") > (IShader::E_SHADER_STAGE::ESS_FRAGMENT, ico.fragment); // note we reuse fragment from basic! - */ if (!device->createGraphicsPipelines(nullptr,params,pipelines)) { logger->log("Could not create Graphics Pipelines!",ILogger::ELL_ERROR); @@ -231,8 +219,18 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted if (!geom->valid()) continue; auto& out = init.geoms.emplace_back(); -// TODO: handle special cases - out.pipeline = pipelines[PipelineType::BasicTriangleList]; + switch (geom->getIndexingCallback()->knownTopology()) + { + case E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN: + out.pipeline = pipelines[PipelineType::BasicTriangleFan]; + break; + default: + out.pipeline = pipelines[PipelineType::BasicTriangleList]; + break; + } + // special case + if (entry.name=="Cone") + out.pipeline = pipelines[PipelineType::Cone]; if (const auto& view=geom->getIndexView(); view) { out.indexBuffer.offset = view.src.offset; From a860c432a1a05f8fb114fd8574d3eae6f60c4204 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 19 Jun 2025 01:18:24 +0200 Subject: [PATCH 265/296] fix up examples before the PCH PR --- 03_DeviceSelectionAndSharedSources/main.cpp | 5 ++- 07_StagingAndMultipleQueues/main.cpp | 7 +--- 08_HelloSwapchain/main.cpp | 2 +- 10_CountingSort/main.cpp | 4 +-- 24_ColorSpaceTest/main.cpp | 9 +++-- 26_Blur/main.cpp | 15 ++++---- 27_MPMCScheduler/main.cpp | 9 +++-- 28_FFTBloom/main.cpp | 11 +++--- .../include/nbl/this_example/common.hpp | 16 +++------ 30_ComputeShaderPathTracer/main.cpp | 10 +++--- 70_FLIPFluids/main.cpp | 34 ++++++++----------- common/include/nbl/examples/examples.hpp | 11 ++++++ 12 files changed, 60 insertions(+), 73 deletions(-) create mode 100644 common/include/nbl/examples/examples.hpp diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp index 5fb584e4d..6c99aff7f 100644 --- a/03_DeviceSelectionAndSharedSources/main.cpp +++ b/03_DeviceSelectionAndSharedSources/main.cpp @@ -2,10 +2,9 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/application_templates/MonoDeviceApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "nbl/examples/examples.hpp" +// TODO: why isn't this in `nabla.h` ? #include "nbl/asset/metadata/CHLSLMetadata.h" -#include "CommonPCH/PCH.hpp" using namespace nbl; using namespace core; diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp index 17c64d30e..a1a06f4f4 100644 --- a/07_StagingAndMultipleQueues/main.cpp +++ b/07_StagingAndMultipleQueues/main.cpp @@ -3,12 +3,7 @@ // For conditions of distribution and use, see copyright notice in nabla.h // I've moved out a tiny part of this example into a shared header for reuse, please open and read it. - -#include "nbl/application_templates/BasicMultiQueueApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" - -// get asset converter -#include "CommonPCH/PCH.hpp" +#include "nbl/examples/examples.hpp" using namespace nbl; using namespace core; diff --git a/08_HelloSwapchain/main.cpp b/08_HelloSwapchain/main.cpp index 9137fe77a..cd294b0d2 100644 --- a/08_HelloSwapchain/main.cpp +++ b/08_HelloSwapchain/main.cpp @@ -1,7 +1,7 @@ // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "SimpleWindowedApplication.hpp" +#include "nbl/examples/examples.hpp" // #include "nbl/video/surface/CSurfaceVulkan.h" diff --git a/10_CountingSort/main.cpp b/10_CountingSort/main.cpp index de2ffca8b..0efc0518e 100644 --- a/10_CountingSort/main.cpp +++ b/10_CountingSort/main.cpp @@ -1,6 +1,4 @@ -#include "nbl/application_templates/MonoDeviceApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" -#include "CommonPCH/PCH.hpp" +#include "nbl/examples/examples.hpp" using namespace nbl; using namespace core; diff --git a/24_ColorSpaceTest/main.cpp b/24_ColorSpaceTest/main.cpp index fae93cf45..56af4fc79 100644 --- a/24_ColorSpaceTest/main.cpp +++ b/24_ColorSpaceTest/main.cpp @@ -1,10 +1,8 @@ // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" -#include "SimpleWindowedApplication.hpp" +#include "nbl/examples/examples.hpp" -#include "nbl/video/surface/CSurfaceVulkan.h" #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" #include "nlohmann/json.hpp" @@ -19,13 +17,14 @@ using namespace system; using namespace asset; using namespace ui; using namespace video; +using namespace nbl::examples; // defines for sampler tests can be found in the file below #include "app_resources/push_constants.hlsl" -class ColorSpaceTestSampleApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class ColorSpaceTestSampleApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::SimpleWindowedApplication; + using device_base_t = SimpleWindowedApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; using clock_t = std::chrono::steady_clock; using perf_clock_resolution_t = std::chrono::milliseconds; diff --git a/26_Blur/main.cpp b/26_Blur/main.cpp index bd4b6dedc..e5105c778 100644 --- a/26_Blur/main.cpp +++ b/26_Blur/main.cpp @@ -1,27 +1,24 @@ // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h +#include "nbl/examples/examples.hpp" + #include #include -#include "nabla.h" -#include "SimpleWindowedApplication.hpp" -#include "InputSystem.hpp" -#include "CEventCallback.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" - using namespace nbl; using namespace nbl::core; using namespace nbl::system; using namespace nbl::asset; using namespace nbl::ui; using namespace nbl::video; +using namespace nbl::examples; #include "app_resources/common.hlsl" -class BlurApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class BlurApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::SimpleWindowedApplication; + using device_base_t = SimpleWindowedApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; using clock_t = std::chrono::steady_clock; @@ -262,7 +259,7 @@ class BlurApp final : public examples::SimpleWindowedApplication, public applica ISPIRVOptimizer::EOP_LOCAL_MULTI_STORE_ELIM }; auto opt = make_smart_refctd_ptr(optPasses); - shader = m_device->createShader(source.get(), opt.get()); + shader = m_device->compileShader({ source.get(),opt.get() }); #else shader = m_device->compileShader({ source.get() }); #endif diff --git a/27_MPMCScheduler/main.cpp b/27_MPMCScheduler/main.cpp index 33768c981..18d396135 100644 --- a/27_MPMCScheduler/main.cpp +++ b/27_MPMCScheduler/main.cpp @@ -1,9 +1,7 @@ // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nabla.h" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" -#include "SimpleWindowedApplication.hpp" +#include "nbl/examples/examples.hpp" using namespace nbl; using namespace nbl::core; @@ -11,12 +9,13 @@ using namespace nbl::system; using namespace nbl::asset; using namespace nbl::ui; using namespace nbl::video; +using namespace nbl::examples; #include "app_resources/common.hlsl" -class MPMCSchedulerApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class MPMCSchedulerApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::SimpleWindowedApplication; + using device_base_t = SimpleWindowedApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; using clock_t = std::chrono::steady_clock; diff --git a/28_FFTBloom/main.cpp b/28_FFTBloom/main.cpp index b528d3c41..16835ecf6 100644 --- a/28_FFTBloom/main.cpp +++ b/28_FFTBloom/main.cpp @@ -1,9 +1,7 @@ // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h - -#include "SimpleWindowedApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "nbl/examples/examples.hpp" using namespace nbl; using namespace core; @@ -11,6 +9,7 @@ using namespace system; using namespace asset; using namespace video; using namespace ui; +using namespace nbl::examples; #include "app_resources/common.hlsl" #include "nbl/builtin/hlsl/bit.hlsl" @@ -19,9 +18,9 @@ using namespace ui; constexpr uint32_t WIN_W = 1280; constexpr uint32_t WIN_H = 720; -class FFTBloomApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class FFTBloomApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::SimpleWindowedApplication; + using device_base_t = SimpleWindowedApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; using clock_t = std::chrono::steady_clock; @@ -212,7 +211,7 @@ class FFTBloomApp final : public examples::SimpleWindowedApplication, public app #ifndef _NBL_DEBUG ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); - return m_device->createShader({ HLSLShader.get(), opt.get(), m_readCache.get(), m_writeCache.get()}); + return m_device->compileShader({ HLSLShader.get(), opt.get(), m_readCache.get(), m_writeCache.get()}); #else return m_device->compileShader({ HLSLShader.get(), nullptr, m_readCache.get(), m_writeCache.get() }); #endif diff --git a/30_ComputeShaderPathTracer/include/nbl/this_example/common.hpp b/30_ComputeShaderPathTracer/include/nbl/this_example/common.hpp index ff3dd8095..3745ca512 100644 --- a/30_ComputeShaderPathTracer/include/nbl/this_example/common.hpp +++ b/30_ComputeShaderPathTracer/include/nbl/this_example/common.hpp @@ -1,17 +1,11 @@ -#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ -#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ +#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ +#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ -#include - -// common api -#include "CCamera.hpp" -#include "SimpleWindowedApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" -#include "CEventCallback.hpp" +#include "nbl/examples/examples.hpp" // example's own headers -#include "nbl/ui/ICursorControl.h" +#include "nbl/ui/ICursorControl.h" // TODO: why not in nabla.h ? #include "nbl/ext/ImGui/ImGui.h" #include "imgui/imgui_internal.h" -#endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ \ No newline at end of file +#endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ \ No newline at end of file diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp index 62602c7f9..487388ea0 100644 --- a/30_ComputeShaderPathTracer/main.cpp +++ b/30_ComputeShaderPathTracer/main.cpp @@ -1,12 +1,13 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h +#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" #include "nbl/this_example/common.hpp" -#include "nbl/asset/interchange/IImageAssetHandlerBase.h" -#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" + #include "nbl/builtin/hlsl/surface_transform.h" + using namespace nbl; using namespace core; using namespace hlsl; @@ -14,6 +15,7 @@ using namespace system; using namespace asset; using namespace ui; using namespace video; +using namespace nbl::examples; // TODO: share push constants struct PTPushConstant { @@ -24,9 +26,9 @@ struct PTPushConstant { // TODO: Add a QueryPool for timestamping once its ready (actually add IMGUI mspf plotter) // TODO: Do buffer creation using assConv -class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class ComputeShaderPathtracer final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::SimpleWindowedApplication; + using device_base_t = SimpleWindowedApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; using clock_t = std::chrono::steady_clock; diff --git a/70_FLIPFluids/main.cpp b/70_FLIPFluids/main.cpp index c0f68ca49..d66b56811 100644 --- a/70_FLIPFluids/main.cpp +++ b/70_FLIPFluids/main.cpp @@ -1,30 +1,24 @@ -#include - -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" -#include "SimpleWindowedApplication.hpp" -#include "InputSystem.hpp" -#include "CCamera.hpp" - -#include "glm/glm/glm.hpp" -#include -#include - +// Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#include "nbl/examples/examples.hpp" +// TODO: why is it not in nabla.h ? #include "nbl/asset/metadata/CHLSLMetadata.h" +using namespace nbl::core; using namespace nbl::hlsl; -using namespace nbl; -using namespace core; -using namespace hlsl; -using namespace system; -using namespace asset; -using namespace ui; -using namespace video; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; #include "app_resources/common.hlsl" #include "app_resources/gridUtils.hlsl" #include "app_resources/render_common.hlsl" #include "app_resources/descriptor_bindings.hlsl" + enum SimPresets { CENTER_DROP, @@ -167,9 +161,9 @@ class CEventCallback : public ISimpleManagedSurface::ICallback nbl::system::logger_opt_smart_ptr m_logger = nullptr; }; -class FLIPFluidsApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class FLIPFluidsApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::SimpleWindowedApplication; + using device_base_t = SimpleWindowedApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; using clock_t = std::chrono::steady_clock; diff --git a/common/include/nbl/examples/examples.hpp b/common/include/nbl/examples/examples.hpp new file mode 100644 index 000000000..a7d8f92e4 --- /dev/null +++ b/common/include/nbl/examples/examples.hpp @@ -0,0 +1,11 @@ +// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_EXAMPLES_HPP_ +#define _NBL_EXAMPLES_HPP_ + + +#include "nbl/examples/PCH.hpp" + + +#endif // _NBL_EXAMPLES_HPP_ \ No newline at end of file From 2dc268211efc70e8319114d32ba5749e9fcd9a4e Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 19 Jun 2025 01:21:02 +0200 Subject: [PATCH 266/296] prep example 71 for @kevinyu Use `ICPUPolygonGeometry::exportForBLAS` to make the Triangle Geometries https://github.com/Devsh-Graphics-Programming/Nabla/blob/1f52d2f48ff077cb430cc78285fb12dd7e093f74/include/nbl/asset/IPolygonGeometry.h#L209 --- 71_RayTracingPipeline/include/common.hpp | 99 +++--------------------- 71_RayTracingPipeline/main.cpp | 20 +++-- 2 files changed, 24 insertions(+), 95 deletions(-) diff --git a/71_RayTracingPipeline/include/common.hpp b/71_RayTracingPipeline/include/common.hpp index 3b66fd3e9..c60e0c3e5 100644 --- a/71_RayTracingPipeline/include/common.hpp +++ b/71_RayTracingPipeline/include/common.hpp @@ -1,97 +1,22 @@ -#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ -#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ +#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ +#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ -#include -#include "nbl/asset/utils/CGeometryCreator.h" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "nbl/examples/examples.hpp" -#include "SimpleWindowedApplication.hpp" - -#include "InputSystem.hpp" -#include "CEventCallback.hpp" - -#include "CCamera.hpp" - -#include -#include -#include +using namespace nbl; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::application_templates; +using namespace nbl::examples; #include "nbl/ui/ICursorControl.h" #include "nbl/ext/ImGui/ImGui.h" #include "imgui/imgui_internal.h" -using namespace nbl; -using namespace core; -using namespace hlsl; -using namespace system; -using namespace asset; -using namespace ui; -using namespace video; -using namespace scene; - #include "app_resources/common.hlsl" -namespace nbl::scene -{ - -enum ObjectType : uint8_t -{ - OT_CUBE, - OT_SPHERE, - OT_CYLINDER, - OT_RECTANGLE, - OT_DISK, - OT_ARROW, - OT_CONE, - OT_ICOSPHERE, - - OT_COUNT, - OT_UNKNOWN = std::numeric_limits::max() -}; - -static constexpr uint32_t s_smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 }; - -struct ObjectMeta -{ - ObjectType type = OT_UNKNOWN; - std::string_view name = "Unknown"; -}; - -struct ObjectDrawHookCpu -{ - nbl::core::matrix3x4SIMD model; - nbl::asset::SBasicViewParameters viewParameters; - ObjectMeta meta; -}; - -struct ReferenceObjectCpu -{ - ObjectMeta meta; - nbl::asset::CGeometryCreator::return_type data; - Material material; - core::matrix3x4SIMD transform; -}; - -struct ReferenceObjectGpu -{ - struct Bindings - { - nbl::asset::SBufferBinding vertex, index; - }; - - ObjectMeta meta; - Bindings bindings; - uint32_t vertexStride; - nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN; - uint32_t indexCount = {}; - MaterialPacked material; - core::matrix3x4SIMD transform; - - const bool useIndex() const - { - return bindings.index.buffer && (indexType != E_INDEX_TYPE::EIT_UNKNOWN); - } -}; -} - #endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 519ff8473..453e9cf69 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1,15 +1,15 @@ // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h - #include "common.hpp" + #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" #include "nbl/builtin/hlsl/indirect_commands.hlsl" -class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class RaytracingPipelineApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::SimpleWindowedApplication; + using device_base_t = SimpleWindowedApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; using clock_t = std::chrono::steady_clock; @@ -375,12 +375,11 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); - auto* geometryCreator = assetManager->getGeometryCreator(); if (!createIndirectBuffer()) return logFail("Could not create indirect buffer"); - if (!createAccelerationStructuresFromGeometry(geometryCreator)) + if (!createAccelerationStructuresFromGeometry()) return logFail("Could not create acceleration structures from geometry creator"); ISampler::SParams samplerParams = { @@ -1082,7 +1081,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, return true; } - bool createAccelerationStructuresFromGeometry(const IGeometryCreator* gc) + bool createAccelerationStructuresFromGeometry() { auto queue = getGraphicsQueue(); // get geometries into ICPUBuffers @@ -1109,6 +1108,10 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, planeTransform.setRotation(quaternion::fromAngleAxis(core::radians(-90.0f), vector3df_SIMD{ 1, 0, 0 })); // triangles geometries + auto geometryCreator = make_smart_refctd_ptr(); +#if 1 + return false; +#else const auto cpuObjects = std::array{ ReferenceObjectCpu { .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, @@ -1513,7 +1516,7 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, params.size = geomInfoBuffer->getSize(); m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); } - +#endif return true; } @@ -1567,7 +1570,8 @@ class RaytracingPipelineApp final : public examples::SimpleWindowedApplication, } m_ui; core::smart_refctd_ptr m_guiDescriptorSetPool; - core::vector m_gpuTriangleGeometries; + // TODO: how much of this do we actually have to keep ? +// core::vector m_gpuTriangleGeometries; core::vector m_gpuIntersectionSpheres; uint32_t m_intersectionHitGroupIdx; From be46ec3d8ccb4dfca3768be03089f01912804c96 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 19 Jun 2025 01:28:21 +0200 Subject: [PATCH 267/296] prep example 67 for further work --- 67_RayQueryGeometry/include/common.hpp | 101 +++---------------------- 67_RayQueryGeometry/main.cpp | 34 +++------ 2 files changed, 23 insertions(+), 112 deletions(-) diff --git a/67_RayQueryGeometry/include/common.hpp b/67_RayQueryGeometry/include/common.hpp index 0595c7203..bcf896f55 100644 --- a/67_RayQueryGeometry/include/common.hpp +++ b/67_RayQueryGeometry/include/common.hpp @@ -1,95 +1,18 @@ -#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ -#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ +#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ +#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ -#include -#include "nbl/asset/utils/CGeometryCreator.h" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" - -#include "SimpleWindowedApplication.hpp" - -#include "InputSystem.hpp" -#include "CEventCallback.hpp" - -#include "CCamera.hpp" - -#include -#include +#include "nbl/examples/examples.hpp" using namespace nbl; -using namespace core; -using namespace hlsl; -using namespace system; -using namespace asset; -using namespace ui; -using namespace video; -using namespace scene; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::application_templates; +using namespace nbl::examples; #include "app_resources/common.hlsl" -namespace nbl::scene -{ -enum ObjectType : uint8_t -{ - OT_CUBE, - OT_SPHERE, - OT_CYLINDER, - OT_RECTANGLE, - OT_DISK, - OT_ARROW, - OT_CONE, - OT_ICOSPHERE, - - OT_COUNT, - OT_UNKNOWN = std::numeric_limits::max() -}; - -struct ObjectMeta -{ - ObjectType type = OT_UNKNOWN; - std::string_view name = "Unknown"; -}; - -struct ObjectDrawHookCpu -{ - nbl::core::matrix3x4SIMD model; - nbl::asset::SBasicViewParameters viewParameters; - ObjectMeta meta; -}; - -enum GeometryShader -{ - GP_BASIC = 0, - GP_CONE, - GP_ICO, - - GP_COUNT -}; - -struct ReferenceObjectCpu -{ - ObjectMeta meta; - GeometryShader shadersType; - nbl::asset::CGeometryCreator::return_type data; -}; - -struct ReferenceObjectGpu -{ - struct Bindings - { - nbl::asset::SBufferBinding vertex, index; - }; - - ObjectMeta meta; - Bindings bindings; - uint32_t vertexStride; - nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN; - uint32_t indexCount = {}; - - const bool useIndex() const - { - return bindings.index.buffer && (indexType != E_INDEX_TYPE::EIT_UNKNOWN); - } -}; -} - -#endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ \ No newline at end of file +#endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ \ No newline at end of file diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index fdee5c5a1..495f3a3e2 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -3,10 +3,10 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "common.hpp" -class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class RayQueryGeometryApp final : public SimpleWindowedApplication, public MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::SimpleWindowedApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using device_base_t = SimpleWindowedApplication; + using asset_base_t = MonoAssetManagerAndBuiltinResourceApplication; using clock_t = std::chrono::steady_clock; constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; @@ -121,7 +121,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu return logFail("Could not create HDR Image"); auto assetManager = make_smart_refctd_ptr(smart_refctd_ptr(system)); - auto* geometryCreator = assetManager->getGeometryCreator(); auto cQueue = getComputeQueue(); @@ -138,9 +137,9 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu std::this_thread::yield(); } // Nsight is special and can't capture anything not on the queue that performs the swapchain acquire/release - createAccelerationStructureDS(gQueue,geometryCreator); + createAccelerationStructureDS(gQueue); #else - createAccelerationStructureDS(cQueue,geometryCreator); + createAccelerationStructureDS(cQueue); #endif if (!renderDs) return logFail("Could not create acceleration structures and descriptor set"); @@ -258,11 +257,9 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu cmdbuf->beginDebugMarker("RayQueryGeometryApp Frame"); { camera.beginInputProcessing(nextPresentationTimestamp); - mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); mouseProcess(events); }, m_logger.get()); + mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, m_logger.get()); keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); }, m_logger.get()); camera.endInputProcessing(nextPresentationTimestamp); - - const auto type = static_cast(gcIndex); } const auto viewMatrix = camera.getViewMatrix(); @@ -487,9 +484,12 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu return (dim + size - 1) / size; } - smart_refctd_ptr createAccelerationStructureDS(video::CThreadSafeQueueAdapter* queue, const IGeometryCreator* gc) + smart_refctd_ptr createAccelerationStructureDS(video::CThreadSafeQueueAdapter* queue) { // get geometries in ICPUBuffers +#if 1 + return nullptr; +#else std::array objectsCpu; objectsCpu[OT_CUBE] = ReferenceObjectCpu{ .meta = {.type = OT_CUBE, .name = "Cube Mesh" }, .shadersType = GP_BASIC, .data = gc->createCubeMesh(nbl::core::vector3df(1.f, 1.f, 1.f)) }; objectsCpu[OT_SPHERE] = ReferenceObjectCpu{ .meta = {.type = OT_SPHERE, .name = "Sphere Mesh" }, .shadersType = GP_BASIC, .data = gc->createSphereMesh(2, 16, 16) }; @@ -892,6 +892,7 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu m_api->endCapture(); return reservation.getGPUObjects().front().value; +#endif } @@ -915,19 +916,6 @@ class RayQueryGeometryApp final : public examples::SimpleWindowedApplication, pu smart_refctd_ptr renderPipeline; smart_refctd_ptr renderDs; - - uint16_t gcIndex = {}; - - void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events) - { - for (auto eventIt = events.begin(); eventIt != events.end(); eventIt++) - { - auto ev = *eventIt; - - if (ev.type == nbl::ui::SMouseEvent::EET_SCROLL) - gcIndex = std::clamp(int16_t(gcIndex) + int16_t(core::sign(ev.scrollEvent.verticalScroll)), int64_t(0), int64_t(OT_COUNT - (uint8_t)1u)); - } - } }; NBL_MAIN_FUNC(RayQueryGeometryApp) \ No newline at end of file From 9aa748347856f53e4847d0d17b4be521481e7857 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 19 Jun 2025 01:33:27 +0200 Subject: [PATCH 268/296] @AnastaZIuk I need your help for getting the mounting of `common/include/nbl/examples` under `nbl/examples` either as dir or builtin archive --- .../app_resources/testWorkgroup.comp.hlsl | 3 ++- .../app_resources/benchmarkWorkgroup.comp.hlsl | 3 ++- 29_Arithmetic2Bench/main.cpp | 16 ++++++++-------- .../examples/workgroup/DataAccessors.hlsl} | 13 ++++++++++--- 4 files changed, 22 insertions(+), 13 deletions(-) rename common/include/{WorkgroupDataAccessors.hlsl => nbl/examples/workgroup/DataAccessors.hlsl} (96%) diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl index 2a32ed20e..a3e70b8ff 100644 --- a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -14,7 +14,8 @@ typedef vector type_t; // final (level 1/2) scan needs to fit in one subgroup exactly groupshared uint32_t scratch[mpl::max_v]; -#include "../../common/include/WorkgroupDataAccessors.hlsl" +#include "nbl/examples/workgroup/DataAccessors.hlsl" +using namespace nbl::hlsl::examples::workgroup; static ScratchProxy arithmeticAccessor; diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl index a56945467..58912691f 100644 --- a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -15,7 +15,8 @@ typedef vector type_t; // final (level 1/2) scan needs to fit in one subgroup exactly groupshared uint32_t scratch[mpl::max_v]; -#include "../../common/include/WorkgroupDataAccessors.hlsl" +#include "nbl/examples/workgroup/DataAccessors.hlsl" +using namespace nbl::hlsl::examples::workgroup; template struct RandomizedInputDataProxy diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index 61e94607b..0a0e3b35f 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -1,16 +1,16 @@ -#include "SimpleWindowedApplication.hpp" -#include "CEventCallback.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "nbl/examples/examples.hpp" #include "app_resources/common.hlsl" #include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl" using namespace nbl; -using namespace core; -using namespace system; -using namespace asset; -using namespace ui; -using namespace video; +using namespace nbl::core; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; + template requires std::is_base_of_v class CExplicitSurfaceFormatResizeSurface final : public ISimpleManagedSurface diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/nbl/examples/workgroup/DataAccessors.hlsl similarity index 96% rename from common/include/WorkgroupDataAccessors.hlsl rename to common/include/nbl/examples/workgroup/DataAccessors.hlsl index 7287a4135..f94121ec0 100644 --- a/common/include/WorkgroupDataAccessors.hlsl +++ b/common/include/nbl/examples/workgroup/DataAccessors.hlsl @@ -1,12 +1,18 @@ -#ifndef _WORKGROUP_DATA_ACCESSORS_HLSL_ -#define _WORKGROUP_DATA_ACCESSORS_HLSL_ +#ifndef _NBL_EXAMPLES_WORKGROUP_DATA_ACCESSORS_HLSL_ +#define _NBL_EXAMPLES_WORKGROUP_DATA_ACCESSORS_HLSL_ + #include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl" + namespace nbl { namespace hlsl { +namespace examples +{ +namespace workgroup +{ struct ScratchProxy { @@ -120,5 +126,6 @@ struct PreloadedDataProxy } } - +} +} #endif From 846d2fda842a15014d38047f457ed96362927eed Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 19 Jun 2025 01:34:31 +0200 Subject: [PATCH 269/296] prep UI example for further work --- 61_UI/include/common.hpp | 28 +++++++++++----------------- 61_UI/main.cpp | 10 +++------- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/61_UI/include/common.hpp b/61_UI/include/common.hpp index a5def7551..fe7d086dd 100644 --- a/61_UI/include/common.hpp +++ b/61_UI/include/common.hpp @@ -1,25 +1,19 @@ -#ifndef __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ -#define __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ +#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ +#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ -#include -// common api -#include "CCamera.hpp" -#include "SimpleWindowedApplication.hpp" -#include "CEventCallback.hpp" +#include "nbl/examples/examples.hpp" // the example's headers #include "transform.hpp" -#include "CGeomtryCreatorScene.hpp" using namespace nbl; -using namespace core; -using namespace hlsl; -using namespace system; -using namespace asset; -using namespace ui; -using namespace video; -using namespace scene; -using namespace geometrycreator; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; -#endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ \ No newline at end of file +#endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ \ No newline at end of file diff --git a/61_UI/main.cpp b/61_UI/main.cpp index 470d5e723..17d028f29 100644 --- a/61_UI/main.cpp +++ b/61_UI/main.cpp @@ -14,14 +14,11 @@ handle scene's object translations. */ -class UISampleApp final : public examples::SimpleWindowedApplication +class UISampleApp final : public SimpleWindowedApplication { - using device_base_t = examples::SimpleWindowedApplication; - using clock_t = std::chrono::steady_clock; + using device_base_t = SimpleWindowedApplication; - _NBL_STATIC_INLINE_CONSTEXPR uint32_t WIN_W = 1280, WIN_H = 720; - - constexpr static inline clock_t::duration DisplayImageDuration = std::chrono::milliseconds(900); + _NBL_STATIC_INLINE_CONSTEXPR uint32_t WIN_W = 1280, WIN_H = 720; public: inline UISampleApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) @@ -63,7 +60,6 @@ class UISampleApp final : public examples::SimpleWindowedApplication return false; m_assetManager = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); - auto* geometry = m_assetManager->getGeometryCreator(); m_semaphore = m_device->createSemaphore(m_realFrameIx); if (!m_semaphore) From a3475622a9bcea1751dd50c8d652136cfc38faa7 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 19 Jun 2025 02:19:47 +0200 Subject: [PATCH 270/296] note down embedding TODOs for arek --- 09_GeometryCreator/main.cpp | 6 +++ .../geometry/CSimpleDebugRenderer.hpp | 15 +++--- .../nbl/examples/geometry/SPushConstants.hlsl | 5 ++ .../examples/geometry/shaders/unified.hlsl | 52 +++++++++++++++++++ 4 files changed, 70 insertions(+), 8 deletions(-) create mode 100644 common/src/nbl/examples/geometry/shaders/unified.hlsl diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index 4c982e8f8..a78c385ee 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -4,6 +4,8 @@ #include "common.hpp" +// TODO: Arek, we should have a `nbl::examples` class inheriting from `application_templates::MonoAssetManagerAndBuiltinResourceApplication` which +// during `onAppInitialized` also mounts correct `common/include/nbl/examples` and `common/src/nbl/examples` as folder or builtin class GeometryCreatorApp final : public MonoWindowApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { using device_base_t = MonoWindowApplication; @@ -55,6 +57,10 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies },patch ); + + // TODO: this is plain wrong Arek + auto commonArchive = make_smart_refctd_ptr(localInputCWD/"app_resources",smart_refctd_ptr(m_logger),m_system.get()); + m_system->mount(make_smart_refctd_ptr(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()),"nbl/examples"); auto scRes = static_cast(m_surface->getSwapchainResources()); m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,m_scene.get()); diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp index bd190c082..7db627050 100644 --- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp +++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp @@ -21,8 +21,6 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted using namespace nbl::asset; \ using namespace nbl::video public: - // - constexpr static inline auto DescriptorCount = 255; // struct SViewParams { @@ -85,7 +83,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted { EXPOSE_NABLA_NAMESPACES; - if (!!renderpass) + if (!renderpass) return nullptr; auto device = const_cast(renderpass->getOriginDevice()); auto logger = device->getLogger(); @@ -100,9 +98,10 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted smart_refctd_ptr shader; { const auto bundle = assMan->getAsset("nbl/examples/geometry/shaders/unified.hlsl",{}); +// TODO: Arek //const auto bundle = assMan->getAsset("nbl/examples/geometry/shaders/unified.spv",{}); const auto contents = bundle.getContents(); - if (bundle.getAssetType()!=IAsset::ET_SHADER || contents.empty()) + if (contents.empty() || bundle.getAssetType()!=IAsset::ET_SHADER) return nullptr; shader = IAsset::castDown(contents[0]); if (!shader) @@ -124,7 +123,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted // some geometries may not have particular attributes .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT, .stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT, - .count = DescriptorCount + .count = SInstance::SPushConstants::DescriptorCount } }; dsLayout = device->createDescriptorSetLayout(bindings); @@ -164,9 +163,9 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted smart_refctd_ptr pipelines[PipelineType::Count] = {}; { IGPUGraphicsPipeline::SCreationParams params[PipelineType::Count] = {}; - params[PipelineType::BasicTriangleList].vertexShader = {.shader=shader.get(),.entryPoint="BasicTriangleListVS"}; + params[PipelineType::BasicTriangleList].vertexShader = {.shader=shader.get(),.entryPoint="BasicVS"}; params[PipelineType::BasicTriangleList].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"}; - params[PipelineType::BasicTriangleFan].vertexShader = {.shader=shader.get(),.entryPoint="BasicTriangleFanVS"}; + params[PipelineType::BasicTriangleFan].vertexShader = {.shader=shader.get(),.entryPoint="BasicVS"}; params[PipelineType::BasicTriangleFan].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"}; params[PipelineType::Cone].vertexShader = {.shader=shader.get(),.entryPoint="ConeVS"}; params[PipelineType::Cone].fragmentShader = {.shader=shader.get(),.entryPoint="ConeFS"}; @@ -206,7 +205,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted auto allocateUTB = [device,&infos](const IGeometry::SDataView& view)->uint8_t { if (!view) - return DescriptorCount; + return SInstance::SPushConstants::DescriptorCount; const auto retval = infos.size(); infos.emplace_back().desc = device->createBufferView(view.src, view.composed.format); return retval; diff --git a/common/include/nbl/examples/geometry/SPushConstants.hlsl b/common/include/nbl/examples/geometry/SPushConstants.hlsl index 2048f1f3f..932210d0d 100644 --- a/common/include/nbl/examples/geometry/SPushConstants.hlsl +++ b/common/include/nbl/examples/geometry/SPushConstants.hlsl @@ -22,6 +22,11 @@ struct SInstanceMatrices struct SPushConstants { + // no idea if DXC still has this bug with Push Constant static variables +#ifndef __HLSL_VERSiON + NBL_CONSTEXPR_STATIC_INLINE uint32_t DescriptorCount = 255; +#endif + SInstanceMatrices matrices; uint32_t positionView : 11; uint32_t normalView : 10; diff --git a/common/src/nbl/examples/geometry/shaders/unified.hlsl b/common/src/nbl/examples/geometry/shaders/unified.hlsl new file mode 100644 index 000000000..1c24ee870 --- /dev/null +++ b/common/src/nbl/examples/geometry/shaders/unified.hlsl @@ -0,0 +1,52 @@ +// +#include "nbl/examples/geometry/SPushConstants.hlsl" +using namespace nbl::hlsl; +using namespace nbl::hlsl::examples::geometry_creator_scene; + +// for dat sweet programmable pulling +[[vk::binding(0)]] Buffer utbs[/*SPushConstants::DescriptorCount*/255]; + +// +[[vk::push_constant]] SPushConstants pc; + +// +struct SInterpolants +{ + float32_t4 position : SV_Position; + float32_t3 color : COLOR0; +}; +#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl" + +// +SInterpolants BasicVS() +{ + const float32_t3 position = utbs[pc.positionView].xyz; + + SInterpolants output; + output.position = math::linalg::promoted_mul(pc.matrices.worldViewProj,position); + output.color = mul(pc.matrices.normalMat,utbs[pc.normalView].xyz)*0.5+promote(0.5f); + return output; +} +float32_t4 BasicFS(SInterpolants input) : SV_Target0 +{ + return float32_t4(input.color,1.f); +} + +// TODO: do smooth normals on the cone +SInterpolants ConeVS() +{ + const float32_t3 position = utbs[pc.positionView].xyz; + + SInterpolants output; + output.position = math::linalg::promoted_mul(pc.matrices.worldViewProj,position); + output.color = mul(inverse(transpose(pc.matrices.normalMat)),position); + return output; +} +float32_t4 ConeFS(SInterpolants input) : SV_Target0 +{ + const float32_t2x3 dViewPos_dScreen = float32_t2x3( + ddx(input.color), + ddy(input.color) + ); + return float32_t4(normalize(cross(X,Y))*0.5f+promote(0.5f),1.f); +} \ No newline at end of file From cf4e27959c5e58c562df3f3602a1b8a77b0d4dc7 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 19 Jun 2025 02:19:58 +0200 Subject: [PATCH 271/296] remove unused shaders --- .../geometry/shaders/gc.basic.fragment.hlsl | 6 ----- .../geometry/shaders/gc.basic.vertex.hlsl | 6 ----- .../geometry/shaders/gc.cone.vertex.hlsl | 6 ----- .../geometry/shaders/gc.ico.vertex.hlsl | 6 ----- .../geometry/shaders/grid.fragment.hlsl | 12 ---------- .../template/gc.basic.vertex.input.hlsl | 12 ---------- .../geometry/shaders/template/gc.common.hlsl | 22 ------------------- .../template/gc.cone.vertex.input.hlsl | 12 ---------- .../shaders/template/gc.ico.vertex.input.hlsl | 11 ---------- .../geometry/shaders/template/gc.vertex.hlsl | 15 ------------- 10 files changed, 108 deletions(-) delete mode 100644 common/src/nbl/examples/geometry/shaders/gc.basic.fragment.hlsl delete mode 100644 common/src/nbl/examples/geometry/shaders/gc.basic.vertex.hlsl delete mode 100644 common/src/nbl/examples/geometry/shaders/gc.cone.vertex.hlsl delete mode 100644 common/src/nbl/examples/geometry/shaders/gc.ico.vertex.hlsl delete mode 100644 common/src/nbl/examples/geometry/shaders/grid.fragment.hlsl delete mode 100644 common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl delete mode 100644 common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl delete mode 100644 common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl delete mode 100644 common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl delete mode 100644 common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl diff --git a/common/src/nbl/examples/geometry/shaders/gc.basic.fragment.hlsl b/common/src/nbl/examples/geometry/shaders/gc.basic.fragment.hlsl deleted file mode 100644 index 3dc9b9f1d..000000000 --- a/common/src/nbl/examples/geometry/shaders/gc.basic.fragment.hlsl +++ /dev/null @@ -1,6 +0,0 @@ -#include "template/gc.common.hlsl" - -float4 PSMain(PSInput input) : SV_Target0 -{ - return input.color; -} \ No newline at end of file diff --git a/common/src/nbl/examples/geometry/shaders/gc.basic.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/gc.basic.vertex.hlsl deleted file mode 100644 index 1afd468d9..000000000 --- a/common/src/nbl/examples/geometry/shaders/gc.basic.vertex.hlsl +++ /dev/null @@ -1,6 +0,0 @@ -#include "template/gc.basic.vertex.input.hlsl" -#include "template/gc.vertex.hlsl" - -/* - do not remove this text, WAVE is so bad that you can get errors if no proper ending xD -*/ diff --git a/common/src/nbl/examples/geometry/shaders/gc.cone.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/gc.cone.vertex.hlsl deleted file mode 100644 index ee0c42431..000000000 --- a/common/src/nbl/examples/geometry/shaders/gc.cone.vertex.hlsl +++ /dev/null @@ -1,6 +0,0 @@ -#include "template/gc.cone.vertex.input.hlsl" -#include "template/gc.vertex.hlsl" - -/* - do not remove this text, WAVE is so bad that you can get errors if no proper ending xD -*/ diff --git a/common/src/nbl/examples/geometry/shaders/gc.ico.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/gc.ico.vertex.hlsl deleted file mode 100644 index d63fdc809..000000000 --- a/common/src/nbl/examples/geometry/shaders/gc.ico.vertex.hlsl +++ /dev/null @@ -1,6 +0,0 @@ -#include "template/gc.ico.vertex.input.hlsl" -#include "template/gc.vertex.hlsl" - -/* - do not remove this text, WAVE is so bad that you can get errors if no proper ending xD -*/ diff --git a/common/src/nbl/examples/geometry/shaders/grid.fragment.hlsl b/common/src/nbl/examples/geometry/shaders/grid.fragment.hlsl deleted file mode 100644 index 4b4c1e691..000000000 --- a/common/src/nbl/examples/geometry/shaders/grid.fragment.hlsl +++ /dev/null @@ -1,12 +0,0 @@ -#include "template/grid.common.hlsl" - -float4 PSMain(PSInput input) : SV_Target0 -{ - float2 uv = (input.uv - float2(0.5, 0.5)) + 0.5 / 30.0; - float grid = gridTextureGradBox(uv, ddx(input.uv), ddy(input.uv)); - float4 fragColor = float4(1.0 - grid, 1.0 - grid, 1.0 - grid, 1.0); - fragColor *= 0.25; - fragColor *= 0.3 + 0.6 * smoothstep(0.0, 0.1, 1.0 - length(input.uv) / 5.5); - - return fragColor; -} \ No newline at end of file diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl deleted file mode 100644 index 862d4508e..000000000 --- a/common/src/nbl/examples/geometry/shaders/template/gc.basic.vertex.input.hlsl +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _NBL_EXAMPLES_GC_BASIC_VERTEX_INPUT_HLSL_ -#define _NBL_EXAMPLES_GC_BASIC_VERTEX_INPUT_HLSL_ - -[[vk::binding(0)]] Buffer position; -[[vk::binding(1)]] Buffer normal; -[[vk::binding(2)]] Buffer uv; -[[vk::binding(3)]] Buffer color; - -#endif // _NBL_EXAMPLES_GC_BASIC_VERTEX_INPUT_HLSL_ -/* - do not remove this text, WAVE is so bad that you can get errors if no proper ending xD -*/ diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl deleted file mode 100644 index ff40fb3c8..000000000 --- a/common/src/nbl/examples/geometry/shaders/template/gc.common.hlsl +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _NBL_EXAMPLES_GC_COMMON_HLSL_ -#define _NBL_EXAMPLES_GC_COMMON_HLSL_ - - -#include "common/SBasicViewParameters.hlsl" - -#ifdef __HLSL_VERSION -[[vk::push_constant]] SBasicViewParameters params; - -struct PSInput -{ - float4 position : SV_Position; - float3 color : COLOR0; -}; -#endif // __HLSL_VERSION - - -#endif // _NBL_EXAMPLES_GC_COMMON_HLSL_ - -/* - do not remove this text, WAVE is so bad that you can get errors if no proper ending xD -*/ \ No newline at end of file diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl deleted file mode 100644 index 7c40f54ab..000000000 --- a/common/src/nbl/examples/geometry/shaders/template/gc.cone.vertex.input.hlsl +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _NBL_EXAMPLES_GEOMETRY_CONE_VERTEX_INPUT_HLSL_ -#define _NBL_EXAMPLES_GEOMETRY_CONE_VERTEX_INPUT_HLSL_ - -[[vk::binding(0)]] Buffer position; -[[vk::binding(1)]] Buffer normal; -[[vk::binding(2)]] Buffer color; - -#endif // _NBL_EXAMPLES_GEOMETRY_CONE_VERTEX_INPUT_HLSL_ - -/* - do not remove this text, WAVE is so bad that you can get errors if no proper ending xD -*/ diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl deleted file mode 100644 index 67092ccf0..000000000 --- a/common/src/nbl/examples/geometry/shaders/template/gc.ico.vertex.input.hlsl +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef _NBL_EXAMPLES_GEOMETRY_ICO_VERTEX_INPUT_HLSL_ -#define _NBL_EXAMPLES_GEOMETRY_ICO_VERTEX_INPUT_HLSL_ - -[[vk::binding(0)]] Buffer position; -[[vk::binding(1)]] Buffer normal; -[[vk::binding(2)]] Buffer uv; - -#endif // _NBL_EXAMPLES_GEOMETRY_ICO_VERTEX_INPUT_HLSL_ -/* - do not remove this text, WAVE is so bad that you can get errors if no proper ending xD -*/ diff --git a/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl b/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl deleted file mode 100644 index e878bf7d7..000000000 --- a/common/src/nbl/examples/geometry/shaders/template/gc.vertex.hlsl +++ /dev/null @@ -1,15 +0,0 @@ -#include "gc.common.hlsl" - -PSInput VSMain() -{ - PSInput output; - - output.position = mul(params.MVP, float4(input.position, 1.0)); - output.color = float4(input.normal * 0.5 + 0.5, 1.0); - - return output; -} - -/* - do not remove this text, WAVE is so bad that you can get errors if no proper ending xD -*/ From 08c28efbe827edf85c564894c86d591841a938ed Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 19 Jun 2025 02:37:23 +0200 Subject: [PATCH 272/296] get the example to find shaders and create pipelines, but nothing on screen --- 09_GeometryCreator/main.cpp | 7 ++--- .../geometry/CSimpleDebugRenderer.hpp | 1 + .../examples/geometry/shaders/unified.hlsl | 27 +++++++++++-------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index a78c385ee..422cc7285 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -13,7 +13,8 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio public: GeometryCreatorApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) - : device_base_t({1280,720}, EF_D16_UNORM, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), + device_base_t({1280,720}, EF_D16_UNORM, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override { @@ -59,7 +60,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio ); // TODO: this is plain wrong Arek - auto commonArchive = make_smart_refctd_ptr(localInputCWD/"app_resources",smart_refctd_ptr(m_logger),m_system.get()); + m_system->mount(make_smart_refctd_ptr(localInputCWD/"../common/include/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()),"nbl/examples"); m_system->mount(make_smart_refctd_ptr(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()),"nbl/examples"); auto scRes = static_cast(m_surface->getSwapchainResources()); @@ -259,7 +260,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio if (ev.type==nbl::ui::SMouseEvent::EET_SCROLL && m_renderer) { gcIndex += int16_t(core::sign(ev.scrollEvent.verticalScroll)); - gcIndex = core::clamp(gcIndex,0ull,m_renderer->getInitParams().geoms.size()); + gcIndex = core::clamp(gcIndex,0ull,m_renderer->getInitParams().geoms.size()-1); } } } diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp index 7db627050..4308425a2 100644 --- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp +++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp @@ -104,6 +104,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted if (contents.empty() || bundle.getAssetType()!=IAsset::ET_SHADER) return nullptr; shader = IAsset::castDown(contents[0]); + shader = device->compileShader({.source=shader.get()}); if (!shader) return nullptr; } diff --git a/common/src/nbl/examples/geometry/shaders/unified.hlsl b/common/src/nbl/examples/geometry/shaders/unified.hlsl index 1c24ee870..bc6b6e13a 100644 --- a/common/src/nbl/examples/geometry/shaders/unified.hlsl +++ b/common/src/nbl/examples/geometry/shaders/unified.hlsl @@ -13,40 +13,45 @@ using namespace nbl::hlsl::examples::geometry_creator_scene; struct SInterpolants { float32_t4 position : SV_Position; - float32_t3 color : COLOR0; + float32_t3 meta : COLOR0; }; #include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl" // -SInterpolants BasicVS() +[shader("vertex")] +SInterpolants BasicVS(uint32_t VertexIndex : SV_VertexID) { - const float32_t3 position = utbs[pc.positionView].xyz; + const float32_t3 position = utbs[pc.positionView][VertexIndex].xyz; SInterpolants output; output.position = math::linalg::promoted_mul(pc.matrices.worldViewProj,position); - output.color = mul(pc.matrices.normalMat,utbs[pc.normalView].xyz)*0.5+promote(0.5f); + output.meta = mul(pc.matrices.normal,utbs[pc.normalView][VertexIndex].xyz); return output; } +[shader("pixel")] float32_t4 BasicFS(SInterpolants input) : SV_Target0 { - return float32_t4(input.color,1.f); + return float32_t4(normalize(input.meta)*0.5f+promote(0.5f),1.f); } // TODO: do smooth normals on the cone -SInterpolants ConeVS() +[shader("vertex")] +SInterpolants ConeVS(uint32_t VertexIndex : SV_VertexID) { - const float32_t3 position = utbs[pc.positionView].xyz; + const float32_t3 position = utbs[pc.positionView][VertexIndex].xyz; SInterpolants output; output.position = math::linalg::promoted_mul(pc.matrices.worldViewProj,position); - output.color = mul(inverse(transpose(pc.matrices.normalMat)),position); + output.meta = mul(inverse(transpose(pc.matrices.normal)),position); return output; } +[shader("pixel")] float32_t4 ConeFS(SInterpolants input) : SV_Target0 { const float32_t2x3 dViewPos_dScreen = float32_t2x3( - ddx(input.color), - ddy(input.color) + ddx(input.meta), + ddy(input.meta) ); - return float32_t4(normalize(cross(X,Y))*0.5f+promote(0.5f),1.f); + const float32_t3 normal = cross(dViewPos_dScreen[0],dViewPos_dScreen[1]); + return float32_t4(normalize(normal)*0.5f+promote(0.5f),1.f); } \ No newline at end of file From 17b0579da795261b5f2c20b4f040e4bbe674ca10 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 19 Jun 2025 02:47:37 +0200 Subject: [PATCH 273/296] scene was empty, no wonder nothing drew --- 09_GeometryCreator/main.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index 422cc7285..c087eba07 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -67,6 +67,12 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,m_scene.get()); if (!m_renderer) return logFail("Could not create Renderer!"); + m_renderer->m_instances.resize(1); + m_renderer->m_instances[0].world = float32_t3x4( + float32_t4(1,0,0,0), + float32_t4(0,1,0,0), + float32_t4(0,0,1,0) + ); // camera { @@ -147,6 +153,9 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewMatrix)); } const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix); + + // tear down scene every frame + m_renderer->m_instances[0].packedGeo = m_renderer->getInitParams().geoms.data()+gcIndex; m_renderer->render(cb,viewParams); cb->endRenderPass(); From 62f1a2684f6bc21f5c7f6bd84ed552f248f64657 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 19 Jun 2025 02:58:03 +0200 Subject: [PATCH 274/296] index type was not being set --- .../nbl/examples/geometry/CGeometryCreatorScene.hpp | 2 +- .../nbl/examples/geometry/CSimpleDebugRenderer.hpp | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index 8a73f2e14..63b3d7a8d 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -167,7 +167,7 @@ class CGeometryCreatorScene : public core::IReferenceCounted // struct SNamedGeometry { - std::string_view name = {}; + std::string name = {}; core::smart_refctd_ptr geom; }; std::span getGeometries() const {return m_geometries;} diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp index 4308425a2..474f1d350 100644 --- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp +++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp @@ -235,6 +235,18 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted { out.indexBuffer.offset = view.src.offset; out.indexBuffer.buffer = view.src.buffer; + switch (view.composed.format) + { + case E_FORMAT::EF_R16_UINT: + out.indexType = EIT_16BIT; + break; + case E_FORMAT::EF_R32_UINT: + out.indexType = EIT_32BIT; + break; + default: + assert(false); + return nullptr; + } } out.elementCount = geom->getVertexReferenceCount(); out.positionView = allocateUTB(geom->getPositionView()); From fcae0b438805e33c2884f75ea28aaa16e273e11f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 19 Jun 2025 19:14:49 +0700 Subject: [PATCH 275/296] Initial commit for example 71 to use ICPUPolygonGeometry --- 71_RayTracingPipeline/include/common.hpp | 62 ++++++++++ 71_RayTracingPipeline/main.cpp | 145 ++++++----------------- 2 files changed, 101 insertions(+), 106 deletions(-) diff --git a/71_RayTracingPipeline/include/common.hpp b/71_RayTracingPipeline/include/common.hpp index c60e0c3e5..184d424c7 100644 --- a/71_RayTracingPipeline/include/common.hpp +++ b/71_RayTracingPipeline/include/common.hpp @@ -19,4 +19,66 @@ using namespace nbl::examples; #include "app_resources/common.hlsl" +namespace nbl::scene +{ + +enum ObjectType : uint8_t +{ + OT_CUBE, + OT_SPHERE, + OT_CYLINDER, + OT_RECTANGLE, + OT_DISK, + OT_ARROW, + OT_CONE, + OT_ICOSPHERE, + + OT_COUNT, + OT_UNKNOWN = std::numeric_limits::max() +}; + +static constexpr uint32_t s_smoothNormals[OT_COUNT] = { 0, 1, 1, 0, 0, 1, 1, 1 }; + +struct ObjectMeta +{ + ObjectType type = OT_UNKNOWN; + std::string_view name = "Unknown"; +}; + +struct ObjectDrawHookCpu +{ + nbl::core::matrix3x4SIMD model; + ObjectMeta meta; +}; + +struct ReferenceObjectCpu +{ + ObjectMeta meta; + core::smart_refctd_ptr data; + Material material; + core::matrix3x4SIMD transform; +}; + +struct ReferenceObjectGpu +{ + struct Bindings + { + nbl::asset::SBufferBinding vertex, index; + }; + + ObjectMeta meta; + Bindings bindings; + uint32_t vertexStride; + nbl::asset::E_INDEX_TYPE indexType = nbl::asset::E_INDEX_TYPE::EIT_UNKNOWN; + uint32_t indexCount = {}; + MaterialPacked material; + core::matrix3x4SIMD transform; + + const bool useIndex() const + { + return bindings.index.buffer && (indexType != E_INDEX_TYPE::EIT_UNKNOWN); + } +}; +} + #endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index 453e9cf69..382e5cccb 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1109,25 +1109,23 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app // triangles geometries auto geometryCreator = make_smart_refctd_ptr(); -#if 1 - return false; -#else + const auto cpuObjects = std::array{ - ReferenceObjectCpu { - .meta = {.type = OT_RECTANGLE, .name = "Plane Mesh"}, - .data = gc->createRectangleMesh(nbl::core::vector2df_SIMD(10, 10)), + scene::ReferenceObjectCpu { + .meta = {.type = scene::OT_RECTANGLE, .name = "Plane Mesh"}, + .data = geometryCreator->createRectangle({10, 10}), .material = defaultMaterial, .transform = planeTransform, }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1, 1, 1)), + scene::ReferenceObjectCpu { + .meta = {.type = scene::OT_CUBE, .name = "Cube Mesh"}, + .data = geometryCreator->createCube({1, 1, 1}), .material = defaultMaterial, .transform = getTranslationMatrix(0, 0.5f, 0), }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Cube Mesh 2"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + scene::ReferenceObjectCpu { + .meta = {.type = scene::OT_CUBE, .name = "Cube Mesh 2"}, + .data = geometryCreator->createCube({1.5, 1.5, 1.5}), .material = Material{ .ambient = {0.1, 0.1, 0.2}, .diffuse = {0.2, 0.2, 0.8}, @@ -1137,9 +1135,9 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app }, .transform = getTranslationMatrix(-5.0f, 1.0f, 0), }, - ReferenceObjectCpu { - .meta = {.type = OT_CUBE, .name = "Transparent Cube Mesh"}, - .data = gc->createCubeMesh(nbl::core::vector3df(1.5, 1.5, 1.5)), + scene::ReferenceObjectCpu { + .meta = {.type = scene::OT_CUBE, .name = "Transparent Cube Mesh"}, + .data = geometryCreator->createCube({1.5, 1.5, 1.5}), .material = Material{ .ambient = {0.1, 0.2, 0.1}, .diffuse = {0.2, 0.8, 0.2}, @@ -1151,40 +1149,6 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app }, }; - struct CPUTriBufferBindings - { - nbl::asset::SBufferBinding vertex, index; - }; - std::array cpuTriBuffers; - - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - const auto& cpuObject = cpuObjects[i]; - - auto vBuffer = smart_refctd_ptr(cpuObject.data.bindings[0].buffer); // no offset - auto vUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - vBuffer->addUsageFlags(vUsage); - vBuffer->setContentHash(vBuffer->computeContentHash()); - - auto iBuffer = smart_refctd_ptr(cpuObject.data.indexBuffer.buffer); // no offset - auto iUsage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | - IGPUBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - - if (cpuObject.data.indexType != EIT_UNKNOWN) - if (iBuffer) - { - iBuffer->addUsageFlags(iUsage); - iBuffer->setContentHash(iBuffer->computeContentHash()); - } - - cpuTriBuffers[i] = { - .vertex = {.offset = 0, .buffer = vBuffer}, - .index = {.offset = 0, .buffer = iBuffer}, - }; - - } - // procedural geometries using Aabb = IGPUBottomLevelAccelerationStructure::AABB_t; @@ -1233,10 +1197,10 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app const auto blasCount = std::size(cpuObjects) + 1; const auto proceduralBlasIdx = std::size(cpuObjects); - std::array, std::size(cpuObjects)+1u> cpuBlas; + std::array, std::size(cpuObjects)+1u> cpuBlasList; for (uint32_t i = 0; i < blasCount; i++) { - auto& blas = cpuBlas[i]; + auto& blas = cpuBlasList[i]; blas = make_smart_refctd_ptr(); if (i == proceduralBlasIdx) @@ -1256,30 +1220,15 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app } else { - auto triangles = make_refctd_dynamic_array>>(1u); + auto triangles = make_refctd_dynamic_array>>(cpuObjects[i].data->exportForBLAS()); auto primitiveCounts = make_refctd_dynamic_array>(1u); auto& tri = triangles->front(); - auto& primCount = primitiveCounts->front(); - const auto& geom = cpuObjects[i]; - const auto& cpuBuf = cpuTriBuffers[i]; - const bool useIndex = geom.data.indexType != EIT_UNKNOWN; - const uint32_t vertexStride = geom.data.inputParams.bindings[0].stride; - const uint32_t numVertices = cpuBuf.vertex.buffer->getSize() / vertexStride; + auto& primCount = primitiveCounts->front(); + primCount = cpuObjects[i].data->getPrimitiveCount(); - if (useIndex) - primCount = geom.data.indexCount / 3; - else - primCount = numVertices / 3; - - tri.vertexData[0] = cpuBuf.vertex; - tri.indexData = useIndex ? cpuBuf.index : cpuBuf.vertex; - tri.maxVertex = numVertices - 1; - tri.vertexStride = vertexStride; - tri.vertexFormat = EF_R32G32B32_SFLOAT; - tri.indexType = geom.data.indexType; - tri.geometryFlags = geom.material.isTransparent() ? + tri.geometryFlags = cpuObjects[i].material.isTransparent() ? IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::NO_DUPLICATE_ANY_HIT_INVOCATION_BIT : IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS::OPAQUE_BIT; @@ -1305,7 +1254,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app { const auto isProceduralInstance = i == proceduralBlasIdx; ICPUTopLevelAccelerationStructure::StaticInstance inst; - inst.base.blas = cpuBlas[i]; + inst.base.blas = cpuBlasList[i]; inst.base.flags = static_cast(IGPUTopLevelAccelerationStructure::INSTANCE_FLAGS::TRIANGLE_FACING_CULL_DISABLE_BIT); inst.base.instanceCustomIndex = i; inst.base.instanceShaderBindingTableRecordOffset = isProceduralInstance ? 2 : 0;; @@ -1356,18 +1305,19 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app inputs.allocator = &myalloc; std::array tmpTlas; - std::array tmpBuffers; + std::array tmpGeometries; + std::array tmpBuffers; { tmpTlas[0] = cpuTlas.get(); + tmpBuffers[0] = cpuProcBuffer.get(); for (uint32_t i = 0; i < cpuObjects.size(); i++) { - tmpBuffers[2 * i + 0] = cpuTriBuffers[i].vertex.buffer.get(); - tmpBuffers[2 * i + 1] = cpuTriBuffers[i].index.buffer.get(); + tmpGeometries[i] = cpuObjects[i].data.get(); } - tmpBuffers[2 * proceduralBlasIdx] = cpuProcBuffer.get(); std::get>(inputs.assets) = tmpTlas; std::get>(inputs.assets) = tmpBuffers; + std::get>(inputs.assets) = tmpGeometries; } auto reservation = converter->reserve(inputs); @@ -1475,37 +1425,24 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app auto&& tlases = reservation.getGPUObjects(); m_gpuTlas = tlases[0].value; auto&& buffers = reservation.getGPUObjects(); - for (uint32_t i = 0; i < cpuObjects.size(); i++) - { - auto& cpuObject = cpuObjects[i]; - - m_gpuTriangleGeometries.push_back(ReferenceObjectGpu{ - .meta = cpuObject.meta, - .bindings = { - .vertex = {.offset = 0, .buffer = buffers[2 * i + 0].value }, - .index = {.offset = 0, .buffer = buffers[2 * i + 1].value }, - }, - .vertexStride = cpuObject.data.inputParams.bindings[0].stride, - .indexType = cpuObject.data.indexType, - .indexCount = cpuObject.data.indexCount, - .material = hlsl::_static_cast(cpuObject.material), - .transform = cpuObject.transform, - }); - } + m_proceduralAabbBuffer = buffers[2 * proceduralBlasIdx].value; - for (uint32_t i = 0; i < m_gpuTriangleGeometries.size(); i++) + for (uint32_t i = 0; i < cpuObjects.size(); i++) { - const auto& gpuObject = m_gpuTriangleGeometries[i]; - const uint64_t vertexBufferAddress = gpuObject.bindings.vertex.buffer->getDeviceAddress(); + const auto& cpuObject = cpuObjects[i]; + const auto& cpuBlas = cpuBlasList[i]; + const auto& geometry = cpuBlas->getTriangleGeometries()[0]; + const uint64_t vertexBufferAddress = buffers[2 * i].value->getDeviceAddress(); + const uint64_t indexBufferAddress = buffers[(2 * i) + 1].value->getDeviceAddress(); geomInfos[i] = { - .material = gpuObject.material, + .material = hlsl::_static_cast(cpuObject.material), .vertexBufferAddress = vertexBufferAddress, - .indexBufferAddress = gpuObject.useIndex() ? gpuObject.bindings.index.buffer->getDeviceAddress() : vertexBufferAddress, - .vertexStride = gpuObject.vertexStride, - .objType = gpuObject.meta.type, - .indexType = gpuObject.indexType, - .smoothNormals = s_smoothNormals[gpuObject.meta.type], + .indexBufferAddress = geometry.indexData.buffer ? indexBufferAddress : vertexBufferAddress, + .vertexStride = geometry.vertexStride, + .objType = cpuObject.meta.type, + .indexType = geometry.indexType, + .smoothNormals = scene::s_smoothNormals[cpuObject.meta.type], }; } } @@ -1516,12 +1453,10 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app params.size = geomInfoBuffer->getSize(); m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = queue }, std::move(params), geomInfos).move_into(m_triangleGeomInfoBuffer); } -#endif + return true; } - - smart_refctd_ptr m_window; smart_refctd_ptr> m_surface; smart_refctd_ptr m_semaphore; @@ -1570,8 +1505,6 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public app } m_ui; core::smart_refctd_ptr m_guiDescriptorSetPool; - // TODO: how much of this do we actually have to keep ? -// core::vector m_gpuTriangleGeometries; core::vector m_gpuIntersectionSpheres; uint32_t m_intersectionHitGroupIdx; From e419580318a44baaf7f2050bd988200f1ab00f08 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 19 Jun 2025 23:13:28 +0200 Subject: [PATCH 276/296] docs docs docs, adjust to comments and test on 01 after updates --- 01_HelloCoreSystemAsset/main.cpp | 2 +- CMakeLists.txt | 32 +++++++++----- common/CMakeLists.txt | 43 ++++++++++++++++++- common/include/nbl/examples/PCH.hpp | 18 ++++++-- common/include/nbl/examples/api.hpp | 24 ----------- common/include/nbl/examples/examples.hpp | 10 ++++- common/src/nbl/examples/CMakeLists.txt | 14 +----- .../src/nbl/examples/cameras/CMakeLists.txt | 7 --- 8 files changed, 90 insertions(+), 60 deletions(-) delete mode 100644 common/include/nbl/examples/api.hpp delete mode 100644 common/src/nbl/examples/cameras/CMakeLists.txt diff --git a/01_HelloCoreSystemAsset/main.cpp b/01_HelloCoreSystemAsset/main.cpp index 96e4a0d4e..7ca4badb4 100644 --- a/01_HelloCoreSystemAsset/main.cpp +++ b/01_HelloCoreSystemAsset/main.cpp @@ -3,7 +3,7 @@ // For conditions of distribution and use, see copyright notice in nabla.h // public interface and common examples API, always include first before std:: headers -#include "nbl/examples/api.hpp" +#include "nbl/examples/examples.hpp" #include "nbl/system/IApplicationFramework.h" diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a168b061..41ed86b52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,17 @@ if(NBL_BUILD_EXAMPLES) nbl_android_create_media_storage_apk() endif() + #! Common api library & precompiled headers for Nabla framework examples + add_subdirectory(common EXCLUDE_FROM_ALL) + + #! use "EXCLUDE_FROM_ALL" to exclude an example from the NablaExamples project + #[[ + useful if we don't want the example to be tested by CI but still want + the example's project to be generated + + https://cmake.org/cmake/help/latest/prop_tgt/EXCLUDE_FROM_ALL.html + ]] + # showcase the use of `nbl::core`,`nbl::system` and `nbl::asset` add_subdirectory(01_HelloCoreSystemAsset) # showcase the use of `system::IApplicationFramework` and `nbl::video` @@ -31,7 +42,6 @@ if(NBL_BUILD_EXAMPLES) # showcase use of FFT for post-FX Bloom effect add_subdirectory(11_FFT) - # Waiting for a refactor #add_subdirectory(27_PLYSTLDemo) #add_subdirectory(33_Draw3DLine) @@ -42,7 +52,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(22_CppCompat) add_subdirectory(23_Arithmetic2UnitTest) add_subdirectory(24_ColorSpaceTest) - add_subdirectory(25_FilterTest) + add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL) add_subdirectory(26_Blur) add_subdirectory(27_MPMCScheduler) add_subdirectory(28_FFTBloom) @@ -58,27 +68,29 @@ if(NBL_BUILD_EXAMPLES) # endif() #add_subdirectory(43_SumAndCDFFilters) - add_subdirectory(47_DerivMapTest) - add_subdirectory(54_Transformations) - add_subdirectory(55_RGB18E7S3) + add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL) + add_subdirectory(54_Transformations EXCLUDE_FROM_ALL) + add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL) add_subdirectory(61_UI) add_subdirectory(62_CAD) - add_subdirectory(62_SchusslerTest) + add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL) add_subdirectory(64_EmulatedFloatTest) - add_subdirectory(0_ImportanceSamplingEnvMaps) #TODO: integrate back into 42 + add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42 - add_subdirectory(66_HLSLBxDFTests) + add_subdirectory(66_HLSLBxDFTests EXCLUDE_FROM_ALL) add_subdirectory(67_RayQueryGeometry) add_subdirectory(68_JpegLoading) add_subdirectory(70_FLIPFluids) add_subdirectory(71_RayTracingPipeline) + # add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory NBL_GET_ALL_TARGETS(TARGETS) - # PCH & CommonAPI library for Nabla framework examples - add_subdirectory(common EXCLUDE_FROM_ALL) + # we want to loop only over the examples so we exclude examples' interface libraries created in common subdirectory + list(REMOVE_ITEM TARGETS ${NBL_EXAMPLES_API_TARGET} ${NBL_EXAMPLES_API_LIBRARIES}) + # we link common example api library and force examples to reuse its PCH foreach(T IN LISTS TARGETS) target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET}) target_include_directories(${T} PUBLIC $) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 3cdcce82d..3a55e7a26 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,7 +1,48 @@ +#! Examples API proxy library +#[[ + We create the Nabla Examples API as a static library extension, this + allows all examples to reuse a single precompiled header (PCH) + instead of generating their own + + The PCH includes Nabla.h + example common interface headers and takes + around 1 GB per configuration, so sharing it avoids significant disk space waste +]] + nbl_create_ext_library_project(ExamplesAPI "" "${CMAKE_CURRENT_SOURCE_DIR}/src/nbl/examples/pch.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/include" "" "") set_target_properties(${LIB_NAME} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF) target_precompile_headers(${LIB_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include/nbl/examples/PCH.hpp") +#! Examples API common libraries +#[[ + The rule is to avoid creating additional libraries as part of the examples' common + interface in order to prevent generating another precompiled header (PCH) and wasting disk space + + If you have new utilities that could be shared across examples then try to implement them as header only + and include in the PCH or in `examples.h` *if you cannot* (open the header to see details) + + but If you have a good reason to create library because you cannot make it header only + AND you *can REUSE* the examples' PCH then go ahead anyway and put it under `src/nbl/examples`, + otherwise keep it header only - a good example would be to use our embedded-whatever-you-want tool + which does create library but can reuse example's PCH +]] + +#! NOTE: as I write it we don't have any targets there yet add_subdirectory("src/nbl/examples" EXCLUDE_FROM_ALL) -set(NBL_EXAMPLES_API_TARGET ${LIB_NAME} PARENT_SCOPE) \ No newline at end of file + +NBL_GET_ALL_TARGETS(TARGETS) +list(REMOVE_ITEM TARGETS ${LIB_NAME}) + +# the Examples API proxy library CMake target name +#[[ + this one gets linked to each executable automatically +]] +set(NBL_EXAMPLES_API_TARGET ${LIB_NAME} PARENT_SCOPE) + +#! names of CMake targets created in src/nbl/examples +#[[ + if your example wants to use anything from src/nbl/examples + then you must target_link_libraries() the lib you want as we + don't link all those libraries to each executable automatically +]] +set(NBL_EXAMPLES_API_LIBRARIES ${TARGETS} PARENT_SCOPE) \ No newline at end of file diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp index 671f8b331..5316ce2e8 100644 --- a/common/include/nbl/examples/PCH.hpp +++ b/common/include/nbl/examples/PCH.hpp @@ -4,17 +4,29 @@ #ifndef _NBL_EXAMPLES_PCH_HPP_ #define _NBL_EXAMPLES_PCH_HPP_ -//! public Nabla declarations +//! Precompiled header (PCH) for Nabla Examples /* NOTE: currently our whole public and private interface is broken and private headers leak to public includes */ + +//! Nabla declarations #include "nabla.h" -//! common example headers +//! Common example interface headers // why isnt this in `nabla.h` ? -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +/* + because it does stuff like + + #ifdef NBL_EMBED_BUILTIN_RESOURCES + #include "nbl/this_example/builtin/CArchive.h" + #endif + + hence also cannot be there in PCH but rather in examples.h -> compile errors + but only *if* we decide each example handles builtins on NBL_EMBED_BUILTIN_RESOURCES +*/ +// #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "nbl/examples/common/SimpleWindowedApplication.hpp" #include "nbl/examples/common/MonoWindowApplication.hpp" diff --git a/common/include/nbl/examples/api.hpp b/common/include/nbl/examples/api.hpp deleted file mode 100644 index 9b809b8ea..000000000 --- a/common/include/nbl/examples/api.hpp +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_EXAMPLES_API_HPP_ -#define _NBL_EXAMPLES_API_HPP_ - -//! PCH for examples -/* - PCH is compiled only once *if* an example can be promoted to use it, it is - when its compile options & definitions set is the same as nblExamplesAPI's - each example links to, otherwise it compiles its own PCH -*/ -#include "nbl/examples/PCH.hpp" - -//! common headers used across examples which cannot be part of PCH -/* - NOTE: put here if a header requires defines which may be differ -*/ - -// broken? probably to refactor or even remove? -// #include "nbl/examples/geometry/CGeometryCreatorScene.hpp" - - -#endif // _NBL_EXAMPLES_API_HPP_ \ No newline at end of file diff --git a/common/include/nbl/examples/examples.hpp b/common/include/nbl/examples/examples.hpp index a7d8f92e4..985a3960a 100644 --- a/common/include/nbl/examples/examples.hpp +++ b/common/include/nbl/examples/examples.hpp @@ -4,8 +4,16 @@ #ifndef _NBL_EXAMPLES_HPP_ #define _NBL_EXAMPLES_HPP_ - +//! Precompiled header shared across all examples #include "nbl/examples/PCH.hpp" +//! Example specific headers that must not be included in the PCH +/* + NOTE: Add here if they depend on preprocessor definitions + or macros that are specific to individual example targets + (eg. defined in CMake) +*/ + +// #include "..." #endif // _NBL_EXAMPLES_HPP_ \ No newline at end of file diff --git a/common/src/nbl/examples/CMakeLists.txt b/common/src/nbl/examples/CMakeLists.txt index 65c312582..a95372eea 100644 --- a/common/src/nbl/examples/CMakeLists.txt +++ b/common/src/nbl/examples/CMakeLists.txt @@ -1,16 +1,4 @@ # TODO builtin SPIR-V shaders # add_subdirectory(geometry EXCLUDE_FROM_ALL) -# TODO: slightly redo and make docs once I get n4ce embed SPIRV tool to build system - -# we get all available targets inclusive & below this directory -# NBL_GET_ALL_TARGETS(NBL_SUBDIRECTORY_TARGETS) - -# then we expose common include search directories to all common libraries + create link interface -# foreach(NBL_TARGET IN LISTS NBL_SUBDIRECTORY_TARGETS) -# target_include_directories(${NBL_TARGET} PUBLIC $) -# target_link_libraries(nblExamplesAPI INTERFACE ${NBL_TARGET}) -#endforeach() - -# -# set(NBL_COMMON_API_TARGETS ${NBL_SUBDIRECTORY_TARGETS} PARENT_SCOPE) \ No newline at end of file +# TODO: make docs once I get n4ce embed SPIRV tool to build system and then use the tool with Matts new shader \ No newline at end of file diff --git a/common/src/nbl/examples/cameras/CMakeLists.txt b/common/src/nbl/examples/cameras/CMakeLists.txt deleted file mode 100644 index 0b0e59cdc..000000000 --- a/common/src/nbl/examples/cameras/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -# header only currently - -#set(NBL_EXAMPLES_CAMERA_LIB_SOURCES -# "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" -#) - -#nbl_create_ext_library_project(ExampleCameras "" "${NBL_EXAMPLES_CAMERA_LIB_SOURCES}" "" "" "") \ No newline at end of file From cfd609c3f01819abd504416d2b52ae72191c15b1 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 20 Jun 2025 16:54:44 +0200 Subject: [PATCH 277/296] matrix math in linalg was fine, I made a typo --- 09_GeometryCreator/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index c087eba07..000bec369 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -150,13 +150,13 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio // TODO: get rid of legacy matrices { memcpy(&viewMatrix,camera.getViewMatrix().pointer(),sizeof(viewMatrix)); - memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewMatrix)); + memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewProjMatrix)); } const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix); // tear down scene every frame m_renderer->m_instances[0].packedGeo = m_renderer->getInitParams().geoms.data()+gcIndex; - m_renderer->render(cb,viewParams); + m_renderer->render(cb,viewParams); cb->endRenderPass(); cb->end(); From 6242357bb8b98793da13885bdae4586c009e6984 Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 21 Jun 2025 00:00:47 +0200 Subject: [PATCH 278/296] fix a caption bug --- 09_GeometryCreator/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index 000bec369..6fddd8282 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -195,7 +195,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio std::string caption = "[Nabla Engine] Geometry Creator"; { - caption += ", displaying [" + + caption += ", displaying ["; caption += m_scene->getGeometries()[gcIndex].name; caption += "]"; m_window->setCaption(caption); From 19167c57e6fb80b762263513b10873c85990d09e Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 21 Jun 2025 23:36:08 +0200 Subject: [PATCH 279/296] leave pointers on how to port example 62_CAD, also `EXCLUDE_FROM_ALL` the examples which are WIP on this branch --- 62_CAD/main.cpp | 24 ++++++++++++------------ CMakeLists.txt | 8 ++++---- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp index 637c88eda..f873914e2 100644 --- a/62_CAD/main.cpp +++ b/62_CAD/main.cpp @@ -1,17 +1,17 @@ - -using namespace nbl::hlsl; -using namespace nbl; -using namespace core; -using namespace system; -using namespace asset; -using namespace ui; -using namespace video; +// TODO: Copyright notice + +#include "nbl/examples/examples.hpp" + +using namespace nbl; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +// TODO: probably need to be `using namespace nbl::examples` as well, see other examples -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" -#include "SimpleWindowedApplication.hpp" -#include "InputSystem.hpp" -#include "nbl/video/utilities/CSimpleResizeSurface.h" #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" #include "nbl/ext/TextRendering/TextRendering.h" diff --git a/CMakeLists.txt b/CMakeLists.txt index 41ed86b52..aa3880762 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,18 +71,18 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL) add_subdirectory(54_Transformations EXCLUDE_FROM_ALL) add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL) - add_subdirectory(61_UI) - add_subdirectory(62_CAD) + add_subdirectory(61_UI EXCLUDE_FROM_ALL) # TODO: resurrect before `mesh_loaders` merge + add_subdirectory(62_CAD EXCLUDE_FROM_ALL) # TODO: Erfan, Przemek, Francisco and co. need to resurrect this add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL) add_subdirectory(64_EmulatedFloatTest) add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42 add_subdirectory(66_HLSLBxDFTests EXCLUDE_FROM_ALL) - add_subdirectory(67_RayQueryGeometry) + add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL) # TODO: resurrect before `mesh_loaders` merge add_subdirectory(68_JpegLoading) add_subdirectory(70_FLIPFluids) - add_subdirectory(71_RayTracingPipeline) + add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) # TODO: resurrect before `mesh_loaders` merge # add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory NBL_GET_ALL_TARGETS(TARGETS) From a0622f3c0a71fc8d2e9c4f0f426f3f5695dc89dd Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 21 Jun 2025 23:49:19 +0200 Subject: [PATCH 280/296] kill old meshloaders example that didn't work since forever --- 06_MeshLoaders/CMakeLists.txt | 6 - 06_MeshLoaders/config.json.template | 28 -- 06_MeshLoaders/main.cpp | 563 ---------------------------- 06_MeshLoaders/pipeline.groovy | 50 --- 4 files changed, 647 deletions(-) delete mode 100644 06_MeshLoaders/CMakeLists.txt delete mode 100644 06_MeshLoaders/config.json.template delete mode 100644 06_MeshLoaders/main.cpp delete mode 100644 06_MeshLoaders/pipeline.groovy diff --git a/06_MeshLoaders/CMakeLists.txt b/06_MeshLoaders/CMakeLists.txt deleted file mode 100644 index 2f9218f93..000000000 --- a/06_MeshLoaders/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -include(common RESULT_VARIABLE RES) -if(NOT RES) - message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") -endif() - -nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file diff --git a/06_MeshLoaders/config.json.template b/06_MeshLoaders/config.json.template deleted file mode 100644 index f961745c1..000000000 --- a/06_MeshLoaders/config.json.template +++ /dev/null @@ -1,28 +0,0 @@ -{ - "enableParallelBuild": true, - "threadsPerBuildProcess" : 2, - "isExecuted": false, - "scriptPath": "", - "cmake": { - "configurations": [ "Release", "Debug", "RelWithDebInfo" ], - "buildModes": [], - "requiredOptions": [] - }, - "profiles": [ - { - "backend": "vulkan", - "platform": "windows", - "buildModes": [], - "runConfiguration": "Release", - "gpuArchitectures": [] - } - ], - "dependencies": [], - "data": [ - { - "dependencies": [], - "command": [""], - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/06_MeshLoaders/main.cpp b/06_MeshLoaders/main.cpp deleted file mode 100644 index 75135c033..000000000 --- a/06_MeshLoaders/main.cpp +++ /dev/null @@ -1,563 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#define _NBL_STATIC_LIB_ -#include -#include -#include - -#include "CCamera.hpp" -#include "../common/CommonAPI.h" -#include "nbl/ext/ScreenShot/ScreenShot.h" - -using namespace nbl; -using namespace core; -using namespace ui; -/* - Uncomment for more detailed logging -*/ - -// #define NBL_MORE_LOGS - -class MeshLoadersApp : public ApplicationBase -{ - constexpr static uint32_t WIN_W = 1280; - constexpr static uint32_t WIN_H = 720; - constexpr static uint32_t SC_IMG_COUNT = 3u; - constexpr static uint32_t FRAMES_IN_FLIGHT = 5u; - constexpr static uint64_t MAX_TIMEOUT = 99999999999999ull; - constexpr static size_t NBL_FRAMES_TO_AVERAGE = 100ull; - - static_assert(FRAMES_IN_FLIGHT > SC_IMG_COUNT); -public: - nbl::core::smart_refctd_ptr windowManager; - nbl::core::smart_refctd_ptr window; - nbl::core::smart_refctd_ptr windowCb; - nbl::core::smart_refctd_ptr apiConnection; - nbl::core::smart_refctd_ptr surface; - nbl::core::smart_refctd_ptr utilities; - nbl::core::smart_refctd_ptr logicalDevice; - nbl::video::IPhysicalDevice* physicalDevice; - std::array queues; - nbl::core::smart_refctd_ptr swapchain; - nbl::core::smart_refctd_ptr renderpass; - nbl::core::smart_refctd_dynamic_array> fbo; - std::array, CommonAPI::InitOutput::MaxFramesInFlight>, CommonAPI::InitOutput::MaxQueuesCount> commandPools; - nbl::core::smart_refctd_ptr system; - nbl::core::smart_refctd_ptr assetManager; - nbl::video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams; - nbl::core::smart_refctd_ptr logger; - nbl::core::smart_refctd_ptr inputSystem; - - nbl::video::IGPUObjectFromAssetConverter cpu2gpu; - - video::IDeviceMemoryBacked::SDeviceMemoryRequirements ubomemreq; - core::smart_refctd_ptr gpuubo; - core::smart_refctd_ptr gpuds1; - - core::smart_refctd_ptr occlusionQueryPool; - core::smart_refctd_ptr timestampQueryPool; - - asset::ICPUMesh* meshRaw = nullptr; - const asset::COBJMetadata* metaOBJ = nullptr; - - core::smart_refctd_ptr frameComplete[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr imageAcquire[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr renderFinished[FRAMES_IN_FLIGHT] = { nullptr }; - core::smart_refctd_ptr commandBuffers[FRAMES_IN_FLIGHT]; - - CommonAPI::InputSystem::ChannelReader mouse; - CommonAPI::InputSystem::ChannelReader keyboard; - Camera camera = Camera(vectorSIMDf(0, 0, 0), vectorSIMDf(0, 0, 0), matrix4SIMD()); - - using RENDERPASS_INDEPENDENT_PIPELINE_ADRESS = size_t; - std::map> gpuPipelines; - core::smart_refctd_ptr gpumesh; - const asset::ICPUMeshBuffer* firstMeshBuffer; - const nbl::asset::COBJMetadata::CRenderpassIndependentPipeline* pipelineMetadata; - nbl::video::ISwapchain::SCreationParams m_swapchainCreationParams; - - uint32_t ds1UboBinding = 0; - int resourceIx; - uint32_t acquiredNextFBO = {}; - std::chrono::steady_clock::time_point lastTime; - bool frameDataFilled = false; - size_t frame_count = 0ull; - double time_sum = 0; - double dtList[NBL_FRAMES_TO_AVERAGE] = {}; - - video::CDumbPresentationOracle oracle; - - core::smart_refctd_ptr queryResultsBuffer; - - void setWindow(core::smart_refctd_ptr&& wnd) override - { - window = std::move(wnd); - } - void setSystem(core::smart_refctd_ptr&& s) override - { - system = std::move(s); - } - nbl::ui::IWindow* getWindow() override - { - return window.get(); - } - video::IAPIConnection* getAPIConnection() override - { - return apiConnection.get(); - } - video::ILogicalDevice* getLogicalDevice() override - { - return logicalDevice.get(); - } - video::IGPURenderpass* getRenderpass() override - { - return renderpass.get(); - } - void setSurface(core::smart_refctd_ptr&& s) override - { - surface = std::move(s); - } - void setFBOs(std::vector>& f) override - { - for (int i = 0; i < f.size(); i++) - { - fbo->begin()[i] = core::smart_refctd_ptr(f[i]); - } - } - void setSwapchain(core::smart_refctd_ptr&& s) override - { - swapchain = std::move(s); - } - uint32_t getSwapchainImageCount() override - { - return swapchain->getImageCount(); - } - virtual nbl::asset::E_FORMAT getDepthFormat() override - { - return nbl::asset::EF_D32_SFLOAT; - } - - void getAndLogQueryPoolResults() - { -#ifdef QUERY_POOL_LOGS - { - uint64_t samples_passed[4] = {}; - auto queryResultFlags = core::bitflag(video::IQueryPool::EQRF_WITH_AVAILABILITY_BIT) | video::IQueryPool::EQRF_64_BIT; - logicalDevice->getQueryPoolResults(occlusionQueryPool.get(), 0u, 2u, sizeof(samples_passed), samples_passed, sizeof(uint64_t) * 2, queryResultFlags); - logger->log("[AVAIL+64] SamplesPassed[0] = %d, SamplesPassed[1] = %d, Result Available = %d, %d", system::ILogger::ELL_INFO, samples_passed[0], samples_passed[2], samples_passed[1], samples_passed[3]); - } - { - uint64_t samples_passed[4] = {}; - auto queryResultFlags = core::bitflag(video::IQueryPool::EQRF_WITH_AVAILABILITY_BIT) | video::IQueryPool::EQRF_64_BIT | video::IQueryPool::EQRF_WAIT_BIT; - logicalDevice->getQueryPoolResults(occlusionQueryPool.get(), 0u, 2u, sizeof(samples_passed), samples_passed, sizeof(uint64_t) * 2, queryResultFlags); - logger->log("[WAIT+AVAIL+64] SamplesPassed[0] = %d, SamplesPassed[1] = %d, Result Available = %d, %d", system::ILogger::ELL_INFO, samples_passed[0], samples_passed[2], samples_passed[1], samples_passed[3]); - } - { - uint32_t samples_passed[2] = {}; - auto queryResultFlags = core::bitflag(video::IQueryPool::EQRF_WAIT_BIT); - logicalDevice->getQueryPoolResults(occlusionQueryPool.get(), 0u, 2u, sizeof(samples_passed), samples_passed, sizeof(uint32_t), queryResultFlags); - logger->log("[WAIT] SamplesPassed[0] = %d, SamplesPassed[1] = %d", system::ILogger::ELL_INFO, samples_passed[0], samples_passed[1]); - } - { - uint64_t timestamps[4] = {}; - auto queryResultFlags = core::bitflag(video::IQueryPool::EQRF_WAIT_BIT) | video::IQueryPool::EQRF_WITH_AVAILABILITY_BIT | video::IQueryPool::EQRF_64_BIT; - logicalDevice->getQueryPoolResults(timestampQueryPool.get(), 0u, 2u, sizeof(timestamps), timestamps, sizeof(uint64_t) * 2ull, queryResultFlags); - float timePassed = (timestamps[2] - timestamps[0]) * physicalDevice->getLimits().timestampPeriodInNanoSeconds; - logger->log("Time Passed (Seconds) = %f", system::ILogger::ELL_INFO, (timePassed * 1e-9)); - logger->log("Timestamps availablity: %d, %d", system::ILogger::ELL_INFO, timestamps[1], timestamps[3]); - } -#endif - } - - APP_CONSTRUCTOR(MeshLoadersApp) - void onAppInitialized_impl() override - { - const auto swapchainImageUsage = static_cast(asset::IImage::EUF_COLOR_ATTACHMENT_BIT | asset::IImage::EUF_TRANSFER_SRC_BIT); - CommonAPI::InitParams initParams; - initParams.window = core::smart_refctd_ptr(window); - initParams.apiType = video::EAT_VULKAN; - initParams.appName = { _NBL_APP_NAME_ }; - initParams.framesInFlight = FRAMES_IN_FLIGHT; - initParams.windowWidth = WIN_W; - initParams.windowHeight = WIN_H; - initParams.swapchainImageCount = SC_IMG_COUNT; - initParams.swapchainImageUsage = swapchainImageUsage; - initParams.depthFormat = nbl::asset::EF_D32_SFLOAT; - auto initOutput = CommonAPI::InitWithDefaultExt(std::move(initParams)); - - window = std::move(initParams.window); - windowCb = std::move(initParams.windowCb); - apiConnection = std::move(initOutput.apiConnection); - surface = std::move(initOutput.surface); - utilities = std::move(initOutput.utilities); - logicalDevice = std::move(initOutput.logicalDevice); - physicalDevice = initOutput.physicalDevice; - queues = std::move(initOutput.queues); - renderpass = std::move(initOutput.renderToSwapchainRenderpass); - commandPools = std::move(initOutput.commandPools); - system = std::move(initOutput.system); - assetManager = std::move(initOutput.assetManager); - cpu2gpuParams = std::move(initOutput.cpu2gpuParams); - logger = std::move(initOutput.logger); - inputSystem = std::move(initOutput.inputSystem); - m_swapchainCreationParams = std::move(initOutput.swapchainCreationParams); - - CommonAPI::createSwapchain(std::move(logicalDevice), m_swapchainCreationParams, WIN_W, WIN_H, swapchain); - assert(swapchain); - fbo = CommonAPI::createFBOWithSwapchainImages( - swapchain->getImageCount(), WIN_W, WIN_H, - logicalDevice, swapchain, renderpass, - nbl::asset::EF_D32_SFLOAT - ); - - // Occlusion Query - { - video::IQueryPool::SCreationParams queryPoolCreationParams = {}; - queryPoolCreationParams.queryType = video::IQueryPool::EQT_OCCLUSION; - queryPoolCreationParams.queryCount = 2u; - occlusionQueryPool = logicalDevice->createQueryPool(std::move(queryPoolCreationParams)); - } - - // Timestamp Query - video::IQueryPool::SCreationParams queryPoolCreationParams = {}; - { - video::IQueryPool::SCreationParams queryPoolCreationParams = {}; - queryPoolCreationParams.queryType = video::IQueryPool::EQT_TIMESTAMP; - queryPoolCreationParams.queryCount = 2u; - timestampQueryPool = logicalDevice->createQueryPool(std::move(queryPoolCreationParams)); - } - - { - // SAMPLES_PASSED_0 + AVAILABILIY_0 + SAMPLES_PASSED_1 + AVAILABILIY_1 (uint32_t) - const size_t queriesSize = sizeof(uint32_t) * 4; - video::IGPUBuffer::SCreationParams gpuuboCreationParams; - gpuuboCreationParams.size = queriesSize; - gpuuboCreationParams.usage = core::bitflag(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT)|asset::IBuffer::EUF_TRANSFER_DST_BIT|asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF; - gpuuboCreationParams.queueFamilyIndexCount = 0u; - gpuuboCreationParams.queueFamilyIndices = nullptr; - - queryResultsBuffer = logicalDevice->createBuffer(std::move(gpuuboCreationParams)); - auto memReqs = queryResultsBuffer->getMemoryReqs(); - memReqs.memoryTypeBits &= physicalDevice->getDeviceLocalMemoryTypeBits(); - auto queriesMem = logicalDevice->allocate(memReqs, queryResultsBuffer.get()); - - queryResultsBuffer->setObjectDebugName("QueryResults"); - } - - nbl::video::IGPUObjectFromAssetConverter cpu2gpu; - { - auto* quantNormalCache = assetManager->getMeshManipulator()->getQuantNormalCache(); - quantNormalCache->loadCacheFromFile(system.get(), sharedOutputCWD / "normalCache101010.sse"); - - system::path archPath = sharedInputCWD / "sponza.zip"; - auto arch = system->openFileArchive(archPath); - // test no alias loading (TODO: fix loading from absolute paths) - system->mount(std::move(arch)); - asset::IAssetLoader::SAssetLoadParams loadParams; - loadParams.workingDirectory = sharedInputCWD; - loadParams.logger = logger.get(); - auto meshes_bundle = assetManager->getAsset((sharedInputCWD / "sponza.zip/sponza.obj").string(), loadParams); - assert(!meshes_bundle.getContents().empty()); - - metaOBJ = meshes_bundle.getMetadata()->selfCast(); - - auto cpuMesh = meshes_bundle.getContents().begin()[0]; - meshRaw = static_cast(cpuMesh.get()); - - quantNormalCache->saveCacheToFile(system.get(), sharedOutputCWD / "normalCache101010.sse"); - } - - // Fix FrontFace and BlendParams for meshBuffers - for (size_t i = 0ull; i < meshRaw->getMeshBuffers().size(); ++i) - { - auto& meshBuffer = meshRaw->getMeshBuffers().begin()[i]; - meshBuffer->getPipeline()->getRasterizationParams().frontFaceIsCCW = false; - } - - // we can safely assume that all meshbuffers within mesh loaded from OBJ has same DS1 layout (used for camera-specific data) - firstMeshBuffer = *meshRaw->getMeshBuffers().begin(); - pipelineMetadata = metaOBJ->getAssetSpecificMetadata(firstMeshBuffer->getPipeline()); - - // so we can create just one DS - const asset::ICPUDescriptorSetLayout* ds1layout = firstMeshBuffer->getPipeline()->getLayout()->getDescriptorSetLayout(1u); - ds1UboBinding = ds1layout->getDescriptorRedirect(asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER).getBinding(asset::ICPUDescriptorSetLayout::CBindingRedirect::storage_range_index_t{ 0 }).data; - - size_t neededDS1UBOsz = 0ull; - { - for (const auto& shdrIn : pipelineMetadata->m_inputSemantics) - if (shdrIn.descriptorSection.type == asset::IRenderpassIndependentPipelineMetadata::ShaderInput::E_TYPE::ET_UNIFORM_BUFFER && shdrIn.descriptorSection.uniformBufferObject.set == 1u && shdrIn.descriptorSection.uniformBufferObject.binding == ds1UboBinding) - neededDS1UBOsz = std::max(neededDS1UBOsz, shdrIn.descriptorSection.uniformBufferObject.relByteoffset + shdrIn.descriptorSection.uniformBufferObject.bytesize); - } - - core::smart_refctd_ptr gpuds1layout; - { - auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&ds1layout, &ds1layout + 1, cpu2gpuParams); - if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0]) - assert(false); - - gpuds1layout = (*gpu_array)[0]; - } - - core::smart_refctd_ptr descriptorPool = nullptr; - { - video::IDescriptorPool::SCreateInfo createInfo = {}; - createInfo.maxSets = 1u; - createInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER)] = 1u; - descriptorPool = logicalDevice->createDescriptorPool(std::move(createInfo)); - } - - video::IGPUBuffer::SCreationParams gpuuboCreationParams; - gpuuboCreationParams.size = neededDS1UBOsz; - gpuuboCreationParams.usage = core::bitflag(asset::IBuffer::EUF_UNIFORM_BUFFER_BIT) | asset::IBuffer::EUF_TRANSFER_DST_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF; - gpuuboCreationParams.queueFamilyIndexCount = 0u; - gpuuboCreationParams.queueFamilyIndices = nullptr; - - gpuubo = logicalDevice->createBuffer(std::move(gpuuboCreationParams)); - auto gpuuboMemReqs = gpuubo->getMemoryReqs(); - gpuuboMemReqs.memoryTypeBits &= physicalDevice->getDeviceLocalMemoryTypeBits(); - auto uboMemoryOffset = logicalDevice->allocate(gpuuboMemReqs, gpuubo.get(), video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE); - - gpuds1 = descriptorPool->createDescriptorSet(std::move(gpuds1layout)); - - { - video::IGPUDescriptorSet::SWriteDescriptorSet write; - write.dstSet = gpuds1.get(); - write.binding = ds1UboBinding; - write.count = 1u; - write.arrayElement = 0u; - write.descriptorType = asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER; - video::IGPUDescriptorSet::SDescriptorInfo info; - { - info.desc = gpuubo; - info.info.buffer.offset = 0ull; - info.info.buffer.size = neededDS1UBOsz; - } - write.info = &info; - logicalDevice->updateDescriptorSets(1u, &write, 0u, nullptr); - } - { - cpu2gpuParams.beginCommandBuffers(); - - auto gpu_array = cpu2gpu.getGPUObjectsFromAssets(&meshRaw, &meshRaw + 1, cpu2gpuParams); - if (!gpu_array || gpu_array->size() < 1u || !(*gpu_array)[0]) - assert(false); - - cpu2gpuParams.waitForCreationToComplete(false); - - gpumesh = (*gpu_array)[0]; - } - - { - for (size_t i = 0; i < gpumesh->getMeshBuffers().size(); ++i) - { - auto gpuIndependentPipeline = gpumesh->getMeshBuffers().begin()[i]->getPipeline(); - - nbl::video::IGPUGraphicsPipeline::SCreationParams graphicsPipelineParams; - graphicsPipelineParams.renderpassIndependent = core::smart_refctd_ptr(const_cast(gpuIndependentPipeline)); - graphicsPipelineParams.renderpass = core::smart_refctd_ptr(renderpass); - - const RENDERPASS_INDEPENDENT_PIPELINE_ADRESS adress = reinterpret_cast(graphicsPipelineParams.renderpassIndependent.get()); - gpuPipelines[adress] = logicalDevice->createGraphicsPipeline(nullptr, std::move(graphicsPipelineParams)); - } - } - - core::vectorSIMDf cameraPosition(-250.0f,177.0f,1.69f); - core::vectorSIMDf cameraTarget(50.0f,125.0f,-3.0f); - matrix4SIMD projectionMatrix = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(60.0f), video::ISurface::getTransformedAspectRatio(swapchain->getPreTransform(), WIN_W, WIN_H), 0.1, 10000); - camera = Camera(cameraPosition, cameraTarget, projectionMatrix, 10.f, 1.f); - lastTime = std::chrono::steady_clock::now(); - - for (size_t i = 0ull; i < NBL_FRAMES_TO_AVERAGE; ++i) - dtList[i] = 0.0; - - oracle.reportBeginFrameRecord(); - - - const auto& graphicsCommandPools = commandPools[CommonAPI::InitOutput::EQT_GRAPHICS]; - for (uint32_t i = 0u; i < FRAMES_IN_FLIGHT; i++) - { - logicalDevice->createCommandBuffers(graphicsCommandPools[i].get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, commandBuffers+i); - imageAcquire[i] = logicalDevice->createSemaphore(); - renderFinished[i] = logicalDevice->createSemaphore(); - } - - constexpr uint64_t MAX_TIMEOUT = 99999999999999ull; - uint32_t acquiredNextFBO = {}; - resourceIx = -1; - } - void onAppTerminated_impl() override - { - const auto& fboCreationParams = fbo->begin()[acquiredNextFBO]->getCreationParameters(); - auto gpuSourceImageView = fboCreationParams.attachments[0]; - - bool status = ext::ScreenShot::createScreenShot( - logicalDevice.get(), - queues[CommonAPI::InitOutput::EQT_TRANSFER_DOWN], - renderFinished[resourceIx].get(), - gpuSourceImageView.get(), - assetManager.get(), - "ScreenShot.png", - asset::IImage::EL_PRESENT_SRC, - asset::EAF_NONE); - - assert(status); - logicalDevice->waitIdle(); - } - void workLoopBody() override - { - ++resourceIx; - if (resourceIx >= FRAMES_IN_FLIGHT) - resourceIx = 0; - - auto& commandBuffer = commandBuffers[resourceIx]; - auto& fence = frameComplete[resourceIx]; - if (fence) - logicalDevice->blockForFences(1u, &fence.get()); - else - fence = logicalDevice->createFence(static_cast(0)); - - commandBuffer->reset(nbl::video::IGPUCommandBuffer::ERF_RELEASE_RESOURCES_BIT); - commandBuffer->begin(nbl::video::IGPUCommandBuffer::EU_NONE); - - const auto nextPresentationTimestamp = oracle.acquireNextImage(swapchain.get(), imageAcquire[resourceIx].get(), nullptr, &acquiredNextFBO); - { - inputSystem->getDefaultMouse(&mouse); - inputSystem->getDefaultKeyboard(&keyboard); - - camera.beginInputProcessing(nextPresentationTimestamp); - mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, logger.get()); - keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { camera.keyboardProcess(events); }, logger.get()); - camera.endInputProcessing(nextPresentationTimestamp); - } - - const auto& viewMatrix = camera.getViewMatrix(); - const auto& viewProjectionMatrix = matrix4SIMD::concatenateBFollowedByAPrecisely( - video::ISurface::getSurfaceTransformationMatrix(swapchain->getPreTransform()), - camera.getConcatenatedMatrix() - ); - - asset::SViewport viewport; - viewport.minDepth = 1.f; - viewport.maxDepth = 0.f; - viewport.x = 0u; - viewport.y = 0u; - viewport.width = WIN_W; - viewport.height = WIN_H; - commandBuffer->setViewport(0u, 1u, &viewport); - - VkRect2D scissor = {}; - scissor.offset = { 0, 0 }; - scissor.extent = { WIN_W, WIN_H }; - commandBuffer->setScissor(0u, 1u, &scissor); - - core::matrix3x4SIMD modelMatrix; - modelMatrix.setTranslation(nbl::core::vectorSIMDf(0, 0, 0, 0)); - core::matrix4SIMD mvp = core::concatenateBFollowedByA(viewProjectionMatrix, modelMatrix); - - const size_t uboSize = gpuubo->getSize(); - core::vector uboData(uboSize); - for (const auto& shdrIn : pipelineMetadata->m_inputSemantics) - { - if (shdrIn.descriptorSection.type == asset::IRenderpassIndependentPipelineMetadata::ShaderInput::E_TYPE::ET_UNIFORM_BUFFER && shdrIn.descriptorSection.uniformBufferObject.set == 1u && shdrIn.descriptorSection.uniformBufferObject.binding == ds1UboBinding) - { - switch (shdrIn.type) - { - case asset::IRenderpassIndependentPipelineMetadata::ECSI_WORLD_VIEW_PROJ: - { - memcpy(uboData.data() + shdrIn.descriptorSection.uniformBufferObject.relByteoffset, mvp.pointer(), shdrIn.descriptorSection.uniformBufferObject.bytesize); - } break; - - case asset::IRenderpassIndependentPipelineMetadata::ECSI_WORLD_VIEW: - { - memcpy(uboData.data() + shdrIn.descriptorSection.uniformBufferObject.relByteoffset, viewMatrix.pointer(), shdrIn.descriptorSection.uniformBufferObject.bytesize); - } break; - - case asset::IRenderpassIndependentPipelineMetadata::ECSI_WORLD_VIEW_INVERSE_TRANSPOSE: - { - memcpy(uboData.data() + shdrIn.descriptorSection.uniformBufferObject.relByteoffset, viewMatrix.pointer(), shdrIn.descriptorSection.uniformBufferObject.bytesize); - } break; - } - } - } - commandBuffer->updateBuffer(gpuubo.get(), 0ull, uboSize, uboData.data()); - - nbl::video::IGPUCommandBuffer::SRenderpassBeginInfo beginInfo; - { - VkRect2D area; - area.offset = { 0,0 }; - area.extent = { WIN_W, WIN_H }; - asset::SClearValue clear[2] = {}; - clear[0].color.float32[0] = 1.f; - clear[0].color.float32[1] = 1.f; - clear[0].color.float32[2] = 1.f; - clear[0].color.float32[3] = 1.f; - clear[1].depthStencil.depth = 0.f; - - beginInfo.clearValueCount = 2u; - beginInfo.framebuffer = fbo->begin()[acquiredNextFBO]; - beginInfo.renderpass = renderpass; - beginInfo.renderArea = area; - beginInfo.clearValues = clear; - } - - commandBuffer->resetQueryPool(occlusionQueryPool.get(), 0u, 2u); - commandBuffer->resetQueryPool(timestampQueryPool.get(), 0u, 2u); - commandBuffer->beginRenderPass(&beginInfo, nbl::asset::ESC_INLINE); - - commandBuffer->writeTimestamp(asset::E_PIPELINE_STAGE_FLAGS::EPSF_TOP_OF_PIPE_BIT, timestampQueryPool.get(), 0u); - for (size_t i = 0; i < gpumesh->getMeshBuffers().size(); ++i) - { - if(i < 2) - commandBuffer->beginQuery(occlusionQueryPool.get(), i); - auto gpuMeshBuffer = gpumesh->getMeshBuffers().begin()[i]; - auto gpuGraphicsPipeline = gpuPipelines[reinterpret_cast(gpuMeshBuffer->getPipeline())]; - - const video::IGPURenderpassIndependentPipeline* gpuRenderpassIndependentPipeline = gpuMeshBuffer->getPipeline(); - const video::IGPUDescriptorSet* ds3 = gpuMeshBuffer->getAttachedDescriptorSet(); - - commandBuffer->bindGraphicsPipeline(gpuGraphicsPipeline.get()); - - const video::IGPUDescriptorSet* gpuds1_ptr = gpuds1.get(); - commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuRenderpassIndependentPipeline->getLayout(), 1u, 1u, &gpuds1_ptr); - const video::IGPUDescriptorSet* gpuds3_ptr = gpuMeshBuffer->getAttachedDescriptorSet(); - if (gpuds3_ptr) - commandBuffer->bindDescriptorSets(asset::EPBP_GRAPHICS, gpuRenderpassIndependentPipeline->getLayout(), 3u, 1u, &gpuds3_ptr); - commandBuffer->pushConstants(gpuRenderpassIndependentPipeline->getLayout(), asset::IShader::ESS_FRAGMENT, 0u, gpuMeshBuffer->MAX_PUSH_CONSTANT_BYTESIZE, gpuMeshBuffer->getPushConstantsDataPtr()); - - commandBuffer->drawMeshBuffer(gpuMeshBuffer); - - if(i < 2) - commandBuffer->endQuery(occlusionQueryPool.get(), i); - } - commandBuffer->writeTimestamp(asset::E_PIPELINE_STAGE_FLAGS::EPSF_BOTTOM_OF_PIPE_BIT, timestampQueryPool.get(), 1u); - - commandBuffer->endRenderPass(); - - auto queryResultFlags = core::bitflag(video::IQueryPool::EQRF_WAIT_BIT) | video::IQueryPool::EQRF_WITH_AVAILABILITY_BIT; - commandBuffer->copyQueryPoolResults(occlusionQueryPool.get(), 0, 2, queryResultsBuffer.get(), 0u, sizeof(uint32_t) * 2, queryResultFlags); - - commandBuffer->end(); - - logicalDevice->resetFences(1, &fence.get()); - CommonAPI::Submit( - logicalDevice.get(), - commandBuffer.get(), - queues[CommonAPI::InitOutput::EQT_COMPUTE], - imageAcquire[resourceIx].get(), - renderFinished[resourceIx].get(), - fence.get()); - CommonAPI::Present(logicalDevice.get(), - swapchain.get(), - queues[CommonAPI::InitOutput::EQT_GRAPHICS], renderFinished[resourceIx].get(), acquiredNextFBO); - - getAndLogQueryPoolResults(); - } - bool keepRunning() override - { - return windowCb->isWindowOpen(); - } -}; - -NBL_COMMON_API_MAIN(MeshLoadersApp) diff --git a/06_MeshLoaders/pipeline.groovy b/06_MeshLoaders/pipeline.groovy deleted file mode 100644 index 0923d296f..000000000 --- a/06_MeshLoaders/pipeline.groovy +++ /dev/null @@ -1,50 +0,0 @@ -import org.DevshGraphicsProgramming.Agent -import org.DevshGraphicsProgramming.BuilderInfo -import org.DevshGraphicsProgramming.IBuilder - -class CMeshLoadersBuilder extends IBuilder -{ - public CMeshLoadersBuilder(Agent _agent, _info) - { - super(_agent, _info) - } - - @Override - public boolean prepare(Map axisMapping) - { - return true - } - - @Override - public boolean build(Map axisMapping) - { - IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") - IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") - - def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) - def nameOfConfig = getNameOfConfig(config) - - agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") - - return true - } - - @Override - public boolean test(Map axisMapping) - { - return true - } - - @Override - public boolean install(Map axisMapping) - { - return true - } -} - -def create(Agent _agent, _info) -{ - return new CMeshLoadersBuilder(_agent, _info) -} - -return this \ No newline at end of file From f494d859787936e0308e6661dffce9043361935e Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 21 Jun 2025 23:49:36 +0200 Subject: [PATCH 281/296] yes `NBL_EMBED_BUILTIN_RESOURCES` should control both Nable and examples_tests w.r.t. the question of `MonoAssetManagerAndBuiltinResourceApplication.hpp` There really isn't a usecase for: - embedding Nabla resources but not embedding example (there's no harm in Nabla resources not being embedded) - even more strangely, not embedding Nabla resources, but embedding example. --- common/include/nbl/examples/PCH.hpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp index 5316ce2e8..3b1e6beaa 100644 --- a/common/include/nbl/examples/PCH.hpp +++ b/common/include/nbl/examples/PCH.hpp @@ -15,18 +15,8 @@ //! Common example interface headers -// why isnt this in `nabla.h` ? -/* - because it does stuff like - - #ifdef NBL_EMBED_BUILTIN_RESOURCES - #include "nbl/this_example/builtin/CArchive.h" - #endif - - hence also cannot be there in PCH but rather in examples.h -> compile errors - but only *if* we decide each example handles builtins on NBL_EMBED_BUILTIN_RESOURCES -*/ -// #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +// TODO: examine moving this header to `nbl/examples/common` +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "nbl/examples/common/SimpleWindowedApplication.hpp" #include "nbl/examples/common/MonoWindowApplication.hpp" From ce268fb50f481d94e3db8d5e3eee69ffa3e30af2 Mon Sep 17 00:00:00 2001 From: devsh Date: Sun, 22 Jun 2025 01:14:05 +0200 Subject: [PATCH 282/296] Adjust all examples after splitting `MonoAssetManagerAndBuiltinResourceApplication` in two --- 03_DeviceSelectionAndSharedSources/Testers.h | 3 +- 03_DeviceSelectionAndSharedSources/main.cpp | 17 +++-- .../main.cpp | 6 +- 06_HelloGraphicsQueue/main.cpp | 20 +++--- 07_StagingAndMultipleQueues/main.cpp | 15 +++-- 09_GeometryCreator/main.cpp | 12 ++-- 10_CountingSort/main.cpp | 15 +++-- 11_FFT/main.cpp | 21 +++---- 22_CppCompat/CIntrinsicsTester.h | 7 ++- 22_CppCompat/CTgmathTester.h | 7 ++- 22_CppCompat/ITester.h | 7 ++- 22_CppCompat/main.cpp | 22 +++---- 23_Arithmetic2UnitTest/main.cpp | 12 ++-- 24_ColorSpaceTest/main.cpp | 4 +- 26_Blur/main.cpp | 8 ++- 27_MPMCScheduler/main.cpp | 7 ++- 28_FFTBloom/main.cpp | 19 +++--- 29_Arithmetic2Bench/main.cpp | 4 +- 30_ComputeShaderPathTracer/main.cpp | 23 ++++--- 64_EmulatedFloatTest/main.cpp | 13 ++-- 68_JpegLoading/main.cpp | 22 ++++--- 70_FLIPFluids/main.cpp | 7 ++- common/include/nbl/examples/PCH.hpp | 4 +- .../common/BuiltinResourcesApplication.hpp | 63 +++++++++++++++++++ 24 files changed, 219 insertions(+), 119 deletions(-) create mode 100644 common/include/nbl/examples/common/BuiltinResourcesApplication.hpp diff --git a/03_DeviceSelectionAndSharedSources/Testers.h b/03_DeviceSelectionAndSharedSources/Testers.h index 9a4016d20..f957e50a0 100644 --- a/03_DeviceSelectionAndSharedSources/Testers.h +++ b/03_DeviceSelectionAndSharedSources/Testers.h @@ -4,8 +4,7 @@ #ifndef _NBL_TESTERS_H_INCLUDED_ #define _NBL_TESTERS_H_INCLUDED_ -#include "nbl/application_templates/MonoDeviceApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "nbl/examples/examples.hpp" using namespace nbl; diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp index 6c99aff7f..c09228ce5 100644 --- a/03_DeviceSelectionAndSharedSources/main.cpp +++ b/03_DeviceSelectionAndSharedSources/main.cpp @@ -2,15 +2,20 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h + #include "nbl/examples/examples.hpp" // TODO: why isn't this in `nabla.h` ? #include "nbl/asset/metadata/CHLSLMetadata.h" + using namespace nbl; -using namespace core; -using namespace system; -using namespace asset; -using namespace video; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; // TODO[Przemek]: update comments @@ -21,10 +26,10 @@ using namespace video; constexpr bool ENABLE_TESTS = false; // This time we create the device in the base class and also use a base class to give us an Asset Manager and an already mounted built-in resource archive -class DeviceSelectionAndSharedSourcesApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class DeviceSelectionAndSharedSourcesApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication { using device_base_t = application_templates::MonoDeviceApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; public: // Yay thanks to multiple inheritance we cannot forward ctors anymore DeviceSelectionAndSharedSourcesApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : diff --git a/05_StreamingAndBufferDeviceAddressApp/main.cpp b/05_StreamingAndBufferDeviceAddressApp/main.cpp index f98e38f66..a648acefb 100644 --- a/05_StreamingAndBufferDeviceAddressApp/main.cpp +++ b/05_StreamingAndBufferDeviceAddressApp/main.cpp @@ -5,7 +5,7 @@ // I've moved out a tiny part of this example into a shared header for reuse, please open and read it. #include "nbl/application_templates/MonoDeviceApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "nbl/examples/common/BuiltinResourcesApplication.hpp" using namespace nbl; @@ -20,10 +20,10 @@ using namespace video; // In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants -class StreamingAndBufferDeviceAddressApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class StreamingAndBufferDeviceAddressApp final : public application_templates::MonoDeviceApplication, public examples::BuiltinResourcesApplication { using device_base_t = application_templates::MonoDeviceApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = examples::BuiltinResourcesApplication; // This is the first example that submits multiple workloads in-flight. // What the shader does is it computes the minimum distance of each point against K other random input points. diff --git a/06_HelloGraphicsQueue/main.cpp b/06_HelloGraphicsQueue/main.cpp index dc2f3ebb4..07d6affd3 100644 --- a/06_HelloGraphicsQueue/main.cpp +++ b/06_HelloGraphicsQueue/main.cpp @@ -3,18 +3,20 @@ // For conditions of distribution and use, see copyright notice in nabla.h -// I've moved out a tiny part of this example into a shared header for reuse, please open and read it. -#include "nbl/application_templates/MonoDeviceApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "nbl/examples/examples.hpp" #include "nbl/ext/ScreenShot/ScreenShot.h" using namespace nbl; -using namespace core; -using namespace system; -using namespace asset; -using namespace video; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; + // Here we showcase the use of Graphics Queue only // Steps we take in this example: @@ -26,10 +28,10 @@ using namespace video; // - save the smallImg to disk // // all without using IUtilities. -class HelloGraphicsQueueApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class HelloGraphicsQueueApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication { using device_base_t = application_templates::MonoDeviceApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; public: // Yay thanks to multiple inheritance we cannot forward ctors anymore. diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp index a1a06f4f4..fc6bf4551 100644 --- a/07_StagingAndMultipleQueues/main.cpp +++ b/07_StagingAndMultipleQueues/main.cpp @@ -6,18 +6,21 @@ #include "nbl/examples/examples.hpp" using namespace nbl; -using namespace core; -using namespace system; -using namespace asset; -using namespace video; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; #include "app_resources/common.hlsl" // This time we let the new base class score and pick queue families, as well as initialize `nbl::video::IUtilities` for us -class StagingAndMultipleQueuesApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class StagingAndMultipleQueuesApp final : public application_templates::BasicMultiQueueApplication, public BuiltinResourcesApplication { using device_base_t = application_templates::BasicMultiQueueApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; // TODO: would be cool if we used `system::ISystem::listItemsInDirectory(sharedInputCWD/"GLI")` as our dataset static constexpr std::array imagesToLoad = { diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index 6fddd8282..38daebaa5 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -2,14 +2,14 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h + #include "common.hpp" -// TODO: Arek, we should have a `nbl::examples` class inheriting from `application_templates::MonoAssetManagerAndBuiltinResourceApplication` which -// during `onAppInitialized` also mounts correct `common/include/nbl/examples` and `common/src/nbl/examples` as folder or builtin -class GeometryCreatorApp final : public MonoWindowApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication + +class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinResourcesApplication { using device_base_t = MonoWindowApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; public: GeometryCreatorApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) @@ -58,10 +58,6 @@ class GeometryCreatorApp final : public MonoWindowApplication, public applicatio .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies },patch ); - - // TODO: this is plain wrong Arek - m_system->mount(make_smart_refctd_ptr(localInputCWD/"../common/include/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()),"nbl/examples"); - m_system->mount(make_smart_refctd_ptr(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()),"nbl/examples"); auto scRes = static_cast(m_surface->getSwapchainResources()); m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,m_scene.get()); diff --git a/10_CountingSort/main.cpp b/10_CountingSort/main.cpp index 0efc0518e..d51650919 100644 --- a/10_CountingSort/main.cpp +++ b/10_CountingSort/main.cpp @@ -1,18 +1,21 @@ #include "nbl/examples/examples.hpp" using namespace nbl; -using namespace core; -using namespace system; -using namespace asset; -using namespace video; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; #include "app_resources/common.hlsl" #include "nbl/builtin/hlsl/bit.hlsl" -class CountingSortApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class CountingSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication { using device_base_t = application_templates::MonoDeviceApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; public: // Yay thanks to multiple inheritance we cannot forward ctors anymore diff --git a/11_FFT/main.cpp b/11_FFT/main.cpp index ad9bbfd47..3829e8481 100644 --- a/11_FFT/main.cpp +++ b/11_FFT/main.cpp @@ -3,17 +3,16 @@ // For conditions of distribution and use, see copyright notice in nabla.h -// I've moved out a tiny part of this example into a shared header for reuse, please open and read it. -#include "nbl/application_templates/MonoDeviceApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" - +#include "nbl/examples/examples.hpp" using namespace nbl; -using namespace core; -using namespace system; -using namespace asset; -using namespace video; - +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; #include "app_resources/common.hlsl" #include "nbl/builtin/hlsl/bit.hlsl" @@ -21,10 +20,10 @@ using namespace video; // Simple showcase of how to run FFT on a 1D array -class FFT_Test final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class FFT_Test final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication { using device_base_t = application_templates::MonoDeviceApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; smart_refctd_ptr m_pipeline; diff --git a/22_CppCompat/CIntrinsicsTester.h b/22_CppCompat/CIntrinsicsTester.h index 77aa2c1ca..d053977c0 100644 --- a/22_CppCompat/CIntrinsicsTester.h +++ b/22_CppCompat/CIntrinsicsTester.h @@ -1,12 +1,13 @@ #ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_C_INTRINSICS_TESTER_INCLUDED_ #define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_C_INTRINSICS_TESTER_INCLUDED_ -#include + +#include "nbl/examples/examples.hpp" + #include "app_resources/common.hlsl" -#include "nbl/application_templates/MonoDeviceApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "ITester.h" + using namespace nbl; class CIntrinsicsTester final : public ITester diff --git a/22_CppCompat/CTgmathTester.h b/22_CppCompat/CTgmathTester.h index 6d2b23c73..63b0e483e 100644 --- a/22_CppCompat/CTgmathTester.h +++ b/22_CppCompat/CTgmathTester.h @@ -1,12 +1,13 @@ #ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_C_TGMATH_TESTER_INCLUDED_ #define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_C_TGMATH_TESTER_INCLUDED_ -#include + +#include "nbl/examples/examples.hpp" + #include "app_resources/common.hlsl" -#include "nbl/application_templates/MonoDeviceApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "ITester.h" + using namespace nbl; class CTgmathTester final : public ITester diff --git a/22_CppCompat/ITester.h b/22_CppCompat/ITester.h index 32138f198..9f2353c95 100644 --- a/22_CppCompat/ITester.h +++ b/22_CppCompat/ITester.h @@ -1,12 +1,13 @@ #ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_I_TESTER_INCLUDED_ #define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_I_TESTER_INCLUDED_ -#include + +#include "nbl/examples/examples.hpp" + #include "app_resources/common.hlsl" -#include "nbl/application_templates/MonoDeviceApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "nbl/asset/metadata/CHLSLMetadata.h" + using namespace nbl; class ITester diff --git a/22_CppCompat/main.cpp b/22_CppCompat/main.cpp index a5a819d49..70c8d7b3a 100644 --- a/22_CppCompat/main.cpp +++ b/22_CppCompat/main.cpp @@ -1,26 +1,26 @@ // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include -#include -#include -#include -#include "nbl/application_templates/MonoDeviceApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "app_resources/common.hlsl" #include "CTgmathTester.h" #include "CIntrinsicsTester.h" +#include +#include +#include + + +using namespace nbl; using namespace nbl::core; using namespace nbl::hlsl; using namespace nbl::system; using namespace nbl::asset; +using namespace nbl::ui; using namespace nbl::video; -using namespace nbl::application_templates; - +using namespace nbl::examples; //using namespace glm; @@ -43,10 +43,10 @@ struct T float32_t4 h; }; -class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetManagerAndBuiltinResourceApplication +class CompatibilityTest final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication { - using device_base_t = MonoDeviceApplication; - using asset_base_t = MonoAssetManagerAndBuiltinResourceApplication; + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; public: CompatibilityTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp index da0d3de7d..3939fd443 100644 --- a/23_Arithmetic2UnitTest/main.cpp +++ b/23_Arithmetic2UnitTest/main.cpp @@ -1,9 +1,13 @@ -#include "nbl/application_templates/BasicMultiQueueApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +// TODO: copyright notice + + +#include "nbl/examples/examples.hpp" + #include "app_resources/common.hlsl" #include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl" #include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl" + using namespace nbl; using namespace core; using namespace asset; @@ -47,10 +51,10 @@ struct emulatedScanExclusive static inline constexpr const char* name = "exclusive_scan"; }; -class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueueApplication, public examples::BuiltinResourcesApplication { using device_base_t = application_templates::BasicMultiQueueApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = examples::BuiltinResourcesApplication; public: Workgroup2ScanTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : diff --git a/24_ColorSpaceTest/main.cpp b/24_ColorSpaceTest/main.cpp index 56af4fc79..84c55ef3a 100644 --- a/24_ColorSpaceTest/main.cpp +++ b/24_ColorSpaceTest/main.cpp @@ -22,10 +22,10 @@ using namespace nbl::examples; // defines for sampler tests can be found in the file below #include "app_resources/push_constants.hlsl" -class ColorSpaceTestSampleApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class ColorSpaceTestSampleApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication { using device_base_t = SimpleWindowedApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; using clock_t = std::chrono::steady_clock; using perf_clock_resolution_t = std::chrono::milliseconds; diff --git a/26_Blur/main.cpp b/26_Blur/main.cpp index e5105c778..83cf140d6 100644 --- a/26_Blur/main.cpp +++ b/26_Blur/main.cpp @@ -1,6 +1,8 @@ // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h + + #include "nbl/examples/examples.hpp" #include @@ -16,10 +18,12 @@ using namespace nbl::examples; #include "app_resources/common.hlsl" -class BlurApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication + + +class BlurApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication { using device_base_t = SimpleWindowedApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; using clock_t = std::chrono::steady_clock; public: diff --git a/27_MPMCScheduler/main.cpp b/27_MPMCScheduler/main.cpp index 18d396135..580335a35 100644 --- a/27_MPMCScheduler/main.cpp +++ b/27_MPMCScheduler/main.cpp @@ -1,6 +1,8 @@ // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h + + #include "nbl/examples/examples.hpp" using namespace nbl; @@ -13,10 +15,11 @@ using namespace nbl::examples; #include "app_resources/common.hlsl" -class MPMCSchedulerApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication + +class MPMCSchedulerApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication { using device_base_t = SimpleWindowedApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; using clock_t = std::chrono::steady_clock; constexpr static inline uint32_t WIN_W = 1280, WIN_H = 720; diff --git a/28_FFTBloom/main.cpp b/28_FFTBloom/main.cpp index 16835ecf6..049bbd581 100644 --- a/28_FFTBloom/main.cpp +++ b/28_FFTBloom/main.cpp @@ -1,27 +1,32 @@ // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h + + #include "nbl/examples/examples.hpp" using namespace nbl; -using namespace core; -using namespace system; -using namespace asset; -using namespace video; -using namespace ui; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; using namespace nbl::examples; #include "app_resources/common.hlsl" #include "nbl/builtin/hlsl/bit.hlsl" + + // Defaults that match this example's image constexpr uint32_t WIN_W = 1280; constexpr uint32_t WIN_H = 720; -class FFTBloomApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class FFTBloomApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication { using device_base_t = SimpleWindowedApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; using clock_t = std::chrono::steady_clock; // Windowed App members diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp index 0a0e3b35f..75f483db0 100644 --- a/29_Arithmetic2Bench/main.cpp +++ b/29_Arithmetic2Bench/main.cpp @@ -167,10 +167,10 @@ class CExplicitSurfaceFormatResizeSurface final : public ISimpleManagedSurface }; // NOTE added swapchain + drawing frames to be able to profile with Nsight, which still doesn't support profiling headless compute shaders -class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public examples::BuiltinResourcesApplication { using device_base_t = examples::SimpleWindowedApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = examples::BuiltinResourcesApplication; constexpr static inline uint32_t WIN_W = 1280; constexpr static inline uint32_t WIN_H = 720; diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp index 487388ea0..54bc64495 100644 --- a/30_ComputeShaderPathTracer/main.cpp +++ b/30_ComputeShaderPathTracer/main.cpp @@ -1,20 +1,23 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" -#include "nbl/this_example/common.hpp" +#include "nbl/examples/examples.hpp" + +#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" #include "nbl/builtin/hlsl/surface_transform.h" +#include "nbl/this_example/common.hpp" + using namespace nbl; -using namespace core; -using namespace hlsl; -using namespace system; -using namespace asset; -using namespace ui; -using namespace video; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; using namespace nbl::examples; // TODO: share push constants @@ -26,10 +29,10 @@ struct PTPushConstant { // TODO: Add a QueryPool for timestamping once its ready (actually add IMGUI mspf plotter) // TODO: Do buffer creation using assConv -class ComputeShaderPathtracer final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class ComputeShaderPathtracer final : public SimpleWindowedApplication, public BuiltinResourcesApplication { using device_base_t = SimpleWindowedApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; using clock_t = std::chrono::steady_clock; enum E_LIGHT_GEOMETRY : uint8_t diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp index b44cb2b4e..fd3e465e7 100644 --- a/64_EmulatedFloatTest/main.cpp +++ b/64_EmulatedFloatTest/main.cpp @@ -1,35 +1,38 @@ // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h + + +#include "nbl/examples/examples.hpp" + #include #include #include #include #include -#include "nbl/application_templates/MonoDeviceApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" - #include "app_resources/common.hlsl" #include "app_resources/benchmark/common.hlsl" #include "nbl/builtin/hlsl/ieee754.hlsl" #include + using namespace nbl::core; using namespace nbl::hlsl; using namespace nbl::system; using namespace nbl::asset; using namespace nbl::video; using namespace nbl::application_templates; +using namespace nbl::examples; constexpr bool DoTests = true; constexpr bool DoBenchmark = true; -class CompatibilityTest final : public MonoDeviceApplication, public MonoAssetManagerAndBuiltinResourceApplication +class CompatibilityTest final : public MonoDeviceApplication, public BuiltinResourcesApplication { using device_base_t = MonoDeviceApplication; - using asset_base_t = MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; public: CompatibilityTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} diff --git a/68_JpegLoading/main.cpp b/68_JpegLoading/main.cpp index 5ef9b637d..663b40759 100644 --- a/68_JpegLoading/main.cpp +++ b/68_JpegLoading/main.cpp @@ -1,22 +1,26 @@ // Copyright (C) 2018-2024 - DevSH GrapMonoAssetManagerAndBuiltinResourceApplicationhics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" + + +#include "nbl/examples/examples.hpp" #include #include "nlohmann/json.hpp" #include "argparse/argparse.hpp" + using json = nlohmann::json; using namespace nbl; -using namespace core; -using namespace hlsl; -using namespace system; -using namespace asset; -using namespace ui; -using namespace video; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; class ThreadPool { @@ -76,11 +80,11 @@ using task_t = std::function; std::atomic m_shouldStop = false; }; -class JpegLoaderApp final : public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class JpegLoaderApp final : public BuiltinResourcesApplication { using clock_t = std::chrono::steady_clock; using clock_resolution_t = std::chrono::milliseconds; - using base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using base_t = BuiltinResourcesApplication; public: using base_t::base_t; diff --git a/70_FLIPFluids/main.cpp b/70_FLIPFluids/main.cpp index d66b56811..66596c526 100644 --- a/70_FLIPFluids/main.cpp +++ b/70_FLIPFluids/main.cpp @@ -1,10 +1,13 @@ // Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h + + #include "nbl/examples/examples.hpp" // TODO: why is it not in nabla.h ? #include "nbl/asset/metadata/CHLSLMetadata.h" +using namespace nbl; using namespace nbl::core; using namespace nbl::hlsl; using namespace nbl::system; @@ -161,10 +164,10 @@ class CEventCallback : public ISimpleManagedSurface::ICallback nbl::system::logger_opt_smart_ptr m_logger = nullptr; }; -class FLIPFluidsApp final : public SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication { using device_base_t = SimpleWindowedApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = BuiltinResourcesApplication; using clock_t = std::chrono::steady_clock; constexpr static inline uint32_t WIN_WIDTH = 1280, WIN_HEIGHT = 720; diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp index 3b1e6beaa..4d2025f5f 100644 --- a/common/include/nbl/examples/PCH.hpp +++ b/common/include/nbl/examples/PCH.hpp @@ -15,9 +15,7 @@ //! Common example interface headers -// TODO: examine moving this header to `nbl/examples/common` -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" - +#include "nbl/examples/common/BuiltinResourcesApplication.hpp" #include "nbl/examples/common/SimpleWindowedApplication.hpp" #include "nbl/examples/common/MonoWindowApplication.hpp" #include "nbl/examples/common/InputSystem.hpp" diff --git a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp new file mode 100644 index 000000000..2f1884470 --- /dev/null +++ b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp @@ -0,0 +1,63 @@ +// Copyright (C) 2023-2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_EXAMPLES_BUILTIN_RESOURCE_APPLICATION_HPP_INCLUDED_ +#define _NBL_EXAMPLES_BUILTIN_RESOURCE_APPLICATION_HPP_INCLUDED_ + + +// we need a system, logger and an asset manager +#include "nbl/application_templates/MonoAssetManagerApplication.hpp" + +#ifdef NBL_EMBED_BUILTIN_RESOURCES +// TODO: the include/header `nbl/examples` archive +// TODO: the source `nbl/examples` archive +// TODO: the build `nbl/examples` archive +#include "nbl/this_example/builtin/CArchive.h" +#endif + + +namespace nbl::examples +{ + +// Virtual Inheritance because apps might end up doing diamond inheritance +class BuiltinResourcesApplication : public virtual application_templates::MonoAssetManagerApplication +{ + using base_t = MonoAssetManagerApplication; + + public: + using base_t::base_t; + + protected: + // need this one for skipping passing all args into ApplicationFramework + BuiltinResourcesApplication() = default; + + virtual bool onAppInitialized(core::smart_refctd_ptr&& system) override + { + if (!base_t::onAppInitialized(std::move(system))) + return false; + + using namespace core; + + smart_refctd_ptr examplesHeaderArch,examplesSourceArch,examplesBuildArch,thisExampleArch; + #ifdef NBL_EMBED_BUILTIN_RESOURCES +// TODO: the 3 examples archives + thisExampleArch = make_smart_refctd_ptr(smart_refctd_ptr(m_logger)); + #else + examplesHeaderArch = make_smart_refctd_ptr(localInputCWD/"../common/include/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()); + examplesSourceArch = make_smart_refctd_ptr(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()); +// TODO: examplesBuildArch = + thisExampleArch = make_smart_refctd_ptr(localInputCWD/"app_resources",smart_refctd_ptr(m_logger),m_system.get()); + #endif + // yes all 3 aliases are meant to be the same + m_system->mount(std::move(examplesHeaderArch),"nbl/examples"); + m_system->mount(std::move(examplesSourceArch),"nbl/examples"); +// m_system->mount(std::move(examplesBuildArch),"nbl/examples"); + m_system->mount(std::move(thisExampleArch),"app_resources"); + + return true; + } +}; + +} + +#endif // _CAMERA_IMPL_ \ No newline at end of file From 20a5438ef09fddbecb96e40c4180227bf443ffcd Mon Sep 17 00:00:00 2001 From: devsh Date: Sun, 22 Jun 2025 01:19:57 +0200 Subject: [PATCH 283/296] correct namespace ambiguities affecting example 23 and 29 --- common/include/nbl/examples/workgroup/DataAccessors.hlsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/include/nbl/examples/workgroup/DataAccessors.hlsl b/common/include/nbl/examples/workgroup/DataAccessors.hlsl index f94121ec0..ca5915f2c 100644 --- a/common/include/nbl/examples/workgroup/DataAccessors.hlsl +++ b/common/include/nbl/examples/workgroup/DataAccessors.hlsl @@ -101,14 +101,14 @@ struct PreloadedDataProxy void preload() { - const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + const uint16_t invocationIndex = hlsl::workgroup::SubgroupContiguousIndex(); [unroll] for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) data.template get(idx * WorkgroupSize + invocationIndex, preloaded[idx]); } void unload() { - const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + const uint16_t invocationIndex = hlsl::workgroup::SubgroupContiguousIndex(); [unroll] for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) data.template set(idx * WorkgroupSize + invocationIndex, preloaded[idx]); From 21a88ff56a08673716648cba490fa9282ce8b065 Mon Sep 17 00:00:00 2001 From: devsh Date: Sun, 22 Jun 2025 02:00:58 +0200 Subject: [PATCH 284/296] aaah the `BuiltinResourcesApplication.hpp` header needs some special treatment to NOT include `this_example/CArchive` --- common/include/nbl/examples/PCH.hpp | 2 -- .../nbl/examples/common/BuiltinResourcesApplication.hpp | 5 ++++- common/include/nbl/examples/examples.hpp | 5 +++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/common/include/nbl/examples/PCH.hpp b/common/include/nbl/examples/PCH.hpp index 4d2025f5f..0905465c2 100644 --- a/common/include/nbl/examples/PCH.hpp +++ b/common/include/nbl/examples/PCH.hpp @@ -14,8 +14,6 @@ #include "nabla.h" //! Common example interface headers - -#include "nbl/examples/common/BuiltinResourcesApplication.hpp" #include "nbl/examples/common/SimpleWindowedApplication.hpp" #include "nbl/examples/common/MonoWindowApplication.hpp" #include "nbl/examples/common/InputSystem.hpp" diff --git a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp index 2f1884470..d183a9f4b 100644 --- a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp +++ b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp @@ -12,6 +12,7 @@ // TODO: the include/header `nbl/examples` archive // TODO: the source `nbl/examples` archive // TODO: the build `nbl/examples` archive +// TODO: make the `this_example` optional, only if the example has builtins #include "nbl/this_example/builtin/CArchive.h" #endif @@ -41,7 +42,9 @@ class BuiltinResourcesApplication : public virtual application_templates::MonoAs smart_refctd_ptr examplesHeaderArch,examplesSourceArch,examplesBuildArch,thisExampleArch; #ifdef NBL_EMBED_BUILTIN_RESOURCES // TODO: the 3 examples archives - thisExampleArch = make_smart_refctd_ptr(smart_refctd_ptr(m_logger)); + #ifdef _NBL_THIS_EXAMPLE_BUILTIN_C_ARCHIVE_H_ + thisExampleArch = make_smart_refctd_ptr(smart_refctd_ptr(m_logger)); + #endif #else examplesHeaderArch = make_smart_refctd_ptr(localInputCWD/"../common/include/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()); examplesSourceArch = make_smart_refctd_ptr(localInputCWD/"../common/src/nbl/examples",smart_refctd_ptr(m_logger),m_system.get()); diff --git a/common/include/nbl/examples/examples.hpp b/common/include/nbl/examples/examples.hpp index 985a3960a..d82303514 100644 --- a/common/include/nbl/examples/examples.hpp +++ b/common/include/nbl/examples/examples.hpp @@ -4,6 +4,7 @@ #ifndef _NBL_EXAMPLES_HPP_ #define _NBL_EXAMPLES_HPP_ + //! Precompiled header shared across all examples #include "nbl/examples/PCH.hpp" @@ -16,4 +17,8 @@ // #include "..." +// Cannot be in PCH because depens on definition of `this_example` for Example's builtins +#include "nbl/examples/common/BuiltinResourcesApplication.hpp" + + #endif // _NBL_EXAMPLES_HPP_ \ No newline at end of file From 1b3c19cb84d618f20c77469c86a665544889aff7 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 23 Jun 2025 09:46:26 +0200 Subject: [PATCH 285/296] make the `BuiltinResourcesApplication` work for examples without builtins too --- 09_GeometryCreator/include/common.hpp | 5 +++-- .../nbl/examples/common/BuiltinResourcesApplication.hpp | 8 +++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/09_GeometryCreator/include/common.hpp b/09_GeometryCreator/include/common.hpp index d172e1959..84cd8118a 100644 --- a/09_GeometryCreator/include/common.hpp +++ b/09_GeometryCreator/include/common.hpp @@ -1,8 +1,8 @@ #ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ #define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ -// TODO: @AnastaZIuk do we even make that explicit? -#include "nbl/examples/PCH.hpp" + +#include "nbl/examples/examples.hpp" using namespace nbl; using namespace core; @@ -14,4 +14,5 @@ using namespace video; using namespace scene; using namespace nbl::examples; + #endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ \ No newline at end of file diff --git a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp index d183a9f4b..aa1949ecd 100644 --- a/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp +++ b/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp @@ -12,8 +12,9 @@ // TODO: the include/header `nbl/examples` archive // TODO: the source `nbl/examples` archive // TODO: the build `nbl/examples` archive -// TODO: make the `this_example` optional, only if the example has builtins -#include "nbl/this_example/builtin/CArchive.h" +#if __has_include("nbl/this_example/builtin/CArchive.h") + #include "nbl/this_example/builtin/CArchive.h" +#endif #endif @@ -55,7 +56,8 @@ class BuiltinResourcesApplication : public virtual application_templates::MonoAs m_system->mount(std::move(examplesHeaderArch),"nbl/examples"); m_system->mount(std::move(examplesSourceArch),"nbl/examples"); // m_system->mount(std::move(examplesBuildArch),"nbl/examples"); - m_system->mount(std::move(thisExampleArch),"app_resources"); + if (thisExampleArch) + m_system->mount(std::move(thisExampleArch),"app_resources"); return true; } From 7ec6846d21be7893cf169703b0f7406e90bb8680 Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 23 Jun 2025 17:13:04 +0200 Subject: [PATCH 286/296] make CSwapchainFramebuffersAndDepth support runtime depth buffer resignation --- .../common/CSwapchainFramebuffersAndDepth.hpp | 57 +++++++++++-------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp index ef88fb325..c7d780fdf 100644 --- a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp +++ b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp @@ -18,6 +18,10 @@ class CSwapchainFramebuffersAndDepth final : public video::CDefaultSwapchainFram template inline CSwapchainFramebuffersAndDepth(video::ILogicalDevice* device, const asset::E_FORMAT _desiredDepthFormat, Args&&... args) : base_t(device,std::forward(args)...) { + // user didn't want any depth + if (_desiredDepthFormat==asset::EF_UNKNOWN) + return; + using namespace nbl::asset; using namespace nbl::video; const IPhysicalDevice::SImageFormatPromotionRequest req = { @@ -55,32 +59,34 @@ class CSwapchainFramebuffersAndDepth final : public video::CDefaultSwapchainFram { using namespace nbl::asset; using namespace nbl::video; - // DOCS: why are we not using `m_device` here? any particular reason? - auto device = const_cast(m_renderpass->getOriginDevice()); - - const auto depthFormat = m_renderpass->getCreationParameters().depthStencilAttachments[0].format; - const auto& sharedParams = getSwapchain()->getCreationParameters().sharedParams; - auto image = device->createImage({ IImage::SCreationParams{ - .type = IGPUImage::ET_2D, - .samples = IGPUImage::ESCF_1_BIT, - .format = depthFormat, - .extent = {sharedParams.width,sharedParams.height,1}, - .mipLevels = 1, - .arrayLayers = 1, - .depthUsage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT - } }); + if (m_depthFormat!=asset::EF_UNKNOWN) + { + // DOCS: why are we not using `m_device` here? any particular reason? + auto device = const_cast(m_renderpass->getOriginDevice()); - device->allocate(image->getMemoryReqs(), image.get()); + const auto depthFormat = m_renderpass->getCreationParameters().depthStencilAttachments[0].format; + const auto& sharedParams = getSwapchain()->getCreationParameters().sharedParams; + auto image = device->createImage({ IImage::SCreationParams{ + .type = IGPUImage::ET_2D, + .samples = IGPUImage::ESCF_1_BIT, + .format = depthFormat, + .extent = {sharedParams.width,sharedParams.height,1}, + .mipLevels = 1, + .arrayLayers = 1, + .depthUsage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT + } }); - m_depthBuffer = device->createImageView({ - .flags = IGPUImageView::ECF_NONE, - .subUsages = IGPUImage::EUF_RENDER_ATTACHMENT_BIT, - .image = std::move(image), - .viewType = IGPUImageView::ET_2D, - .format = depthFormat, - .subresourceRange = {IGPUImage::EAF_DEPTH_BIT,0,1,0,1} - }); + device->allocate(image->getMemoryReqs(), image.get()); + m_depthBuffer = device->createImageView({ + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::EUF_RENDER_ATTACHMENT_BIT, + .image = std::move(image), + .viewType = IGPUImageView::ET_2D, + .format = depthFormat, + .subresourceRange = {IGPUImage::EAF_DEPTH_BIT,0,1,0,1} + }); + } const auto retval = base_t::onCreateSwapchain_impl(qFam); m_depthBuffer = nullptr; return retval; @@ -88,11 +94,12 @@ class CSwapchainFramebuffersAndDepth final : public video::CDefaultSwapchainFram inline core::smart_refctd_ptr createFramebuffer(video::IGPUFramebuffer::SCreationParams&& params) override { - params.depthStencilAttachments = &m_depthBuffer.get(); + if (m_depthBuffer) + params.depthStencilAttachments = &m_depthBuffer.get(); return m_device->createFramebuffer(std::move(params)); } - asset::E_FORMAT m_depthFormat; + asset::E_FORMAT m_depthFormat = asset::EF_UNKNOWN; // only used to pass a parameter from `onCreateSwapchain_impl` to `createFramebuffer` core::smart_refctd_ptr m_depthBuffer; }; From 64e7b26f196e7636d2b2aa9c7f09275042d1a82b Mon Sep 17 00:00:00 2001 From: devsh Date: Mon, 23 Jun 2025 17:34:41 +0200 Subject: [PATCH 287/296] remake example 61, not thoroughly tested, some TODOs remain for @AnastaZIuk --- 09_GeometryCreator/main.cpp | 11 +- 61_UI/include/transform.hpp | 22 +- 61_UI/main.cpp | 1350 ++++++++++++++++++----------------- 3 files changed, 721 insertions(+), 662 deletions(-) diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index 38daebaa5..1a959f7a0 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -16,13 +16,6 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), device_base_t({1280,720}, EF_D16_UNORM, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} - SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override - { - auto retval = device_base_t::getRequiredDeviceFeatures(); - retval.geometryShader = true; - return retval; - } - inline bool onAppInitialized(smart_refctd_ptr&& system) override { if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) @@ -100,7 +93,6 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes camera.endInputProcessing(nextPresentationTimestamp); } - auto* queue = getGraphicsQueue(); asset::SViewport viewport; { @@ -155,6 +147,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes m_renderer->render(cb,viewParams); cb->endRenderPass(); + cb->endDebugMarker(); cb->end(); IQueue::SSubmitInfo::SSemaphoreInfo retval = @@ -183,7 +176,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes } }; - if (queue->submit(infos) != IQueue::RESULT::SUCCESS) + if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS) { retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal m_realFrameIx--; diff --git a/61_UI/include/transform.hpp b/61_UI/include/transform.hpp index 88a78f751..fb1672c2f 100644 --- a/61_UI/include/transform.hpp +++ b/61_UI/include/transform.hpp @@ -1,20 +1,23 @@ -#ifndef __NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED__ -#define __NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED__ +#ifndef _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_ +#define _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_ + #include "nbl/ui/ICursorControl.h" + #include "nbl/ext/ImGui/ImGui.h" + #include "imgui/imgui_internal.h" #include "imguizmo/ImGuizmo.h" -static constexpr inline auto OfflineSceneTextureIx = 1u; struct TransformRequestParams { - bool useWindow = true, editTransformDecomposition = false, enableViewManipulate = false; float camDistance = 8.f; + uint8_t sceneTexDescIx = ~0; + bool useWindow = true, editTransformDecomposition = false, enableViewManipulate = false; }; -void EditTransform(float* cameraView, const float* cameraProjection, float* matrix, const TransformRequestParams& params) +nbl::hlsl::uint16_t2 EditTransform(float* cameraView, const float* cameraProjection, float* matrix, const TransformRequestParams& params) { static ImGuizmo::OPERATION mCurrentGizmoOperation(ImGuizmo::TRANSLATE); static ImGuizmo::MODE mCurrentGizmoMode(ImGuizmo::LOCAL); @@ -99,11 +102,12 @@ void EditTransform(float* cameraView, const float* cameraProjection, float* matr rendered is aligned to our texture scene using imgui "cursor" screen positions */ - +// TODO: this shouldn't be handled here I think SImResourceInfo info; - info.textureID = OfflineSceneTextureIx; + info.textureID = params.sceneTexDescIx; info.samplerIx = (uint16_t)nbl::ext::imgui::UI::DefaultSamplerIx::USER; + nbl::hlsl::uint16_t2 retval; if (params.useWindow) { ImGui::SetNextWindowSize(ImVec2(800, 400), ImGuiCond_Appearing); @@ -118,6 +122,7 @@ void EditTransform(float* cameraView, const float* cameraProjection, float* matr ImGui::Image(info, contentRegionSize); ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y); + retval = {contentRegionSize.x,contentRegionSize.y}; viewManipulateRight = cursorPos.x + contentRegionSize.x; viewManipulateTop = cursorPos.y; @@ -137,6 +142,7 @@ void EditTransform(float* cameraView, const float* cameraProjection, float* matr ImGui::Image(info, contentRegionSize); ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y); + retval = {contentRegionSize.x,contentRegionSize.y}; viewManipulateRight = cursorPos.x + contentRegionSize.x; viewManipulateTop = cursorPos.y; @@ -149,6 +155,8 @@ void EditTransform(float* cameraView, const float* cameraProjection, float* matr ImGui::End(); ImGui::PopStyleColor(); + + return retval; } #endif // __NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED__ \ No newline at end of file diff --git a/61_UI/main.cpp b/61_UI/main.cpp index 17d028f29..d4f21f2e8 100644 --- a/61_UI/main.cpp +++ b/61_UI/main.cpp @@ -5,790 +5,848 @@ #include "common.hpp" /* - Renders scene texture to an offline - framebuffer which color attachment - is then sampled into a imgui window. +Renders scene texture to an offscreen framebuffer whose color attachment is then sampled into a imgui window. - Written with Nabla, it's UI extension - and got integrated with ImGuizmo to - handle scene's object translations. +Written with Nabla's UI extension and got integrated with ImGuizmo to handle scene's object translations. */ - -class UISampleApp final : public SimpleWindowedApplication +class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesApplication { - using device_base_t = SimpleWindowedApplication; - - _NBL_STATIC_INLINE_CONSTEXPR uint32_t WIN_W = 1280, WIN_H = 720; + using device_base_t = MonoWindowApplication; + using asset_base_t = BuiltinResourcesApplication; public: inline UISampleApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) - : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} - - inline core::vector getSurfaces() const override - { - if (!m_surface) - { - { - auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); - IWindow::SCreationParams params = {}; - params.callback = core::make_smart_refctd_ptr(); - params.width = WIN_W; - params.height = WIN_H; - params.x = 32; - params.y = 32; - params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; - params.windowCaption = "UISampleApp"; - params.callback = windowCallback; - const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); - } - - auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); - const_cast&>(m_surface) = nbl::video::CSimpleResizeSurface::create(std::move(surface)); - } - - if (m_surface) - return { {m_surface->getSurface()/*,EQF_NONE*/} }; - - return {}; - } + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), + device_base_t({1280,720}, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} inline bool onAppInitialized(smart_refctd_ptr&& system) override { - m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); - + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) return false; - m_assetManager = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); - m_semaphore = m_device->createSemaphore(m_realFrameIx); if (!m_semaphore) return logFail("Failed to Create a Semaphore!"); - ISwapchain::SCreationParams swapchainParams = { .surface = m_surface->getSurface() }; - if (!swapchainParams.deduceFormat(m_physicalDevice)) - return logFail("Could not choose a Surface Format for the Swapchain!"); - - const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = + auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + for (auto i = 0u; i(m_device.get(), swapchainParams.surfaceFormat.format, dependencies); - auto* renderpass = scResources->getRenderpass(); - - if (!renderpass) - return logFail("Failed to create Renderpass!"); - - auto gQueue = getGraphicsQueue(); - if (!m_surface || !m_surface->init(gQueue, std::move(scResources), swapchainParams.sharedParams)) - return logFail("Could not create Window & Surface or initialize the Surface!"); - - m_cmdPool = m_device->createCommandPool(gQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - - for (auto i = 0u; i < MaxFramesInFlight; i++) - { - if (!m_cmdPool) + if (!pool) return logFail("Couldn't create Command Pool!"); - if (!m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{m_cmdBufs.data()+i,1})) return logFail("Couldn't create Command Buffer!"); } - //pass.scene = CScene::create(smart_refctd_ptr(m_utils), smart_refctd_ptr(m_logger), gQueue, geometry); - pass.scene = CScene::create(smart_refctd_ptr(m_utils), smart_refctd_ptr(m_logger), gQueue, geometry); - - nbl::ext::imgui::UI::SCreationParameters params; - - params.resources.texturesInfo = { .setIx = 0u, .bindingIx = 0u }; - params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u }; - params.assetManager = m_assetManager; - params.pipelineCache = nullptr; - params.pipelineLayout = nbl::ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, TexturesAmount); - params.renderpass = smart_refctd_ptr(renderpass); - params.streamingBuffer = nullptr; - params.subpassIx = 0u; - params.transfer = getTransferUpQueue(); - params.utilities = m_utils; - { - pass.ui.manager = nbl::ext::imgui::UI::create(std::move(params)); - - if (!pass.ui.manager) - return false; - - // note that we use default layout provided by our extension, but you are free to create your own by filling nbl::ext::imgui::UI::S_CREATION_PARAMETERS::resources - const auto* descriptorSetLayout = pass.ui.manager->getPipeline()->getLayout()->getDescriptorSetLayout(0u); - const auto& params = pass.ui.manager->getCreationParameters(); - - IDescriptorPool::SCreateInfo descriptorPoolInfo = {}; - descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLER)] = (uint32_t)nbl::ext::imgui::UI::DefaultSamplerIx::COUNT; - descriptorPoolInfo.maxDescriptorCount[static_cast(asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE)] = TexturesAmount; - descriptorPoolInfo.maxSets = 1u; - descriptorPoolInfo.flags = IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT; - - m_descriptorSetPool = m_device->createDescriptorPool(std::move(descriptorPoolInfo)); - assert(m_descriptorSetPool); - - m_descriptorSetPool->createDescriptorSets(1u, &descriptorSetLayout, &pass.ui.descriptorSet); - assert(pass.ui.descriptorSet); - } - pass.ui.manager->registerListener([this]() -> void + const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()}; + // we want to use the vertex data through UTBs + using usage_f = IGPUBuffer::E_USAGE_FLAGS; + CAssetConverter::patch_t patch = {}; + patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; + patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT; + patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; + m_scene = CGeometryCreatorScene::create( { - ImGuiIO& io = ImGui::GetIO(); - - camera.setProjectionMatrix([&]() - { - static matrix4SIMD projection; - - if (isPerspective) - if(isLH) - projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar); - else - projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar); - else + .transferQueue = getTransferUpQueue(), + .utilities = m_utils.get(), + .logger = m_logger.get(), + .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies + },patch + ); + + // for the scene drawing pass + { + IGPURenderpass::SCreationParams params = {}; + const IGPURenderpass::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = { + {{ { - float viewHeight = viewWidth * io.DisplaySize.y / io.DisplaySize.x; - - if(isLH) - projection = matrix4SIMD::buildProjectionMatrixOrthoLH(viewWidth, viewHeight, zNear, zFar); - else - projection = matrix4SIMD::buildProjectionMatrixOrthoRH(viewWidth, viewHeight, zNear, zFar); + .format = sceneRenderDepthFormat, + .samples = IGPUImage::ESCF_1_BIT, + .mayAlias = false + }, + /*.loadOp = */{IGPURenderpass::LOAD_OP::CLEAR}, + /*.storeOp = */{IGPURenderpass::STORE_OP::STORE}, + /*.initialLayout = */{IGPUImage::LAYOUT::UNDEFINED}, + /*.finalLayout = */{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} + }}, + IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd + }; + params.depthStencilAttachments = depthAttachments; + const IGPURenderpass::SCreationParams::SColorAttachmentDescription colorAttachments[] = { + {{ + { + .format = finalSceneRenderFormat, + .samples = IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT, + .mayAlias = false + }, + /*.loadOp = */IGPURenderpass::LOAD_OP::CLEAR, + /*.storeOp = */IGPURenderpass::STORE_OP::STORE, + /*.initialLayout = */IGPUImage::LAYOUT::UNDEFINED, + /*.finalLayout = */ IGPUImage::LAYOUT::READ_ONLY_OPTIMAL // ImGUI shall read + }}, + IGPURenderpass::SCreationParams::ColorAttachmentsEnd + }; + params.colorAttachments = colorAttachments; + IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = { + {}, + IGPURenderpass::SCreationParams::SubpassesEnd + }; + subpasses[0].colorAttachments[0] = {.render={.attachmentIndex=0,.layout=IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}; + params.subpasses = subpasses; + + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { + // wipe-transition of Color to ATTACHMENT_OPTIMAL and depth + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = { + // last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later + // while color is sampled by ImGUI + .srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT|PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, + // don't want any writes to be available, as we are clearing both attachments + .srcAccessMask = ACCESS_FLAGS::NONE, + // destination needs to wait as early as possible + // TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h` + .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT|PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // because depth and color get cleared first no read mask + .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT|ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT } + // leave view offsets and flags default + }, + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = { + // last place where the color can get modified, depth is implicitly earlier + .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // only write ops, reads can't be made available, also won't be using depth so don't care about it being visible to anyone else + .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT, + // the ImGUI will sample the color, then next frame we overwrite both attachments + .dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT|PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT, + // but we only care about the availability-visibility chain between renderpass and imgui + .dstAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT + } + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; + params.dependencies = {}; + m_renderpass = m_device->createRenderpass(std::move(params)); + if (!m_renderpass) + return logFail("Failed to create Scene Renderpass!"); + } + m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),m_renderpass.get(),0,m_scene.get()); - return projection; - }()); - - ImGuizmo::SetOrthographic(false); - ImGuizmo::BeginFrame(); - - ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); - ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); - - // create a window and insert the inspector - ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); - ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); - ImGui::Begin("Editor"); - - if (ImGui::RadioButton("Full view", !transformParams.useWindow)) - transformParams.useWindow = false; - - ImGui::SameLine(); - - if (ImGui::RadioButton("Window", transformParams.useWindow)) - transformParams.useWindow = true; - - ImGui::Text("Camera"); - bool viewDirty = false; - - if (ImGui::RadioButton("LH", isLH)) - isLH = true; - - ImGui::SameLine(); - - if (ImGui::RadioButton("RH", !isLH)) - isLH = false; - - if (ImGui::RadioButton("Perspective", isPerspective)) - isPerspective = true; + // Create ImGUI + { + ext::imgui::UI::SCreationParameters params = {}; + params.resources.texturesInfo = {.setIx=0u,.bindingIx=TexturesImGUIBindingIndex}; + params.resources.samplersInfo = {.setIx=0u,.bindingIx=1u}; + params.utilities = m_utils; + params.transfer = getTransferUpQueue(); + params.pipelineLayout = ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(),params.resources.texturesInfo,params.resources.samplersInfo,MaxImGUITextures); + params.assetManager = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + params.renderpass = m_renderpass; + params.subpassIx = 0u; + params.pipelineCache = nullptr; + interface.imGUI = ext::imgui::UI::create(std::move(params)); + if (!interface.imGUI) + return logFail("Failed to create `nbl::ext::imgui::UI` class"); + } - ImGui::SameLine(); + // create rest of User Interface + { + auto* imgui = interface.imGUI.get(); + // create the suballocated descriptor set + { + // note that we use default layout provided by our extension, but you are free to create your own by filling ext::imgui::UI::S_CREATION_PARAMETERS::resources + const auto* layout = imgui->getPipeline()->getLayout()->getDescriptorSetLayout(0u); + auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,{&layout,1}); + auto ds = pool->createDescriptorSet(smart_refctd_ptr(layout)); + interface.subAllocDS = make_smart_refctd_ptr(std::move(ds)); + if (!interface.subAllocDS) + return logFail("Failed to create the descriptor set"); + // make sure Texture Atlas slot is taken for eternity + { + auto dummy = SubAllocatedDescriptorSet::invalid_value; + interface.subAllocDS->multi_allocate(0,1,&dummy); + assert(dummy==ext::imgui::UI::FontAtlasTexId); + } + // write constant descriptors, note we don't create info & write pair for the samplers because UI extension's are immutable and baked into DS layout + IGPUDescriptorSet::SDescriptorInfo info = {}; + info.desc = smart_refctd_ptr(interface.imGUI->getFontAtlasView()); + info.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + const IGPUDescriptorSet::SWriteDescriptorSet write = { + .dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = ext::imgui::UI::FontAtlasTexId, + .count = 1, + .info = &info + }; + if (!m_device->updateDescriptorSets({&write,1},{})) + return logFail("Failed to write the descriptor set"); + } + imgui->registerListener([this](){interface();}); + } - if (ImGui::RadioButton("Orthographic", !isPerspective)) - isPerspective = false; + interface.camera.mapKeysToArrows(); - ImGui::Checkbox("Enable \"view manipulate\"", &transformParams.enableViewManipulate); - ImGui::Checkbox("Enable camera movement", &move); - ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f); - ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f); + onAppInitializedFinish(); + return true; + } - // ImGui::Checkbox("Flip Gizmo's Y axis", &flipGizmoY); // let's not expose it to be changed in UI but keep the logic in case + inline void beginRenderpass(IGPUCommandBuffer* cb, const IGPUCommandBuffer::SRenderpassBeginInfo& info) + { + cb->beginRenderPass(info,IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); + cb->setScissor(0,1,&info.renderArea); + const SViewport viewport = { + .x = 0, + .y = 0, + .width = static_cast(info.renderArea.extent.width), + .height = static_cast(info.renderArea.extent.height) + }; + cb->setViewport(0u,1u,&viewport); + } - if (isPerspective) - ImGui::SliderFloat("Fov", &fov, 20.f, 150.f); - else - ImGui::SliderFloat("Ortho width", &viewWidth, 1, 20); + inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override + { + // CPU events + update(nextPresentationTimestamp); - ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); - ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); + const auto& virtualWindowRes = interface.sceneResolution; + if (!m_framebuffer || m_framebuffer->getCreationParameters().width!=virtualWindowRes[0] || m_framebuffer->getCreationParameters().height!=virtualWindowRes[1]) + recreateFramebuffer(virtualWindowRes); - viewDirty |= ImGui::SliderFloat("Distance", &transformParams.camDistance, 1.f, 69.f); + // + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; - if (viewDirty || firstFrame) + auto* const cb = m_cmdBufs.data()[resourceIx].get(); + cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // clear to black for both things + const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} }; + { + cb->beginDebugMarker("UISampleApp Scene Frame"); + { + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = { - core::vectorSIMDf cameraPosition(cosf(camYAngle)* cosf(camXAngle)* transformParams.camDistance, sinf(camXAngle)* transformParams.camDistance, sinf(camYAngle)* cosf(camXAngle)* transformParams.camDistance); - core::vectorSIMDf cameraTarget(0.f, 0.f, 0.f); - const static core::vectorSIMDf up(0.f, 1.f, 0.f); - - camera.setPosition(cameraPosition); - camera.setTarget(cameraTarget); - camera.setBackupUpVector(up); - - camera.recomputeViewMatrix(); - - firstFrame = false; + .framebuffer = m_framebuffer.get(), + .colorClearValues = &clearValue, + .depthStencilClearValues = nullptr, + .renderArea = { + .offset = {0,0}, + .extent = {virtualWindowRes[0],virtualWindowRes[1]} + } + }; + beginRenderpass(cb,renderpassInfo); + } + // draw scene + { + float32_t3x4 viewMatrix; + float32_t4x4 viewProjMatrix; + // TODO: get rid of legacy matrices + { + const auto& camera = interface.camera; + memcpy(&viewMatrix,camera.getViewMatrix().pointer(),sizeof(viewMatrix)); + memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewProjMatrix)); } + const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix); - ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); - if (ImGuizmo::IsUsing()) + // tear down scene every frame + m_renderer->m_instances[0].packedGeo = m_renderer->getInitParams().geoms.data()+interface.gcIndex; + m_renderer->render(cb,viewParams); + } + cb->endRenderPass(); + cb->endDebugMarker(); + } + { + cb->beginDebugMarker("UISampleApp IMGUI Frame"); + { + auto scRes = static_cast(m_surface->getSwapchainResources()); + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = { - ImGui::Text("Using gizmo"); - } - else + .framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), + .colorClearValues = &clearValue, + .depthStencilClearValues = nullptr, + .renderArea = { + .offset = {0,0}, + .extent = {m_window->getWidth(),m_window->getHeight()} + } + }; + beginRenderpass(cb,renderpassInfo); + } + // draw ImGUI + { + auto* imgui = interface.imGUI.get(); + auto* pipeline = imgui->getPipeline(); + cb->bindGraphicsPipeline(pipeline); + // note that we use default UI pipeline layout where uiParams.resources.textures.setIx == uiParams.resources.samplers.setIx + const auto* ds = interface.subAllocDS->getDescriptorSet(); + cb->bindDescriptorSets(EPBP_GRAPHICS,pipeline->getLayout(),imgui->getCreationParameters().resources.texturesInfo.setIx,1u,&ds); + // a timepoint in the future to release streaming resources for geometry + const ISemaphore::SWaitInfo drawFinished = {.semaphore=m_semaphore.get(),.value=m_realFrameIx+1u}; + if (!imgui->render(cb,drawFinished)) { - ImGui::Text(ImGuizmo::IsOver() ? "Over gizmo" : ""); - ImGui::SameLine(); - ImGui::Text(ImGuizmo::IsOver(ImGuizmo::TRANSLATE) ? "Over translate gizmo" : ""); - ImGui::SameLine(); - ImGui::Text(ImGuizmo::IsOver(ImGuizmo::ROTATE) ? "Over rotate gizmo" : ""); - ImGui::SameLine(); - ImGui::Text(ImGuizmo::IsOver(ImGuizmo::SCALE) ? "Over scale gizmo" : ""); + m_logger->log("TODO: need to present acquired image before bailing because its already acquired.",ILogger::ELL_ERROR); + return {}; } - ImGui::Separator(); - - /* - * ImGuizmo expects view & perspective matrix to be column major both with 4x4 layout - * and Nabla uses row major matricies - 3x4 matrix for view & 4x4 for projection - - - VIEW: - - ImGuizmo - - | X[0] Y[0] Z[0] 0.0f | - | X[1] Y[1] Z[1] 0.0f | - | X[2] Y[2] Z[2] 0.0f | - | -Dot(X, eye) -Dot(Y, eye) -Dot(Z, eye) 1.0f | - - Nabla - - | X[0] X[1] X[2] -Dot(X, eye) | - | Y[0] Y[1] Y[2] -Dot(Y, eye) | - | Z[0] Z[1] Z[2] -Dot(Z, eye) | + } + cb->endRenderPass(); + cb->endDebugMarker(); + } + cb->end(); - = transpose(nbl::core::matrix4SIMD()) + //updateGUIDescriptorSet(); - - PERSPECTIVE [PROJECTION CASE]: + IQueue::SSubmitInfo::SSemaphoreInfo retval = + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS + }; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cb } + }; + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { + { + .semaphore = device_base_t::getCurrentAcquire().semaphore, + .value = device_base_t::getCurrentAcquire().acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = {&retval,1} + } + }; + + if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS) + { + retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal + m_realFrameIx--; + } - ImGuizmo - | (temp / temp2) (0.0) (0.0) (0.0) | - | (0.0) (temp / temp3) (0.0) (0.0) | - | ((right + left) / temp2) ((top + bottom) / temp3) ((-zfar - znear) / temp4) (-1.0f) | - | (0.0) (0.0) ((-temp * zfar) / temp4) (0.0) | + m_window->setCaption("[Nabla Engine] UI App Test Demo"); + return retval; + } - Nabla + protected: + const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override + { + // Subsequent submits don't wait for each other, but they wait for acquire and get waited on by present + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { + // don't want any writes to be available, we'll clear, only thing to worry about is the layout transition + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, // should sync against the semaphore wait anyway + .srcAccessMask = ACCESS_FLAGS::NONE, + // layout transition needs to finish before the color write + .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + // leave view offsets and flags default + }, + // want layout transition to begin after all color output is done + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = { + // last place where the color can get modified, depth is implicitly earlier + .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // only write ops, reads can't be made available + .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + // spec says nothing is needed when presentation is the destination + } + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; + return dependencies; + } - | w (0.0) (0.0) (0.0) | - | (0.0) -h (0.0) (0.0) | - | (0.0) (0.0) (-zFar/(zFar-zNear)) (-zNear*zFar/(zFar-zNear)) | - | (0.0) (0.0) (-1.0) (0.0) | + private: + inline void update(const std::chrono::microseconds nextPresentationTimestamp) + { + auto& camera = interface.camera; + camera.setMoveSpeed(interface.moveSpeed); + camera.setRotateSpeed(interface.rotateSpeed); - = transpose() - * - * the ViewManipulate final call (inside EditTransform) returns world space column major matrix for an object, - * note it also modifies input view matrix but projection matrix is immutable - */ + m_inputSystem->getDefaultMouse(&mouse); + m_inputSystem->getDefaultKeyboard(&keyboard); - static struct - { - core::matrix4SIMD view, projection, model; - } imguizmoM16InOut; + struct + { + std::vector mouse{}; + std::vector keyboard{}; + } uiEvents; - ImGuizmo::SetID(0u); + // TODO: should be a member really + static std::chrono::microseconds previousEventTimestamp{}; - imguizmoM16InOut.view = core::transpose(matrix4SIMD(camera.getViewMatrix())); - imguizmoM16InOut.projection = core::transpose(camera.getProjectionMatrix()); - imguizmoM16InOut.model = core::transpose(core::matrix4SIMD(pass.scene->object.model)); + // I think begin/end should always be called on camera, just events shouldn't be fed, why? + // If you stop begin/end, whatever keys were up/down get their up/down values frozen leading to + // `perActionDt` becoming obnoxiously large the first time the even processing resumes due to + // `timeDiff` being computed since `lastVirtualUpTimeStamp` + camera.beginInputProcessing(nextPresentationTimestamp); + { + mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { - if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates - imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/ + if (interface.move) + camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl - transformParams.editTransformDecomposition = true; - EditTransform(imguizmoM16InOut.view.pointer(), imguizmoM16InOut.projection.pointer(), imguizmoM16InOut.model.pointer(), transformParams); - } + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; - // to Nabla + update camera & model matrices - const auto& view = camera.getViewMatrix(); - const auto& projection = camera.getProjectionMatrix(); + previousEventTimestamp = e.timeStamp; + uiEvents.mouse.emplace_back(e); - // TODO: make it more nicely - const_cast(view) = core::transpose(imguizmoM16InOut.view).extractSub3x4(); // a hack, correct way would be to use inverse matrix and get position + target because now it will bring you back to last position & target when switching from gizmo move to manual move (but from manual to gizmo is ok) - camera.setProjectionMatrix(projection); // update concatanated matrix + if (e.type==nbl::ui::SMouseEvent::EET_SCROLL && m_renderer) + { + interface.gcIndex += int16_t(core::sign(e.scrollEvent.verticalScroll)); + interface.gcIndex = core::clamp(interface.gcIndex,0ull,m_renderer->getInitParams().geoms.size()-1); + } + } + }, + m_logger.get() + ); + keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { - static nbl::core::matrix3x4SIMD modelView, normal; - static nbl::core::matrix4SIMD modelViewProjection; + if (interface.move) + camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl - auto& hook = pass.scene->object; - hook.model = core::transpose(imguizmoM16InOut.model).extractSub3x4(); + for (const auto& e : events) // here capture { - const auto& references = pass.scene->getResources().objects; - const auto type = static_cast(gcIndex); + if (e.timeStamp < previousEventTimestamp) + continue; - const auto& [gpu, meta] = references[type]; - hook.meta.type = type; - hook.meta.name = meta.name; + previousEventTimestamp = e.timeStamp; + uiEvents.keyboard.emplace_back(e); } + }, + m_logger.get() + ); + } + camera.endInputProcessing(nextPresentationTimestamp); - auto& ubo = hook.viewParameters; + const auto cursorPosition = m_window->getCursorControl()->getPosition(); - modelView = nbl::core::concatenateBFollowedByA(view, hook.model); - modelView.getSub3x3InverseTranspose(normal); - modelViewProjection = nbl::core::concatenateBFollowedByA(camera.getConcatenatedMatrix(), hook.model); + ext::imgui::UI::SUpdateParameters params = + { + .mousePosition = float32_t2(cursorPosition.x,cursorPosition.y) - float32_t2(m_window->getX(),m_window->getY()), + .displaySize = {m_window->getWidth(),m_window->getHeight()}, + .mouseEvents = uiEvents.mouse, + .keyboardEvents = uiEvents.keyboard + }; - memcpy(ubo.MVP, modelViewProjection.pointer(), sizeof(ubo.MVP)); - memcpy(ubo.MV, modelView.pointer(), sizeof(ubo.MV)); - memcpy(ubo.NormalMat, normal.pointer(), sizeof(ubo.NormalMat)); + interface.objectName = m_scene->getGeometries()[interface.gcIndex].name; + interface.imGUI->update(params); + } - // object meta display - { - ImGui::Begin("Object"); - ImGui::Text("type: \"%s\"", hook.meta.name.data()); - ImGui::End(); - } + void recreateFramebuffer(const uint16_t2 resolution) + { + auto createImageAndView = [&](E_FORMAT format)->smart_refctd_ptr + { + auto image = m_device->createImage({{ + .type = IGPUImage::ET_2D, + .samples = IGPUImage::ESCF_1_BIT, + .format = format, + .extent = {resolution.x,resolution.y,1}, + .mipLevels = 1, + .arrayLayers = 1, + .usage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT|IGPUImage::EUF_SAMPLED_BIT + }}); + if (!m_device->allocate(image->getMemoryReqs(),image.get()).isValid()) + return nullptr; + return m_device->createImageView({ + .image = std::move(image), + .viewType = IGPUImageView::ET_2D, + .format = format, + .subresourceRange = { + .aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT:IGPUImage::EAF_COLOR_BIT, } - - // view matrices editor - { - ImGui::Begin("Matrices"); - - auto addMatrixTable = [&](const char* topText, const char* tableName, const int rows, const int columns, const float* pointer, const bool withSeparator = true) - { - ImGui::Text(topText); - if (ImGui::BeginTable(tableName, columns)) - { - for (int y = 0; y < rows; ++y) - { - ImGui::TableNextRow(); - for (int x = 0; x < columns; ++x) - { - ImGui::TableSetColumnIndex(x); - ImGui::Text("%.3f", *(pointer + (y * columns) + x)); - } - } - ImGui::EndTable(); - } + }); + }; - if (withSeparator) - ImGui::Separator(); - }; + m_renderColorView = createImageAndView(finalSceneRenderFormat); + auto depthView = createImageAndView(sceneRenderDepthFormat); + m_framebuffer = m_device->createFramebuffer({ { + .renderpass = m_renderpass, + .depthStencilAttachments = &depthView.get(), + .colorAttachments = &m_renderColorView.get(), + .width = resolution.x, + .height = resolution.y + }}); + + // release previous slot and its image + interface.subAllocDS->multi_deallocate(0,1,&interface.renderColorViewDescIndex,{.semaphore=m_semaphore.get(),.value=m_realFrameIx}); + // + interface.subAllocDS->multi_allocate(0,1,&interface.renderColorViewDescIndex); + // update descriptor set + IGPUDescriptorSet::SDescriptorInfo info = {}; + info.desc = m_renderColorView; + info.info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; + const IGPUDescriptorSet::SWriteDescriptorSet write = { + .dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = interface.renderColorViewDescIndex, + .count = 1, + .info = &info + }; + m_device->updateDescriptorSets({&write,1},{}); + } - addMatrixTable("Model Matrix", "ModelMatrixTable", 3, 4, pass.scene->object.model.pointer()); - addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, view.pointer()); - addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, projection.pointer(), false); + // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers + constexpr static inline uint32_t MaxFramesInFlight = 3u; + constexpr static inline auto sceneRenderDepthFormat = EF_D32_SFLOAT; + constexpr static inline auto finalSceneRenderFormat = EF_R8G8B8A8_SRGB; + constexpr static inline auto TexturesImGUIBindingIndex = 0u; + // we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes + constexpr static inline auto MaxImGUITextures = 2u+MaxFramesInFlight; + + // + smart_refctd_ptr m_scene; + smart_refctd_ptr m_renderpass; + smart_refctd_ptr m_renderer; + smart_refctd_ptr m_renderColorView; + smart_refctd_ptr m_framebuffer; + // + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; + std::array,MaxFramesInFlight> m_cmdBufs; + // + InputSystem::ChannelReader mouse; + InputSystem::ChannelReader keyboard; + // UI stuff + struct CInterface + { + void operator()() + { + ImGuiIO& io = ImGui::GetIO(); - ImGui::End(); - } + // TODO: why is this a lambda and not just an assignment in a scope ? + camera.setProjectionMatrix([&]() + { + matrix4SIMD projection; - // Nabla Imgui backend MDI buffer info - // To be 100% accurate and not overly conservative we'd have to explicitly `cull_frees` and defragment each time, - // so unless you do that, don't use this basic info to optimize the size of your IMGUI buffer. + if (isPerspective) + if(isLH) + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar); + else + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar); + else { - auto* streaminingBuffer = pass.ui.manager->getStreamingBuffer(); - - const size_t total = streaminingBuffer->get_total_size(); // total memory range size for which allocation can be requested - const size_t freeSize = streaminingBuffer->getAddressAllocator().get_free_size(); // max total free bloock memory size we can still allocate from total memory available - const size_t consumedMemory = total - freeSize; // memory currently consumed by streaming buffer - - float freePercentage = 100.0f * (float)(freeSize) / (float)total; - float allocatedPercentage = (float)(consumedMemory) / (float)total; + float viewHeight = viewWidth * io.DisplaySize.y / io.DisplaySize.x; - ImVec2 barSize = ImVec2(400, 30); - float windowPadding = 10.0f; - float verticalPadding = ImGui::GetStyle().FramePadding.y; + if(isLH) + projection = matrix4SIMD::buildProjectionMatrixOrthoLH(viewWidth, viewHeight, zNear, zFar); + else + projection = matrix4SIMD::buildProjectionMatrixOrthoRH(viewWidth, viewHeight, zNear, zFar); + } - ImGui::SetNextWindowSize(ImVec2(barSize.x + 2 * windowPadding, 110 + verticalPadding), ImGuiCond_Always); - ImGui::Begin("Nabla Imgui MDI Buffer Info", nullptr, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoScrollbar); + return projection; + }()); - ImGui::Text("Total Allocated Size: %zu bytes", total); - ImGui::Text("In use: %zu bytes", consumedMemory); - ImGui::Text("Buffer Usage:"); + ImGuizmo::SetOrthographic(false); + ImGuizmo::BeginFrame(); - ImGui::SetCursorPosX(windowPadding); + ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); - if (freePercentage > 70.0f) - ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(0.0f, 1.0f, 0.0f, 0.4f)); // Green - else if (freePercentage > 30.0f) - ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 1.0f, 0.0f, 0.4f)); // Yellow - else - ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 0.0f, 0.0f, 0.4f)); // Red + // create a window and insert the inspector + ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); + ImGui::Begin("Editor"); - ImGui::ProgressBar(allocatedPercentage, barSize, ""); + if (ImGui::RadioButton("Full view", !transformParams.useWindow)) + transformParams.useWindow = false; - ImGui::PopStyleColor(); + ImGui::SameLine(); - ImDrawList* drawList = ImGui::GetWindowDrawList(); + if (ImGui::RadioButton("Window", transformParams.useWindow)) + transformParams.useWindow = true; - ImVec2 progressBarPos = ImGui::GetItemRectMin(); - ImVec2 progressBarSize = ImGui::GetItemRectSize(); + ImGui::Text("Camera"); + bool viewDirty = false; - const char* text = "%.2f%% free"; - char textBuffer[64]; - snprintf(textBuffer, sizeof(textBuffer), text, freePercentage); + if (ImGui::RadioButton("LH", isLH)) + isLH = true; - ImVec2 textSize = ImGui::CalcTextSize(textBuffer); - ImVec2 textPos = ImVec2 - ( - progressBarPos.x + (progressBarSize.x - textSize.x) * 0.5f, - progressBarPos.y + (progressBarSize.y - textSize.y) * 0.5f - ); + ImGui::SameLine(); - ImVec4 bgColor = ImGui::GetStyleColorVec4(ImGuiCol_WindowBg); - drawList->AddRectFilled - ( - ImVec2(textPos.x - 5, textPos.y - 2), - ImVec2(textPos.x + textSize.x + 5, textPos.y + textSize.y + 2), - ImGui::GetColorU32(bgColor) - ); + if (ImGui::RadioButton("RH", !isLH)) + isLH = false; - ImGui::SetCursorScreenPos(textPos); - ImGui::Text("%s", textBuffer); + if (ImGui::RadioButton("Perspective", isPerspective)) + isPerspective = true; - ImGui::Dummy(ImVec2(0.0f, verticalPadding)); + ImGui::SameLine(); - ImGui::End(); - } + if (ImGui::RadioButton("Orthographic", !isPerspective)) + isPerspective = false; - ImGui::End(); - } - ); + ImGui::Checkbox("Enable \"view manipulate\"", &transformParams.enableViewManipulate); + ImGui::Checkbox("Enable camera movement", &move); + ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f); - m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); - m_surface->recreateSwapchain(); - m_winMgr->show(m_window.get()); - oracle.reportBeginFrameRecord(); - camera.mapKeysToArrows(); + // ImGui::Checkbox("Flip Gizmo's Y axis", &flipGizmoY); // let's not expose it to be changed in UI but keep the logic in case - return true; - } + if (isPerspective) + ImGui::SliderFloat("Fov", &fov, 20.f, 150.f); + else + ImGui::SliderFloat("Ortho width", &viewWidth, 1, 20); - bool updateGUIDescriptorSet() - { - // texture atlas + our scene texture, note we don't create info & write pair for the font sampler because UI extension's is immutable and baked into DS layout - static std::array descriptorInfo; - static IGPUDescriptorSet::SWriteDescriptorSet writes[TexturesAmount]; + ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); + ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); - descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; - descriptorInfo[nbl::ext::imgui::UI::FontAtlasTexId].desc = core::smart_refctd_ptr(pass.ui.manager->getFontAtlasView()); + viewDirty |= ImGui::SliderFloat("Distance", &transformParams.camDistance, 1.f, 69.f); - descriptorInfo[OfflineSceneTextureIx].info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; - descriptorInfo[OfflineSceneTextureIx].desc = pass.scene->getResources().attachments.color; + if (viewDirty || firstFrame) + { + core::vectorSIMDf cameraPosition(cosf(camYAngle)* cosf(camXAngle)* transformParams.camDistance, sinf(camXAngle)* transformParams.camDistance, sinf(camYAngle)* cosf(camXAngle)* transformParams.camDistance); + core::vectorSIMDf cameraTarget(0.f, 0.f, 0.f); + const static core::vectorSIMDf up(0.f, 1.f, 0.f); - for (uint32_t i = 0; i < descriptorInfo.size(); ++i) - { - writes[i].dstSet = pass.ui.descriptorSet.get(); - writes[i].binding = 0u; - writes[i].arrayElement = i; - writes[i].count = 1u; - } - writes[nbl::ext::imgui::UI::FontAtlasTexId].info = descriptorInfo.data() + nbl::ext::imgui::UI::FontAtlasTexId; - writes[OfflineSceneTextureIx].info = descriptorInfo.data() + OfflineSceneTextureIx; + camera.setPosition(cameraPosition); + camera.setTarget(cameraTarget); + camera.setBackupUpVector(up); - return m_device->updateDescriptorSets(writes, {}); - } + camera.recomputeViewMatrix(); + } + firstFrame = false; - inline void workLoopBody() override - { - // framesInFlight: ensuring safe execution of command buffers and acquires, `framesInFlight` only affect semaphore waits, don't use this to index your resources because it can change with swapchain recreation. - const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); - // We block for semaphores for 2 reasons here: - // A) Resource: Can't use resource like a command buffer BEFORE previous use is finished! [MaxFramesInFlight] - // B) Acquire: Can't have more acquires in flight than a certain threshold returned by swapchain or your surface helper class. [MaxAcquiresInFlight] - if (m_realFrameIx >= framesInFlight) - { - const ISemaphore::SWaitInfo cbDonePending[] = + ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); + if (ImGuizmo::IsUsing()) { - { - .semaphore = m_semaphore.get(), - .value = m_realFrameIx + 1 - framesInFlight - } - }; - if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) - return; - } + ImGui::Text("Using gizmo"); + } + else + { + ImGui::Text(ImGuizmo::IsOver() ? "Over gizmo" : ""); + ImGui::SameLine(); + ImGui::Text(ImGuizmo::IsOver(ImGuizmo::TRANSLATE) ? "Over translate gizmo" : ""); + ImGui::SameLine(); + ImGui::Text(ImGuizmo::IsOver(ImGuizmo::ROTATE) ? "Over rotate gizmo" : ""); + ImGui::SameLine(); + ImGui::Text(ImGuizmo::IsOver(ImGuizmo::SCALE) ? "Over scale gizmo" : ""); + } + ImGui::Separator(); - const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + /* + * ImGuizmo expects view & perspective matrix to be column major both with 4x4 layout + * and Nabla uses row major matricies - 3x4 matrix for view & 4x4 for projection - // CPU events - update(); + - VIEW: - // render whole scene to offline frame buffer & submit - pass.scene->begin(); - { - pass.scene->update(); - pass.scene->record(); - pass.scene->end(); - } - pass.scene->submit(); + ImGuizmo - auto* const cb = m_cmdBufs.data()[resourceIx].get(); - cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cb->beginDebugMarker("UISampleApp IMGUI Frame"); + | X[0] Y[0] Z[0] 0.0f | + | X[1] Y[1] Z[1] 0.0f | + | X[2] Y[2] Z[2] 0.0f | + | -Dot(X, eye) -Dot(Y, eye) -Dot(Z, eye) 1.0f | - auto* queue = getGraphicsQueue(); + Nabla - asset::SViewport viewport; - { - viewport.minDepth = 1.f; - viewport.maxDepth = 0.f; - viewport.x = 0u; - viewport.y = 0u; - viewport.width = WIN_W; - viewport.height = WIN_H; - } - cb->setViewport(0u, 1u, &viewport); + | X[0] X[1] X[2] -Dot(X, eye) | + | Y[0] Y[1] Y[2] -Dot(Y, eye) | + | Z[0] Z[1] Z[2] -Dot(Z, eye) | - const VkRect2D currentRenderArea = - { - .offset = {0,0}, - .extent = {m_window->getWidth(),m_window->getHeight()} - }; + = transpose(nbl::core::matrix4SIMD()) - IQueue::SSubmitInfo::SCommandBufferInfo commandBuffersInfo[] = {{.cmdbuf = cb }}; + - PERSPECTIVE [PROJECTION CASE]: - // UI render pass - { - auto scRes = static_cast(m_surface->getSwapchainResources()); - const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = - { - .framebuffer = scRes->getFramebuffer(m_currentImageAcquire.imageIndex), - .colorClearValues = &clear.color, - .depthStencilClearValues = nullptr, - .renderArea = currentRenderArea - }; - nbl::video::ISemaphore::SWaitInfo waitInfo = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u }; + ImGuizmo - cb->beginRenderPass(renderpassInfo, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); - const auto uiParams = pass.ui.manager->getCreationParameters(); - auto* pipeline = pass.ui.manager->getPipeline(); - cb->bindGraphicsPipeline(pipeline); - cb->bindDescriptorSets(EPBP_GRAPHICS, pipeline->getLayout(), uiParams.resources.texturesInfo.setIx, 1u, &pass.ui.descriptorSet.get()); // note that we use default UI pipeline layout where uiParams.resources.textures.setIx == uiParams.resources.samplers.setIx - - if (!keepRunning()) - return; - - if (!pass.ui.manager->render(cb,waitInfo)) - { - // TODO: need to present acquired image before bailing because its already acquired - return; - } - cb->endRenderPass(); - } - cb->end(); - { - const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = - { - { - .semaphore = m_semaphore.get(), - .value = ++m_realFrameIx, - .stageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT - } - }; + | (temp / temp2) (0.0) (0.0) (0.0) | + | (0.0) (temp / temp3) (0.0) (0.0) | + | ((right + left) / temp2) ((top + bottom) / temp3) ((-zfar - znear) / temp4) (-1.0f) | + | (0.0) (0.0) ((-temp * zfar) / temp4) (0.0) | - { - { - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = - { - { - .semaphore = m_currentImageAcquire.semaphore, - .value = m_currentImageAcquire.acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::NONE - } - }; - - const IQueue::SSubmitInfo infos[] = - { - { - .waitSemaphores = acquired, - .commandBuffers = commandBuffersInfo, - .signalSemaphores = rendered - } - }; - - const nbl::video::ISemaphore::SWaitInfo waitInfos[] = - { { - .semaphore = pass.scene->semaphore.progress.get(), - .value = pass.scene->semaphore.finishedValue - } }; - - m_device->blockForSemaphores(waitInfos); - - updateGUIDescriptorSet(); - - if (queue->submit(infos) != IQueue::RESULT::SUCCESS) - m_realFrameIx--; - } - } + Nabla - m_window->setCaption("[Nabla Engine] UI App Test Demo"); - m_surface->present(m_currentImageAcquire.imageIndex, rendered); - } - } + | w (0.0) (0.0) (0.0) | + | (0.0) -h (0.0) (0.0) | + | (0.0) (0.0) (-zFar/(zFar-zNear)) (-zNear*zFar/(zFar-zNear)) | + | (0.0) (0.0) (-1.0) (0.0) | - inline bool keepRunning() override - { - if (m_surface->irrecoverable()) - return false; + = transpose() - return true; - } - - inline bool onAppTerminated() override - { - return device_base_t::onAppTerminated(); - } - - inline void update() - { - camera.setMoveSpeed(moveSpeed); - camera.setRotateSpeed(rotateSpeed); - - static std::chrono::microseconds previousEventTimestamp{}; - - m_inputSystem->getDefaultMouse(&mouse); - m_inputSystem->getDefaultKeyboard(&keyboard); - - auto updatePresentationTimestamp = [&]() - { - m_currentImageAcquire = m_surface->acquireNextImage(); + * + * the ViewManipulate final call (inside EditTransform) returns world space column major matrix for an object, + * note it also modifies input view matrix but projection matrix is immutable + */ - oracle.reportEndFrameRecord(); - const auto timestamp = oracle.getNextPresentationTimeStamp(); - oracle.reportBeginFrameRecord(); +// TODO: do all computation using `hlsl::matrix` and its `hlsl::float32_tNxM` aliases + static struct + { + core::matrix4SIMD view, projection, model; + } imguizmoM16InOut; - return timestamp; - }; + ImGuizmo::SetID(0u); - const auto nextPresentationTimestamp = updatePresentationTimestamp(); + imguizmoM16InOut.view = core::transpose(matrix4SIMD(camera.getViewMatrix())); + imguizmoM16InOut.projection = core::transpose(camera.getProjectionMatrix()); + imguizmoM16InOut.model = core::transpose(matrix4SIMD(model)); + { + if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates + imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/ - struct - { - std::vector mouse{}; - std::vector keyboard{}; - } capturedEvents; + transformParams.editTransformDecomposition = true; + sceneResolution = EditTransform(imguizmoM16InOut.view.pointer(), imguizmoM16InOut.projection.pointer(), imguizmoM16InOut.model.pointer(), transformParams); + } - if (move) camera.beginInputProcessing(nextPresentationTimestamp); - { - mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + // to Nabla + update camera & model matrices +// TODO: make it more nicely, extract: +// - Position by computing inverse of the view matrix and grabbing its translation +// - Target from 3rd row without W component of view matrix multiplied by some arbitrary distance value (can be the length of position from origin) and adding the position +// But then set the view matrix this way anyway, because up-vector may not be compatible + const auto& view = camera.getViewMatrix(); + const_cast(view) = core::transpose(imguizmoM16InOut.view).extractSub3x4(); // a hack, correct way would be to use inverse matrix and get position + target because now it will bring you back to last position & target when switching from gizmo move to manual move (but from manual to gizmo is ok) + // update concatanated matrix + const auto& projection = camera.getProjectionMatrix(); + camera.setProjectionMatrix(projection); + + // object meta display + { + ImGui::Begin("Object"); + ImGui::Text("type: \"%s\"", objectName.data()); + ImGui::End(); + } + + // view matrices editor { - if (move) - camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + ImGui::Begin("Matrices"); - for (const auto& e : events) // here capture + auto addMatrixTable = [&](const char* topText, const char* tableName, const int rows, const int columns, const float* pointer, const bool withSeparator = true) { - if (e.timeStamp < previousEventTimestamp) - continue; + ImGui::Text(topText); + if (ImGui::BeginTable(tableName, columns)) + { + for (int y = 0; y < rows; ++y) + { + ImGui::TableNextRow(); + for (int x = 0; x < columns; ++x) + { + ImGui::TableSetColumnIndex(x); + ImGui::Text("%.3f", *(pointer + (y * columns) + x)); + } + } + ImGui::EndTable(); + } - previousEventTimestamp = e.timeStamp; - capturedEvents.mouse.emplace_back(e); + if (withSeparator) + ImGui::Separator(); + }; - if (e.type == nbl::ui::SMouseEvent::EET_SCROLL) - gcIndex = std::clamp(int16_t(gcIndex) + int16_t(core::sign(e.scrollEvent.verticalScroll)), int64_t(0), int64_t(OT_COUNT - (uint8_t)1u)); - } - }, m_logger.get()); + addMatrixTable("Model Matrix", "ModelMatrixTable", 3, 4, model.pointer()); + addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, view.pointer()); + addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, projection.pointer(), false); + + ImGui::End(); + } - keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + // Nabla Imgui backend MDI buffer info + // To be 100% accurate and not overly conservative we'd have to explicitly `cull_frees` and defragment each time, + // so unless you do that, don't use this basic info to optimize the size of your IMGUI buffer. { - if (move) - camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl + auto* streaminingBuffer = imGUI->getStreamingBuffer(); - for (const auto& e : events) // here capture - { - if (e.timeStamp < previousEventTimestamp) - continue; + const size_t total = streaminingBuffer->get_total_size(); // total memory range size for which allocation can be requested + const size_t freeSize = streaminingBuffer->getAddressAllocator().get_free_size(); // max total free bloock memory size we can still allocate from total memory available + const size_t consumedMemory = total - freeSize; // memory currently consumed by streaming buffer - previousEventTimestamp = e.timeStamp; - capturedEvents.keyboard.emplace_back(e); - } - }, m_logger.get()); - } - if (move) camera.endInputProcessing(nextPresentationTimestamp); + float freePercentage = 100.0f * (float)(freeSize) / (float)total; + float allocatedPercentage = (float)(consumedMemory) / (float)total; - const auto cursorPosition = m_window->getCursorControl()->getPosition(); + ImVec2 barSize = ImVec2(400, 30); + float windowPadding = 10.0f; + float verticalPadding = ImGui::GetStyle().FramePadding.y; - nbl::ext::imgui::UI::SUpdateParameters params = - { - .mousePosition = nbl::hlsl::float32_t2(cursorPosition.x, cursorPosition.y) - nbl::hlsl::float32_t2(m_window->getX(), m_window->getY()), - .displaySize = { m_window->getWidth(), m_window->getHeight() }, - .mouseEvents = { capturedEvents.mouse.data(), capturedEvents.mouse.size() }, - .keyboardEvents = { capturedEvents.keyboard.data(), capturedEvents.keyboard.size() } - }; + ImGui::SetNextWindowSize(ImVec2(barSize.x + 2 * windowPadding, 110 + verticalPadding), ImGuiCond_Always); + ImGui::Begin("Nabla Imgui MDI Buffer Info", nullptr, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoScrollbar); - pass.ui.manager->update(params); - } + ImGui::Text("Total Allocated Size: %zu bytes", total); + ImGui::Text("In use: %zu bytes", consumedMemory); + ImGui::Text("Buffer Usage:"); - private: - // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers - constexpr static inline uint32_t MaxFramesInFlight = 3u; + ImGui::SetCursorPosX(windowPadding); - smart_refctd_ptr m_window; - smart_refctd_ptr> m_surface; - smart_refctd_ptr m_pipeline; - smart_refctd_ptr m_semaphore; - smart_refctd_ptr m_cmdPool; - uint64_t m_realFrameIx = 0; - std::array, MaxFramesInFlight> m_cmdBufs; - ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + if (freePercentage > 70.0f) + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(0.0f, 1.0f, 0.0f, 0.4f)); // Green + else if (freePercentage > 30.0f) + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 1.0f, 0.0f, 0.4f)); // Yellow + else + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 0.0f, 0.0f, 0.4f)); // Red - smart_refctd_ptr m_assetManager; - core::smart_refctd_ptr m_inputSystem; - InputSystem::ChannelReader mouse; - InputSystem::ChannelReader keyboard; + ImGui::ProgressBar(allocatedPercentage, barSize, ""); - constexpr static inline auto TexturesAmount = 2u; + ImGui::PopStyleColor(); - core::smart_refctd_ptr m_descriptorSetPool; + ImDrawList* drawList = ImGui::GetWindowDrawList(); - struct C_UI - { - nbl::core::smart_refctd_ptr manager; + ImVec2 progressBarPos = ImGui::GetItemRectMin(); + ImVec2 progressBarSize = ImGui::GetItemRectSize(); - struct - { - core::smart_refctd_ptr gui, scene; - } samplers; + const char* text = "%.2f%% free"; + char textBuffer[64]; + snprintf(textBuffer, sizeof(textBuffer), text, freePercentage); - core::smart_refctd_ptr descriptorSet; - }; + ImVec2 textSize = ImGui::CalcTextSize(textBuffer); + ImVec2 textPos = ImVec2 + ( + progressBarPos.x + (progressBarSize.x - textSize.x) * 0.5f, + progressBarPos.y + (progressBarSize.y - textSize.y) * 0.5f + ); - struct E_APP_PASS - { - nbl::core::smart_refctd_ptr scene; - C_UI ui; - } pass; + ImVec4 bgColor = ImGui::GetStyleColorVec4(ImGuiCol_WindowBg); + drawList->AddRectFilled + ( + ImVec2(textPos.x - 5, textPos.y - 2), + ImVec2(textPos.x + textSize.x + 5, textPos.y + textSize.y + 2), + ImGui::GetColorU32(bgColor) + ); - Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); - video::CDumbPresentationOracle oracle; + ImGui::SetCursorScreenPos(textPos); + ImGui::Text("%s", textBuffer); - uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed + ImGui::Dummy(ImVec2(0.0f, verticalPadding)); - TransformRequestParams transformParams; - bool isPerspective = true, isLH = true, flipGizmoY = true, move = false; - float fov = 60.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; - float viewWidth = 10.f; - float camYAngle = 165.f / 180.f * 3.14159f; - float camXAngle = 32.f / 180.f * 3.14159f; + ImGui::End(); + } + + ImGui::End(); + } - bool firstFrame = true; + smart_refctd_ptr imGUI; + // descriptor set + smart_refctd_ptr subAllocDS; + SubAllocatedDescriptorSet::value_type renderColorViewDescIndex = SubAllocatedDescriptorSet::invalid_value; + // + Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); + // mutables + std::string_view objectName; + core::matrix3x4SIMD model; + TransformRequestParams transformParams; + uint16_t2 sceneResolution = {1280,720}; + float fov = 60.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; + float viewWidth = 10.f; + float camYAngle = 165.f / 180.f * 3.14159f; + float camXAngle = 32.f / 180.f * 3.14159f; + uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed + bool isPerspective = true, isLH = true, flipGizmoY = true, move = false; + bool firstFrame = true; + } interface; }; NBL_MAIN_FUNC(UISampleApp) \ No newline at end of file From 28726045367f9bbab5668e324af6a69bcfbb264c Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 24 Jun 2025 01:30:21 +0200 Subject: [PATCH 288/296] fix bugs in ex 61: - correct aspect masks on image views - wrong renderpass given to imgui - handle virtual window getting minimized - imguizmo not updating - imgui not drawing offscreen image --- 61_UI/main.cpp | 113 ++++++++++++++++++++++++++++++------------------- 1 file changed, 70 insertions(+), 43 deletions(-) diff --git a/61_UI/main.cpp b/61_UI/main.cpp index d4f21f2e8..830318e4e 100644 --- a/61_UI/main.cpp +++ b/61_UI/main.cpp @@ -92,6 +92,7 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA {}, IGPURenderpass::SCreationParams::SubpassesEnd }; + subpasses[0].depthStencilAttachment = {{.render={.attachmentIndex=0,.layout=IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}}; subpasses[0].colorAttachments[0] = {.render={.attachmentIndex=0,.layout=IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}; params.subpasses = subpasses; @@ -137,9 +138,12 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA return logFail("Failed to create Scene Renderpass!"); } m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),m_renderpass.get(),0,m_scene.get()); + // we'll only display one thing at a time + m_renderer->m_instances.resize(1); // Create ImGUI { + auto scRes = static_cast(m_surface->getSwapchainResources()); ext::imgui::UI::SCreationParameters params = {}; params.resources.texturesInfo = {.setIx=0u,.bindingIx=TexturesImGUIBindingIndex}; params.resources.samplersInfo = {.setIx=0u,.bindingIx=1u}; @@ -147,7 +151,7 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA params.transfer = getTransferUpQueue(); params.pipelineLayout = ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(),params.resources.texturesInfo,params.resources.samplersInfo,MaxImGUITextures); params.assetManager = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); - params.renderpass = m_renderpass; + params.renderpass = smart_refctd_ptr(scRes->getRenderpass()); params.subpassIx = 0u; params.pipelineCache = nullptr; interface.imGUI = ext::imgui::UI::create(std::move(params)); @@ -196,17 +200,13 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA return true; } - inline void beginRenderpass(IGPUCommandBuffer* cb, const IGPUCommandBuffer::SRenderpassBeginInfo& info) + // + virtual inline bool onAppTerminated() { - cb->beginRenderPass(info,IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); - cb->setScissor(0,1,&info.renderArea); - const SViewport viewport = { - .x = 0, - .y = 0, - .width = static_cast(info.renderArea.extent.width), - .height = static_cast(info.renderArea.extent.height) - }; - cb->setViewport(0u,1u,&viewport); + SubAllocatedDescriptorSet::value_type fontAtlasDescIx = ext::imgui::UI::FontAtlasTexId; + IGPUDescriptorSet::SDropDescriptorSet dummy[1]; + interface.subAllocDS->multi_deallocate(dummy,TexturesImGUIBindingIndex,1,&fontAtlasDescIx); + return device_base_t::onAppTerminated(); } inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override @@ -226,14 +226,16 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); // clear to black for both things const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} }; + if (m_framebuffer) { cb->beginDebugMarker("UISampleApp Scene Frame"); { + const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth=0.f }; const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = { .framebuffer = m_framebuffer.get(), .colorClearValues = &clearValue, - .depthStencilClearValues = nullptr, + .depthStencilClearValues = &farValue, .renderArea = { .offset = {0,0}, .extent = {virtualWindowRes[0],virtualWindowRes[1]} @@ -254,7 +256,9 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix); // tear down scene every frame - m_renderer->m_instances[0].packedGeo = m_renderer->getInitParams().geoms.data()+interface.gcIndex; + auto& instance = m_renderer->m_instances[0]; + memcpy(&instance.world,&interface.model,sizeof(instance.world)); + instance.packedGeo = m_renderer->getInitParams().geoms.data()+interface.gcIndex; m_renderer->render(cb,viewParams); } cb->endRenderPass(); @@ -468,42 +472,65 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA }}); if (!m_device->allocate(image->getMemoryReqs(),image.get()).isValid()) return nullptr; - return m_device->createImageView({ + IGPUImageView::SCreationParams params = { .image = std::move(image), .viewType = IGPUImageView::ET_2D, - .format = format, - .subresourceRange = { - .aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT:IGPUImage::EAF_COLOR_BIT, - } - }); + .format = format + }; + params.subresourceRange.aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT:IGPUImage::EAF_COLOR_BIT; + return m_device->createImageView(std::move(params)); }; - - m_renderColorView = createImageAndView(finalSceneRenderFormat); - auto depthView = createImageAndView(sceneRenderDepthFormat); - m_framebuffer = m_device->createFramebuffer({ { - .renderpass = m_renderpass, - .depthStencilAttachments = &depthView.get(), - .colorAttachments = &m_renderColorView.get(), - .width = resolution.x, - .height = resolution.y - }}); + + smart_refctd_ptr colorView; + // detect window minimization + if (resolution.x<0x4000 && resolution.y<0x4000) + { + colorView = createImageAndView(finalSceneRenderFormat); + auto depthView = createImageAndView(sceneRenderDepthFormat); + m_framebuffer = m_device->createFramebuffer({ { + .renderpass = m_renderpass, + .depthStencilAttachments = &depthView.get(), + .colorAttachments = &colorView.get(), + .width = resolution.x, + .height = resolution.y + }}); + } + else + m_framebuffer = nullptr; // release previous slot and its image interface.subAllocDS->multi_deallocate(0,1,&interface.renderColorViewDescIndex,{.semaphore=m_semaphore.get(),.value=m_realFrameIx}); // - interface.subAllocDS->multi_allocate(0,1,&interface.renderColorViewDescIndex); - // update descriptor set - IGPUDescriptorSet::SDescriptorInfo info = {}; - info.desc = m_renderColorView; - info.info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; - const IGPUDescriptorSet::SWriteDescriptorSet write = { - .dstSet = interface.subAllocDS->getDescriptorSet(), - .binding = TexturesImGUIBindingIndex, - .arrayElement = interface.renderColorViewDescIndex, - .count = 1, - .info = &info + if (colorView) + { + interface.subAllocDS->multi_allocate(0,1,&interface.renderColorViewDescIndex); + // update descriptor set + IGPUDescriptorSet::SDescriptorInfo info = {}; + info.desc = colorView; + info.info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; + const IGPUDescriptorSet::SWriteDescriptorSet write = { + .dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = interface.renderColorViewDescIndex, + .count = 1, + .info = &info + }; + m_device->updateDescriptorSets({&write,1},{}); + } + interface.transformParams.sceneTexDescIx = interface.renderColorViewDescIndex; + } + + inline void beginRenderpass(IGPUCommandBuffer* cb, const IGPUCommandBuffer::SRenderpassBeginInfo& info) + { + cb->beginRenderPass(info,IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); + cb->setScissor(0,1,&info.renderArea); + const SViewport viewport = { + .x = 0, + .y = 0, + .width = static_cast(info.renderArea.extent.width), + .height = static_cast(info.renderArea.extent.height) }; - m_device->updateDescriptorSets({&write,1},{}); + cb->setViewport(0u,1u,&viewport); } // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers @@ -518,7 +545,6 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA smart_refctd_ptr m_scene; smart_refctd_ptr m_renderpass; smart_refctd_ptr m_renderer; - smart_refctd_ptr m_renderColorView; smart_refctd_ptr m_framebuffer; // smart_refctd_ptr m_semaphore; @@ -706,6 +732,7 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA sceneResolution = EditTransform(imguizmoM16InOut.view.pointer(), imguizmoM16InOut.projection.pointer(), imguizmoM16InOut.model.pointer(), transformParams); } + model = core::transpose(imguizmoM16InOut.model).extractSub3x4(); // to Nabla + update camera & model matrices // TODO: make it more nicely, extract: // - Position by computing inverse of the view matrix and grabbing its translation @@ -835,8 +862,8 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA // Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); // mutables - std::string_view objectName; core::matrix3x4SIMD model; + std::string_view objectName; TransformRequestParams transformParams; uint16_t2 sceneResolution = {1280,720}; float fov = 60.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; From 20cc57eaea399d68da28f709d6b63878eba67a61 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 24 Jun 2025 02:26:32 +0200 Subject: [PATCH 289/296] Push Mesh Loader example --- 12_MeshLoaders/CMakeLists.txt | 8 + 12_MeshLoaders/README.md | 2 + 12_MeshLoaders/config.json.template | 28 +++ 12_MeshLoaders/include/common.hpp | 18 ++ 12_MeshLoaders/main.cpp | 272 ++++++++++++++++++++++++++++ 12_MeshLoaders/pipeline.groovy | 50 +++++ CMakeLists.txt | 4 +- 7 files changed, 381 insertions(+), 1 deletion(-) create mode 100644 12_MeshLoaders/CMakeLists.txt create mode 100644 12_MeshLoaders/README.md create mode 100644 12_MeshLoaders/config.json.template create mode 100644 12_MeshLoaders/include/common.hpp create mode 100644 12_MeshLoaders/main.cpp create mode 100644 12_MeshLoaders/pipeline.groovy diff --git a/12_MeshLoaders/CMakeLists.txt b/12_MeshLoaders/CMakeLists.txt new file mode 100644 index 000000000..2dd253226 --- /dev/null +++ b/12_MeshLoaders/CMakeLists.txt @@ -0,0 +1,8 @@ +set(NBL_INCLUDE_SERACH_DIRECTORIES + "${CMAKE_CURRENT_SOURCE_DIR}/include" +) + + # TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !? +nbl_create_executable_project("" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "" "") +# TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet +# LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD) \ No newline at end of file diff --git a/12_MeshLoaders/README.md b/12_MeshLoaders/README.md new file mode 100644 index 000000000..6330f4673 --- /dev/null +++ b/12_MeshLoaders/README.md @@ -0,0 +1,2 @@ +https://github.com/user-attachments/assets/6f779700-e6d4-4e11-95fb-7a7fddc47255 + diff --git a/12_MeshLoaders/config.json.template b/12_MeshLoaders/config.json.template new file mode 100644 index 000000000..f961745c1 --- /dev/null +++ b/12_MeshLoaders/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/12_MeshLoaders/include/common.hpp b/12_MeshLoaders/include/common.hpp new file mode 100644 index 000000000..84cd8118a --- /dev/null +++ b/12_MeshLoaders/include/common.hpp @@ -0,0 +1,18 @@ +#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ +#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ + + +#include "nbl/examples/examples.hpp" + +using namespace nbl; +using namespace core; +using namespace hlsl; +using namespace system; +using namespace asset; +using namespace ui; +using namespace video; +using namespace scene; +using namespace nbl::examples; + + +#endif // __NBL_THIS_EXAMPLE_COMMON_H_INCLUDED__ \ No newline at end of file diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp new file mode 100644 index 000000000..13868fa8c --- /dev/null +++ b/12_MeshLoaders/main.cpp @@ -0,0 +1,272 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#include "common.hpp" + +#include "../3rdparty/portable-file-dialogs/portable-file-dialogs.h" + + +class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourcesApplication +{ + using device_base_t = MonoWindowApplication; + using asset_base_t = BuiltinResourcesApplication; + + public: + inline MeshLoadersApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), + device_base_t({1280,720}, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + inline bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); + + auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + for (auto i=0u; icreateCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{m_cmdBufs.data()+i,1})) + return logFail("Couldn't create Command Buffer!"); + } + + //! cache results -- speeds up mesh generation on second run + m_qnc->loadCacheFromFile(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse"); + + // + if (!reloadModel()) + return false; +#if 0 + const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()}; + // we want to use the vertex data through UTBs + using usage_f = IGPUBuffer::E_USAGE_FLAGS; + CAssetConverter::patch_t patch = {}; + patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; + patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT; + patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; + m_scene = CGeometryCreatorScene::create( + { + .transferQueue = getTransferUpQueue(), + .utilities = m_utils.get(), + .logger = m_logger.get(), + .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies + },patch + ); +#endif + + auto scRes = static_cast(m_surface->getSwapchainResources()); + m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,nullptr); + + camera.mapKeysToArrows(); + + onAppInitializedFinish(); + return true; + } + + inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override + { + m_inputSystem->getDefaultMouse(&mouse); + m_inputSystem->getDefaultKeyboard(&keyboard); + + // + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + + auto* const cb = m_cmdBufs.data()[resourceIx].get(); + cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // clear to black for both things + { + // begin renderpass + { + auto scRes = static_cast(m_surface->getSwapchainResources()); + auto* framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex); + const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {1.f,0.f,1.f,1.f} }; + const IGPUCommandBuffer::SClearDepthStencilValue depthValue = { .depth = 0.f }; + const VkRect2D currentRenderArea = + { + .offset = {0,0}, + .extent = {framebuffer->getCreationParameters().width,framebuffer->getCreationParameters().height} + }; + const IGPUCommandBuffer::SRenderpassBeginInfo info = + { + .framebuffer = framebuffer, + .colorClearValues = &clearValue, + .depthStencilClearValues = &depthValue, + .renderArea = currentRenderArea + }; + cb->beginRenderPass(info,IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); + + const SViewport viewport = { + .x = static_cast(currentRenderArea.offset.x), + .y = static_cast(currentRenderArea.offset.y), + .width = static_cast(currentRenderArea.extent.width), + .height = static_cast(currentRenderArea.extent.height) + }; + cb->setViewport(0u,1u,&viewport); + + cb->setScissor(0u,1u,¤tRenderArea); + } + // late latch input + { + camera.beginInputProcessing(nextPresentationTimestamp); + mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void { camera.mouseProcess(events); }, m_logger.get()); + keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + { + camera.keyboardProcess(events); + }, + m_logger.get() + ); + camera.endInputProcessing(nextPresentationTimestamp); + } + // draw scene + { + float32_t3x4 viewMatrix; + float32_t4x4 viewProjMatrix; + // TODO: get rid of legacy matrices + { + memcpy(&viewMatrix,camera.getViewMatrix().pointer(),sizeof(viewMatrix)); + memcpy(&viewProjMatrix,camera.getConcatenatedMatrix().pointer(),sizeof(viewProjMatrix)); + } + m_renderer->render(cb,CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix)); + } + cb->endRenderPass(); + } + cb->end(); + + //updateGUIDescriptorSet(); + + IQueue::SSubmitInfo::SSemaphoreInfo retval = + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS + }; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cb } + }; + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { + { + .semaphore = device_base_t::getCurrentAcquire().semaphore, + .value = device_base_t::getCurrentAcquire().acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = {&retval,1} + } + }; + + if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS) + { + retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal + m_realFrameIx--; + } + + std::string caption = "[Nabla Engine] Mesh Loaders"; + { + caption += ", displaying ["; + caption += m_modelPath; + caption += "]"; + m_window->setCaption(caption); + } + return retval; + } + + protected: + const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override + { + // Subsequent submits don't wait for each other, hence its important to have External Dependencies which prevent users of the depth attachment overlapping. + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { + // wipe-transition of Color to ATTACHMENT_OPTIMAL and depth + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = { + // last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later + .srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT, + // don't want any writes to be available, we'll clear + .srcAccessMask = ACCESS_FLAGS::NONE, + // destination needs to wait as early as possible + // TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h` + .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // because depth and color get cleared first no read mask + .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + // leave view offsets and flags default + }, + // color from ATTACHMENT_OPTIMAL to PRESENT_SRC + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = { + // last place where the color can get modified, depth is implicitly earlier + .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // only write ops, reads can't be made available + .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + // spec says nothing is needed when presentation is the destination + } + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; + return dependencies; + } + + private: + inline bool reloadModel() + { + pfd::open_file file("Choose a supported Model File", "../../media", { "All Supported Formats", "*.ply *.stl *.serialized *.obj", + "TODO (.ply)", "*.ply", + "TODO (.stl)", "*.stl", + "Mitsuba 0.6 Serialized (.serialized)", "*.serialized", + "Wavefront Object (.obj)", "*.obj" + }); + if (file.result().empty()) + return false; + m_modelPath = file.result()[0]; + + // free up + m_assetMgr->clearAllAssetCache(); + + //! load the geometry + IAssetLoader::SAssetLoadParams params = {}; + params.meshManipulatorOverride = nullptr; // TODO + auto bundle = m_assetMgr->getAsset(m_modelPath,params); + if (bundle.getContents().empty()) + return false; + //! cache results -- speeds up mesh generation on second run + m_qnc->saveCacheToFile(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse"); + + return true; + } + + // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers + constexpr static inline uint32_t MaxFramesInFlight = 3u; + // + smart_refctd_ptr m_qnc; + smart_refctd_ptr m_renderer; + // + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; + std::array,MaxFramesInFlight> m_cmdBufs; + // + InputSystem::ChannelReader mouse; + InputSystem::ChannelReader keyboard; + // + Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); + // mutables + std::string m_modelPath; +}; + +NBL_MAIN_FUNC(MeshLoadersApp) \ No newline at end of file diff --git a/12_MeshLoaders/pipeline.groovy b/12_MeshLoaders/pipeline.groovy new file mode 100644 index 000000000..7b7c9702a --- /dev/null +++ b/12_MeshLoaders/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CUIBuilder extends IBuilder +{ + public CUIBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CUIBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index aa3880762..66d6f682d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,8 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(10_CountingSort) # showcase use of FFT for post-FX Bloom effect add_subdirectory(11_FFT) + # + add_subdirectory(12_MeshLoaders EXCLUDE_FROM_ALL) # Waiting for a refactor #add_subdirectory(27_PLYSTLDemo) @@ -71,7 +73,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL) add_subdirectory(54_Transformations EXCLUDE_FROM_ALL) add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL) - add_subdirectory(61_UI EXCLUDE_FROM_ALL) # TODO: resurrect before `mesh_loaders` merge + add_subdirectory(61_UI) add_subdirectory(62_CAD EXCLUDE_FROM_ALL) # TODO: Erfan, Przemek, Francisco and co. need to resurrect this add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL) add_subdirectory(64_EmulatedFloatTest) From 86cc7dda5858eb5dd83ee2fdaf9ae4ade485d7c7 Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 25 Jun 2025 14:10:16 +0200 Subject: [PATCH 290/296] decouple `CSimpleDebugRenderer` from `CGeometryCreatorScene` --- 12_MeshLoaders/main.cpp | 31 +++++++--- .../geometry/CGeometryCreatorScene.hpp | 32 +++++----- .../geometry/CSimpleDebugRenderer.hpp | 61 +++++++++---------- 3 files changed, 70 insertions(+), 54 deletions(-) diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp index 13868fa8c..0a4e20141 100644 --- a/12_MeshLoaders/main.cpp +++ b/12_MeshLoaders/main.cpp @@ -37,6 +37,7 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc } //! cache results -- speeds up mesh generation on second run + m_qnc = make_smart_refctd_ptr(); m_qnc->loadCacheFromFile(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse"); // @@ -224,17 +225,29 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc } private: + // TODO: standardise this across examples, and take from `argv` + bool m_nonInteractiveTest = true; + inline bool reloadModel() { - pfd::open_file file("Choose a supported Model File", "../../media", { "All Supported Formats", "*.ply *.stl *.serialized *.obj", - "TODO (.ply)", "*.ply", - "TODO (.stl)", "*.stl", - "Mitsuba 0.6 Serialized (.serialized)", "*.serialized", - "Wavefront Object (.obj)", "*.obj" - }); - if (file.result().empty()) - return false; - m_modelPath = file.result()[0]; + if (m_nonInteractiveTest) // TODO: maybe also take from argv and argc + m_modelPath = (sharedInputCWD/"ply/Spanner-ply.ply").string(); + else + { + pfd::open_file file("Choose a supported Model File", sharedInputCWD.string(), + { + "All Supported Formats", "*.ply *.stl *.serialized *.obj", + "TODO (.ply)", "*.ply", + "TODO (.stl)", "*.stl", + "Mitsuba 0.6 Serialized (.serialized)", "*.serialized", + "Wavefront Object (.obj)", "*.obj" + }, + false + ); + if (file.result().empty()) + return false; + m_modelPath = file.result()[0]; + } // free up m_assetMgr->clearAllAssetCache(); diff --git a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp index 63b3d7a8d..2798cfed7 100644 --- a/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp +++ b/common/include/nbl/examples/geometry/CGeometryCreatorScene.hpp @@ -42,13 +42,13 @@ class CGeometryCreatorScene : public core::IReferenceCounted } - core::vector namedGeometries; + SInitParams init = {}; core::vector> geometries; // create out geometries { - auto addGeometry = [&namedGeometries,&geometries](const std::string_view name, smart_refctd_ptr&& geom)->void + auto addGeometry = [&init,&geometries](const std::string_view name, smart_refctd_ptr&& geom)->void { - namedGeometries.emplace_back().name = name; + init.geometryNames.emplace_back(name); geometries.push_back(std::move(geom)); }; @@ -67,6 +67,7 @@ class CGeometryCreatorScene : public core::IReferenceCounted addGeometry("Rectangle",creator->createRectangle({1.5f,3.f})); addGeometry("Disk",creator->createDisk(2.f,30)); } + init.geometries.reserve(init.geometryNames.size()); // convert the geometries { @@ -148,34 +149,37 @@ class CGeometryCreatorScene : public core::IReferenceCounted // assign outputs { auto inIt = reservation.getGPUObjects().data(); - for (auto outIt=namedGeometries.begin(); outIt!=namedGeometries.end(); inIt++) + for (auto outIt=init.geometryNames.begin(); outIt!=init.geometryNames.end(); inIt++) { if (inIt->value) - (outIt++)->geom = inIt->value; + { + init.geometries.push_back(inIt->value); + outIt++; + } else { - logger->log("Failed to convert ICPUPolygonGeometry %s to GPU!",ILogger::ELL_ERROR,outIt->name.data()); - outIt = namedGeometries.erase(outIt); + logger->log("Failed to convert ICPUPolygonGeometry %s to GPU!",ILogger::ELL_ERROR,outIt->c_str()); + outIt = init.geometryNames.erase(outIt); } } } } - return smart_refctd_ptr(new CGeometryCreatorScene(std::move(namedGeometries)),dont_grab); + return smart_refctd_ptr(new CGeometryCreatorScene(std::move(init)),dont_grab); } // - struct SNamedGeometry + struct SInitParams { - std::string name = {}; - core::smart_refctd_ptr geom; + core::vector> geometries; + core::vector geometryNames; }; - std::span getGeometries() const {return m_geometries;} + const SInitParams& getInitParams() const {return m_init;} protected: - inline CGeometryCreatorScene(core::vector&& _geometries) : m_geometries(std::move(_geometries)) {} + inline CGeometryCreatorScene(SInitParams&& _init) : m_init(std::move(_init)) {} - core::vector m_geometries; + SInitParams m_init; #undef EXPOSE_NABLA_NAMESPACES }; diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp index 474f1d350..325ae8eb7 100644 --- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp +++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp @@ -79,7 +79,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted }; // - static inline core::smart_refctd_ptr create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX, const CGeometryCreatorScene* scene) + static inline core::smart_refctd_ptr create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX, const std::span geometries) { EXPOSE_NABLA_NAMESPACES; @@ -88,10 +88,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted auto device = const_cast(renderpass->getOriginDevice()); auto logger = device->getLogger(); - if (!assMan || !scene) - return nullptr; - const auto namedGeoms = scene->getGeometries(); - if (namedGeoms.empty()) + if (!assMan || geometries.empty()) return nullptr; // load shader @@ -154,33 +151,26 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr(init.ds->getLayout())); // create pipelines - enum PipelineType : uint8_t - { - BasicTriangleList, - BasicTriangleFan, - Cone, - Count - }; - smart_refctd_ptr pipelines[PipelineType::Count] = {}; + using pipeline_e = SInitParams::PipelineType; { - IGPUGraphicsPipeline::SCreationParams params[PipelineType::Count] = {}; - params[PipelineType::BasicTriangleList].vertexShader = {.shader=shader.get(),.entryPoint="BasicVS"}; - params[PipelineType::BasicTriangleList].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"}; - params[PipelineType::BasicTriangleFan].vertexShader = {.shader=shader.get(),.entryPoint="BasicVS"}; - params[PipelineType::BasicTriangleFan].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"}; - params[PipelineType::Cone].vertexShader = {.shader=shader.get(),.entryPoint="ConeVS"}; - params[PipelineType::Cone].fragmentShader = {.shader=shader.get(),.entryPoint="ConeFS"}; - for (auto i=0; i< PipelineType::Count; i++) + IGPUGraphicsPipeline::SCreationParams params[pipeline_e::Count] = {}; + params[pipeline_e::BasicTriangleList].vertexShader = {.shader=shader.get(),.entryPoint="BasicVS"}; + params[pipeline_e::BasicTriangleList].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"}; + params[pipeline_e::BasicTriangleFan].vertexShader = {.shader=shader.get(),.entryPoint="BasicVS"}; + params[pipeline_e::BasicTriangleFan].fragmentShader = {.shader=shader.get(),.entryPoint="BasicFS"}; + params[pipeline_e::Cone].vertexShader = {.shader=shader.get(),.entryPoint="ConeVS"}; + params[pipeline_e::Cone].fragmentShader = {.shader=shader.get(),.entryPoint="ConeFS"}; + for (auto i=0; i(i); + const auto type = static_cast(i); switch (type) { - case PipelineType::BasicTriangleFan: + case pipeline_e::BasicTriangleFan: primitiveAssembly.primitiveType = E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN; break; default: @@ -193,7 +183,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted params[i].cached.subpassIx = subpassIX; params[i].renderpass = renderpass; } - if (!device->createGraphicsPipelines(nullptr,params,pipelines)) + if (!device->createGraphicsPipelines(nullptr,params,init.pipelines)) { logger->log("Could not create Graphics Pipelines!",ILogger::ELL_ERROR); return nullptr; @@ -212,9 +202,8 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted return retval; }; - for (const auto& entry : namedGeoms) + for (const auto geom : geometries) { - const auto* geom = entry.geom.get(); // could also check device origin on all buffers if (!geom->valid()) continue; @@ -222,15 +211,12 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted switch (geom->getIndexingCallback()->knownTopology()) { case E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN: - out.pipeline = pipelines[PipelineType::BasicTriangleFan]; + out.pipeline = init.pipelines[pipeline_e::BasicTriangleFan]; break; default: - out.pipeline = pipelines[PipelineType::BasicTriangleList]; + out.pipeline = init.pipelines[pipeline_e::BasicTriangleList]; break; } - // special case - if (entry.name=="Cone") - out.pipeline = pipelines[PipelineType::Cone]; if (const auto& view=geom->getIndexView(); view) { out.indexBuffer.offset = view.src.offset; @@ -275,12 +261,25 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted // struct SInitParams { + enum PipelineType : uint8_t + { + BasicTriangleList, + BasicTriangleFan, + Cone, // special case + Count + }; + core::smart_refctd_ptr ds; core::smart_refctd_ptr layout; + core::smart_refctd_ptr pipelines[PipelineType::Count]; core::vector geoms; }; inline const SInitParams& getInitParams() const {return m_params;} + // + inline auto& getGeometry(const uint32_t ix) {return m_params.geoms[ix];} + inline const auto& getGeometry(const uint32_t ix) const {return m_params.geoms[ix];} + // inline void render(video::IGPUCommandBuffer* cmdbuf, const SViewParams& viewParams) const { From e790c841466747645638060ded7972de2ca0348b Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 25 Jun 2025 15:38:43 +0200 Subject: [PATCH 291/296] more improvements to make the Simple Debug Renderer more runtime friendly --- 09_GeometryCreator/main.cpp | 31 ++- 61_UI/main.cpp | 29 +- .../geometry/CSimpleDebugRenderer.hpp | 260 ++++++++++++------ 3 files changed, 219 insertions(+), 101 deletions(-) diff --git a/09_GeometryCreator/main.cpp b/09_GeometryCreator/main.cpp index 1a959f7a0..900d827b7 100644 --- a/09_GeometryCreator/main.cpp +++ b/09_GeometryCreator/main.cpp @@ -37,25 +37,32 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes } const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()}; - // we want to use the vertex data through UTBs - using usage_f = IGPUBuffer::E_USAGE_FLAGS; - CAssetConverter::patch_t patch = {}; - patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; - patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT; - patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; m_scene = CGeometryCreatorScene::create( { .transferQueue = getTransferUpQueue(), .utilities = m_utils.get(), .logger = m_logger.get(), .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies - },patch + }, + CSimpleDebugRenderer::DefaultPolygonGeometryPatch // we want to use the vertex data through UTBs ); auto scRes = static_cast(m_surface->getSwapchainResources()); - m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,m_scene.get()); - if (!m_renderer) + const auto& geometries = m_scene->getInitParams().geometries; + m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,{&geometries.front().get(),geometries.size()}); + if (!m_renderer || m_renderer->getGeometries().size() != geometries.size()) return logFail("Could not create Renderer!"); + // special case + { + const auto& pipelines = m_renderer->getInitParams().pipelines; + auto ix = 0u; + for (const auto& name : m_scene->getInitParams().geometryNames) + { + if (name=="Cone") + m_renderer->getGeometry(ix).pipeline = pipelines[CSimpleDebugRenderer::SInitParams::PipelineType::Cone]; + ix++; + } + } m_renderer->m_instances.resize(1); m_renderer->m_instances[0].world = float32_t3x4( float32_t4(1,0,0,0), @@ -143,7 +150,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix,viewProjMatrix); // tear down scene every frame - m_renderer->m_instances[0].packedGeo = m_renderer->getInitParams().geoms.data()+gcIndex; + m_renderer->m_instances[0].packedGeo = m_renderer->getGeometries().data()+gcIndex; m_renderer->render(cb,viewParams); cb->endRenderPass(); @@ -185,7 +192,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes std::string caption = "[Nabla Engine] Geometry Creator"; { caption += ", displaying ["; - caption += m_scene->getGeometries()[gcIndex].name; + caption += m_scene->getInitParams().geometryNames[gcIndex]; caption += "]"; m_window->setCaption(caption); } @@ -258,7 +265,7 @@ class GeometryCreatorApp final : public MonoWindowApplication, public BuiltinRes if (ev.type==nbl::ui::SMouseEvent::EET_SCROLL && m_renderer) { gcIndex += int16_t(core::sign(ev.scrollEvent.verticalScroll)); - gcIndex = core::clamp(gcIndex,0ull,m_renderer->getInitParams().geoms.size()-1); + gcIndex = core::clamp(gcIndex,0ull,m_renderer->getGeometries().size()-1); } } } diff --git a/61_UI/main.cpp b/61_UI/main.cpp index 830318e4e..643cab079 100644 --- a/61_UI/main.cpp +++ b/61_UI/main.cpp @@ -40,19 +40,14 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA } const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()}; - // we want to use the vertex data through UTBs - using usage_f = IGPUBuffer::E_USAGE_FLAGS; - CAssetConverter::patch_t patch = {}; - patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; - patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT; - patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; m_scene = CGeometryCreatorScene::create( { .transferQueue = getTransferUpQueue(), .utilities = m_utils.get(), .logger = m_logger.get(), .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies - },patch + }, + CSimpleDebugRenderer::DefaultPolygonGeometryPatch ); // for the scene drawing pass @@ -137,7 +132,19 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA if (!m_renderpass) return logFail("Failed to create Scene Renderpass!"); } - m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),m_renderpass.get(),0,m_scene.get()); + const auto& geometries = m_scene->getInitParams().geometries; + m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),m_renderpass.get(),0,{&geometries.front().get(),geometries.size()}); + // special case + { + const auto& pipelines = m_renderer->getInitParams().pipelines; + auto ix = 0u; + for (const auto& name : m_scene->getInitParams().geometryNames) + { + if (name=="Cone") + m_renderer->getGeometry(ix).pipeline = pipelines[CSimpleDebugRenderer::SInitParams::PipelineType::Cone]; + ix++; + } + } // we'll only display one thing at a time m_renderer->m_instances.resize(1); @@ -258,7 +265,7 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA // tear down scene every frame auto& instance = m_renderer->m_instances[0]; memcpy(&instance.world,&interface.model,sizeof(instance.world)); - instance.packedGeo = m_renderer->getInitParams().geoms.data()+interface.gcIndex; + instance.packedGeo = m_renderer->getGeometries().data() + interface.gcIndex; m_renderer->render(cb,viewParams); } cb->endRenderPass(); @@ -418,7 +425,7 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA if (e.type==nbl::ui::SMouseEvent::EET_SCROLL && m_renderer) { interface.gcIndex += int16_t(core::sign(e.scrollEvent.verticalScroll)); - interface.gcIndex = core::clamp(interface.gcIndex,0ull,m_renderer->getInitParams().geoms.size()-1); + interface.gcIndex = core::clamp(interface.gcIndex,0ull,m_renderer->getGeometries().size()-1); } } }, @@ -453,7 +460,7 @@ class UISampleApp final : public MonoWindowApplication, public BuiltinResourcesA .keyboardEvents = uiEvents.keyboard }; - interface.objectName = m_scene->getGeometries()[interface.gcIndex].name; + interface.objectName = m_scene->getInitParams().geometryNames[interface.gcIndex]; interface.imGUI->update(params); } diff --git a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp index 325ae8eb7..969b3afd8 100644 --- a/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp +++ b/common/include/nbl/examples/geometry/CSimpleDebugRenderer.hpp @@ -20,7 +20,10 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted using namespace nbl::system; \ using namespace nbl::asset; \ using namespace nbl::video + public: + // + constexpr static inline uint16_t VertexAttrubUTBDescBinding = 0; // struct SViewParams { @@ -79,7 +82,19 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted }; // - static inline core::smart_refctd_ptr create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX, const std::span geometries) + constexpr static inline auto DefaultPolygonGeometryPatch = []()->video::CAssetConverter::patch_t + { + // we want to use the vertex data through UTBs + using usage_f = video::IGPUBuffer::E_USAGE_FLAGS; + video::CAssetConverter::patch_t patch = {}; + patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; + patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT; + patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; + return patch; + }(); + + // + static inline core::smart_refctd_ptr create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX) { EXPOSE_NABLA_NAMESPACES; @@ -88,7 +103,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted auto device = const_cast(renderpass->getOriginDevice()); auto logger = device->getLogger(); - if (!assMan || geometries.empty()) + if (!assMan) return nullptr; // load shader @@ -113,13 +128,14 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted // create Descriptor Set Layout smart_refctd_ptr dsLayout; { + using binding_flags_t = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS; const IGPUDescriptorSetLayout::SBinding bindings[] = { { - .binding = 0, + .binding = VertexAttrubUTBDescBinding, .type = IDescriptor::E_TYPE::ET_UNIFORM_TEXEL_BUFFER, - // some geometries may not have particular attributes - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT, + // need this trifecta of flags for `SubAllocatedDescriptorSet` to accept the binding as suballocatable + .createFlags = binding_flags_t::ECF_UPDATE_AFTER_BIND_BIT|binding_flags_t::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT |binding_flags_t::ECF_PARTIALLY_BOUND_BIT, .stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX|IShader::E_SHADER_STAGE::ESS_FRAGMENT, .count = SInstance::SPushConstants::DescriptorCount } @@ -134,12 +150,13 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted // create Descriptor Set auto pool = device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT,{&dsLayout.get(),1}); - init.ds = pool->createDescriptorSet(std::move(dsLayout)); - if (!init.ds) + auto ds = pool->createDescriptorSet(std::move(dsLayout)); + if (!ds) { logger->log("Could not descriptor set!",ILogger::ELL_ERROR); return nullptr; } + init.subAllocDS = make_smart_refctd_ptr(std::move(ds)); } // create pipeline layout @@ -148,7 +165,7 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted .offset = 0, .size = sizeof(SInstance::SPushConstants), }}; - init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr(init.ds->getLayout())); + init.layout = device->createPipelineLayout(ranges,smart_refctd_ptr(init.subAllocDS->getDescriptorSet()->getLayout())); // create pipelines using pipeline_e = SInitParams::PipelineType; @@ -190,74 +207,18 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted } } - // write geometries' attributes to descriptor set - { - core::vector infos; - auto allocateUTB = [device,&infos](const IGeometry::SDataView& view)->uint8_t - { - if (!view) - return SInstance::SPushConstants::DescriptorCount; - const auto retval = infos.size(); - infos.emplace_back().desc = device->createBufferView(view.src, view.composed.format); - return retval; - }; - - for (const auto geom : geometries) - { - // could also check device origin on all buffers - if (!geom->valid()) - continue; - auto& out = init.geoms.emplace_back(); - switch (geom->getIndexingCallback()->knownTopology()) - { - case E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN: - out.pipeline = init.pipelines[pipeline_e::BasicTriangleFan]; - break; - default: - out.pipeline = init.pipelines[pipeline_e::BasicTriangleList]; - break; - } - if (const auto& view=geom->getIndexView(); view) - { - out.indexBuffer.offset = view.src.offset; - out.indexBuffer.buffer = view.src.buffer; - switch (view.composed.format) - { - case E_FORMAT::EF_R16_UINT: - out.indexType = EIT_16BIT; - break; - case E_FORMAT::EF_R32_UINT: - out.indexType = EIT_32BIT; - break; - default: - assert(false); - return nullptr; - } - } - out.elementCount = geom->getVertexReferenceCount(); - out.positionView = allocateUTB(geom->getPositionView()); - out.normalView = allocateUTB(geom->getNormalView()); - // the first view is usually the UV - if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty()) - out.uvView = allocateUTB(auxViews.front()); - } - - if (infos.empty()) - return nullptr; - const IGPUDescriptorSet::SWriteDescriptorSet write = { - .dstSet = init.ds.get(), - .binding = 0, - .arrayElement = 0, - .count = static_cast(infos.size()), - .info = infos.data() - }; - if (!device->updateDescriptorSets({&write,1},{})) - return nullptr; - } - return smart_refctd_ptr(new CSimpleDebugRenderer(std::move(init)),dont_grab); } + // + static inline core::smart_refctd_ptr create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX, const std::span geometries) + { + auto retval = create(assMan,renderpass,subpassIX); + if (retval) + retval->addGeometries(geometries); + return retval; + } + // struct SInitParams { @@ -269,16 +230,145 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted Count }; - core::smart_refctd_ptr ds; + core::smart_refctd_ptr subAllocDS; core::smart_refctd_ptr layout; core::smart_refctd_ptr pipelines[PipelineType::Count]; - core::vector geoms; }; inline const SInitParams& getInitParams() const {return m_params;} // - inline auto& getGeometry(const uint32_t ix) {return m_params.geoms[ix];} - inline const auto& getGeometry(const uint32_t ix) const {return m_params.geoms[ix];} + inline bool addGeometries(const std::span geometries) + { + EXPOSE_NABLA_NAMESPACES; + if (geometries.empty()) + return false; + auto device = const_cast(m_params.layout->getOriginDevice()); + + core::vector writes; + core::vector infos; + auto allocateUTB = [&](const IGeometry::SDataView& view)->uint8_t + { + if (!view) + return SInstance::SPushConstants::DescriptorCount; + auto index = SubAllocatedDescriptorSet::invalid_value; + if (m_params.subAllocDS->multi_allocate(VertexAttrubUTBDescBinding,1,&index)!=0) + return SInstance::SPushConstants::DescriptorCount; + const auto retval = infos.size(); + infos.emplace_back().desc = device->createBufferView(view.src,view.composed.format); + writes.emplace_back() = { + .dstSet = m_params.subAllocDS->getDescriptorSet(), + .binding = VertexAttrubUTBDescBinding, + .arrayElement = index, + .count = 1, + .info = reinterpret_cast(retval) + }; + return retval; + }; + + auto sizeToSet = m_geoms.size(); + auto resetGeoms = core::makeRAIIExiter([&]()->void + { + for (auto& write : writes) + immediateDealloc(write.arrayElement); + m_geoms.resize(sizeToSet); + } + ); + for (const auto geom : geometries) + { + // could also check device origin on all buffers + if (!geom->valid()) + return false; + auto& out = m_geoms.emplace_back(); + using pipeline_e = SInitParams::PipelineType; + switch (geom->getIndexingCallback()->knownTopology()) + { + case E_PRIMITIVE_TOPOLOGY::EPT_TRIANGLE_FAN: + out.pipeline = m_params.pipelines[pipeline_e::BasicTriangleFan]; + break; + default: + out.pipeline = m_params.pipelines[pipeline_e::BasicTriangleList]; + break; + } + if (const auto& view=geom->getIndexView(); view) + { + out.indexBuffer.offset = view.src.offset; + out.indexBuffer.buffer = view.src.buffer; + switch (view.composed.format) + { + case E_FORMAT::EF_R16_UINT: + out.indexType = EIT_16BIT; + break; + case E_FORMAT::EF_R32_UINT: + out.indexType = EIT_32BIT; + break; + default: + return false; + } + } + out.elementCount = geom->getVertexReferenceCount(); + out.positionView = allocateUTB(geom->getPositionView()); + out.normalView = allocateUTB(geom->getNormalView()); + // the first view is usually the UV + if (const auto& auxViews = geom->getAuxAttributeViews(); !auxViews.empty()) + out.uvView = allocateUTB(auxViews.front()); + } + + // no geometry + if (infos.empty()) + return false; + + // unbase our pointers + for (auto& write : writes) + write.info = infos.data()+reinterpret_cast(write.info); + if (!device->updateDescriptorSets(writes,{})) + return false; + + // retain + writes.clear(); + sizeToSet = m_geoms.size(); + return true; + } + + // + inline void removeGeometry(const uint32_t ix, const video::ISemaphore::SWaitInfo& info) + { + EXPOSE_NABLA_NAMESPACES; + if (ix>=m_geoms.size()) + return; + + core::vector deferredFree; + deferredFree.reserve(3); + auto deallocate = [&](SubAllocatedDescriptorSet::value_type index)->void + { + if (info.semaphore) + deferredFree.push_back(index); + else + immediateDealloc(index); + }; + auto geo = m_geoms.begin() + ix; + deallocate(geo->positionView); + deallocate(geo->normalView); + deallocate(geo->uvView); + m_geoms.erase(geo); + + if (deferredFree.empty()) + return; + + core::vector nullify(deferredFree.size()); + const_cast(m_params.layout->getOriginDevice())->nullifyDescriptors(nullify); + } + + // + inline void clearGeometries(const video::ISemaphore::SWaitInfo& info) + { + // back to front to avoid O(n^2) resize + while (!m_geoms.empty()) + removeGeometry(m_geoms.size()-1,info); + } + + // + inline const auto& getGeometries() const {return m_geoms;} + inline auto& getGeometry(const uint32_t ix) {return m_geoms[ix];} // inline void render(video::IGPUCommandBuffer* cmdbuf, const SViewParams& viewParams) const @@ -288,7 +378,8 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted cmdbuf->beginDebugMarker("CSimpleDebugRenderer::render"); const auto* layout = m_params.layout.get(); - cmdbuf->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_GRAPHICS,layout,0,1,&m_params.ds.get()); + const auto ds = m_params.subAllocDS->getDescriptorSet(); + cmdbuf->bindDescriptorSets(E_PIPELINE_BIND_POINT::EPBP_GRAPHICS,layout,0,1,&ds); for (const auto& instance : m_instances) { @@ -311,8 +402,21 @@ class CSimpleDebugRenderer final : public core::IReferenceCounted protected: inline CSimpleDebugRenderer(SInitParams&& _params) : m_params(std::move(_params)) {} + inline ~CSimpleDebugRenderer() + { + // clean shutdown, can also make SubAllocatedDescriptorSet resillient against that, and issue `device->waitIdle` if not everything is freed + const_cast(m_params.layout->getOriginDevice())->waitIdle(); + clearGeometries({}); + } + + inline void immediateDealloc(video::SubAllocatedDescriptorSet::value_type index) + { + video::IGPUDescriptorSet::SDropDescriptorSet dummy[1]; + m_params.subAllocDS->multi_deallocate(dummy,VertexAttrubUTBDescBinding,1,&index); + } SInitParams m_params; + core::vector m_geoms; #undef EXPOSE_NABLA_NAMESPACES }; From eaa132075c5c8564723b07d28ce58bb3040b4dba Mon Sep 17 00:00:00 2001 From: devsh Date: Wed, 25 Jun 2025 15:50:26 +0200 Subject: [PATCH 292/296] prep the conversion --- 12_MeshLoaders/main.cpp | 123 +++++++++++++++++++++++++++++++++------- 1 file changed, 102 insertions(+), 21 deletions(-) diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp index 0a4e20141..8c97cb44a 100644 --- a/12_MeshLoaders/main.cpp +++ b/12_MeshLoaders/main.cpp @@ -40,29 +40,14 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc m_qnc = make_smart_refctd_ptr(); m_qnc->loadCacheFromFile(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse"); + auto scRes = static_cast(m_surface->getSwapchainResources()); + m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,{}); + if (!m_renderer) + return logFail("Failed to create renderer!"); + // if (!reloadModel()) return false; -#if 0 - const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()}; - // we want to use the vertex data through UTBs - using usage_f = IGPUBuffer::E_USAGE_FLAGS; - CAssetConverter::patch_t patch = {}; - patch.positionBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; - patch.indexBufferUsages = usage_f::EUF_INDEX_BUFFER_BIT; - patch.otherBufferUsages = usage_f::EUF_UNIFORM_TEXEL_BUFFER_BIT; - m_scene = CGeometryCreatorScene::create( - { - .transferQueue = getTransferUpQueue(), - .utilities = m_utils.get(), - .logger = m_logger.get(), - .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies - },patch - ); -#endif - - auto scRes = static_cast(m_surface->getSwapchainResources()); - m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(),scRes->getRenderpass(),0,nullptr); camera.mapKeysToArrows(); @@ -250,6 +235,8 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc } // free up + m_renderer->m_instances.clear(); + m_renderer->clearGeometries({.semaphore=m_semaphore.get(),.value=m_realFrameIx}); m_assetMgr->clearAllAssetCache(); //! load the geometry @@ -258,10 +245,104 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc auto bundle = m_assetMgr->getAsset(m_modelPath,params); if (bundle.getContents().empty()) return false; + + // + core::vector> geometries; + switch (bundle.getAssetType()) + { + case IAsset::E_TYPE::ET_GEOMETRY: + for (const auto& item : bundle.getContents()) + if (auto polyGeo=IAsset::castDown(item); polyGeo) + geometries.push_back(polyGeo); + break; + default: + m_logger->log("Asset loaded but not a supported type (ET_GEOMETRY,ET_GEOMETRY_COLLECTION)",ILogger::ELL_ERROR); + break; + } + if (geometries.empty()) + return false; + //! cache results -- speeds up mesh generation on second run m_qnc->saveCacheToFile(m_system.get(),sharedOutputCWD/"../../tmp/normalCache888.sse"); + + // convert the geometries + { + smart_refctd_ptr converter = CAssetConverter::create({.device=m_device.get()}); - return true; + const auto transferFamily = getTransferUpQueue()->getFamilyIndex(); + + struct SInputs : CAssetConverter::SInputs + { + virtual inline std::span getSharedOwnershipQueueFamilies(const size_t groupCopyID, const asset::ICPUBuffer* buffer, const CAssetConverter::patch_t& patch) const + { + return sharedBufferOwnership; + } + + core::vector sharedBufferOwnership; + } inputs = {}; + core::vector> patches(geometries.size(),CSimpleDebugRenderer::DefaultPolygonGeometryPatch); + { + inputs.logger = m_logger.get(); + std::get>(inputs.assets) = {&geometries.front().get(),geometries.size()}; + std::get>(inputs.patches) = patches; + // set up shared ownership so we don't have to + core::unordered_set families; + families.insert(transferFamily); + families.insert(getGraphicsQueue()->getFamilyIndex()); + if (families.size()>1) + for (const auto fam : families) + inputs.sharedBufferOwnership.push_back(fam); + } + + // reserve + auto reservation = converter->reserve(inputs); + if (!reservation) + { + m_logger->log("Failed to reserve GPU objects for CPU->GPU conversion!",ILogger::ELL_ERROR); + return false; + } + + // convert + { + auto semaphore = m_device->createSemaphore(0u); + + constexpr auto MultiBuffering = 2; + std::array,MultiBuffering> commandBuffers = {}; + { + auto pool = m_device->createCommandPool(transferFamily,IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT|IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,commandBuffers,smart_refctd_ptr(m_logger)); + } + commandBuffers.front()->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + std::array commandBufferSubmits; + for (auto i=0; ilog("Failed to await submission feature!", ILogger::ELL_ERROR); + return false; + } + } + + const auto& converted = reservation.getGPUObjects(); + return m_renderer->addGeometries({&converted.front().get(),converted.size()}); + } } // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers From 5929be13ea1bfbb1d04bbe6a39321d519a3cbf92 Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 26 Jun 2025 17:52:52 +0200 Subject: [PATCH 293/296] just some todo markup --- 12_MeshLoaders/main.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/12_MeshLoaders/main.cpp b/12_MeshLoaders/main.cpp index 8c97cb44a..14da9f0d6 100644 --- a/12_MeshLoaders/main.cpp +++ b/12_MeshLoaders/main.cpp @@ -125,8 +125,6 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc } cb->end(); - //updateGUIDescriptorSet(); - IQueue::SSubmitInfo::SSemaphoreInfo retval = { .semaphore = m_semaphore.get(), @@ -341,8 +339,15 @@ class MeshLoadersApp final : public MonoWindowApplication, public BuiltinResourc } const auto& converted = reservation.getGPUObjects(); - return m_renderer->addGeometries({&converted.front().get(),converted.size()}); + if (!m_renderer->addGeometries({ &converted.front().get(),converted.size() })) + return false; } + +// TODO: get scene bounds and reset camera + + // TODO: write out the geometry + + return true; } // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers From 52c1aa54cf859b63a8ff6df648f743003e5e13fe Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 27 Jun 2025 10:18:40 +0700 Subject: [PATCH 294/296] ready changes for mesh_loaders merge, requires examples.hpp from mesh_loaders --- .../include/nbl/this_example/common.hpp | 2 +- 31_HLSLPathTracer/main.cpp | 34 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/31_HLSLPathTracer/include/nbl/this_example/common.hpp b/31_HLSLPathTracer/include/nbl/this_example/common.hpp index ff3dd8095..b08656eee 100644 --- a/31_HLSLPathTracer/include/nbl/this_example/common.hpp +++ b/31_HLSLPathTracer/include/nbl/this_example/common.hpp @@ -6,7 +6,7 @@ // common api #include "CCamera.hpp" #include "SimpleWindowedApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "nbl/examples/examples.hpp" #include "CEventCallback.hpp" // example's own headers diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 0dc5fc053..6b4cad224 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -23,10 +23,10 @@ struct PTPushConstant { // TODO: Add a QueryPool for timestamping once its ready // TODO: Do buffer creation using assConv -class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, public examples::BuiltinResourcesApplication { using device_base_t = examples::SimpleWindowedApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using asset_base_t = examples::BuiltinResourcesApplication; using clock_t = std::chrono::steady_clock; enum E_LIGHT_GEOMETRY : uint8_t @@ -323,7 +323,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, m_presentDescriptorSet = presentDSPool->createDescriptorSet(gpuPresentDescriptorSetLayout); // Create Shaders - auto loadAndCompileGLSLShader = [&](const std::string& pathToShader, bool persistentWorkGroups = false) -> smart_refctd_ptr + auto loadAndCompileGLSLShader = [&](const std::string& pathToShader, bool persistentWorkGroups = false) -> smart_refctd_ptr { IAssetLoader::SAssetLoadParams lp = {}; lp.workingDirectory = localInputCWD; @@ -335,7 +335,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, std::exit(-1); } - auto source = IAsset::castDown(assets[0]); + auto source = smart_refctd_ptr_static_cast(assets[0]); // The down-cast should not fail! assert(source); @@ -361,7 +361,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); // this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple - auto shader = m_device->createShader(source.get()); + auto shader = m_device->compileShader({ source.get(), nullptr, nullptr, nullptr }); if (!shader) { m_logger->log("GLSL shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader); @@ -371,7 +371,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, return shader; }; - auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro = "", bool persistentWorkGroups = false) -> smart_refctd_ptr + auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro = "", bool persistentWorkGroups = false) -> smart_refctd_ptr { IAssetLoader::SAssetLoadParams lp = {}; lp.workingDirectory = localInputCWD; @@ -383,7 +383,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, std::exit(-1); } - auto source = IAsset::castDown(assets[0]); + auto source = smart_refctd_ptr_static_cast(assets[0]); // The down-cast should not fail! assert(source); @@ -410,7 +410,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); - auto shader = m_device->createShader(source.get()); + auto shader = m_device->compileShader({ source.get(), nullptr, nullptr, nullptr }); if (!shader) { m_logger->log("HLSL shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader); @@ -447,8 +447,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, params.shader.shader = ptShader.get(); params.shader.entryPoint = "main"; params.shader.entries = nullptr; - params.shader.requireFullSubgroups = true; - params.shader.requiredSubgroupSize = static_cast(5); + params.cached.requireFullSubgroups = true; + params.shader.requiredSubgroupSize = static_cast(5); if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTGLSLPipelines.data() + index)) return logFail("Failed to create GLSL compute pipeline!\n"); } @@ -460,8 +460,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, params.shader.shader = ptShader.get(); params.shader.entryPoint = "main"; params.shader.entries = nullptr; - params.shader.requireFullSubgroups = true; - params.shader.requiredSubgroupSize = static_cast(5); + params.cached.requireFullSubgroups = true; + params.shader.requiredSubgroupSize = static_cast(5); if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTHLSLPipelines.data() + index)) return logFail("Failed to create HLSL compute pipeline!\n"); } @@ -475,8 +475,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, params.shader.shader = ptShader.get(); params.shader.entryPoint = "main"; params.shader.entries = nullptr; - params.shader.requireFullSubgroups = true; - params.shader.requiredSubgroupSize = static_cast(5); + params.cached.requireFullSubgroups = true; + params.shader.requiredSubgroupSize = static_cast(5); if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTGLSLPersistentWGPipelines.data() + index)) return logFail("Failed to create GLSL PersistentWG compute pipeline!\n"); } @@ -488,8 +488,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, params.shader.shader = ptShader.get(); params.shader.entryPoint = "main"; params.shader.entries = nullptr; - params.shader.requireFullSubgroups = true; - params.shader.requiredSubgroupSize = static_cast(5); + params.cached.requireFullSubgroups = true; + params.shader.requiredSubgroupSize = static_cast(5); if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, m_PTHLSLPersistentWGPipelines.data() + index)) return logFail("Failed to create HLSL PersistentWG compute pipeline!\n"); } @@ -508,7 +508,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, if (!fragmentShader) return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!"); - const IGPUShader::SSpecInfo fragSpec = { + const IGPUPipelineBase::SShaderSpecInfo fragSpec = { .entryPoint = "main", .shader = fragmentShader.get() }; From c43c93b75a11870cacef0ad16bc2a8bdf40ae0e3 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 27 Jun 2025 11:26:29 +0700 Subject: [PATCH 295/296] cpp fixes so it compiles at least --- .../include/nbl/this_example/common.hpp | 6 +++--- 31_HLSLPathTracer/main.cpp | 17 +++++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/31_HLSLPathTracer/include/nbl/this_example/common.hpp b/31_HLSLPathTracer/include/nbl/this_example/common.hpp index b08656eee..db051bb3e 100644 --- a/31_HLSLPathTracer/include/nbl/this_example/common.hpp +++ b/31_HLSLPathTracer/include/nbl/this_example/common.hpp @@ -4,10 +4,10 @@ #include // common api -#include "CCamera.hpp" -#include "SimpleWindowedApplication.hpp" +#include "nbl/examples/common/SimpleWindowedApplication.hpp" #include "nbl/examples/examples.hpp" -#include "CEventCallback.hpp" +#include "nbl/examples/cameras/CCamera.hpp" +#include "nbl/examples/common/CEventCallback.hpp" // example's own headers #include "nbl/ui/ICursorControl.h" diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 6b4cad224..576a4c7b0 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -14,6 +14,7 @@ using namespace system; using namespace asset; using namespace ui; using namespace video; +using namespace nbl::examples; struct PTPushConstant { matrix4SIMD invMVP; @@ -23,10 +24,10 @@ struct PTPushConstant { // TODO: Add a QueryPool for timestamping once its ready // TODO: Do buffer creation using assConv -class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, public examples::BuiltinResourcesApplication +class HLSLComputePathtracer final : public SimpleWindowedApplication, public BuiltinResourcesApplication { - using device_base_t = examples::SimpleWindowedApplication; - using asset_base_t = examples::BuiltinResourcesApplication; + using device_base_t = SimpleWindowedApplication; + using asset_base_t = BuiltinResourcesApplication; using clock_t = std::chrono::steady_clock; enum E_LIGHT_GEOMETRY : uint8_t @@ -91,7 +92,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, if (!m_surface) { { - auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); + auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); IWindow::SCreationParams params = {}; params.callback = core::make_smart_refctd_ptr(); params.width = WindowDimensions.x; @@ -118,7 +119,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, { // Init systems { - m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); // Remember to call the base class initialization! if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) @@ -509,8 +510,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!"); const IGPUPipelineBase::SShaderSpecInfo fragSpec = { - .entryPoint = "main", - .shader = fragmentShader.get() + .shader = fragmentShader.get(), + .entryPoint = "main" }; auto presentLayout = m_device->createPipelineLayout( @@ -1381,7 +1382,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication, // system resources core::smart_refctd_ptr m_inputSystem; - InputSystem::ChannelReader mouse; + InputSystem::ChannelReader mouse; InputSystem::ChannelReader keyboard; // pathtracer resources From 8b31859520069831b246d13270b43b97aea83141 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 27 Jun 2025 14:53:17 +0700 Subject: [PATCH 296/296] a bazillion fixes since last time bxdf usages changed --- .../app_resources/hlsl/material_system.hlsl | 84 +++++++++++++------ .../hlsl/next_event_estimator.hlsl | 6 +- .../app_resources/hlsl/pathtracer.hlsl | 41 ++++----- .../app_resources/hlsl/render.comp.hlsl | 11 ++- 31_HLSLPathTracer/main.cpp | 7 -- 5 files changed, 87 insertions(+), 62 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl index feffee9ef..4e2fdc5a0 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/material_system.hlsl @@ -14,22 +14,6 @@ namespace ext namespace MaterialSystem { -// struct Material -// { -// enum Type : uint32_t // enum class? -// { -// DIFFUSE, -// CONDUCTOR, -// DIELECTRIC -// }; - -// NBL_CONSTEXPR_STATIC_INLINE uint32_t DataSize = 1; - -// uint32_t type : 2; -// uint32_t unused : 30; // possible space for flags -// uint32_t data[DataSize]; -// }; - enum MaterialType : uint32_t // enum class? { DIFFUSE, @@ -37,6 +21,52 @@ enum MaterialType : uint32_t // enum class? DIELECTRIC }; +template +struct MaterialParams +{ + using this_t = MaterialParams; + using sample_type = typename DiffuseBxDF::sample_type; + using anisotropic_interaction_type = typename DiffuseBxDF::anisotropic_interaction_type; + using isotropic_interaction_type = typename anisotropic_interaction_type::isotropic_interaction_type; + using anisocache_type = typename ConductorBxDF::anisocache_type; + using isocache_type = typename anisocache_type::isocache_type; + + using diffuse_params_type = typename DiffuseBxDF::params_isotropic_t; + using conductor_params_type = typename ConductorBxDF::params_isotropic_t; + using dielectric_params_type = typename DielectricBxDF::params_isotropic_t; + + // we're only doing isotropic for this example + static this_t create(sample_type _sample, isotropic_interaction_type _interaction, isocache_type _cache, bxdf::BxDFClampMode _clamp) + { + this_t retval; + retval._Sample = _sample; + retval.interaction = _interaction; + retval.cache = _cache; + retval.clampMode = _clamp; + return retval; + } + + diffuse_params_type getDiffuseParams() + { + return diffuse_params_type::create(_Sample, interaction, clampMode); + } + + conductor_params_type getConductorParams() + { + return conductor_params_type::create(_Sample, interaction, cache, clampMode); + } + + dielectric_params_type getDielectricParams() + { + return dielectric_params_type::create(_Sample, interaction, cache, clampMode); + } + + sample_type _Sample; + isotropic_interaction_type interaction; + isocache_type cache; + bxdf::BxDFClampMode clampMode; +}; + template // NOTE: these bxdfs should match the ones in Scene BxDFNode struct System { @@ -48,9 +78,11 @@ struct System using sample_type = typename DiffuseBxDF::sample_type; using ray_dir_info_type = typename sample_type::ray_dir_info_type; using quotient_pdf_type = typename DiffuseBxDF::quotient_pdf_type; - using anisotropic_type = typename DiffuseBxDF::anisotropic_type; + using anisotropic_interaction_type = typename DiffuseBxDF::anisotropic_interaction_type; + using isotropic_interaction_type = typename anisotropic_interaction_type::isotropic_interaction_type; using anisocache_type = typename ConductorBxDF::anisocache_type; - using params_t = bxdf::SBxDFParams; + using isocache_type = typename anisocache_type::isocache_type; + using params_t = MaterialParams; using create_params_t = bxdf::SBxDFCreationParams; using diffuse_op_type = DiffuseBxDF; @@ -73,19 +105,19 @@ struct System case MaterialType::DIFFUSE: { diffuseBxDF.init(cparams); - return (measure_type)diffuseBxDF.eval(params); + return (measure_type)diffuseBxDF.eval(params.getDiffuseParams()); } break; case MaterialType::CONDUCTOR: { conductorBxDF.init(cparams); - return conductorBxDF.eval(params); + return conductorBxDF.eval(params.getConductorParams()); } break; case MaterialType::DIELECTRIC: { dielectricBxDF.init(cparams); - return dielectricBxDF.eval(params); + return dielectricBxDF.eval(params.getDielectricParams()); } break; default: @@ -93,7 +125,7 @@ struct System } } - sample_type generate(uint32_t material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(anisotropic_type) interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) _cache) + sample_type generate(uint32_t material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(anisotropic_interaction_type) interaction, NBL_CONST_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) _cache) { switch(material) { @@ -131,26 +163,26 @@ struct System quotient_pdf_type quotient_and_pdf(uint32_t material, NBL_CONST_REF_ARG(create_params_t) cparams, NBL_CONST_REF_ARG(params_t) params) { const float minimumProjVectorLen = 0.00000001; - if (params.NdotV > minimumProjVectorLen && params.NdotL > minimumProjVectorLen) + if (params.interaction.getNdotV() > minimumProjVectorLen && params._Sample.getNdotL() > minimumProjVectorLen) { switch(material) { case MaterialType::DIFFUSE: { diffuseBxDF.init(cparams); - return diffuseBxDF.quotient_and_pdf(params); + return diffuseBxDF.quotient_and_pdf(params.getDiffuseParams()); } break; case MaterialType::CONDUCTOR: { conductorBxDF.init(cparams); - return conductorBxDF.quotient_and_pdf(params); + return conductorBxDF.quotient_and_pdf(params.getConductorParams()); } break; case MaterialType::DIELECTRIC: { dielectricBxDF.init(cparams); - return dielectricBxDF.quotient_and_pdf(params); + return dielectricBxDF.quotient_and_pdf(params.getDielectricParams()); } break; default: diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 51c018ac5..ac74b1abf 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -294,7 +294,7 @@ struct Estimator using light_type = typename Scene::light_type; using spectral_type = typename light_type::spectral_type; using interaction_type = Aniso; - using quotient_pdf_type = bxdf::quotient_and_pdf; + using quotient_pdf_type = sampling::quotient_and_pdf; using sample_type = LightSample; using ray_dir_info_type = typename sample_type::ray_dir_info_type; @@ -346,7 +346,7 @@ struct Estimator; + using quotient_pdf_type = sampling::quotient_and_pdf; using sample_type = LightSample; using ray_dir_info_type = typename sample_type::ray_dir_info_type; @@ -397,7 +397,7 @@ struct Estimator; + using quotient_pdf_type = sampling::quotient_and_pdf; using sample_type = LightSample; using ray_dir_info_type = typename sample_type::ray_dir_info_type; diff --git a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl index f5d5206dc..add1eb8a9 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/pathtracer.hlsl @@ -58,8 +58,8 @@ struct Unidirectional using ray_type = typename RayGen::ray_type; using light_type = Light; using bxdfnode_type = BxDFNode; - using anisotropic_type = typename MaterialSystem::anisotropic_type; - using isotropic_type = typename anisotropic_type::isotropic_type; + using anisotropic_interaction_type = typename MaterialSystem::anisotropic_interaction_type; + using isotropic_interaction_type = typename anisotropic_interaction_type::isotropic_interaction_type; using anisocache_type = typename MaterialSystem::anisocache_type; using isocache_type = typename anisocache_type::isocache_type; using quotient_pdf_type = typename NextEventEstimator::quotient_pdf_type; @@ -100,8 +100,8 @@ struct Unidirectional const vector3_type intersection = ray.origin + ray.direction * ray.intersectionT; uint32_t bsdfLightIDs; - anisotropic_type interaction; - isotropic_type iso_interaction; + anisotropic_interaction_type interaction; + isotropic_interaction_type iso_interaction; uint32_t mode = objectID.mode; switch (mode) { @@ -116,8 +116,8 @@ struct Unidirectional N = nbl::hlsl::normalize(N); ray_dir_info_type V; V.direction = -ray.direction; - isotropic_type iso_interaction = isotropic_type::create(V, N); - interaction = anisotropic_type::create(iso_interaction); + isotropic_interaction_type iso_interaction = isotropic_interaction_type::create(V, N); + interaction = anisotropic_interaction_type::create(iso_interaction); } break; default: @@ -142,9 +142,9 @@ struct Unidirectional // TODO: ifdef kill diffuse specular paths - const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIFFUSE) ? bxdf_traits::type == BT_BSDF : - (bxdf.materialType == ext::MaterialSystem::MaterialType::CONDUCTOR) ? bxdf_traits::type == BT_BSDF : - bxdf_traits::type == BT_BSDF; + const bool isBSDF = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIFFUSE) ? bxdf::traits::type == bxdf::BT_BSDF : + (bxdf.materialType == ext::MaterialSystem::MaterialType::CONDUCTOR) ? bxdf::traits::type == bxdf::BT_BSDF : + bxdf::traits::type == bxdf::BT_BSDF; vector3_type eps0 = rand3d(depth, _sample, 0u); vector3_type eps1 = rand3d(depth, _sample, 1u); @@ -171,24 +171,25 @@ struct Unidirectional ); // We don't allow non watertight transmitters in this renderer - bool validPath = nee_sample.NdotL > numeric_limits::min; + bool validPath = nee_sample.getNdotL() > numeric_limits::min; // but if we allowed non-watertight transmitters (single water surface), it would make sense just to apply this line by itself - anisocache_type _cache; - validPath = validPath && anisocache_type::template compute(_cache, interaction, nee_sample, monochromeEta); + bxdf::fresnel::OrientedEtas orientedEta = bxdf::fresnel::OrientedEtas::create(interaction.getNdotV(), monochromeEta); + anisocache_type _cache = anisocache_type::template create(interaction, nee_sample, orientedEta); + validPath = validPath && _cache.getNdotH() >= 0.0; bxdf.params.eta = monochromeEta; if (neeContrib_pdf.pdf < numeric_limits::max) { - if (nbl::hlsl::any(isnan(nee_sample.L.direction))) + if (nbl::hlsl::any(isnan(nee_sample.getL().getDirection()))) ray.payload.accumulation += vector3_type(1000.f, 0.f, 0.f); - else if (nbl::hlsl::all((vector3_type)69.f == nee_sample.L.direction)) + else if (nbl::hlsl::all((vector3_type)69.f == nee_sample.getL().getDirection())) ray.payload.accumulation += vector3_type(0.f, 1000.f, 0.f); else if (validPath) { bxdf::BxDFClampMode _clamp; _clamp = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX; // example only uses isotropic bxdfs - params_type params = params_type::template create(nee_sample, interaction.isotropic, _cache.iso_cache, _clamp); + params_type params = params_type::create(nee_sample, interaction.isotropic, _cache.iso_cache, _clamp); quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(bxdf.materialType, bxdf.params, params); neeContrib_pdf.quotient *= bxdf.albedo * throughput * bsdf_quotient_pdf.quotient; @@ -200,8 +201,8 @@ struct Unidirectional // neeContrib_pdf.quotient *= otherGenOverChoice; ray_type nee_ray; - nee_ray.origin = intersection + nee_sample.L.direction * t * Tolerance::getStart(depth); - nee_ray.direction = nee_sample.L.direction; + nee_ray.origin = intersection + nee_sample.getL().getDirection() * t * Tolerance::getStart(depth); + nee_ray.direction = nee_sample.getL().getDirection(); nee_ray.intersectionT = t; if (bsdf_quotient_pdf.pdf < numeric_limits::max && getLuma(neeContrib_pdf.quotient) > lumaContributionThreshold && intersector_type::traceRay(nee_ray, scene).id == -1) ray.payload.accumulation += neeContrib_pdf.quotient; @@ -221,13 +222,13 @@ struct Unidirectional bxdf::BxDFClampMode _clamp; _clamp = (bxdf.materialType == ext::MaterialSystem::MaterialType::DIELECTRIC) ? bxdf::BxDFClampMode::BCM_ABS : bxdf::BxDFClampMode::BCM_MAX; // example only uses isotropic bxdfs - params_type params = params_type::template create(bsdf_sample, interaction.isotropic, _cache.iso_cache, _clamp); + params_type params = params_type::create(bsdf_sample, interaction.isotropic, _cache.iso_cache, _clamp); // the value of the bsdf divided by the probability of the sample being generated quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(bxdf.materialType, bxdf.params, params); throughput *= bxdf.albedo * bsdf_quotient_pdf.quotient; bxdfPdf = bsdf_quotient_pdf.pdf; - bxdfSample = bsdf_sample.L.direction; + bxdfSample = bsdf_sample.getL().getDirection(); } // additional threshold @@ -243,7 +244,7 @@ struct Unidirectional ray.direction = bxdfSample; if ((PTPolygonMethod)nee_type::PolygonMethod == PPM_APPROX_PROJECTED_SOLID_ANGLE) { - ray.normalAtOrigin = interaction.isotropic.N; + ray.normalAtOrigin = interaction.getN(); ray.wasBSDFAtOrigin = isBSDF; } return true; diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl index 81736f508..a40eb3dd0 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl @@ -74,18 +74,17 @@ float32_t2 getTexCoords() using ray_dir_info_t = bxdf::ray_dir_info::SBasic; using iso_interaction = bxdf::surface_interactions::SIsotropic; -using aniso_interaction = bxdf::surface_interactions::SAnisotropic; +using aniso_interaction = bxdf::surface_interactions::SAnisotropic; using sample_t = bxdf::SLightSample; using iso_cache = bxdf::SIsotropicMicrofacetCache; -using aniso_cache = bxdf::SAnisotropicMicrofacetCache; -using quotient_pdf_t = bxdf::quotient_and_pdf; +using aniso_cache = bxdf::SAnisotropicMicrofacetCache; +using quotient_pdf_t = sampling::quotient_and_pdf; using spectral_t = vector; -using params_t = bxdf::SBxDFParams; using create_params_t = bxdf::SBxDFCreationParams; using diffuse_bxdf_type = bxdf::reflection::SOrenNayarBxDF; -using conductor_bxdf_type = bxdf::reflection::SGGXBxDF; -using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF; +using conductor_bxdf_type = bxdf::reflection::SGGXBxDF; +using dielectric_bxdf_type = bxdf::transmission::SGGXDielectricBxDF; using ray_type = ext::Ray; using light_type = ext::Light; diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 576a4c7b0..2e139af8d 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -80,13 +80,6 @@ class HLSLComputePathtracer final : public SimpleWindowedApplication, public Bui inline bool isComputeOnly() const override { return false; } - //inline video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override - //{ - // auto retval = device_base_t::getAPIFeaturesToEnable(); - // retval.synchronizationValidation = true; - // return retval; - //} - inline core::vector getSurfaces() const override { if (!m_surface)