From f2ea51d0b3e3388c0f9bae03602ec3b1f658c124 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Sun, 23 Mar 2025 19:29:49 -0300 Subject: [PATCH 1/9] Morton code tests --- CMakeLists.txt | 3 +- XX_Mortons/CMakeLists.txt | 24 ++++++++++ XX_Mortons/app_resources/shader.hlsl | 7 +++ XX_Mortons/config.json.template | 28 +++++++++++ XX_Mortons/main.cpp | 69 ++++++++++++++++++++++++++++ XX_Mortons/pipeline.groovy | 50 ++++++++++++++++++++ 6 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 XX_Mortons/CMakeLists.txt create mode 100644 XX_Mortons/app_resources/shader.hlsl create mode 100644 XX_Mortons/config.json.template create mode 100644 XX_Mortons/main.cpp create mode 100644 XX_Mortons/pipeline.groovy diff --git a/CMakeLists.txt b/CMakeLists.txt index fb03f95a4..7fcddfc18 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,7 +95,8 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL) add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL) - add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) + add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) + add_subdirectory(XX_Mortons EXCLUDE_FROM_ALL) NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") endif() diff --git a/XX_Mortons/CMakeLists.txt b/XX_Mortons/CMakeLists.txt new file mode 100644 index 000000000..a434ff32a --- /dev/null +++ b/XX_Mortons/CMakeLists.txt @@ -0,0 +1,24 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/XX_Mortons/app_resources/shader.hlsl b/XX_Mortons/app_resources/shader.hlsl new file mode 100644 index 000000000..a24a78191 --- /dev/null +++ b/XX_Mortons/app_resources/shader.hlsl @@ -0,0 +1,7 @@ +#include "nbl/builtin/hlsl/math/morton.hlsl" + +[numthreads(512, 1, 1)] +void main(uint32_t3 ID : SV_DispatchThreadID) +{ + printf("%d %d", nbl::hlsl::morton::impl::decode_masks_array::Masks[0], nbl::hlsl::morton::impl::decode_masks_array::Masks[1]); +} \ No newline at end of file diff --git a/XX_Mortons/config.json.template b/XX_Mortons/config.json.template new file mode 100644 index 000000000..717d05d53 --- /dev/null +++ b/XX_Mortons/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", // should be none + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/XX_Mortons/main.cpp b/XX_Mortons/main.cpp new file mode 100644 index 000000000..881c84417 --- /dev/null +++ b/XX_Mortons/main.cpp @@ -0,0 +1,69 @@ +// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + + +// I've moved out a tiny part of this example into a shared header for reuse, please open and read it. +#include "nbl/application_templates/MonoDeviceApplication.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" + +#include "nbl/builtin/hlsl/math/morton.hlsl" +#include + +using namespace nbl; +using namespace core; +using namespace system; +using namespace asset; +using namespace video; + + +// this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platforms +class MortonTestApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + + inline core::smart_refctd_ptr createShader( + const char* includeMainName) + { + std::string prelude = "#include \""; + auto CPUShader = core::make_smart_refctd_ptr((prelude + includeMainName + "\"\n").c_str(), IShader::E_SHADER_STAGE::ESS_COMPUTE, IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName); + assert(CPUShader); + return m_device->createShader(CPUShader.get()); + } + public: + MortonTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + // we stuff all our work here because its a "single shot" app + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Remember to call the base class initialization! + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + createShader("app_resources/shader.hlsl"); + + const auto masksArray = hlsl::morton::impl::decode_masks_array::Masks; + for (auto i = 0u; i < 3; i++) + { + std::cout << std::bitset<32>(masksArray[i]) << std::endl; + } + + return true; + } + + // Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop" + void workLoopBody() override {} + + // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. + bool keepRunning() override {return false;} + + private: + smart_refctd_ptr m_api; +}; + + +NBL_MAIN_FUNC(MortonTestApp) \ No newline at end of file diff --git a/XX_Mortons/pipeline.groovy b/XX_Mortons/pipeline.groovy new file mode 100644 index 000000000..1a7b043a4 --- /dev/null +++ b/XX_Mortons/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CStreamingAndBufferDeviceAddressBuilder extends IBuilder +{ + public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info) +} + +return this \ No newline at end of file From 8f4e4529ca6f31ace6498cf9ac4284c14dbdf652 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Tue, 25 Mar 2025 10:44:31 -0300 Subject: [PATCH 2/9] Morton codes creating properly --- XX_Mortons/app_resources/common.hlsl | 10 ++ XX_Mortons/app_resources/shader.hlsl | 15 +- XX_Mortons/main.cpp | 241 ++++++++++++++++++++++++++- 3 files changed, 259 insertions(+), 7 deletions(-) create mode 100644 XX_Mortons/app_resources/common.hlsl diff --git a/XX_Mortons/app_resources/common.hlsl b/XX_Mortons/app_resources/common.hlsl new file mode 100644 index 000000000..3a9fca3fa --- /dev/null +++ b/XX_Mortons/app_resources/common.hlsl @@ -0,0 +1,10 @@ +#include "nbl/builtin/hlsl/math/morton.hlsl" + +NBL_CONSTEXPR uint32_t bufferSize = 256; +using scalar_t = int32_t; +using unsigned_scalar_t = nbl::hlsl::make_unsigned_t; + +struct PushConstantData +{ + uint64_t deviceBufferAddress; +}; \ No newline at end of file diff --git a/XX_Mortons/app_resources/shader.hlsl b/XX_Mortons/app_resources/shader.hlsl index a24a78191..d1f7c967e 100644 --- a/XX_Mortons/app_resources/shader.hlsl +++ b/XX_Mortons/app_resources/shader.hlsl @@ -1,7 +1,16 @@ -#include "nbl/builtin/hlsl/math/morton.hlsl" +#include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl" -[numthreads(512, 1, 1)] +[[vk::push_constant]] PushConstantData pushConstants; + +using namespace nbl::hlsl; + +[numthreads(bufferSize, 1, 1)] void main(uint32_t3 ID : SV_DispatchThreadID) { - printf("%d %d", nbl::hlsl::morton::impl::decode_masks_array::Masks[0], nbl::hlsl::morton::impl::decode_masks_array::Masks[1]); + LegacyBdaAccessor accessor = LegacyBdaAccessor::create(pushConstants.deviceBufferAddress); + + morton::code foo = morton::code::create(vector(-32768, -1)); + + accessor.set(0, foo.value); } \ No newline at end of file diff --git a/XX_Mortons/main.cpp b/XX_Mortons/main.cpp index 881c84417..860b581d2 100644 --- a/XX_Mortons/main.cpp +++ b/XX_Mortons/main.cpp @@ -7,7 +7,7 @@ #include "nbl/application_templates/MonoDeviceApplication.hpp" #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" -#include "nbl/builtin/hlsl/math/morton.hlsl" +#include "app_resources/common.hlsl" #include using namespace nbl; @@ -16,7 +16,6 @@ using namespace system; using namespace asset; using namespace video; - // this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platforms class MortonTestApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { @@ -44,14 +43,221 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, if (!asset_base_t::onAppInitialized(std::move(system))) return false; - createShader("app_resources/shader.hlsl"); + auto shader = createShader("app_resources/shader.hlsl"); + + // Create massive upload/download buffers + constexpr uint32_t DownstreamBufferSize = sizeof(unsigned_scalar_t) << 23; + constexpr uint32_t UpstreamBufferSize = sizeof(unsigned_scalar_t) << 23; + + m_utils = make_smart_refctd_ptr(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize); + if (!m_utils) + return logFail("Failed to create Utilities!"); + m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer(); + m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer(); + m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress(); + m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress(); + + // Create device-local buffer + { + IGPUBuffer::SCreationParams deviceLocalBufferParams = {}; + + IQueue* const queue = getComputeQueue(); + uint32_t queueFamilyIndex = queue->getFamilyIndex(); + + deviceLocalBufferParams.queueFamilyIndexCount = 1; + deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex; + deviceLocalBufferParams.size = sizeof(unsigned_scalar_t) * bufferSize; + deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; + + m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams)); + auto mreqs = m_deviceLocalBuffer->getMemoryReqs(); + mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); + auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT); + + m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress(); + } + + const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(PushConstantData) }; + + { + auto layout = m_device->createPipelineLayout({ &pcRange,1 }); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout.get(); + params.shader.shader = shader.get(); + params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)); + params.shader.requireFullSubgroups = true; + if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) + return logFail("Failed to create compute pipeline!\n"); + } + + const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); + // The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices + // which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets. + // Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc. + // there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those. + // We'll align to max of coherent atom size even if the memory is coherent, + // and we also need to take into account BDA shader loads need to be aligned to the type being loaded. + m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(float)); + + // Semaphor used here to know the FFT is done before download + m_timeline = m_device->createSemaphore(semaphorValue); + + IQueue* const queue = getComputeQueue(); + + const uint32_t inputSize = sizeof(unsigned_scalar_t) * bufferSize; + + // Just need a single suballocation in this example + const uint32_t AllocationCount = 1; + + // It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value + // this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args. + auto inputOffset = m_upStreamingBuffer->invalid_value; + + // We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled) + // Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later). + std::chrono::steady_clock::time_point waitTill(std::chrono::years(45)); + // note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly + m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment); + + // Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example! + { + auto* const inputPtr = reinterpret_cast(reinterpret_cast(m_upStreamingBuffer->getBufferPointer()) + inputOffset); + for (auto j = 0; j < bufferSize; j++) + { + unsigned_scalar_t x = j > 0 ? 0.f : 2.f; + unsigned_scalar_t y = 0; + + /* + unsigned_scalar_t x = 1.f; + unsigned_scalar_t y = 0.f; + */ + + inputPtr[2 * j] = x; + inputPtr[2 * j + 1] = y; + } + // Always remember to flush! + if (m_upStreamingBuffer->needsManualFlushOrInvalidate()) + { + const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory(); + const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize); + m_device->flushMappedMemoryRanges(1, &range); + } + } + + // finally allocate our output range + const uint32_t outputSize = inputSize; + auto outputOffset = m_downStreamingBuffer->invalid_value; + m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment); + + smart_refctd_ptr cmdbuf; + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) { + return logFail("Failed to create Command Buffers!\n"); + } + cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger)); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdbuf->bindComputePipeline(m_pipeline.get()); + // This is the new fun part, pushing constants + const PushConstantData pc = { .deviceBufferAddress = m_deviceLocalBufferAddress }; + IGPUCommandBuffer::SBufferCopy copyInfo = {}; + copyInfo.srcOffset = 0; + copyInfo.dstOffset = 0; + copyInfo.size = m_deviceLocalBuffer->getSize(); + cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, ©Info); + cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + // Remember we do a single workgroup per 1D array in these parts + cmdbuf->dispatch(1, 1, 1); + + // Pipeline barrier: wait for FFT shader to be done before copying to downstream buffer + IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo = {}; + + decltype(pipelineBarrierInfo)::buffer_barrier_t barrier = {}; + pipelineBarrierInfo.bufBarriers = { &barrier, 1u }; + + barrier.range.buffer = m_deviceLocalBuffer; + + barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS; + barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS; + + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo); + cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, ©Info); + cmdbuf->end(); + } + + semaphorValue++; + { + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = + { + .cmdbuf = cmdbuf.get() + }; + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = + { + .semaphore = m_timeline.get(), + .value = semaphorValue, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + + const IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = {}, + .commandBuffers = {&cmdbufInfo,1}, + .signalSemaphores = {&signalInfo,1} + }; + + m_api->startCapture(); + queue->submit({ &submitInfo,1 }); + m_api->endCapture(); + } + + // We let all latches know what semaphore and counter value has to be passed for the functors to execute + const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue }; + + // As promised, we can defer an upstreaming buffer deallocation until a fence is signalled + // You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation. + m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait); + + // Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer. + // Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory. + // Its nice because it will also remember to invalidate our memory mapping if its not coherent. + auto latchedConsumer = make_smart_refctd_ptr( + IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize), + // Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals + [=](const size_t dstOffset, const void* bufSrc, const size_t size)->void + { + // The unused variable is used for letting the consumer know the subsection of the output we've managed to download + // But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves. + assert(dstOffset == 0 && size == outputSize); + + std::cout << "Begin array GPU\n"; + unsigned_scalar_t* const data = reinterpret_cast(const_cast(bufSrc)); + std::cout << std::bitset<32>(data[0]) << "\n"; + /* + for (auto i = 0u; i < bufferSize; i++) { + std::cout << std::bitset<32>(data[i]) << "\n"; + } + */ + std::cout << "\nEnd array GPU\n"; + }, + // Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it + // hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands. + // It could also be latched in the upstreaming deallocate, because its the same fence. + std::move(cmdbuf), m_downStreamingBuffer + ); + // We put a function we want to execute + m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get()); + + // ------------------------------------------- CPP ------------------------------------------------------------------------------------------------------ const auto masksArray = hlsl::morton::impl::decode_masks_array::Masks; for (auto i = 0u; i < 3; i++) { std::cout << std::bitset<32>(masksArray[i]) << std::endl; } + const auto someCode = hlsl::morton::code::create(hlsl::vector(1, 1, 1, 1)); + return true; } @@ -61,8 +267,35 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. bool keepRunning() override {return false;} + // Cleanup + bool onAppTerminated() override + { + // Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated` + // (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain) + while (m_downStreamingBuffer->cull_frees()) {} + return device_base_t::onAppTerminated(); + } + private: - smart_refctd_ptr m_api; + smart_refctd_ptr m_pipeline; + + smart_refctd_ptr m_utils; + + nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer; + StreamingTransientDataBufferMT<>* m_downStreamingBuffer; + smart_refctd_ptr m_deviceLocalBuffer; + + // These are Buffer Device Addresses + uint64_t m_upStreamingBufferAddress; + uint64_t m_downStreamingBufferAddress; + uint64_t m_deviceLocalBufferAddress; + + // You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!) + uint32_t m_alignment; + + // This example really lets the advantages of a timeline semaphore shine through! + smart_refctd_ptr m_timeline; + uint64_t semaphorValue = 0; }; From 0aedfd929a505657ef761c84be15cfaf8d4ddb7b Mon Sep 17 00:00:00 2001 From: Fletterio Date: Fri, 28 Mar 2025 20:16:45 -0300 Subject: [PATCH 3/9] All tests passing, HLSL compiles fine! --- XX_Mortons/main.cpp | 235 +++++++++++++++++++++++++++++++++----------- 1 file changed, 177 insertions(+), 58 deletions(-) diff --git a/XX_Mortons/main.cpp b/XX_Mortons/main.cpp index 860b581d2..b20662904 100644 --- a/XX_Mortons/main.cpp +++ b/XX_Mortons/main.cpp @@ -10,6 +10,9 @@ #include "app_resources/common.hlsl" #include +// Right now the test only checks that HLSL compiles the file +constexpr bool TestHLSL = true; + using namespace nbl; using namespace core; using namespace system; @@ -22,6 +25,12 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, using device_base_t = application_templates::MonoDeviceApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using morton_t = nbl::hlsl::morton::code; + using vector_t = nbl::hlsl::vector; + using unsigned_morton_t = nbl::hlsl::morton::code; + using unsigned_vector_t = nbl::hlsl::vector; + using bool_vector_t = nbl::hlsl::vector; + inline core::smart_refctd_ptr createShader( const char* includeMainName) { @@ -43,18 +52,173 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, if (!asset_base_t::onAppInitialized(std::move(system))) return false; + // ----------------------------------------------- CPP TESTS ---------------------------------------------------------------------- + + // Coordinate extraction and whole vector decode tests + { + morton_t morton(vector_t(-1011, 765, 248)); + unsigned_morton_t unsignedMorton(unsigned_vector_t(154, 789, 1011)); + + assert(morton.getCoordinate(0) == -1011 && morton.getCoordinate(1) == 765 && morton.getCoordinate(2) == 248); + assert(unsignedMorton.getCoordinate(0) == 154u && unsignedMorton.getCoordinate(1) == 789u && unsignedMorton.getCoordinate(2) == 1011u); + + assert(static_cast(morton) == vector_t(-1011, 765, 248) && static_cast(unsignedMorton) == unsigned_vector_t(154, 789, 1011)); + } + + // *********************************************************************************************************************************** + // ************************************************* Arithmetic operator tests ******************************************************* + // *********************************************************************************************************************************** + + // ---------------------------------------------------------------------------------------------------- + // --------------------------------------- ADDITION --------------------------------------------------- + // ---------------------------------------------------------------------------------------------------- + + // ---------------------------------------- Signed ----------------------------------------------------- + + // No overflow + assert(static_cast(morton_t(vector_t(-1011, 765, 248)) + morton_t(vector_t(1000, -985, 200))) == vector_t(-11, -220, 448)); + + // Type 1 overflow: Addition of representable coordinates goes out of range + assert(static_cast(morton_t(vector_t(-900, 70, 500)) + morton_t(vector_t(-578, -50, 20))) == vector_t(570, 20, -504)); + + // Type 2 overflow: Addition of irrepresentable range gives correct result + assert(static_cast(morton_t(vector_t(54, 900, -475)) + morton_t(vector_t(46, -1437, 699))) == vector_t(100, -537, 224)); + + // ---------------------------------------- Unsigned ----------------------------------------------------- + + // No overflow + assert(static_cast(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(1563, 754, 220))) == unsigned_vector_t(1945, 1664, 763)); + + // Type 1 overflow: Addition of representable coordinates goes out of range + assert(static_cast(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(2000, 2000, 1000))) == unsigned_vector_t(334, 862, 519)); + + // Type 2 overflow: Addition of irrepresentable range gives correct result + assert(static_cast(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(-143, -345, -233))) == unsigned_vector_t(239, 565, 310)); + + // ---------------------------------------------------------------------------------------------------- + // -------------------------------------- SUBTRACTION ------------------------------------------------- + // ---------------------------------------------------------------------------------------------------- + + // ---------------------------------------- Signed ----------------------------------------------------- + + // No overflow + assert(static_cast(morton_t(vector_t(1000, 764, -365)) - morton_t(vector_t(834, -243, 100))) == vector_t(166, 1007, -465)); + + // Type 1 overflow: Subtraction of representable coordinates goes out of range + assert(static_cast(morton_t(vector_t(-900, 70, 500)) - morton_t(vector_t(578, -50, -20))) == vector_t(570, 120, -504)); + + // Type 2 overflow: Subtraction of irrepresentable range gives correct result + assert(static_cast(morton_t(vector_t(54, 900, -475)) - morton_t(vector_t(-46, 1437, -699))) == vector_t(100, -537, 224)); + + // ---------------------------------------- Unsigned ----------------------------------------------------- + + // No overflow + assert(static_cast(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) - unsigned_morton_t(unsigned_vector_t(322, 564, 299))) == unsigned_vector_t(60, 346, 244)); + + // Type 1 overflow: Subtraction of representable coordinates goes out of range + assert(static_cast(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) - unsigned_morton_t(unsigned_vector_t(2000, 2000, 1000))) == unsigned_vector_t(430, 958, 567)); + + // Type 2 overflow: Subtraction of irrepresentable range gives correct result + assert(static_cast(unsigned_morton_t(unsigned_vector_t(54, 900, 475)) - unsigned_morton_t(unsigned_vector_t(-865, -100, -10))) == unsigned_vector_t(919, 1000, 485)); + + + // ---------------------------------------------------------------------------------------------------- + // -------------------------------------- UNARY NEGATION ---------------------------------------------- + // ---------------------------------------------------------------------------------------------------- + + // Only makes sense for signed + assert(static_cast(- morton_t(vector_t(-1024, 543, -475))) == vector_t(-1024, -543, 475)); + + // *********************************************************************************************************************************** + // ************************************************* Comparison operator tests ******************************************************* + // *********************************************************************************************************************************** + + // ---------------------------------------------------------------------------------------------------- + // -------------------------------------- OPERATOR< --------------------------------------------------- + // ---------------------------------------------------------------------------------------------------- + + // Signed + + // Same sign, negative + assert(morton_t(vector_t(-954, -455, -333)) < morton_t(vector_t(-433, -455, -433)) == bool_vector_t(true, false, false)); + // Same sign, positive + assert(morton_t(vector_t(954, 455, 333)) < morton_t(vector_t(433, 455, 433)) == bool_vector_t(false, false, true)); + // Differing signs + assert(morton_t(vector_t(954, -32, 0)) < morton_t(vector_t(-44, 0, -1)) == bool_vector_t(false, true, false)); + + // Unsigned + assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) < unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(true, false, false)); + + // ---------------------------------------------------------------------------------------------------- + // -------------------------------------- OPERATOR<= -------------------------------------------------- + // ---------------------------------------------------------------------------------------------------- + + // Signed + + // Same sign, negative + assert(morton_t(vector_t(-954, -455, -333)) <= morton_t(vector_t(-433, -455, -433)) == bool_vector_t(true, true, false)); + // Same sign, positive + assert(morton_t(vector_t(954, 455, 333)) <= morton_t(vector_t(433, 455, 433)) == bool_vector_t(false, true, true)); + // Differing signs + assert(morton_t(vector_t(954, -32, 0)) <= morton_t(vector_t(-44, 0, -1)) == bool_vector_t(false, true, false)); + + // Unsigned + assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) <= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(true, true, false)); + + // ---------------------------------------------------------------------------------------------------- + // -------------------------------------- OPERATOR> --------------------------------------------------- + // ---------------------------------------------------------------------------------------------------- + + // Signed + + // Same sign, negative + assert(morton_t(vector_t(-954, -455, -333)) > morton_t(vector_t(-433, -455, -433)) == bool_vector_t(false, false, true)); + // Same sign, positive + assert(morton_t(vector_t(954, 455, 333)) > morton_t(vector_t(433, 455, 433)) == bool_vector_t(true, false, false)); + // Differing signs + assert(morton_t(vector_t(954, -32, 0)) > morton_t(vector_t(-44, 0, -1)) == bool_vector_t(true, false, true)); + + // Unsigned + assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) > unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, false, true)); + + // ---------------------------------------------------------------------------------------------------- + // -------------------------------------- OPERATOR>= -------------------------------------------------- + // ---------------------------------------------------------------------------------------------------- + + // Signed + + // Same sign, negative + assert(morton_t(vector_t(-954, -455, -333)) >= morton_t(vector_t(-433, -455, -433)) == bool_vector_t(false, true, true)); + // Same sign, positive + assert(morton_t(vector_t(954, 455, 333)) >= morton_t(vector_t(433, 455, 433)) == bool_vector_t(true, true, false)); + // Differing signs + assert(morton_t(vector_t(954, -32, 0)) >= morton_t(vector_t(-44, 0, -1)) == bool_vector_t(true, false, true)); + + // Unsigned + assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) >= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, true, true)); + + + if(!TestHLSL) + return true; + + + + + + + + + + // ----------------------------------------------- HLSL COMPILATION + OPTIONAL TESTS ---------------------------------------------- auto shader = createShader("app_resources/shader.hlsl"); // Create massive upload/download buffers constexpr uint32_t DownstreamBufferSize = sizeof(unsigned_scalar_t) << 23; - constexpr uint32_t UpstreamBufferSize = sizeof(unsigned_scalar_t) << 23; - m_utils = make_smart_refctd_ptr(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize); + m_utils = make_smart_refctd_ptr(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize); if (!m_utils) return logFail("Failed to create Utilities!"); - m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer(); m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer(); - m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress(); m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress(); // Create device-local buffer @@ -109,40 +273,9 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, // Just need a single suballocation in this example const uint32_t AllocationCount = 1; - // It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value - // this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args. - auto inputOffset = m_upStreamingBuffer->invalid_value; - // We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled) // Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later). std::chrono::steady_clock::time_point waitTill(std::chrono::years(45)); - // note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly - m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment); - - // Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example! - { - auto* const inputPtr = reinterpret_cast(reinterpret_cast(m_upStreamingBuffer->getBufferPointer()) + inputOffset); - for (auto j = 0; j < bufferSize; j++) - { - unsigned_scalar_t x = j > 0 ? 0.f : 2.f; - unsigned_scalar_t y = 0; - - /* - unsigned_scalar_t x = 1.f; - unsigned_scalar_t y = 0.f; - */ - - inputPtr[2 * j] = x; - inputPtr[2 * j + 1] = y; - } - // Always remember to flush! - if (m_upStreamingBuffer->needsManualFlushOrInvalidate()) - { - const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory(); - const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize); - m_device->flushMappedMemoryRanges(1, &range); - } - } // finally allocate our output range const uint32_t outputSize = inputSize; @@ -161,11 +294,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, cmdbuf->bindComputePipeline(m_pipeline.get()); // This is the new fun part, pushing constants const PushConstantData pc = { .deviceBufferAddress = m_deviceLocalBufferAddress }; - IGPUCommandBuffer::SBufferCopy copyInfo = {}; - copyInfo.srcOffset = 0; - copyInfo.dstOffset = 0; - copyInfo.size = m_deviceLocalBuffer->getSize(); - cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, ©Info); cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); // Remember we do a single workgroup per 1D array in these parts cmdbuf->dispatch(1, 1, 1); @@ -184,6 +312,11 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS; cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo); + + IGPUCommandBuffer::SBufferCopy copyInfo = {}; + copyInfo.srcOffset = 0; + copyInfo.dstOffset = 0; + copyInfo.size = m_deviceLocalBuffer->getSize(); cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, ©Info); cmdbuf->end(); } @@ -215,10 +348,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, // We let all latches know what semaphore and counter value has to be passed for the functors to execute const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue }; - // As promised, we can defer an upstreaming buffer deallocation until a fence is signalled - // You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation. - m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait); - // Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer. // Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory. // Its nice because it will also remember to invalidate our memory mapping if its not coherent. @@ -249,15 +378,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, // We put a function we want to execute m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get()); - // ------------------------------------------- CPP ------------------------------------------------------------------------------------------------------ - const auto masksArray = hlsl::morton::impl::decode_masks_array::Masks; - for (auto i = 0u; i < 3; i++) - { - std::cout << std::bitset<32>(masksArray[i]) << std::endl; - } - - const auto someCode = hlsl::morton::code::create(hlsl::vector(1, 1, 1, 1)); - return true; } @@ -272,7 +392,10 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, { // Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated` // (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain) - while (m_downStreamingBuffer->cull_frees()) {} + if (TestHLSL) + { + while (m_downStreamingBuffer->cull_frees()) {} + } return device_base_t::onAppTerminated(); } @@ -281,19 +404,15 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, smart_refctd_ptr m_utils; - nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer; StreamingTransientDataBufferMT<>* m_downStreamingBuffer; smart_refctd_ptr m_deviceLocalBuffer; // These are Buffer Device Addresses - uint64_t m_upStreamingBufferAddress; uint64_t m_downStreamingBufferAddress; uint64_t m_deviceLocalBufferAddress; - // You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!) uint32_t m_alignment; - // This example really lets the advantages of a timeline semaphore shine through! smart_refctd_ptr m_timeline; uint64_t semaphorValue = 0; }; From ea42d5bf287cbff376809be65f64c71567e0134f Mon Sep 17 00:00:00 2001 From: Fletterio Date: Tue, 1 Apr 2025 15:44:55 -0300 Subject: [PATCH 4/9] Rename example --- {XX_Mortons => 12_Mortons}/CMakeLists.txt | 0 12_Mortons/app_resources/common.hlsl | 13 ++++++++++++ .../app_resources/shader.hlsl | 8 ++++--- .../config.json.template | 0 {XX_Mortons => 12_Mortons}/main.cpp | 21 ++++++++----------- {XX_Mortons => 12_Mortons}/pipeline.groovy | 0 CMakeLists.txt | 2 +- XX_Mortons/app_resources/common.hlsl | 10 --------- 8 files changed, 28 insertions(+), 26 deletions(-) rename {XX_Mortons => 12_Mortons}/CMakeLists.txt (100%) create mode 100644 12_Mortons/app_resources/common.hlsl rename {XX_Mortons => 12_Mortons}/app_resources/shader.hlsl (79%) rename {XX_Mortons => 12_Mortons}/config.json.template (100%) rename {XX_Mortons => 12_Mortons}/main.cpp (97%) rename {XX_Mortons => 12_Mortons}/pipeline.groovy (100%) delete mode 100644 XX_Mortons/app_resources/common.hlsl diff --git a/XX_Mortons/CMakeLists.txt b/12_Mortons/CMakeLists.txt similarity index 100% rename from XX_Mortons/CMakeLists.txt rename to 12_Mortons/CMakeLists.txt diff --git a/12_Mortons/app_resources/common.hlsl b/12_Mortons/app_resources/common.hlsl new file mode 100644 index 000000000..bd5184f80 --- /dev/null +++ b/12_Mortons/app_resources/common.hlsl @@ -0,0 +1,13 @@ +//#include "nbl/builtin/hlsl/morton.hlsl" +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +NBL_CONSTEXPR uint32_t bufferSize = 256; + +// Proper coverage would require writing tests for ALL possible sign, dimensions and width configurations +//using morton_t2 = nbl::hlsl::morton::code; // Fits in an int16_t +using vector_t2 = nbl::hlsl::vector; + +struct PushConstantData +{ + uint64_t deviceBufferAddress; +}; \ No newline at end of file diff --git a/XX_Mortons/app_resources/shader.hlsl b/12_Mortons/app_resources/shader.hlsl similarity index 79% rename from XX_Mortons/app_resources/shader.hlsl rename to 12_Mortons/app_resources/shader.hlsl index d1f7c967e..e7f570eee 100644 --- a/XX_Mortons/app_resources/shader.hlsl +++ b/12_Mortons/app_resources/shader.hlsl @@ -3,14 +3,16 @@ [[vk::push_constant]] PushConstantData pushConstants; -using namespace nbl::hlsl; - [numthreads(bufferSize, 1, 1)] void main(uint32_t3 ID : SV_DispatchThreadID) { + /* LegacyBdaAccessor accessor = LegacyBdaAccessor::create(pushConstants.deviceBufferAddress); morton::code foo = morton::code::create(vector(-32768, -1)); - accessor.set(0, foo.value); + //accessor.set(0, foo.value); + */ + uint32_t bar = _static_cast(0xCAFEDEADDEADBEEF); + accessor.set(0, bar); } \ No newline at end of file diff --git a/XX_Mortons/config.json.template b/12_Mortons/config.json.template similarity index 100% rename from XX_Mortons/config.json.template rename to 12_Mortons/config.json.template diff --git a/XX_Mortons/main.cpp b/12_Mortons/main.cpp similarity index 97% rename from XX_Mortons/main.cpp rename to 12_Mortons/main.cpp index b20662904..d1fddba7a 100644 --- a/XX_Mortons/main.cpp +++ b/12_Mortons/main.cpp @@ -25,12 +25,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, using device_base_t = application_templates::MonoDeviceApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; - using morton_t = nbl::hlsl::morton::code; - using vector_t = nbl::hlsl::vector; - using unsigned_morton_t = nbl::hlsl::morton::code; - using unsigned_vector_t = nbl::hlsl::vector; - using bool_vector_t = nbl::hlsl::vector; - inline core::smart_refctd_ptr createShader( const char* includeMainName) { @@ -52,6 +46,8 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, if (!asset_base_t::onAppInitialized(std::move(system))) return false; + /* + // ----------------------------------------------- CPP TESTS ---------------------------------------------------------------------- // Coordinate extraction and whole vector decode tests @@ -201,7 +197,7 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, if(!TestHLSL) return true; - + */ @@ -213,7 +209,7 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, auto shader = createShader("app_resources/shader.hlsl"); // Create massive upload/download buffers - constexpr uint32_t DownstreamBufferSize = sizeof(unsigned_scalar_t) << 23; + constexpr uint32_t DownstreamBufferSize = sizeof(uint32_t) << 23; m_utils = make_smart_refctd_ptr(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize); if (!m_utils) @@ -230,7 +226,7 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, deviceLocalBufferParams.queueFamilyIndexCount = 1; deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex; - deviceLocalBufferParams.size = sizeof(unsigned_scalar_t) * bufferSize; + deviceLocalBufferParams.size = sizeof(uint32_t) * bufferSize; deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams)); @@ -268,7 +264,7 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, IQueue* const queue = getComputeQueue(); - const uint32_t inputSize = sizeof(unsigned_scalar_t) * bufferSize; + const uint32_t inputSize = sizeof(uint32_t) * bufferSize; // Just need a single suballocation in this example const uint32_t AllocationCount = 1; @@ -361,8 +357,9 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, assert(dstOffset == 0 && size == outputSize); std::cout << "Begin array GPU\n"; - unsigned_scalar_t* const data = reinterpret_cast(const_cast(bufSrc)); - std::cout << std::bitset<32>(data[0]) << "\n"; + uint32_t* const data = reinterpret_cast(const_cast(bufSrc)); + //std::cout << std::bitset<32>(data[0]) << "\n"; + std::cout << data[0] << "\n"; /* for (auto i = 0u; i < bufferSize; i++) { std::cout << std::bitset<32>(data[i]) << "\n"; diff --git a/XX_Mortons/pipeline.groovy b/12_Mortons/pipeline.groovy similarity index 100% rename from XX_Mortons/pipeline.groovy rename to 12_Mortons/pipeline.groovy diff --git a/CMakeLists.txt b/CMakeLists.txt index 7fcddfc18..5d0c148cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(10_CountingSort EXCLUDE_FROM_ALL) # showcase use of FFT for post-FX Bloom effect add_subdirectory(11_FFT EXCLUDE_FROM_ALL) + add_subdirectory(12_Mortons EXCLUDE_FROM_ALL) # Waiting for a refactor @@ -96,7 +97,6 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL) add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) - add_subdirectory(XX_Mortons EXCLUDE_FROM_ALL) NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") endif() diff --git a/XX_Mortons/app_resources/common.hlsl b/XX_Mortons/app_resources/common.hlsl deleted file mode 100644 index 3a9fca3fa..000000000 --- a/XX_Mortons/app_resources/common.hlsl +++ /dev/null @@ -1,10 +0,0 @@ -#include "nbl/builtin/hlsl/math/morton.hlsl" - -NBL_CONSTEXPR uint32_t bufferSize = 256; -using scalar_t = int32_t; -using unsigned_scalar_t = nbl::hlsl::make_unsigned_t; - -struct PushConstantData -{ - uint64_t deviceBufferAddress; -}; \ No newline at end of file From 2ba08a4a39bf15b3c689666012b263794b8371f2 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Tue, 1 Apr 2025 17:43:20 -0300 Subject: [PATCH 5/9] Add tests for AddCarry and SUbBorrow intrinsics --- 22_CppCompat/CIntrinsicsTester.h | 13 + 22_CppCompat/app_resources/common.hlsl | 859 +++++++++++++------------ 2 files changed, 451 insertions(+), 421 deletions(-) diff --git a/22_CppCompat/CIntrinsicsTester.h b/22_CppCompat/CIntrinsicsTester.h index 77aa2c1ca..5fe7bc08e 100644 --- a/22_CppCompat/CIntrinsicsTester.h +++ b/22_CppCompat/CIntrinsicsTester.h @@ -85,6 +85,10 @@ class CIntrinsicsTester final : public ITester testInput.smoothStepEdge0 = realDistributionNeg(mt); testInput.smoothStepEdge1 = realDistributionPos(mt); testInput.smoothStepX = realDistribution(mt); + testInput.addCarryA = std::numeric_limits::max() - uintDistribution(mt); + testInput.addCarryB = uintDistribution(mt); + testInput.subBorrowA = uintDistribution(mt); + testInput.subBorrowB = uintDistribution(mt); testInput.bitCountVec = int32_t3(intDistribution(mt), intDistribution(mt), intDistribution(mt)); testInput.clampValVec = float32_t3(realDistribution(mt), realDistribution(mt), realDistribution(mt)); @@ -119,6 +123,10 @@ class CIntrinsicsTester final : public ITester testInput.refractI = float32_t3(realDistribution(mt), realDistribution(mt), realDistribution(mt)); testInput.refractN = glm::normalize(float32_t3(realDistribution(mt), realDistribution(mt), realDistribution(mt))); testInput.refractEta = realDistribution(mt); + testInput.addCarryAVec = uint32_t3(std::numeric_limits::max() - uintDistribution(mt), std::numeric_limits::max() - uintDistribution(mt), std::numeric_limits::max() - uintDistribution(mt)); + testInput.addCarryBVec = uint32_t3(uintDistribution(mt), uintDistribution(mt), uintDistribution(mt)); + testInput.subBorrowAVec = uint32_t3(uintDistribution(mt), uintDistribution(mt), uintDistribution(mt)); + testInput.subBorrowBVec = uint32_t3(uintDistribution(mt), uintDistribution(mt), uintDistribution(mt)); // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values IntrinsicsTestValues expected; @@ -188,6 +196,11 @@ class CIntrinsicsTester final : public ITester auto inverseGlm = glm::inverse(reinterpret_cast(testInput.inverse)); expected.inverse = reinterpret_cast(inverseGlm); + expected.addCarry.result = glm::uaddCarry(testInput.addCarryA, testInput.addCarryB, expected.addCarry.carry); + expected.subBorrow.result = glm::usubBorrow(testInput.subBorrowA, testInput.subBorrowB, expected.subBorrow.borrow); + expected.addCarryVec.result = glm::uaddCarry(testInput.addCarryAVec, testInput.addCarryBVec, expected.addCarryVec.carry); + expected.subBorrowVec.result = glm::usubBorrow(testInput.subBorrowAVec, testInput.subBorrowBVec, expected.subBorrowVec.borrow); + performCpuTests(testInput, expected); performGpuTests(testInput, expected); } diff --git a/22_CppCompat/app_resources/common.hlsl b/22_CppCompat/app_resources/common.hlsl index e2303a2fc..dc3ff5fcd 100644 --- a/22_CppCompat/app_resources/common.hlsl +++ b/22_CppCompat/app_resources/common.hlsl @@ -1,74 +1,74 @@ -//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. -//// This file is part of the "Nabla Engine". -//// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_COMMON_INCLUDED_ -#define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_COMMON_INCLUDED_ - -// because DXC doesn't properly support `_Static_assert` -// TODO: add a message, and move to macros.h or cpp_compat -#define STATIC_ASSERT(...) { nbl::hlsl::conditional<__VA_ARGS__, int, void>::type a = 0; } - -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include - -#include - - -#include -#include -#include - -#include -#include - -// tgmath.hlsl and intrinsics.hlsl tests - -using namespace nbl::hlsl; -struct TgmathIntputTestValues -{ - float floor; - float isnan; - float isinf; - float powX; - float powY; - float exp; - float exp2; - float log; - float log2; - float absF; - int absI; - float sqrt; - float sin; - float cos; - float acos; - float modf; - float round; - float roundEven; - float trunc; - float ceil; - float fmaX; - float fmaY; - float fmaZ; - float ldexpArg; - int ldexpExp; - float modfStruct; - float frexpStruct; +//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_COMMON_INCLUDED_ +#define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_COMMON_INCLUDED_ + +// because DXC doesn't properly support `_Static_assert` +// TODO: add a message, and move to macros.h or cpp_compat +#define STATIC_ASSERT(...) { nbl::hlsl::conditional<__VA_ARGS__, int, void>::type a = 0; } + +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include + +#include + + +#include +#include +#include + +#include +#include + +// tgmath.hlsl and intrinsics.hlsl tests + +using namespace nbl::hlsl; +struct TgmathIntputTestValues +{ + float floor; + float isnan; + float isinf; + float powX; + float powY; + float exp; + float exp2; + float log; + float log2; + float absF; + int absI; + float sqrt; + float sin; + float cos; + float acos; + float modf; + float round; + float roundEven; + float trunc; + float ceil; + float fmaX; + float fmaY; + float fmaZ; + float ldexpArg; + int ldexpExp; + float modfStruct; + float frexpStruct; float tan; float asin; float atan; @@ -78,38 +78,38 @@ struct TgmathIntputTestValues float asinh; float acosh; float atanh; - float atan2X; - float atan2Y; - float erf; - float erfInv; - - float32_t3 floorVec; - float32_t3 isnanVec; - float32_t3 isinfVec; - float32_t3 powXVec; - float32_t3 powYVec; - float32_t3 expVec; - float32_t3 exp2Vec; - float32_t3 logVec; - float32_t3 log2Vec; - float32_t3 absFVec; - int32_t3 absIVec; - float32_t3 sqrtVec; - float32_t3 sinVec; - float32_t3 cosVec; - float32_t3 acosVec; - float32_t3 modfVec; - float32_t3 roundVec; - float32_t3 roundEvenVec; - float32_t3 truncVec; - float32_t3 ceilVec; - float32_t3 fmaXVec; - float32_t3 fmaYVec; - float32_t3 fmaZVec; - float32_t3 ldexpArgVec; - int32_t3 ldexpExpVec; - float32_t3 modfStructVec; - float32_t3 frexpStructVec; + float atan2X; + float atan2Y; + float erf; + float erfInv; + + float32_t3 floorVec; + float32_t3 isnanVec; + float32_t3 isinfVec; + float32_t3 powXVec; + float32_t3 powYVec; + float32_t3 expVec; + float32_t3 exp2Vec; + float32_t3 logVec; + float32_t3 log2Vec; + float32_t3 absFVec; + int32_t3 absIVec; + float32_t3 sqrtVec; + float32_t3 sinVec; + float32_t3 cosVec; + float32_t3 acosVec; + float32_t3 modfVec; + float32_t3 roundVec; + float32_t3 roundEvenVec; + float32_t3 truncVec; + float32_t3 ceilVec; + float32_t3 fmaXVec; + float32_t3 fmaYVec; + float32_t3 fmaZVec; + float32_t3 ldexpArgVec; + int32_t3 ldexpExpVec; + float32_t3 modfStructVec; + float32_t3 frexpStructVec; float32_t3 tanVec; float32_t3 asinVec; float32_t3 atanVec; @@ -119,35 +119,35 @@ struct TgmathIntputTestValues float32_t3 asinhVec; float32_t3 acoshVec; float32_t3 atanhVec; - float32_t3 atan2XVec; - float32_t3 atan2YVec; - float32_t3 erfVec; - float32_t3 erfInvVec; -}; - -struct TgmathTestValues -{ - float floor; - int isnan; - int isinf; - float pow; - float exp; - float exp2; - float log; - float log2; - float absF; - int absI; - float sqrt; - float sin; - float cos; - float acos; - float modf; - float round; - float roundEven; - float trunc; - float ceil; - float fma; - float ldexp; + float32_t3 atan2XVec; + float32_t3 atan2YVec; + float32_t3 erfVec; + float32_t3 erfInvVec; +}; + +struct TgmathTestValues +{ + float floor; + int isnan; + int isinf; + float pow; + float exp; + float exp2; + float log; + float log2; + float absF; + int absI; + float sqrt; + float sin; + float cos; + float acos; + float modf; + float round; + float roundEven; + float trunc; + float ceil; + float fma; + float ldexp; float tan; float asin; float atan; @@ -157,40 +157,40 @@ struct TgmathTestValues float asinh; float acosh; float atanh; - float atan2; - float erf; - float erfInv; - - float32_t3 floorVec; - - // we can't fix this because using namespace nbl::hlsl would cause ambiguous math functions below - // and we can't add a nbl::hlsl alias for the builtin hLSL vector type because of https://github.com/microsoft/DirectXShaderCompiler/issues/7035 -#ifndef __HLSL_VERSION - nbl::hlsl::vector isnanVec; - nbl::hlsl::vector isinfVec; -#else - vector isnanVec; - vector isinfVec; -#endif - - float32_t3 powVec; - float32_t3 expVec; - float32_t3 exp2Vec; - float32_t3 logVec; - float32_t3 log2Vec; - float32_t3 absFVec; - int32_t3 absIVec; - float32_t3 sqrtVec; - float32_t3 cosVec; - float32_t3 sinVec; - float32_t3 acosVec; - float32_t3 modfVec; - float32_t3 roundVec; - float32_t3 roundEvenVec; - float32_t3 truncVec; - float32_t3 ceilVec; - float32_t3 fmaVec; - float32_t3 ldexpVec; + float atan2; + float erf; + float erfInv; + + float32_t3 floorVec; + + // we can't fix this because using namespace nbl::hlsl would cause ambiguous math functions below + // and we can't add a nbl::hlsl alias for the builtin hLSL vector type because of https://github.com/microsoft/DirectXShaderCompiler/issues/7035 +#ifndef __HLSL_VERSION + nbl::hlsl::vector isnanVec; + nbl::hlsl::vector isinfVec; +#else + vector isnanVec; + vector isinfVec; +#endif + + float32_t3 powVec; + float32_t3 expVec; + float32_t3 exp2Vec; + float32_t3 logVec; + float32_t3 log2Vec; + float32_t3 absFVec; + int32_t3 absIVec; + float32_t3 sqrtVec; + float32_t3 cosVec; + float32_t3 sinVec; + float32_t3 acosVec; + float32_t3 modfVec; + float32_t3 roundVec; + float32_t3 roundEvenVec; + float32_t3 truncVec; + float32_t3 ceilVec; + float32_t3 fmaVec; + float32_t3 ldexpVec; float32_t3 tanVec; float32_t3 asinVec; float32_t3 atanVec; @@ -200,258 +200,275 @@ struct TgmathTestValues float32_t3 asinhVec; float32_t3 acoshVec; float32_t3 atanhVec; - float32_t3 atan2Vec; - float32_t3 erfVec; - float32_t3 erfInvVec; - - ModfOutput modfStruct; - ModfOutput modfStructVec; - FrexpOutput frexpStruct; - FrexpOutput frexpStructVec; - - void fillTestValues(NBL_CONST_REF_ARG(TgmathIntputTestValues) input) - { - floor = nbl::hlsl::floor(input.floor); - isnan = nbl::hlsl::isnan(input.isnan); - isinf = nbl::hlsl::isinf(input.isinf); - pow = nbl::hlsl::pow(input.powX, input.powY); - exp = nbl::hlsl::exp(input.exp); - exp2 = nbl::hlsl::exp2(input.exp2); - log = nbl::hlsl::log(input.log); - log2 = nbl::hlsl::log2(input.log2); - absF = nbl::hlsl::abs(input.absF); - absI = nbl::hlsl::abs(input.absI); - sqrt = nbl::hlsl::sqrt(input.sqrt); - sin = nbl::hlsl::sin(input.sin); - cos = nbl::hlsl::cos(input.cos); - tan = nbl::hlsl::tan(input.tan); - asin = nbl::hlsl::asin(input.asin); - atan = nbl::hlsl::atan(input.atan); - sinh = nbl::hlsl::sinh(input.sinh); - cosh = nbl::hlsl::cosh(input.cosh); - tanh = nbl::hlsl::tanh(input.tanh); - asinh = nbl::hlsl::asinh(input.asinh); - acosh = nbl::hlsl::acosh(input.acosh); - atanh = nbl::hlsl::atanh(input.atanh); - atan2 = nbl::hlsl::atan2(input.atan2Y, input.atan2X); - erf = nbl::hlsl::erf(input.erf); - erfInv = nbl::hlsl::erfInv(input.erfInv); - acos = nbl::hlsl::acos(input.acos); - modf = nbl::hlsl::modf(input.modf); - round = nbl::hlsl::round(input.round); - roundEven = nbl::hlsl::roundEven(input.roundEven); - trunc = nbl::hlsl::trunc(input.trunc); - ceil = nbl::hlsl::ceil(input.ceil); - fma = nbl::hlsl::fma(input.fmaX, input.fmaY, input.fmaZ); - ldexp = nbl::hlsl::ldexp(input.ldexpArg, input.ldexpExp); - - floorVec = nbl::hlsl::floor(input.floorVec); - isnanVec = nbl::hlsl::isnan(input.isnanVec); - isinfVec = nbl::hlsl::isinf(input.isinfVec); - powVec = nbl::hlsl::pow(input.powXVec, input.powYVec); - expVec = nbl::hlsl::exp(input.expVec); - exp2Vec = nbl::hlsl::exp2(input.exp2Vec); - logVec = nbl::hlsl::log(input.logVec); - log2Vec = nbl::hlsl::log2(input.log2Vec); - absFVec = nbl::hlsl::abs(input.absFVec); - absIVec = nbl::hlsl::abs(input.absIVec); - sqrtVec = nbl::hlsl::sqrt(input.sqrtVec); - sinVec = nbl::hlsl::sin(input.sinVec); - cosVec = nbl::hlsl::cos(input.cosVec); - tanVec = nbl::hlsl::tan(input.tanVec); - asinVec = nbl::hlsl::asin(input.asinVec); - atanVec = nbl::hlsl::atan(input.atanVec); - sinhVec = nbl::hlsl::sinh(input.sinhVec); - coshVec = nbl::hlsl::cosh(input.coshVec); - tanhVec = nbl::hlsl::tanh(input.tanhVec); - asinhVec = nbl::hlsl::asinh(input.asinhVec); - acoshVec = nbl::hlsl::acosh(input.acoshVec); - atanhVec = nbl::hlsl::atanh(input.atanhVec); - atan2Vec = nbl::hlsl::atan2(input.atan2YVec, input.atan2XVec); - acosVec = nbl::hlsl::acos(input.acosVec); - modfVec = nbl::hlsl::modf(input.modfVec); - roundVec = nbl::hlsl::round(input.roundVec); - roundEvenVec = nbl::hlsl::roundEven(input.roundEvenVec); - truncVec = nbl::hlsl::trunc(input.truncVec); - ceilVec = nbl::hlsl::ceil(input.ceilVec); - fmaVec = nbl::hlsl::fma(input.fmaXVec, input.fmaYVec, input.fmaZVec); - ldexpVec = nbl::hlsl::ldexp(input.ldexpArgVec, input.ldexpExpVec); - erfVec = nbl::hlsl::erf(input.erfVec); - erfInvVec = nbl::hlsl::erfInv(input.erfInvVec); - - modfStruct = nbl::hlsl::modfStruct(input.modfStruct); - modfStructVec = nbl::hlsl::modfStruct(input.modfStructVec); - frexpStruct = nbl::hlsl::frexpStruct(input.frexpStruct); - frexpStructVec = nbl::hlsl::frexpStruct(input.frexpStructVec); - } -}; - -struct IntrinsicsIntputTestValues -{ - int bitCount; - float32_t3 crossLhs; - float32_t3 crossRhs; - float clampVal; - float clampMin; - float clampMax; - float32_t3 length; - float32_t3 normalize; - float32_t3 dotLhs; - float32_t3 dotRhs; - float32_t3x3 determinant; - uint32_t findMSB; - uint32_t findLSB; - float32_t3x3 inverse; - float32_t3x3 transpose; - float32_t3x3 mulLhs; - float32_t3x3 mulRhs; - float minA; - float minB; - float maxA; - float maxB; - float rsqrt; - uint32_t bitReverse; - float frac; - float mixX; - float mixY; - float mixA; - float sign; - float radians; - float degrees; - float stepEdge; - float stepX; - float smoothStepEdge0; - float smoothStepEdge1; - float smoothStepX; - - int32_t3 bitCountVec; - float32_t3 clampValVec; - float32_t3 clampMinVec; - float32_t3 clampMaxVec; - uint32_t3 findMSBVec; - uint32_t3 findLSBVec; - float32_t3 minAVec; - float32_t3 minBVec; - float32_t3 maxAVec; - float32_t3 maxBVec; - float32_t3 rsqrtVec; - uint32_t3 bitReverseVec; - float32_t3 fracVec; - float32_t3 mixXVec; - float32_t3 mixYVec; - float32_t3 mixAVec; - float32_t3 signVec; - float32_t3 radiansVec; - float32_t3 degreesVec; - float32_t3 stepEdgeVec; - float32_t3 stepXVec; - float32_t3 smoothStepEdge0Vec; - float32_t3 smoothStepEdge1Vec; - float32_t3 smoothStepXVec; - float32_t3 faceForwardN; - float32_t3 faceForwardI; - float32_t3 faceForwardNref; - float32_t3 reflectI; - float32_t3 reflectN; - float32_t3 refractI; - float32_t3 refractN; - float refractEta; -}; - -struct IntrinsicsTestValues -{ - int bitCount; - float clamp; - float length; - float dot; - float determinant; - int findMSB; - int findLSB; - float min; - float max; - float rsqrt; - float frac; - uint32_t bitReverse; - float mix; - float sign; - float radians; - float degrees; - float step; - float smoothStep; - - float32_t3 normalize; - float32_t3 cross; - int32_t3 bitCountVec; - float32_t3 clampVec; - uint32_t3 findMSBVec; - uint32_t3 findLSBVec; - float32_t3 minVec; - float32_t3 maxVec; - float32_t3 rsqrtVec; - uint32_t3 bitReverseVec; - float32_t3 fracVec; - float32_t3 mixVec; - float32_t3 signVec; - float32_t3 radiansVec; - float32_t3 degreesVec; - float32_t3 stepVec; - float32_t3 smoothStepVec; - float32_t3 faceForward; - float32_t3 reflect; - float32_t3 refract; - - float32_t3x3 mul; - float32_t3x3 transpose; - float32_t3x3 inverse; - - void fillTestValues(NBL_CONST_REF_ARG(IntrinsicsIntputTestValues) input) - { - bitCount = nbl::hlsl::bitCount(input.bitCount); - cross = nbl::hlsl::cross(input.crossLhs, input.crossRhs); - clamp = nbl::hlsl::clamp(input.clampVal, input.clampMin, input.clampMax); - length = nbl::hlsl::length(input.length); - normalize = nbl::hlsl::normalize(input.normalize); - dot = nbl::hlsl::dot(input.dotLhs, input.dotRhs); - determinant = nbl::hlsl::determinant(input.determinant); - findMSB = nbl::hlsl::findMSB(input.findMSB); - findLSB = nbl::hlsl::findLSB(input.findLSB); - inverse = nbl::hlsl::inverse(input.inverse); - transpose = nbl::hlsl::transpose(input.transpose); - mul = nbl::hlsl::mul(input.mulLhs, input.mulRhs); - // TODO: fix min and max - min = nbl::hlsl::min(input.minA, input.minB); - max = nbl::hlsl::max(input.maxA, input.maxB); - rsqrt = nbl::hlsl::rsqrt(input.rsqrt); - bitReverse = nbl::hlsl::bitReverse(input.bitReverse); - frac = nbl::hlsl::fract(input.frac); - mix = nbl::hlsl::mix(input.mixX, input.mixY, input.mixA); - sign = nbl::hlsl::sign(input.sign); - radians = nbl::hlsl::radians(input.radians); - degrees = nbl::hlsl::degrees(input.degrees); - step = nbl::hlsl::step(input.stepEdge, input.stepX); - smoothStep = nbl::hlsl::smoothStep(input.smoothStepEdge0, input.smoothStepEdge1, input.smoothStepX); - - bitCountVec = nbl::hlsl::bitCount(input.bitCountVec); - clampVec = nbl::hlsl::clamp(input.clampValVec, input.clampMinVec, input.clampMaxVec); - findMSBVec = nbl::hlsl::findMSB(input.findMSBVec); - findLSBVec = nbl::hlsl::findLSB(input.findLSBVec); - // TODO: fix min and max - minVec = nbl::hlsl::min(input.minAVec, input.minBVec); - maxVec = nbl::hlsl::max(input.maxAVec, input.maxBVec); - rsqrtVec = nbl::hlsl::rsqrt(input.rsqrtVec); - bitReverseVec = nbl::hlsl::bitReverse(input.bitReverseVec); - fracVec = nbl::hlsl::fract(input.fracVec); - mixVec = nbl::hlsl::mix(input.mixXVec, input.mixYVec, input.mixAVec); - - signVec = nbl::hlsl::sign(input.signVec); - radiansVec = nbl::hlsl::radians(input.radiansVec); - degreesVec = nbl::hlsl::degrees(input.degreesVec); - stepVec = nbl::hlsl::step(input.stepEdgeVec, input.stepXVec); - smoothStepVec = nbl::hlsl::smoothStep(input.smoothStepEdge0Vec, input.smoothStepEdge1Vec, input.smoothStepXVec); - faceForward = nbl::hlsl::faceForward(input.faceForwardN, input.faceForwardI, input.faceForwardNref); - reflect = nbl::hlsl::reflect(input.reflectI, input.reflectN); - refract = nbl::hlsl::refract(input.refractI, input.refractN, input.refractEta); - } -}; - -#endif + float32_t3 atan2Vec; + float32_t3 erfVec; + float32_t3 erfInvVec; + + ModfOutput modfStruct; + ModfOutput modfStructVec; + FrexpOutput frexpStruct; + FrexpOutput frexpStructVec; + + void fillTestValues(NBL_CONST_REF_ARG(TgmathIntputTestValues) input) + { + floor = nbl::hlsl::floor(input.floor); + isnan = nbl::hlsl::isnan(input.isnan); + isinf = nbl::hlsl::isinf(input.isinf); + pow = nbl::hlsl::pow(input.powX, input.powY); + exp = nbl::hlsl::exp(input.exp); + exp2 = nbl::hlsl::exp2(input.exp2); + log = nbl::hlsl::log(input.log); + log2 = nbl::hlsl::log2(input.log2); + absF = nbl::hlsl::abs(input.absF); + absI = nbl::hlsl::abs(input.absI); + sqrt = nbl::hlsl::sqrt(input.sqrt); + sin = nbl::hlsl::sin(input.sin); + cos = nbl::hlsl::cos(input.cos); + tan = nbl::hlsl::tan(input.tan); + asin = nbl::hlsl::asin(input.asin); + atan = nbl::hlsl::atan(input.atan); + sinh = nbl::hlsl::sinh(input.sinh); + cosh = nbl::hlsl::cosh(input.cosh); + tanh = nbl::hlsl::tanh(input.tanh); + asinh = nbl::hlsl::asinh(input.asinh); + acosh = nbl::hlsl::acosh(input.acosh); + atanh = nbl::hlsl::atanh(input.atanh); + atan2 = nbl::hlsl::atan2(input.atan2Y, input.atan2X); + erf = nbl::hlsl::erf(input.erf); + erfInv = nbl::hlsl::erfInv(input.erfInv); + acos = nbl::hlsl::acos(input.acos); + modf = nbl::hlsl::modf(input.modf); + round = nbl::hlsl::round(input.round); + roundEven = nbl::hlsl::roundEven(input.roundEven); + trunc = nbl::hlsl::trunc(input.trunc); + ceil = nbl::hlsl::ceil(input.ceil); + fma = nbl::hlsl::fma(input.fmaX, input.fmaY, input.fmaZ); + ldexp = nbl::hlsl::ldexp(input.ldexpArg, input.ldexpExp); + + floorVec = nbl::hlsl::floor(input.floorVec); + isnanVec = nbl::hlsl::isnan(input.isnanVec); + isinfVec = nbl::hlsl::isinf(input.isinfVec); + powVec = nbl::hlsl::pow(input.powXVec, input.powYVec); + expVec = nbl::hlsl::exp(input.expVec); + exp2Vec = nbl::hlsl::exp2(input.exp2Vec); + logVec = nbl::hlsl::log(input.logVec); + log2Vec = nbl::hlsl::log2(input.log2Vec); + absFVec = nbl::hlsl::abs(input.absFVec); + absIVec = nbl::hlsl::abs(input.absIVec); + sqrtVec = nbl::hlsl::sqrt(input.sqrtVec); + sinVec = nbl::hlsl::sin(input.sinVec); + cosVec = nbl::hlsl::cos(input.cosVec); + tanVec = nbl::hlsl::tan(input.tanVec); + asinVec = nbl::hlsl::asin(input.asinVec); + atanVec = nbl::hlsl::atan(input.atanVec); + sinhVec = nbl::hlsl::sinh(input.sinhVec); + coshVec = nbl::hlsl::cosh(input.coshVec); + tanhVec = nbl::hlsl::tanh(input.tanhVec); + asinhVec = nbl::hlsl::asinh(input.asinhVec); + acoshVec = nbl::hlsl::acosh(input.acoshVec); + atanhVec = nbl::hlsl::atanh(input.atanhVec); + atan2Vec = nbl::hlsl::atan2(input.atan2YVec, input.atan2XVec); + acosVec = nbl::hlsl::acos(input.acosVec); + modfVec = nbl::hlsl::modf(input.modfVec); + roundVec = nbl::hlsl::round(input.roundVec); + roundEvenVec = nbl::hlsl::roundEven(input.roundEvenVec); + truncVec = nbl::hlsl::trunc(input.truncVec); + ceilVec = nbl::hlsl::ceil(input.ceilVec); + fmaVec = nbl::hlsl::fma(input.fmaXVec, input.fmaYVec, input.fmaZVec); + ldexpVec = nbl::hlsl::ldexp(input.ldexpArgVec, input.ldexpExpVec); + erfVec = nbl::hlsl::erf(input.erfVec); + erfInvVec = nbl::hlsl::erfInv(input.erfInvVec); + + modfStruct = nbl::hlsl::modfStruct(input.modfStruct); + modfStructVec = nbl::hlsl::modfStruct(input.modfStructVec); + frexpStruct = nbl::hlsl::frexpStruct(input.frexpStruct); + frexpStructVec = nbl::hlsl::frexpStruct(input.frexpStructVec); + } +}; + +struct IntrinsicsIntputTestValues +{ + int bitCount; + float32_t3 crossLhs; + float32_t3 crossRhs; + float clampVal; + float clampMin; + float clampMax; + float32_t3 length; + float32_t3 normalize; + float32_t3 dotLhs; + float32_t3 dotRhs; + float32_t3x3 determinant; + uint32_t findMSB; + uint32_t findLSB; + float32_t3x3 inverse; + float32_t3x3 transpose; + float32_t3x3 mulLhs; + float32_t3x3 mulRhs; + float minA; + float minB; + float maxA; + float maxB; + float rsqrt; + uint32_t bitReverse; + float frac; + float mixX; + float mixY; + float mixA; + float sign; + float radians; + float degrees; + float stepEdge; + float stepX; + float smoothStepEdge0; + float smoothStepEdge1; + float smoothStepX; + uint32_t addCarryA; + uint32_t addCarryB; + uint32_t subBorrowA; + uint32_t subBorrowB; + + int32_t3 bitCountVec; + float32_t3 clampValVec; + float32_t3 clampMinVec; + float32_t3 clampMaxVec; + uint32_t3 findMSBVec; + uint32_t3 findLSBVec; + float32_t3 minAVec; + float32_t3 minBVec; + float32_t3 maxAVec; + float32_t3 maxBVec; + float32_t3 rsqrtVec; + uint32_t3 bitReverseVec; + float32_t3 fracVec; + float32_t3 mixXVec; + float32_t3 mixYVec; + float32_t3 mixAVec; + float32_t3 signVec; + float32_t3 radiansVec; + float32_t3 degreesVec; + float32_t3 stepEdgeVec; + float32_t3 stepXVec; + float32_t3 smoothStepEdge0Vec; + float32_t3 smoothStepEdge1Vec; + float32_t3 smoothStepXVec; + float32_t3 faceForwardN; + float32_t3 faceForwardI; + float32_t3 faceForwardNref; + float32_t3 reflectI; + float32_t3 reflectN; + float32_t3 refractI; + float32_t3 refractN; + float refractEta; + uint32_t3 addCarryAVec; + uint32_t3 addCarryBVec; + uint32_t3 subBorrowAVec; + uint32_t3 subBorrowBVec; +}; + +struct IntrinsicsTestValues +{ + int bitCount; + float clamp; + float length; + float dot; + float determinant; + int findMSB; + int findLSB; + float min; + float max; + float rsqrt; + float frac; + uint32_t bitReverse; + float mix; + float sign; + float radians; + float degrees; + float step; + float smoothStep; + + float32_t3 normalize; + float32_t3 cross; + int32_t3 bitCountVec; + float32_t3 clampVec; + uint32_t3 findMSBVec; + uint32_t3 findLSBVec; + float32_t3 minVec; + float32_t3 maxVec; + float32_t3 rsqrtVec; + uint32_t3 bitReverseVec; + float32_t3 fracVec; + float32_t3 mixVec; + float32_t3 signVec; + float32_t3 radiansVec; + float32_t3 degreesVec; + float32_t3 stepVec; + float32_t3 smoothStepVec; + float32_t3 faceForward; + float32_t3 reflect; + float32_t3 refract; + + float32_t3x3 mul; + float32_t3x3 transpose; + float32_t3x3 inverse; + + spirv::AddCarryOutput addCarry; + spirv::SubBorrowOutput subBorrow; + spirv::AddCarryOutput addCarryVec; + spirv::SubBorrowOutput subBorrowVec; + + void fillTestValues(NBL_CONST_REF_ARG(IntrinsicsIntputTestValues) input) + { + bitCount = nbl::hlsl::bitCount(input.bitCount); + cross = nbl::hlsl::cross(input.crossLhs, input.crossRhs); + clamp = nbl::hlsl::clamp(input.clampVal, input.clampMin, input.clampMax); + length = nbl::hlsl::length(input.length); + normalize = nbl::hlsl::normalize(input.normalize); + dot = nbl::hlsl::dot(input.dotLhs, input.dotRhs); + determinant = nbl::hlsl::determinant(input.determinant); + findMSB = nbl::hlsl::findMSB(input.findMSB); + findLSB = nbl::hlsl::findLSB(input.findLSB); + inverse = nbl::hlsl::inverse(input.inverse); + transpose = nbl::hlsl::transpose(input.transpose); + mul = nbl::hlsl::mul(input.mulLhs, input.mulRhs); + // TODO: fix min and max + min = nbl::hlsl::min(input.minA, input.minB); + max = nbl::hlsl::max(input.maxA, input.maxB); + rsqrt = nbl::hlsl::rsqrt(input.rsqrt); + bitReverse = nbl::hlsl::bitReverse(input.bitReverse); + frac = nbl::hlsl::fract(input.frac); + mix = nbl::hlsl::mix(input.mixX, input.mixY, input.mixA); + sign = nbl::hlsl::sign(input.sign); + radians = nbl::hlsl::radians(input.radians); + degrees = nbl::hlsl::degrees(input.degrees); + step = nbl::hlsl::step(input.stepEdge, input.stepX); + smoothStep = nbl::hlsl::smoothStep(input.smoothStepEdge0, input.smoothStepEdge1, input.smoothStepX); + + bitCountVec = nbl::hlsl::bitCount(input.bitCountVec); + clampVec = nbl::hlsl::clamp(input.clampValVec, input.clampMinVec, input.clampMaxVec); + findMSBVec = nbl::hlsl::findMSB(input.findMSBVec); + findLSBVec = nbl::hlsl::findLSB(input.findLSBVec); + // TODO: fix min and max + minVec = nbl::hlsl::min(input.minAVec, input.minBVec); + maxVec = nbl::hlsl::max(input.maxAVec, input.maxBVec); + rsqrtVec = nbl::hlsl::rsqrt(input.rsqrtVec); + bitReverseVec = nbl::hlsl::bitReverse(input.bitReverseVec); + fracVec = nbl::hlsl::fract(input.fracVec); + mixVec = nbl::hlsl::mix(input.mixXVec, input.mixYVec, input.mixAVec); + + signVec = nbl::hlsl::sign(input.signVec); + radiansVec = nbl::hlsl::radians(input.radiansVec); + degreesVec = nbl::hlsl::degrees(input.degreesVec); + stepVec = nbl::hlsl::step(input.stepEdgeVec, input.stepXVec); + smoothStepVec = nbl::hlsl::smoothStep(input.smoothStepEdge0Vec, input.smoothStepEdge1Vec, input.smoothStepXVec); + faceForward = nbl::hlsl::faceForward(input.faceForwardN, input.faceForwardI, input.faceForwardNref); + reflect = nbl::hlsl::reflect(input.reflectI, input.reflectN); + refract = nbl::hlsl::refract(input.refractI, input.refractN, input.refractEta); + addCarry = nbl::hlsl::addCarry(input.addCarryA, input.addCarryB); + subBorrow = nbl::hlsl::subBorrow(input.subBorrowA, input.subBorrowB); + addCarryVec = nbl::hlsl::addCarry(input.addCarryAVec, input.addCarryBVec); + subBorrowVec = nbl::hlsl::subBorrow(input.subBorrowAVec, input.subBorrowBVec); + } +}; + +#endif From f00bbf6fa914ec230df8a000deee75aee69cdce9 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Mon, 7 Apr 2025 19:48:46 -0300 Subject: [PATCH 6/9] Disable intrinsic tests for uSUbBorrow for the time being, start copying 22_CppCOmpat to run tests --- 12_Mortons/Tester.h | 417 +++++++++++++++++++++++++++ 12_Mortons/app_resources/common.hlsl | 38 ++- 12_Mortons/app_resources/shader.hlsl | 18 -- 12_Mortons/main.cpp | 198 +------------ 22_CppCompat/CIntrinsicsTester.h | 22 +- 5 files changed, 474 insertions(+), 219 deletions(-) create mode 100644 12_Mortons/Tester.h delete mode 100644 12_Mortons/app_resources/shader.hlsl diff --git a/12_Mortons/Tester.h b/12_Mortons/Tester.h new file mode 100644 index 000000000..5c4773111 --- /dev/null +++ b/12_Mortons/Tester.h @@ -0,0 +1,417 @@ +#ifndef _NBL_EXAMPLES_TESTS_12_MORTONS_I_TESTER_INCLUDED_ +#define _NBL_EXAMPLES_TESTS_12_MORTONS_I_TESTER_INCLUDED_ + +#include +#include "app_resources/common.hlsl" +#include "nbl/application_templates/MonoDeviceApplication.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" + +using namespace nbl; + +class Tester +{ +public: + virtual ~Tester() + { + m_outputBufferAllocation.memory->unmap(); + }; + + struct PipelineSetupData + { + std::string testShaderPath; + + core::smart_refctd_ptr device; + core::smart_refctd_ptr api; + core::smart_refctd_ptr assetMgr; + core::smart_refctd_ptr logger; + video::IPhysicalDevice* physicalDevice; + uint32_t computeFamilyIndex; + }; + + template + void setupPipeline(const PipelineSetupData& pipleineSetupData) + { + // setting up pipeline in the constructor + m_device = core::smart_refctd_ptr(pipleineSetupData.device); + m_physicalDevice = pipleineSetupData.physicalDevice; + m_api = core::smart_refctd_ptr(pipleineSetupData.api); + m_assetMgr = core::smart_refctd_ptr(pipleineSetupData.assetMgr); + m_logger = core::smart_refctd_ptr(pipleineSetupData.logger); + m_queueFamily = pipleineSetupData.computeFamilyIndex; + m_semaphoreCounter = 0; + m_semaphore = m_device->createSemaphore(0); + m_cmdpool = m_device->createCommandPool(m_queueFamily, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) + logFail("Failed to create Command Buffers!\n"); + + // Load shaders, set up pipeline + core::smart_refctd_ptr shader; + { + asset::IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + auto assetBundle = m_assetMgr->getAsset(pipleineSetupData.testShaderPath, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + logFail("Could not load shader!"); + assert(0); + } + + // It would be super weird if loading a shader from a file produced more than 1 asset + assert(assets.size() == 1); + core::smart_refctd_ptr source = asset::IAsset::castDown(assets[0]); + + auto* compilerSet = m_assetMgr->getCompilerSet(); + + asset::IShaderCompiler::SCompilerOptions options = {}; + options.stage = source->getStage(); + options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; + options.spirvOptimizer = nullptr; + options.debugInfoFlags |= asset::IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT; + options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); + options.preprocessorOptions.logger = m_logger.get(); + options.preprocessorOptions.includeFinder = compilerSet->getShaderCompiler(source->getContentType())->getDefaultIncludeFinder(); + + auto spirv = compilerSet->compileToSPIRV(source.get(), options); + + video::ILogicalDevice::SShaderCreationParameters params{}; + params.cpushader = spirv.get(); + shader = m_device->createShader(params); + } + + if (!shader) + logFail("Failed to create a GPU Shader, seems the Driver doesn't like the SPIR-V we're feeding it!\n"); + + video::IGPUDescriptorSetLayout::SBinding bindings[2] = { + { + .binding = 0, + .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_COMPUTE, + .count = 1 + }, + { + .binding = 1, + .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_COMPUTE, + .count = 1 + } + }; + + core::smart_refctd_ptr dsLayout = m_device->createDescriptorSetLayout(bindings); + if (!dsLayout) + logFail("Failed to create a Descriptor Layout!\n"); + + m_pplnLayout = m_device->createPipelineLayout({}, core::smart_refctd_ptr(dsLayout)); + if (!m_pplnLayout) + logFail("Failed to create a Pipeline Layout!\n"); + + { + video::IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pplnLayout.get(); + params.shader.entryPoint = "main"; + params.shader.shader = shader.get(); + if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) + logFail("Failed to create pipelines (compile & link shaders)!\n"); + } + + // Allocate memory of the input buffer + { + constexpr size_t BufferSize = sizeof(InputStruct); + + video::IGPUBuffer::SCreationParams params = {}; + params.size = BufferSize; + params.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + core::smart_refctd_ptr inputBuff = m_device->createBuffer(std::move(params)); + if (!inputBuff) + logFail("Failed to create a GPU Buffer of size %d!\n", params.size); + + inputBuff->setObjectDebugName("emulated_float64_t output buffer"); + + video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuff->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); + + m_inputBufferAllocation = m_device->allocate(reqs, inputBuff.get(), video::IDeviceMemoryAllocation::EMAF_NONE); + if (!m_inputBufferAllocation.isValid()) + logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(inputBuff->getBoundMemory().memory == m_inputBufferAllocation.memory.get()); + core::smart_refctd_ptr pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 }); + + m_ds = pool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); + { + video::IGPUDescriptorSet::SDescriptorInfo info[1]; + info[0].desc = core::smart_refctd_ptr(inputBuff); + info[0].info.buffer = { .offset = 0,.size = BufferSize }; + video::IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { + {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info} + }; + m_device->updateDescriptorSets(writes, {}); + } + } + + // Allocate memory of the output buffer + { + constexpr size_t BufferSize = sizeof(OutputStruct); + + video::IGPUBuffer::SCreationParams params = {}; + params.size = BufferSize; + params.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + core::smart_refctd_ptr outputBuff = m_device->createBuffer(std::move(params)); + if (!outputBuff) + logFail("Failed to create a GPU Buffer of size %d!\n", params.size); + + outputBuff->setObjectDebugName("emulated_float64_t output buffer"); + + video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); + + m_outputBufferAllocation = m_device->allocate(reqs, outputBuff.get(), video::IDeviceMemoryAllocation::EMAF_NONE); + if (!m_outputBufferAllocation.isValid()) + logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(outputBuff->getBoundMemory().memory == m_outputBufferAllocation.memory.get()); + core::smart_refctd_ptr pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 }); + + { + video::IGPUDescriptorSet::SDescriptorInfo info[1]; + info[0].desc = core::smart_refctd_ptr(outputBuff); + info[0].info.buffer = { .offset = 0,.size = BufferSize }; + video::IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { + {.dstSet = m_ds.get(),.binding = 1,.arrayElement = 0,.count = 1,.info = info} + }; + m_device->updateDescriptorSets(writes, {}); + } + } + + if (!m_outputBufferAllocation.memory->map({ 0ull,m_outputBufferAllocation.memory->getAllocationSize() }, video::IDeviceMemoryAllocation::EMCAF_READ)) + logFail("Failed to map the Device Memory!\n"); + + // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches + const video::ILogicalDevice::MappedMemoryRange memoryRange(m_outputBufferAllocation.memory.get(), 0ull, m_outputBufferAllocation.memory->getAllocationSize()); + if (!m_outputBufferAllocation.memory->getMemoryPropertyFlags().hasFlags(video::IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memoryRange); + + assert(memoryRange.valid() && memoryRange.length >= sizeof(OutputStruct)); + + m_queue = m_device->getQueue(m_queueFamily, 0); + } + + enum class TestType + { + CPU, + GPU + }; + + template + void verifyTestValue(const std::string& memberName, const T& expectedVal, const T& testVal, const TestType testType) + { + static constexpr float MaxAllowedError = 0.1f; + if (std::abs(double(expectedVal) - double(testVal)) <= MaxAllowedError) + return; + + std::stringstream ss; + switch (testType) + { + case TestType::CPU: + ss << "CPU TEST ERROR:\n"; + case TestType::GPU: + ss << "GPU TEST ERROR:\n"; + } + + ss << "nbl::hlsl::" << memberName << " produced incorrect output! test value: " << testVal << " expected value: " << expectedVal << '\n'; + + m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR); + } + + template + void verifyTestVector3dValue(const std::string& memberName, const nbl::hlsl::vector& expectedVal, const nbl::hlsl::vector& testVal, const TestType testType) + { + static constexpr float MaxAllowedError = 0.1f; + if (std::abs(double(expectedVal.x) - double(testVal.x)) <= MaxAllowedError && + std::abs(double(expectedVal.y) - double(testVal.y)) <= MaxAllowedError && + std::abs(double(expectedVal.z) - double(testVal.z)) <= MaxAllowedError) + return; + + std::stringstream ss; + switch (testType) + { + case TestType::CPU: + ss << "CPU TEST ERROR:\n"; + case TestType::GPU: + ss << "GPU TEST ERROR:\n"; + } + + ss << "nbl::hlsl::" << memberName << " produced incorrect output! test value: " << + testVal.x << ' ' << testVal.y << ' ' << testVal.z << + " expected value: " << expectedVal.x << ' ' << expectedVal.y << ' ' << expectedVal.z << '\n'; + + m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR); + } + + template + void verifyTestMatrix3x3Value(const std::string& memberName, const nbl::hlsl::matrix& expectedVal, const nbl::hlsl::matrix& testVal, const TestType testType) + { + for (int i = 0; i < 3; ++i) + { + auto expectedValRow = expectedVal[i]; + auto testValRow = testVal[i]; + verifyTestVector3dValue(memberName, expectedValRow, testValRow, testType); + } + } + + void performTests() + { + m_logger->log("intrinsics.hlsl TESTS:", system::ILogger::ELL_PERFORMANCE); + for (int i = 0; i < Iterations; ++i) + { + // Set input thest values that will be used in both CPU and GPU tests + InputTestValues testInput; + + // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values + TestValues expected; + + performCpuTests(testInput, expected); + performGpuTests(testInput, expected); + } + m_logger->log("intrinsics.hlsl TESTS DONE.", system::ILogger::ELL_PERFORMANCE); + } + +protected: + uint32_t m_queueFamily; + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_api; + video::IPhysicalDevice* m_physicalDevice; + core::smart_refctd_ptr m_assetMgr; + core::smart_refctd_ptr m_logger; + video::IDeviceMemoryAllocator::SAllocation m_inputBufferAllocation = {}; + video::IDeviceMemoryAllocator::SAllocation m_outputBufferAllocation = {}; + core::smart_refctd_ptr m_cmdbuf = nullptr; + core::smart_refctd_ptr m_cmdpool = nullptr; + core::smart_refctd_ptr m_ds = nullptr; + core::smart_refctd_ptr m_pplnLayout = nullptr; + core::smart_refctd_ptr m_pipeline; + core::smart_refctd_ptr m_semaphore; + video::IQueue* m_queue; + uint64_t m_semaphoreCounter; + + template + OutputStruct dispatch(const InputStruct& input) + { + // Update input buffer + if (!m_inputBufferAllocation.memory->map({ 0ull,m_inputBufferAllocation.memory->getAllocationSize() }, video::IDeviceMemoryAllocation::EMCAF_READ)) + logFail("Failed to map the Device Memory!\n"); + + const video::ILogicalDevice::MappedMemoryRange memoryRange(m_inputBufferAllocation.memory.get(), 0ull, m_inputBufferAllocation.memory->getAllocationSize()); + if (!m_inputBufferAllocation.memory->getMemoryPropertyFlags().hasFlags(video::IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memoryRange); + + std::memcpy(static_cast(m_inputBufferAllocation.memory->getMappedPointer()), &input, sizeof(InputStruct)); + + m_inputBufferAllocation.memory->unmap(); + + // record command buffer + m_cmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); + m_cmdbuf->begin(video::IGPUCommandBuffer::USAGE::NONE); + m_cmdbuf->beginDebugMarker("test", core::vector4df_SIMD(0, 1, 0, 1)); + m_cmdbuf->bindComputePipeline(m_pipeline.get()); + m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); + m_cmdbuf->dispatch(1, 1, 1); + m_cmdbuf->endDebugMarker(); + m_cmdbuf->end(); + + video::IQueue::SSubmitInfo submitInfos[1] = {}; + const video::IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} }; + submitInfos[0].commandBuffers = cmdbufs; + const video::IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = m_semaphore.get(), .value = ++m_semaphoreCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} }; + submitInfos[0].signalSemaphores = signals; + + m_api->startCapture(); + m_queue->submit(submitInfos); + m_api->endCapture(); + + m_device->waitIdle(); + OutputStruct output; + std::memcpy(&output, static_cast(m_outputBufferAllocation.memory->getMappedPointer()), sizeof(OutputStruct)); + m_device->waitIdle(); + + return output; + } + +private: + template + inline void logFail(const char* msg, Args&&... args) + { + m_logger->log(msg, system::ILogger::ELL_ERROR, std::forward(args)...); + exit(-1); + } + + inline static constexpr int Iterations = 100u; + + void performCpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues) + { + TestValues cpuTestValues; + cpuTestValues.fillTestValues(commonTestInputValues); + verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU); + + } + + void performGpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues) + { + TestValues gpuTestValues; + gpuTestValues = dispatch(commonTestInputValues); + verifyTestValues(expectedTestValues, gpuTestValues, ITester::TestType::GPU); + } + + void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType) + { + verifyTestValue("bitCount", expectedTestValues.bitCount, testValues.bitCount, testType); + verifyTestValue("clamp", expectedTestValues.clamp, testValues.clamp, testType); + verifyTestValue("length", expectedTestValues.length, testValues.length, testType); + verifyTestValue("dot", expectedTestValues.dot, testValues.dot, testType); + verifyTestValue("determinant", expectedTestValues.determinant, testValues.determinant, testType); + verifyTestValue("findMSB", expectedTestValues.findMSB, testValues.findMSB, testType); + verifyTestValue("findLSB", expectedTestValues.findLSB, testValues.findLSB, testType); + verifyTestValue("min", expectedTestValues.min, testValues.min, testType); + verifyTestValue("max", expectedTestValues.max, testValues.max, testType); + verifyTestValue("rsqrt", expectedTestValues.rsqrt, testValues.rsqrt, testType); + verifyTestValue("frac", expectedTestValues.frac, testValues.frac, testType); + verifyTestValue("bitReverse", expectedTestValues.bitReverse, testValues.bitReverse, testType); + verifyTestValue("mix", expectedTestValues.mix, testValues.mix, testType); + verifyTestValue("sign", expectedTestValues.sign, testValues.sign, testType); + verifyTestValue("radians", expectedTestValues.radians, testValues.radians, testType); + verifyTestValue("degrees", expectedTestValues.degrees, testValues.degrees, testType); + verifyTestValue("step", expectedTestValues.step, testValues.step, testType); + verifyTestValue("smoothStep", expectedTestValues.smoothStep, testValues.smoothStep, testType); + + verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType); + verifyTestVector3dValue("cross", expectedTestValues.cross, testValues.cross, testType); + verifyTestVector3dValue("bitCountVec", expectedTestValues.bitCountVec, testValues.bitCountVec, testType); + verifyTestVector3dValue("clampVec", expectedTestValues.clampVec, testValues.clampVec, testType); + verifyTestVector3dValue("findMSBVec", expectedTestValues.findMSBVec, testValues.findMSBVec, testType); + verifyTestVector3dValue("findLSBVec", expectedTestValues.findLSBVec, testValues.findLSBVec, testType); + verifyTestVector3dValue("minVec", expectedTestValues.minVec, testValues.minVec, testType); + verifyTestVector3dValue("maxVec", expectedTestValues.maxVec, testValues.maxVec, testType); + verifyTestVector3dValue("rsqrtVec", expectedTestValues.rsqrtVec, testValues.rsqrtVec, testType); + verifyTestVector3dValue("bitReverseVec", expectedTestValues.bitReverseVec, testValues.bitReverseVec, testType); + verifyTestVector3dValue("fracVec", expectedTestValues.fracVec, testValues.fracVec, testType); + verifyTestVector3dValue("mixVec", expectedTestValues.mixVec, testValues.mixVec, testType); + + verifyTestVector3dValue("signVec", expectedTestValues.signVec, testValues.signVec, testType); + verifyTestVector3dValue("radiansVec", expectedTestValues.radiansVec, testValues.radiansVec, testType); + verifyTestVector3dValue("degreesVec", expectedTestValues.degreesVec, testValues.degreesVec, testType); + verifyTestVector3dValue("stepVec", expectedTestValues.stepVec, testValues.stepVec, testType); + verifyTestVector3dValue("smoothStepVec", expectedTestValues.smoothStepVec, testValues.smoothStepVec, testType); + verifyTestVector3dValue("faceForward", expectedTestValues.faceForward, testValues.faceForward, testType); + verifyTestVector3dValue("reflect", expectedTestValues.reflect, testValues.reflect, testType); + verifyTestVector3dValue("refract", expectedTestValues.refract, testValues.refract, testType); + + verifyTestMatrix3x3Value("mul", expectedTestValues.mul, testValues.mul, testType); + verifyTestMatrix3x3Value("transpose", expectedTestValues.transpose, testValues.transpose, testType); + verifyTestMatrix3x3Value("inverse", expectedTestValues.inverse, testValues.inverse, testType); + } +}; + +#endif \ No newline at end of file diff --git a/12_Mortons/app_resources/common.hlsl b/12_Mortons/app_resources/common.hlsl index bd5184f80..9632bd372 100644 --- a/12_Mortons/app_resources/common.hlsl +++ b/12_Mortons/app_resources/common.hlsl @@ -1,13 +1,33 @@ -//#include "nbl/builtin/hlsl/morton.hlsl" -#include "nbl/builtin/hlsl/cpp_compat.hlsl" +//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h -NBL_CONSTEXPR uint32_t bufferSize = 256; +#ifndef _NBL_EXAMPLES_TESTS_12_MORTON_COMMON_INCLUDED_ +#define _NBL_EXAMPLES_TESTS_12_MORTON_COMMON_INCLUDED_ -// Proper coverage would require writing tests for ALL possible sign, dimensions and width configurations -//using morton_t2 = nbl::hlsl::morton::code; // Fits in an int16_t -using vector_t2 = nbl::hlsl::vector; +// because DXC doesn't properly support `_Static_assert` +// TODO: add a message, and move to macros.h or cpp_compat +#define STATIC_ASSERT(...) { nbl::hlsl::conditional<__VA_ARGS__, int, void>::type a = 0; } -struct PushConstantData +#include + +#include + +// tgmath.hlsl and intrinsics.hlsl tests + +using namespace nbl::hlsl; +struct InputTestValues +{ + +}; + +struct TestValues { - uint64_t deviceBufferAddress; -}; \ No newline at end of file + + void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input) + { + + } +}; + +#endif diff --git a/12_Mortons/app_resources/shader.hlsl b/12_Mortons/app_resources/shader.hlsl deleted file mode 100644 index e7f570eee..000000000 --- a/12_Mortons/app_resources/shader.hlsl +++ /dev/null @@ -1,18 +0,0 @@ -#include "app_resources/common.hlsl" -#include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl" - -[[vk::push_constant]] PushConstantData pushConstants; - -[numthreads(bufferSize, 1, 1)] -void main(uint32_t3 ID : SV_DispatchThreadID) -{ - /* - LegacyBdaAccessor accessor = LegacyBdaAccessor::create(pushConstants.deviceBufferAddress); - - morton::code foo = morton::code::create(vector(-32768, -1)); - - //accessor.set(0, foo.value); - */ - uint32_t bar = _static_cast(0xCAFEDEADDEADBEEF); - accessor.set(0, bar); -} \ No newline at end of file diff --git a/12_Mortons/main.cpp b/12_Mortons/main.cpp index d1fddba7a..8118ec939 100644 --- a/12_Mortons/main.cpp +++ b/12_Mortons/main.cpp @@ -45,7 +45,17 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, return false; if (!asset_base_t::onAppInitialized(std::move(system))) return false; - + { + using namespace nbl::hlsl; + + auto bar = morton::code::create(hlsl::vector(893728, 7843, 98032)); + auto foo = _static_cast>(bar); + std::cout << foo[0] << " " << foo[1] << " " << foo[2] << " " << std::endl; + + //auto bar = morton::code::create(hlsl::vector(893728, 7843, 98032)); + //std::cout << "High Encoded: " << std::bitset<32>(bar.value.data.x) << std::endl; + //std::cout << "Low Encoded: " << std::bitset<32>(bar.value.data.y) << std::endl; + } /* // ----------------------------------------------- CPP TESTS ---------------------------------------------------------------------- @@ -193,188 +203,8 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, // Unsigned assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) >= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, true, true)); - - if(!TestHLSL) - return true; - */ - - - - - - - // ----------------------------------------------- HLSL COMPILATION + OPTIONAL TESTS ---------------------------------------------- - auto shader = createShader("app_resources/shader.hlsl"); - - // Create massive upload/download buffers - constexpr uint32_t DownstreamBufferSize = sizeof(uint32_t) << 23; - - m_utils = make_smart_refctd_ptr(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize); - if (!m_utils) - return logFail("Failed to create Utilities!"); - m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer(); - m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress(); - - // Create device-local buffer - { - IGPUBuffer::SCreationParams deviceLocalBufferParams = {}; - - IQueue* const queue = getComputeQueue(); - uint32_t queueFamilyIndex = queue->getFamilyIndex(); - - deviceLocalBufferParams.queueFamilyIndexCount = 1; - deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex; - deviceLocalBufferParams.size = sizeof(uint32_t) * bufferSize; - deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; - - m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams)); - auto mreqs = m_deviceLocalBuffer->getMemoryReqs(); - mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT); - - m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress(); - } - - const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(PushConstantData) }; - - { - auto layout = m_device->createPipelineLayout({ &pcRange,1 }); - IGPUComputePipeline::SCreationParams params = {}; - params.layout = layout.get(); - params.shader.shader = shader.get(); - params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)); - params.shader.requireFullSubgroups = true; - if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) - return logFail("Failed to create compute pipeline!\n"); - } - - const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); - // The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices - // which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets. - // Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc. - // there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those. - // We'll align to max of coherent atom size even if the memory is coherent, - // and we also need to take into account BDA shader loads need to be aligned to the type being loaded. - m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(float)); - - // Semaphor used here to know the FFT is done before download - m_timeline = m_device->createSemaphore(semaphorValue); - - IQueue* const queue = getComputeQueue(); - - const uint32_t inputSize = sizeof(uint32_t) * bufferSize; - - // Just need a single suballocation in this example - const uint32_t AllocationCount = 1; - - // We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled) - // Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later). - std::chrono::steady_clock::time_point waitTill(std::chrono::years(45)); - - // finally allocate our output range - const uint32_t outputSize = inputSize; - - auto outputOffset = m_downStreamingBuffer->invalid_value; - m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment); - - smart_refctd_ptr cmdbuf; - { - smart_refctd_ptr cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); - if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) { - return logFail("Failed to create Command Buffers!\n"); - } - cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger)); - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cmdbuf->bindComputePipeline(m_pipeline.get()); - // This is the new fun part, pushing constants - const PushConstantData pc = { .deviceBufferAddress = m_deviceLocalBufferAddress }; - cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); - // Remember we do a single workgroup per 1D array in these parts - cmdbuf->dispatch(1, 1, 1); - - // Pipeline barrier: wait for FFT shader to be done before copying to downstream buffer - IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo = {}; - - decltype(pipelineBarrierInfo)::buffer_barrier_t barrier = {}; - pipelineBarrierInfo.bufBarriers = { &barrier, 1u }; - - barrier.range.buffer = m_deviceLocalBuffer; - - barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; - barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS; - barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; - barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS; - - cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo); - - IGPUCommandBuffer::SBufferCopy copyInfo = {}; - copyInfo.srcOffset = 0; - copyInfo.dstOffset = 0; - copyInfo.size = m_deviceLocalBuffer->getSize(); - cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, ©Info); - cmdbuf->end(); - } - - semaphorValue++; - { - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = - { - .cmdbuf = cmdbuf.get() - }; - const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = - { - .semaphore = m_timeline.get(), - .value = semaphorValue, - .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT - }; - - const IQueue::SSubmitInfo submitInfo = { - .waitSemaphores = {}, - .commandBuffers = {&cmdbufInfo,1}, - .signalSemaphores = {&signalInfo,1} - }; - - m_api->startCapture(); - queue->submit({ &submitInfo,1 }); - m_api->endCapture(); - } - - // We let all latches know what semaphore and counter value has to be passed for the functors to execute - const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue }; - - // Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer. - // Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory. - // Its nice because it will also remember to invalidate our memory mapping if its not coherent. - auto latchedConsumer = make_smart_refctd_ptr( - IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize), - // Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals - [=](const size_t dstOffset, const void* bufSrc, const size_t size)->void - { - // The unused variable is used for letting the consumer know the subsection of the output we've managed to download - // But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves. - assert(dstOffset == 0 && size == outputSize); - - std::cout << "Begin array GPU\n"; - uint32_t* const data = reinterpret_cast(const_cast(bufSrc)); - //std::cout << std::bitset<32>(data[0]) << "\n"; - std::cout << data[0] << "\n"; - /* - for (auto i = 0u; i < bufferSize; i++) { - std::cout << std::bitset<32>(data[i]) << "\n"; - } - */ - std::cout << "\nEnd array GPU\n"; - }, - // Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it - // hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands. - // It could also be latched in the upstreaming deallocate, because its the same fence. - std::move(cmdbuf), m_downStreamingBuffer - ); - // We put a function we want to execute - m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get()); - return true; } @@ -387,12 +217,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication, // Cleanup bool onAppTerminated() override { - // Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated` - // (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain) - if (TestHLSL) - { - while (m_downStreamingBuffer->cull_frees()) {} - } return device_base_t::onAppTerminated(); } diff --git a/22_CppCompat/CIntrinsicsTester.h b/22_CppCompat/CIntrinsicsTester.h index 5fe7bc08e..09219a9e7 100644 --- a/22_CppCompat/CIntrinsicsTester.h +++ b/22_CppCompat/CIntrinsicsTester.h @@ -147,6 +147,9 @@ class CIntrinsicsTester final : public ITester expected.step = glm::step(testInput.stepEdge, testInput.stepX); expected.smoothStep = glm::smoothstep(testInput.smoothStepEdge0, testInput.smoothStepEdge1, testInput.smoothStepX); + expected.addCarry.result = glm::uaddCarry(testInput.addCarryA, testInput.addCarryB, expected.addCarry.carry); + expected.subBorrow.result = glm::usubBorrow(testInput.subBorrowA, testInput.subBorrowB, expected.subBorrow.borrow); + expected.frac = testInput.frac - std::floor(testInput.frac); expected.bitReverse = glm::bitfieldReverse(testInput.bitReverse); @@ -189,6 +192,9 @@ class CIntrinsicsTester final : public ITester expected.reflect = glm::reflect(testInput.reflectI, testInput.reflectN); expected.refract = glm::refract(testInput.refractI, testInput.refractN, testInput.refractEta); + expected.addCarryVec.result = glm::uaddCarry(testInput.addCarryAVec, testInput.addCarryBVec, expected.addCarryVec.carry); + expected.subBorrowVec.result = glm::usubBorrow(testInput.subBorrowAVec, testInput.subBorrowBVec, expected.subBorrowVec.borrow); + auto mulGlm = nbl::hlsl::mul(testInput.mulLhs, testInput.mulRhs); expected.mul = reinterpret_cast(mulGlm); auto transposeGlm = glm::transpose(reinterpret_cast(testInput.transpose)); @@ -196,11 +202,6 @@ class CIntrinsicsTester final : public ITester auto inverseGlm = glm::inverse(reinterpret_cast(testInput.inverse)); expected.inverse = reinterpret_cast(inverseGlm); - expected.addCarry.result = glm::uaddCarry(testInput.addCarryA, testInput.addCarryB, expected.addCarry.carry); - expected.subBorrow.result = glm::usubBorrow(testInput.subBorrowA, testInput.subBorrowB, expected.subBorrow.borrow); - expected.addCarryVec.result = glm::uaddCarry(testInput.addCarryAVec, testInput.addCarryBVec, expected.addCarryVec.carry); - expected.subBorrowVec.result = glm::usubBorrow(testInput.subBorrowAVec, testInput.subBorrowBVec, expected.subBorrowVec.borrow); - performCpuTests(testInput, expected); performGpuTests(testInput, expected); } @@ -213,6 +214,7 @@ class CIntrinsicsTester final : public ITester void performCpuTests(const IntrinsicsIntputTestValues& commonTestInputValues, const IntrinsicsTestValues& expectedTestValues) { IntrinsicsTestValues cpuTestValues; + cpuTestValues.fillTestValues(commonTestInputValues); verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU); @@ -245,6 +247,11 @@ class CIntrinsicsTester final : public ITester verifyTestValue("degrees", expectedTestValues.degrees, testValues.degrees, testType); verifyTestValue("step", expectedTestValues.step, testValues.step, testType); verifyTestValue("smoothStep", expectedTestValues.smoothStep, testValues.smoothStep, testType); + verifyTestValue("addCarryResult", expectedTestValues.addCarry.result, testValues.addCarry.result, testType); + verifyTestValue("addCarryCarry", expectedTestValues.addCarry.carry, testValues.addCarry.carry, testType); + // Disabled: current glm implementation is wrong + //verifyTestValue("subBorrowResult", expectedTestValues.subBorrow.result, testValues.subBorrow.result, testType); + //verifyTestValue("subBorrowBorrow", expectedTestValues.subBorrow.borrow, testValues.subBorrow.borrow, testType); verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType); verifyTestVector3dValue("cross", expectedTestValues.cross, testValues.cross, testType); @@ -267,6 +274,11 @@ class CIntrinsicsTester final : public ITester verifyTestVector3dValue("faceForward", expectedTestValues.faceForward, testValues.faceForward, testType); verifyTestVector3dValue("reflect", expectedTestValues.reflect, testValues.reflect, testType); verifyTestVector3dValue("refract", expectedTestValues.refract, testValues.refract, testType); + verifyTestVector3dValue("addCarryVecResult", expectedTestValues.addCarryVec.result, testValues.addCarryVec.result, testType); + verifyTestVector3dValue("addCarryVecCarry", expectedTestValues.addCarryVec.carry, testValues.addCarryVec.carry, testType); + // Disabled: current glm implementation is wrong + //verifyTestVector3dValue("subBorrowVecResult", expectedTestValues.subBorrowVec.result, testValues.subBorrowVec.result, testType); + //verifyTestVector3dValue("subBorrowVecBorrow", expectedTestValues.subBorrowVec.borrow, testValues.subBorrowVec.borrow, testType); verifyTestMatrix3x3Value("mul", expectedTestValues.mul, testValues.mul, testType); verifyTestMatrix3x3Value("transpose", expectedTestValues.transpose, testValues.transpose, testType); From b2d87c36ad63c27b8547ea6583aa4c1ce716690d Mon Sep 17 00:00:00 2001 From: Fletterio Date: Thu, 24 Apr 2025 16:06:16 -0300 Subject: [PATCH 7/9] Added extensive tests for Morton codes --- 12_Mortons/Tester.h | 135 +++--- 12_Mortons/app_resources/common.hlsl | 453 +++++++++++++++++- 12_Mortons/app_resources/mortonTest.comp.hlsl | 16 + 12_Mortons/main.cpp | 298 +++--------- 22_CppCompat/ITester.h | 1 + 5 files changed, 604 insertions(+), 299 deletions(-) create mode 100644 12_Mortons/app_resources/mortonTest.comp.hlsl diff --git a/12_Mortons/Tester.h b/12_Mortons/Tester.h index 5c4773111..480328d18 100644 --- a/12_Mortons/Tester.h +++ b/12_Mortons/Tester.h @@ -1,5 +1,5 @@ -#ifndef _NBL_EXAMPLES_TESTS_12_MORTONS_I_TESTER_INCLUDED_ -#define _NBL_EXAMPLES_TESTS_12_MORTONS_I_TESTER_INCLUDED_ +#ifndef _NBL_EXAMPLES_TESTS_12_MORTONS_TESTER_INCLUDED_ +#define _NBL_EXAMPLES_TESTS_12_MORTONS_TESTER_INCLUDED_ #include #include "app_resources/common.hlsl" @@ -128,7 +128,7 @@ class Tester if (!inputBuff) logFail("Failed to create a GPU Buffer of size %d!\n", params.size); - inputBuff->setObjectDebugName("emulated_float64_t output buffer"); + inputBuff->setObjectDebugName("morton input buffer"); video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuff->getMemoryReqs(); reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); @@ -163,7 +163,7 @@ class Tester if (!outputBuff) logFail("Failed to create a GPU Buffer of size %d!\n", params.size); - outputBuff->setObjectDebugName("emulated_float64_t output buffer"); + outputBuff->setObjectDebugName("morton output buffer"); video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs(); reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); @@ -208,8 +208,7 @@ class Tester template void verifyTestValue(const std::string& memberName, const T& expectedVal, const T& testVal, const TestType testType) { - static constexpr float MaxAllowedError = 0.1f; - if (std::abs(double(expectedVal) - double(testVal)) <= MaxAllowedError) + if (expectedVal == testVal) return; std::stringstream ss; @@ -221,7 +220,7 @@ class Tester ss << "GPU TEST ERROR:\n"; } - ss << "nbl::hlsl::" << memberName << " produced incorrect output! test value: " << testVal << " expected value: " << expectedVal << '\n'; + ss << "nbl::hlsl::" << memberName << " produced incorrect output!" << '\n'; //test value: " << testVal << " expected value: " << expectedVal << '\n'; m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR); } @@ -240,6 +239,7 @@ class Tester { case TestType::CPU: ss << "CPU TEST ERROR:\n"; + break; case TestType::GPU: ss << "GPU TEST ERROR:\n"; } @@ -251,32 +251,60 @@ class Tester m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR); } - template - void verifyTestMatrix3x3Value(const std::string& memberName, const nbl::hlsl::matrix& expectedVal, const nbl::hlsl::matrix& testVal, const TestType testType) - { - for (int i = 0; i < 3; ++i) - { - auto expectedValRow = expectedVal[i]; - auto testValRow = testVal[i]; - verifyTestVector3dValue(memberName, expectedValRow, testValRow, testType); - } - } - void performTests() { - m_logger->log("intrinsics.hlsl TESTS:", system::ILogger::ELL_PERFORMANCE); + std::random_device rd; + std::mt19937 mt(rd()); + + std::uniform_int_distribution shortDistribution(uint16_t(0), std::numeric_limits::max()); + std::uniform_int_distribution intDistribution(uint32_t(0), std::numeric_limits::max()); + std::uniform_int_distribution longDistribution(uint64_t(0), std::numeric_limits::max()); + + m_logger->log("TESTS:", system::ILogger::ELL_PERFORMANCE); for (int i = 0; i < Iterations; ++i) { // Set input thest values that will be used in both CPU and GPU tests InputTestValues testInput; - // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values TestValues expected; + uint32_t generatedShift = intDistribution(mt) & uint32_t(63); + testInput.shift = generatedShift; + { + uint64_t generatedA = longDistribution(mt); + uint64_t generatedB = longDistribution(mt); + + testInput.generatedA = generatedA; + testInput.generatedB = generatedB; + + expected.emulatedAnd = _static_cast(generatedA & generatedB); + expected.emulatedOr = _static_cast(generatedA | generatedB); + expected.emulatedXor = _static_cast(generatedA ^ generatedB); + expected.emulatedNot = _static_cast(~generatedA); + expected.emulatedPlus = _static_cast(generatedA + generatedB); + expected.emulatedMinus = _static_cast(generatedA - generatedB); + expected.emulatedLess = uint32_t(generatedA < generatedB); + expected.emulatedLessEqual = uint32_t(generatedA <= generatedB); + expected.emulatedGreater = uint32_t(generatedA > generatedB); + expected.emulatedGreaterEqual = uint32_t(generatedA >= generatedB); + + expected.emulatedLeftShifted = _static_cast(generatedA << generatedShift); + expected.emulatedUnsignedRightShifted = _static_cast(generatedA >> generatedShift); + expected.emulatedSignedRightShifted = _static_cast(static_cast(generatedA) >> generatedShift); + } + { + uint64_t coordX = longDistribution(mt); + uint64_t coordY = longDistribution(mt); + uint64_t coordZ = longDistribution(mt); + uint64_t coordW = longDistribution(mt); + + + } + performCpuTests(testInput, expected); performGpuTests(testInput, expected); } - m_logger->log("intrinsics.hlsl TESTS DONE.", system::ILogger::ELL_PERFORMANCE); + m_logger->log("TESTS DONE.", system::ILogger::ELL_PERFORMANCE); } protected: @@ -354,7 +382,7 @@ class Tester { TestValues cpuTestValues; cpuTestValues.fillTestValues(commonTestInputValues); - verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU); + verifyTestValues(expectedTestValues, cpuTestValues, TestType::CPU); } @@ -362,55 +390,26 @@ class Tester { TestValues gpuTestValues; gpuTestValues = dispatch(commonTestInputValues); - verifyTestValues(expectedTestValues, gpuTestValues, ITester::TestType::GPU); + verifyTestValues(expectedTestValues, gpuTestValues, TestType::GPU); } - void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType) + void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, TestType testType) { - verifyTestValue("bitCount", expectedTestValues.bitCount, testValues.bitCount, testType); - verifyTestValue("clamp", expectedTestValues.clamp, testValues.clamp, testType); - verifyTestValue("length", expectedTestValues.length, testValues.length, testType); - verifyTestValue("dot", expectedTestValues.dot, testValues.dot, testType); - verifyTestValue("determinant", expectedTestValues.determinant, testValues.determinant, testType); - verifyTestValue("findMSB", expectedTestValues.findMSB, testValues.findMSB, testType); - verifyTestValue("findLSB", expectedTestValues.findLSB, testValues.findLSB, testType); - verifyTestValue("min", expectedTestValues.min, testValues.min, testType); - verifyTestValue("max", expectedTestValues.max, testValues.max, testType); - verifyTestValue("rsqrt", expectedTestValues.rsqrt, testValues.rsqrt, testType); - verifyTestValue("frac", expectedTestValues.frac, testValues.frac, testType); - verifyTestValue("bitReverse", expectedTestValues.bitReverse, testValues.bitReverse, testType); - verifyTestValue("mix", expectedTestValues.mix, testValues.mix, testType); - verifyTestValue("sign", expectedTestValues.sign, testValues.sign, testType); - verifyTestValue("radians", expectedTestValues.radians, testValues.radians, testType); - verifyTestValue("degrees", expectedTestValues.degrees, testValues.degrees, testType); - verifyTestValue("step", expectedTestValues.step, testValues.step, testType); - verifyTestValue("smoothStep", expectedTestValues.smoothStep, testValues.smoothStep, testType); - - verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType); - verifyTestVector3dValue("cross", expectedTestValues.cross, testValues.cross, testType); - verifyTestVector3dValue("bitCountVec", expectedTestValues.bitCountVec, testValues.bitCountVec, testType); - verifyTestVector3dValue("clampVec", expectedTestValues.clampVec, testValues.clampVec, testType); - verifyTestVector3dValue("findMSBVec", expectedTestValues.findMSBVec, testValues.findMSBVec, testType); - verifyTestVector3dValue("findLSBVec", expectedTestValues.findLSBVec, testValues.findLSBVec, testType); - verifyTestVector3dValue("minVec", expectedTestValues.minVec, testValues.minVec, testType); - verifyTestVector3dValue("maxVec", expectedTestValues.maxVec, testValues.maxVec, testType); - verifyTestVector3dValue("rsqrtVec", expectedTestValues.rsqrtVec, testValues.rsqrtVec, testType); - verifyTestVector3dValue("bitReverseVec", expectedTestValues.bitReverseVec, testValues.bitReverseVec, testType); - verifyTestVector3dValue("fracVec", expectedTestValues.fracVec, testValues.fracVec, testType); - verifyTestVector3dValue("mixVec", expectedTestValues.mixVec, testValues.mixVec, testType); - - verifyTestVector3dValue("signVec", expectedTestValues.signVec, testValues.signVec, testType); - verifyTestVector3dValue("radiansVec", expectedTestValues.radiansVec, testValues.radiansVec, testType); - verifyTestVector3dValue("degreesVec", expectedTestValues.degreesVec, testValues.degreesVec, testType); - verifyTestVector3dValue("stepVec", expectedTestValues.stepVec, testValues.stepVec, testType); - verifyTestVector3dValue("smoothStepVec", expectedTestValues.smoothStepVec, testValues.smoothStepVec, testType); - verifyTestVector3dValue("faceForward", expectedTestValues.faceForward, testValues.faceForward, testType); - verifyTestVector3dValue("reflect", expectedTestValues.reflect, testValues.reflect, testType); - verifyTestVector3dValue("refract", expectedTestValues.refract, testValues.refract, testType); - - verifyTestMatrix3x3Value("mul", expectedTestValues.mul, testValues.mul, testType); - verifyTestMatrix3x3Value("transpose", expectedTestValues.transpose, testValues.transpose, testType); - verifyTestMatrix3x3Value("inverse", expectedTestValues.inverse, testValues.inverse, testType); + verifyTestValue("emulatedAnd", expectedTestValues.emulatedAnd, testValues.emulatedAnd, testType); + verifyTestValue("emulatedOr", expectedTestValues.emulatedOr, testValues.emulatedOr, testType); + verifyTestValue("emulatedXor", expectedTestValues.emulatedXor, testValues.emulatedXor, testType); + verifyTestValue("emulatedNot", expectedTestValues.emulatedNot, testValues.emulatedNot, testType); + verifyTestValue("emulatedPlus", expectedTestValues.emulatedPlus, testValues.emulatedPlus, testType); + verifyTestValue("emulatedMinus", expectedTestValues.emulatedMinus, testValues.emulatedMinus, testType); + verifyTestValue("emulatedLess", expectedTestValues.emulatedLess, testValues.emulatedLess, testType); + verifyTestValue("emulatedLessEqual", expectedTestValues.emulatedLessEqual, testValues.emulatedLessEqual, testType); + verifyTestValue("emulatedGreater", expectedTestValues.emulatedGreater, testValues.emulatedGreater, testType); + verifyTestValue("emulatedGreaterEqual", expectedTestValues.emulatedGreaterEqual, testValues.emulatedGreaterEqual, testType); + verifyTestValue("emulatedLeftShifted", expectedTestValues.emulatedLeftShifted, testValues.emulatedLeftShifted, testType); + verifyTestValue("emulatedUnsignedRightShifted", expectedTestValues.emulatedUnsignedRightShifted, testValues.emulatedUnsignedRightShifted, testType); + verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType); + + //verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType); } }; diff --git a/12_Mortons/app_resources/common.hlsl b/12_Mortons/app_resources/common.hlsl index 9632bd372..be6a2f4a0 100644 --- a/12_Mortons/app_resources/common.hlsl +++ b/12_Mortons/app_resources/common.hlsl @@ -13,20 +13,471 @@ #include -// tgmath.hlsl and intrinsics.hlsl tests +NBL_CONSTEXPR uint16_t smallBits_2 = 8; +NBL_CONSTEXPR uint16_t mediumBits_2 = 16; +NBL_CONSTEXPR uint16_t fullBits_2 = 32; +NBL_CONSTEXPR uint16_t smallBits_3 = 5; +NBL_CONSTEXPR uint16_t mediumBits_3 = 10; +NBL_CONSTEXPR uint16_t fullBits_3 = 21; +NBL_CONSTEXPR uint16_t smallBits_4 = 4; +NBL_CONSTEXPR uint16_t mediumBits_4 = 8; +NBL_CONSTEXPR uint16_t fullBits_4 = 16; using namespace nbl::hlsl; struct InputTestValues { + // Both tests + uint32_t shift; + + // Emulated int tests + uint64_t generatedA; + uint64_t generatedB; + // Morton tests + uint64_t coordX; + uint64_t coordY; + uint64_t coordZ; + uint64_t coordW; }; struct TestValues { + // Emulated int tests + emulated_uint64_t emulatedAnd; + emulated_uint64_t emulatedOr; + emulated_uint64_t emulatedXor; + emulated_uint64_t emulatedNot; + emulated_uint64_t emulatedPlus; + emulated_uint64_t emulatedMinus; + // These are bools but stored as uint because you can't store bools, causes a SPIR-V issue + uint32_t emulatedLess; + uint32_t emulatedLessEqual; + uint32_t emulatedGreater; + uint32_t emulatedGreaterEqual; + emulated_uint64_t emulatedLeftShifted; + emulated_uint64_t emulatedUnsignedRightShifted; + emulated_int64_t emulatedSignedRightShifted; + + // Morton tests - for each dimension let's do one small, medium and full-szied (max bits possible) test to cover representation with + // 16, 32 and 64-bit types. Could make it more exhaustive with macros (test all possible bitwidths) + // For emulated mortons, we store only the emulated uint64 representing it, because DXC complains about bitcasts otherwise + + // Plus + morton::code mortonPlus_small_2; + morton::code mortonPlus_medium_2; + morton::code mortonPlus_full_2; + morton::code mortonPlus_emulated_2; + + morton::code mortonPlus_small_3; + morton::code mortonPlus_medium_3; + morton::code mortonPlus_full_3; + morton::code mortonPlus_emulated_3; + + morton::code mortonPlus_small_4; + morton::code mortonPlus_medium_4; + morton::code mortonPlus_full_4; + morton::code mortonPlus_emulated_4; + + // Minus + morton::code mortonMinus_small_2; + morton::code mortonMinus_medium_2; + morton::code mortonMinus_full_2; + morton::code mortonMinus_emulated_2; + + morton::code mortonMinus_small_3; + morton::code mortonMinus_medium_3; + morton::code mortonMinus_full_3; + morton::code mortonMinus_emulated_3; + + morton::code mortonMinus_small_4; + morton::code mortonMinus_medium_4; + morton::code mortonMinus_full_4; + morton::code mortonMinus_emulated_4; + + // Coordinate-wise equality (these are bools) + uint32_t2 mortonEqual_small_2; + uint32_t2 mortonEqual_medium_2; + uint32_t2 mortonEqual_full_2; + uint32_t2 mortonEqual_emulated_2; + + uint32_t3 mortonEqual_small_3; + uint32_t3 mortonEqual_medium_3; + uint32_t3 mortonEqual_full_3; + uint32_t3 mortonEqual_emulated_3; + + uint32_t4 mortonEqual_small_4; + uint32_t4 mortonEqual_medium_4; + uint32_t4 mortonEqual_full_4; + uint32_t4 mortonEqual_emulated_4; + + // Coordinate-wise unsigned inequality (just testing with less, again these are bools) + uint32_t2 mortonUnsignedLess_small_2; + uint32_t2 mortonUnsignedLess_medium_2; + uint32_t2 mortonUnsignedLess_full_2; + uint32_t2 mortonUnsignedLess_emulated_2; + + uint32_t3 mortonUnsignedLess_small_3; + uint32_t3 mortonUnsignedLess_medium_3; + uint32_t3 mortonUnsignedLess_full_3; + uint32_t3 mortonUnsignedLess_emulated_3; + + uint32_t4 mortonUnsignedLess_small_4; + uint32_t4 mortonUnsignedLess_medium_4; + uint32_t4 mortonUnsignedLess_full_4; + uint32_t4 mortonUnsignedLess_emulated_4; + + // Coordinate-wise signed inequality (bools) + uint32_t2 mortonSignedLess_small_2; + uint32_t2 mortonSignedLess_medium_2; + uint32_t2 mortonSignedLess_full_2; + uint32_t2 mortonSignedLess_emulated_2; + + uint32_t3 mortonSignedLess_small_3; + uint32_t3 mortonSignedLess_medium_3; + uint32_t3 mortonSignedLess_full_3; + uint32_t3 mortonSignedLess_emulated_3; + + uint32_t4 mortonSignedLess_small_4; + uint32_t4 mortonSignedLess_medium_4; + uint32_t4 mortonSignedLess_full_4; + uint32_t4 mortonSignedLess_emulated_4; + + // Left-shift + morton::code mortonLeftShift_small_2; + morton::code mortonLeftShift_medium_2; + morton::code mortonLeftShift_full_2; + morton::code mortonLeftShift_emulated_2; + + morton::code mortonLeftShift_small_3; + morton::code mortonLeftShift_medium_3; + morton::code mortonLeftShift_full_3; + morton::code mortonLeftShift_emulated_3; + + morton::code mortonLeftShift_small_4; + morton::code mortonLeftShift_medium_4; + morton::code mortonLeftShift_full_4; + morton::code mortonLeftShift_emulated_4; + + // Unsigned right-shift + morton::code mortonUnsignedRightShift_small_2; + morton::code mortonUnsignedRightShift_medium_2; + morton::code mortonUnsignedRightShift_full_2; + morton::code mortonUnsignedRightShift_emulated_2; + + morton::code mortonUnsignedRightShift_small_3; + morton::code mortonUnsignedRightShift_medium_3; + morton::code mortonUnsignedRightShift_full_3; + morton::code mortonUnsignedRightShift_emulated_3; + + morton::code mortonUnsignedRightShift_small_4; + morton::code mortonUnsignedRightShift_medium_4; + morton::code mortonUnsignedRightShift_full_4; + morton::code mortonUnsignedRightShift_emulated_4; + + // Signed right-shift + morton::code mortonSignedRightShift_small_2; + morton::code mortonSignedRightShift_medium_2; + morton::code mortonSignedRightShift_full_2; + morton::code mortonSignedRightShift_emulated_2; + + morton::code mortonSignedRightShift_small_3; + morton::code mortonSignedRightShift_medium_3; + morton::code mortonSignedRightShift_full_3; + morton::code mortonSignedRightShift_emulated_3; + + morton::code mortonSignedRightShift_small_4; + morton::code mortonSignedRightShift_medium_4; + morton::code mortonSignedRightShift_full_4; + morton::code mortonSignedRightShift_emulated_4; void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input) { + emulated_uint64_t emulatedA = _static_cast(input.generatedA); + emulated_uint64_t emulatedB = _static_cast(input.generatedB); + + // Emulated int tests + emulatedAnd = emulatedA & emulatedB; + emulatedOr = emulatedA | emulatedB; + emulatedXor = emulatedA ^ emulatedB; + emulatedNot = emulatedA.operator~(); + emulatedPlus = emulatedA + emulatedB; + emulatedMinus = emulatedA - emulatedB; + emulatedLess = uint32_t(emulatedA < emulatedB); + emulatedLessEqual = uint32_t(emulatedA <= emulatedB); + emulatedGreater = uint32_t(emulatedA > emulatedB); + emulatedGreaterEqual = uint32_t(emulatedA >= emulatedB); + + left_shift_operator leftShift; + emulatedLeftShifted = leftShift(emulatedA, input.shift); + + arithmetic_right_shift_operator unsignedRightShift; + emulatedUnsignedRightShifted = unsignedRightShift(emulatedA, input.shift); + + arithmetic_right_shift_operator signedRightShift; + emulatedSignedRightShifted = signedRightShift(_static_cast(emulatedA), input.shift); + + // Morton tests + uint64_t2 Vec2A = { input.coordX, input.coordY }; + uint64_t2 Vec2B = { input.coordZ, input.coordW }; + + uint64_t3 Vec3A = { input.coordX, input.coordY, input.coordZ }; + uint64_t3 Vec3B = { input.coordY, input.coordZ, input.coordW }; + + uint64_t4 Vec4A = { input.coordX, input.coordY, input.coordZ, input.coordW }; + uint64_t4 Vec4B = { input.coordY, input.coordZ, input.coordW, input.coordX }; + + int64_t2 Vec2ASigned = int64_t2(Vec2A); + int64_t2 Vec2BSigned = int64_t2(Vec2B); + + int64_t3 Vec3ASigned = int64_t3(Vec3A); + int64_t3 Vec3BSigned = int64_t3(Vec3B); + + int64_t4 Vec4ASigned = int64_t4(Vec4A); + int64_t4 Vec4BSigned = int64_t4(Vec4B); + + morton::code morton_small_2A = morton::code::create(Vec2A); + morton::code morton_medium_2A = morton::code::create(Vec2A); + morton::code morton_full_2A = morton::code::create(Vec2A); + morton::code morton_emulated_2A = morton::code::create(Vec2A); + morton::code morton_small_2B = morton::code::create(Vec2B); + morton::code morton_medium_2B = morton::code::create(Vec2B); + morton::code morton_full_2B = morton::code::create(Vec2B); + morton::code morton_emulated_2B = morton::code::create(Vec2B); + + morton::code morton_small_3A = morton::code::create(Vec3A); + morton::code morton_medium_3A = morton::code::create(Vec3A); + morton::code morton_full_3A = morton::code::create(Vec3A); + morton::code morton_emulated_3A = morton::code::create(Vec3A); + morton::code morton_small_3B = morton::code::create(Vec3B); + morton::code morton_medium_3B = morton::code::create(Vec3B); + morton::code morton_full_3B = morton::code::create(Vec3B); + morton::code morton_emulated_3B = morton::code::create(Vec3B); + + morton::code morton_small_4A = morton::code::create(Vec4A); + morton::code morton_medium_4A = morton::code::create(Vec4A); + morton::code morton_full_4A = morton::code::create(Vec4A); + morton::code morton_emulated_4A = morton::code::create(Vec4A); + morton::code morton_small_4B = morton::code::create(Vec4B); + morton::code morton_medium_4B = morton::code::create(Vec4B); + morton::code morton_full_4B = morton::code::create(Vec4B); + morton::code morton_emulated_4B = morton::code::create(Vec4B); + + morton::code morton_small_2ASigned = morton::code::create(Vec2ASigned); + morton::code morton_medium_2ASigned = morton::code::create(Vec2ASigned); + morton::code morton_full_2ASigned = morton::code::create(Vec2ASigned); + morton::code morton_emulated_2ASigned = morton::code::create(Vec2ASigned); + morton::code morton_small_2BSigned = morton::code::create(Vec2BSigned); + morton::code morton_medium_2BSigned = morton::code::create(Vec2BSigned); + morton::code morton_full_2BSigned = morton::code::create(Vec2BSigned); + morton::code morton_emulated_2BSigned = morton::code::create(Vec2BSigned); + + morton::code morton_small_3ASigned = morton::code::create(Vec3ASigned); + morton::code morton_medium_3ASigned = morton::code::create(Vec3ASigned); + morton::code morton_full_3ASigned = morton::code::create(Vec3ASigned); + morton::code morton_emulated_3ASigned = morton::code::create(Vec3ASigned); + morton::code morton_small_3BSigned = morton::code::create(Vec3BSigned); + morton::code morton_medium_3BSigned = morton::code::create(Vec3BSigned); + morton::code morton_full_3BSigned = morton::code::create(Vec3BSigned); + morton::code morton_emulated_3BSigned = morton::code::create(Vec3BSigned); + + morton::code morton_small_4ASigned = morton::code::create(Vec4ASigned); + morton::code morton_medium_4ASigned = morton::code::create(Vec4ASigned); + morton::code morton_full_4ASigned = morton::code::create(Vec4ASigned); + morton::code morton_emulated_4ASigned = morton::code::create(Vec4ASigned); + morton::code morton_small_4BSigned = morton::code::create(Vec4BSigned); + morton::code morton_medium_4BSigned = morton::code::create(Vec4BSigned); + morton::code morton_full_4BSigned = morton::code::create(Vec4BSigned); + morton::code morton_emulated_4BSigned = morton::code::create(Vec4BSigned); + + /* + left_shift_operator > leftShiftTemp; + portable_vector_t interleaved = _static_cast >(uint16_t4(Vec4B)) & morton::impl::coding_mask_v<4, fullBits_4, morton::impl::CodingStages, emulated_uint64_t>; + + #define ENCODE_LOOP_ITERATION(I) NBL_IF_CONSTEXPR(fullBits_4 > (uint16_t(1) << I))\ + {\ + interleaved = interleaved | leftShiftTemp(interleaved, (uint16_t(1) << I) * (4 - 1));\ + interleaved = interleaved & _static_cast(morton::impl::coding_mask<4, fullBits_4, I>::value);\ + } + + ENCODE_LOOP_ITERATION(4) + ENCODE_LOOP_ITERATION(3) + ENCODE_LOOP_ITERATION(2) + ENCODE_LOOP_ITERATION(1) + ENCODE_LOOP_ITERATION(0) + + #undef ENCODE_LOOP_ITERATION + // After interleaving, shift each coordinate left by their index + return leftShiftTemp(interleaved, truncate >(vector(0, 1, 2, 3))); + + + array_get, emulated_uint64_t> getter; + emulatedAnd = getter(interleaved, 0); + */ + + // Plus + mortonPlus_small_2 = morton_small_2A + morton_small_2B; + mortonPlus_medium_2 = morton_medium_2A + morton_medium_2B; + mortonPlus_full_2 = morton_full_2A + morton_full_2B; + mortonPlus_emulated_2 = morton_emulated_2A + morton_emulated_2B; + + mortonPlus_small_3 = morton_small_3A + morton_small_3B; + mortonPlus_medium_3 = morton_medium_3A + morton_medium_3B; + mortonPlus_full_3 = morton_full_3A + morton_full_3B; + mortonPlus_emulated_3 = morton_emulated_3A + morton_emulated_3B; + + mortonPlus_small_4 = morton_small_4A + morton_small_4B; + mortonPlus_medium_4 = morton_medium_4A + morton_medium_4B; + mortonPlus_full_4 = morton_full_4A + morton_full_4B; + mortonPlus_emulated_4 = morton_emulated_4A + morton_emulated_4B; + + // Minus + mortonMinus_small_2 = morton_small_2A - morton_small_2B; + mortonMinus_medium_2 = morton_medium_2A - morton_medium_2B; + mortonMinus_full_2 = morton_full_2A - morton_full_2B; + mortonMinus_emulated_2 = morton_emulated_2A - morton_emulated_2B; + + mortonMinus_small_3 = morton_small_3A - morton_small_3B; + mortonMinus_medium_3 = morton_medium_3A - morton_medium_3B; + mortonMinus_full_3 = morton_full_3A - morton_full_3B; + mortonMinus_emulated_3 = morton_emulated_3A - morton_emulated_3B; + + mortonMinus_small_4 = morton_small_4A - morton_small_4B; + mortonMinus_medium_4 = morton_medium_4A - morton_medium_4B; + mortonMinus_full_4 = morton_full_4A - morton_full_4B; + mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B; + + // Coordinate-wise equality + mortonEqual_small_2 = uint32_t2(morton_small_2A.equal(uint16_t2(Vec2B))); + mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal(uint16_t2(Vec2B))); + mortonEqual_full_2 = uint32_t2(morton_full_2A.equal(uint32_t2(Vec2B))); + mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal(uint32_t2(Vec2B))); + + mortonEqual_small_3 = uint32_t3(morton_small_3A.equal(uint16_t3(Vec3B))); + mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal(uint16_t3(Vec3B))); + mortonEqual_full_3 = uint32_t3(morton_full_3A.equal(uint32_t3(Vec3B))); + mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal(uint32_t3(Vec3B))); + + mortonEqual_small_4 = uint32_t4(morton_small_4A.equal(uint16_t4(Vec4B))); + mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal(uint16_t4(Vec4B))); + mortonEqual_full_4 = uint32_t4(morton_full_4A.equal(uint16_t4(Vec4B))); + mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal(uint16_t4(Vec4B))); + + // Coordinate-wise unsigned inequality (just testing with less) + mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan(uint16_t2(Vec2B))); + mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan(uint16_t2(Vec2B))); + mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan(uint32_t2(Vec2B))); + mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan(uint32_t2(Vec2B))); + + mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan(uint16_t3(Vec3B))); + mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan(uint16_t3(Vec3B))); + mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan(uint32_t3(Vec3B))); + mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan(uint32_t3(Vec3B))); + + mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan(uint16_t4(Vec4B))); + mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan(uint16_t4(Vec4B))); + mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan(uint16_t4(Vec4B))); + mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan(uint16_t4(Vec4B))); + + // Coordinate-wise signed inequality + mortonSignedLess_small_2 = uint32_t2(morton_small_2ASigned.lessThan(int16_t2(Vec2BSigned))); + mortonSignedLess_medium_2 = uint32_t2(morton_medium_2ASigned.lessThan(int16_t2(Vec2BSigned))); + mortonSignedLess_full_2 = uint32_t2(morton_full_2ASigned.lessThan(int32_t2(Vec2BSigned))); + //mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2ASigned.lessThan(int32_t2(Vec2BSigned))); + + mortonSignedLess_small_3 = uint32_t3(morton_small_3ASigned.lessThan(int16_t3(Vec3BSigned))); + mortonSignedLess_medium_3 = uint32_t3(morton_medium_3ASigned.lessThan(int16_t3(Vec3BSigned))); + mortonSignedLess_full_3 = uint32_t3(morton_full_3ASigned.lessThan(int32_t3(Vec3BSigned))); + //mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3ASigned.lessThan(int32_t3(Vec3BSigned))); + + mortonSignedLess_small_4 = uint32_t4(morton_small_4ASigned.lessThan(int16_t4(Vec4BSigned))); + mortonSignedLess_medium_4 = uint32_t4(morton_medium_4ASigned.lessThan(int16_t4(Vec4BSigned))); + mortonSignedLess_full_4 = uint32_t4(morton_full_4ASigned.lessThan(int16_t4(Vec4BSigned))); + //mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4ASigned.lessThan(int16_t4(Vec4BSigned))); + + // Left-shift + uint16_t castedShift = uint16_t(input.shift); + left_shift_operator > leftShiftSmall2; + mortonLeftShift_small_2 = leftShiftSmall2(morton_small_2A, castedShift); + left_shift_operator > leftShiftMedium2; + mortonLeftShift_medium_2 = leftShiftMedium2(morton_medium_2A, castedShift); + left_shift_operator > leftShiftFull2; + mortonLeftShift_full_2 = leftShiftFull2(morton_full_2A, castedShift); + left_shift_operator > leftShiftEmulated2; + mortonLeftShift_emulated_2 = leftShiftEmulated2(morton_emulated_2A, castedShift); + + left_shift_operator > leftShiftSmall3; + mortonLeftShift_small_3 = leftShiftSmall3(morton_small_3A, castedShift); + left_shift_operator > leftShiftMedium3; + mortonLeftShift_medium_3 = leftShiftMedium3(morton_medium_3A, castedShift); + left_shift_operator > leftShiftFull3; + mortonLeftShift_full_3 = leftShiftFull3(morton_full_3A, castedShift); + left_shift_operator > leftShiftEmulated3; + mortonLeftShift_emulated_3 = leftShiftEmulated3(morton_emulated_3A, castedShift); + + left_shift_operator > leftShiftSmall4; + mortonLeftShift_small_4 = leftShiftSmall4(morton_small_4A, castedShift); + left_shift_operator > leftShiftMedium4; + mortonLeftShift_medium_4 = leftShiftMedium4(morton_medium_4A, castedShift); + left_shift_operator > leftShiftFull4; + mortonLeftShift_full_4 = leftShiftFull4(morton_full_4A, castedShift); + left_shift_operator > leftShiftEmulated4; + mortonLeftShift_emulated_4 = leftShiftEmulated4(morton_emulated_4A, castedShift); + + // Unsigned right-shift + arithmetic_right_shift_operator > rightShiftSmall2; + mortonUnsignedRightShift_small_2 = rightShiftSmall2(morton_small_2A, castedShift); + arithmetic_right_shift_operator > rightShiftMedium2; + mortonUnsignedRightShift_medium_2 = rightShiftMedium2(morton_medium_2A, castedShift); + arithmetic_right_shift_operator > rightShiftFull2; + mortonUnsignedRightShift_full_2 = rightShiftFull2(morton_full_2A, castedShift); + arithmetic_right_shift_operator > rightShiftEmulated2; + mortonUnsignedRightShift_emulated_2 = rightShiftEmulated2(morton_emulated_2A, castedShift); + + arithmetic_right_shift_operator > rightShiftSmall3; + mortonUnsignedRightShift_small_3 = rightShiftSmall3(morton_small_3A, castedShift); + arithmetic_right_shift_operator > rightShiftMedium3; + mortonUnsignedRightShift_medium_3 = rightShiftMedium3(morton_medium_3A, castedShift); + arithmetic_right_shift_operator > rightShiftFull3; + mortonUnsignedRightShift_full_3 = rightShiftFull3(morton_full_3A, castedShift); + arithmetic_right_shift_operator > rightShiftEmulated3; + mortonUnsignedRightShift_emulated_3 = rightShiftEmulated3(morton_emulated_3A, castedShift); + + arithmetic_right_shift_operator > rightShiftSmall4; + mortonUnsignedRightShift_small_4 = rightShiftSmall4(morton_small_4A, castedShift); + arithmetic_right_shift_operator > rightShiftMedium4; + mortonUnsignedRightShift_medium_4 = rightShiftMedium4(morton_medium_4A, castedShift); + arithmetic_right_shift_operator > rightShiftFull4; + mortonUnsignedRightShift_full_4 = rightShiftFull4(morton_full_4A, castedShift); + arithmetic_right_shift_operator > rightShiftEmulated4; + mortonUnsignedRightShift_emulated_4 = rightShiftEmulated4(morton_emulated_4A, castedShift); + + // Signed right-shift + arithmetic_right_shift_operator > rightShiftSignedSmall2; + mortonSignedRightShift_small_2 = rightShiftSignedSmall2(morton_small_2ASigned, castedShift); + arithmetic_right_shift_operator > rightShiftSignedMedium2; + mortonSignedRightShift_medium_2 = rightShiftSignedMedium2(morton_medium_2ASigned, castedShift); + arithmetic_right_shift_operator > rightShiftSignedFull2; + mortonSignedRightShift_full_2 = rightShiftSignedFull2(morton_full_2ASigned, castedShift); + arithmetic_right_shift_operator > rightShiftSignedEmulated2; + //mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2ASigned, castedShift); + + arithmetic_right_shift_operator > rightShiftSignedSmall3; + mortonSignedRightShift_small_3 = rightShiftSignedSmall3(morton_small_3ASigned, castedShift); + arithmetic_right_shift_operator > rightShiftSignedMedium3; + mortonSignedRightShift_medium_3 = rightShiftSignedMedium3(morton_medium_3ASigned, castedShift); + arithmetic_right_shift_operator > rightShiftSignedFull3; + mortonSignedRightShift_full_3 = rightShiftSignedFull3(morton_full_3ASigned, castedShift); + arithmetic_right_shift_operator > rightShiftSignedEmulated3; + //mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3ASigned, castedShift); + arithmetic_right_shift_operator > rightShiftSignedSmall4; + mortonSignedRightShift_small_4 = rightShiftSignedSmall4(morton_small_4ASigned, castedShift); + arithmetic_right_shift_operator > rightShiftSignedMedium4; + mortonSignedRightShift_medium_4 = rightShiftSignedMedium4(morton_medium_4ASigned, castedShift); + arithmetic_right_shift_operator > rightShiftSignedFull4; + mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4ASigned, castedShift); + arithmetic_right_shift_operator > rightShiftSignedEmulated4; + //mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4ASigned, castedShift); } }; diff --git a/12_Mortons/app_resources/mortonTest.comp.hlsl b/12_Mortons/app_resources/mortonTest.comp.hlsl new file mode 100644 index 000000000..7041568b8 --- /dev/null +++ b/12_Mortons/app_resources/mortonTest.comp.hlsl @@ -0,0 +1,16 @@ +//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#pragma shader_stage(compute) + +#include "common.hlsl" + +[[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; +[[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; + +[numthreads(256, 1, 1)] +void main(uint3 invocationID : SV_DispatchThreadID) +{ + if (invocationID.x == 0) + outputTestValues[0].fillTestValues(inputTestValues[0]); +} diff --git a/12_Mortons/main.cpp b/12_Mortons/main.cpp index 8118ec939..f83c49b9e 100644 --- a/12_Mortons/main.cpp +++ b/12_Mortons/main.cpp @@ -1,242 +1,80 @@ -// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O. +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h +#include +#include +#include +#include - -// I've moved out a tiny part of this example into a shared header for reuse, please open and read it. #include "nbl/application_templates/MonoDeviceApplication.hpp" #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "app_resources/common.hlsl" -#include - -// Right now the test only checks that HLSL compiles the file -constexpr bool TestHLSL = true; +#include "Tester.h" -using namespace nbl; -using namespace core; -using namespace system; -using namespace asset; -using namespace video; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::video; +using namespace nbl::application_templates; -// this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platforms -class MortonTestApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class MortonTest final : public MonoDeviceApplication, public MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = application_templates::MonoDeviceApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; - - inline core::smart_refctd_ptr createShader( - const char* includeMainName) - { - std::string prelude = "#include \""; - auto CPUShader = core::make_smart_refctd_ptr((prelude + includeMainName + "\"\n").c_str(), IShader::E_SHADER_STAGE::ESS_COMPUTE, IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName); - assert(CPUShader); - return m_device->createShader(CPUShader.get()); - } - public: - MortonTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : - system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} - - // we stuff all our work here because its a "single shot" app - bool onAppInitialized(smart_refctd_ptr&& system) override - { - // Remember to call the base class initialization! - if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - if (!asset_base_t::onAppInitialized(std::move(system))) - return false; - { - using namespace nbl::hlsl; - - auto bar = morton::code::create(hlsl::vector(893728, 7843, 98032)); - auto foo = _static_cast>(bar); - std::cout << foo[0] << " " << foo[1] << " " << foo[2] << " " << std::endl; - - //auto bar = morton::code::create(hlsl::vector(893728, 7843, 98032)); - //std::cout << "High Encoded: " << std::bitset<32>(bar.value.data.x) << std::endl; - //std::cout << "Low Encoded: " << std::bitset<32>(bar.value.data.y) << std::endl; - } - /* - - // ----------------------------------------------- CPP TESTS ---------------------------------------------------------------------- - - // Coordinate extraction and whole vector decode tests - { - morton_t morton(vector_t(-1011, 765, 248)); - unsigned_morton_t unsignedMorton(unsigned_vector_t(154, 789, 1011)); - - assert(morton.getCoordinate(0) == -1011 && morton.getCoordinate(1) == 765 && morton.getCoordinate(2) == 248); - assert(unsignedMorton.getCoordinate(0) == 154u && unsignedMorton.getCoordinate(1) == 789u && unsignedMorton.getCoordinate(2) == 1011u); - - assert(static_cast(morton) == vector_t(-1011, 765, 248) && static_cast(unsignedMorton) == unsigned_vector_t(154, 789, 1011)); - } - - // *********************************************************************************************************************************** - // ************************************************* Arithmetic operator tests ******************************************************* - // *********************************************************************************************************************************** - - // ---------------------------------------------------------------------------------------------------- - // --------------------------------------- ADDITION --------------------------------------------------- - // ---------------------------------------------------------------------------------------------------- - - // ---------------------------------------- Signed ----------------------------------------------------- - - // No overflow - assert(static_cast(morton_t(vector_t(-1011, 765, 248)) + morton_t(vector_t(1000, -985, 200))) == vector_t(-11, -220, 448)); - - // Type 1 overflow: Addition of representable coordinates goes out of range - assert(static_cast(morton_t(vector_t(-900, 70, 500)) + morton_t(vector_t(-578, -50, 20))) == vector_t(570, 20, -504)); - - // Type 2 overflow: Addition of irrepresentable range gives correct result - assert(static_cast(morton_t(vector_t(54, 900, -475)) + morton_t(vector_t(46, -1437, 699))) == vector_t(100, -537, 224)); - - // ---------------------------------------- Unsigned ----------------------------------------------------- - - // No overflow - assert(static_cast(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(1563, 754, 220))) == unsigned_vector_t(1945, 1664, 763)); - - // Type 1 overflow: Addition of representable coordinates goes out of range - assert(static_cast(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(2000, 2000, 1000))) == unsigned_vector_t(334, 862, 519)); - - // Type 2 overflow: Addition of irrepresentable range gives correct result - assert(static_cast(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(-143, -345, -233))) == unsigned_vector_t(239, 565, 310)); - - // ---------------------------------------------------------------------------------------------------- - // -------------------------------------- SUBTRACTION ------------------------------------------------- - // ---------------------------------------------------------------------------------------------------- - - // ---------------------------------------- Signed ----------------------------------------------------- - - // No overflow - assert(static_cast(morton_t(vector_t(1000, 764, -365)) - morton_t(vector_t(834, -243, 100))) == vector_t(166, 1007, -465)); - - // Type 1 overflow: Subtraction of representable coordinates goes out of range - assert(static_cast(morton_t(vector_t(-900, 70, 500)) - morton_t(vector_t(578, -50, -20))) == vector_t(570, 120, -504)); - - // Type 2 overflow: Subtraction of irrepresentable range gives correct result - assert(static_cast(morton_t(vector_t(54, 900, -475)) - morton_t(vector_t(-46, 1437, -699))) == vector_t(100, -537, 224)); - - // ---------------------------------------- Unsigned ----------------------------------------------------- - - // No overflow - assert(static_cast(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) - unsigned_morton_t(unsigned_vector_t(322, 564, 299))) == unsigned_vector_t(60, 346, 244)); - - // Type 1 overflow: Subtraction of representable coordinates goes out of range - assert(static_cast(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) - unsigned_morton_t(unsigned_vector_t(2000, 2000, 1000))) == unsigned_vector_t(430, 958, 567)); - - // Type 2 overflow: Subtraction of irrepresentable range gives correct result - assert(static_cast(unsigned_morton_t(unsigned_vector_t(54, 900, 475)) - unsigned_morton_t(unsigned_vector_t(-865, -100, -10))) == unsigned_vector_t(919, 1000, 485)); - - - // ---------------------------------------------------------------------------------------------------- - // -------------------------------------- UNARY NEGATION ---------------------------------------------- - // ---------------------------------------------------------------------------------------------------- - - // Only makes sense for signed - assert(static_cast(- morton_t(vector_t(-1024, 543, -475))) == vector_t(-1024, -543, 475)); - - // *********************************************************************************************************************************** - // ************************************************* Comparison operator tests ******************************************************* - // *********************************************************************************************************************************** - - // ---------------------------------------------------------------------------------------------------- - // -------------------------------------- OPERATOR< --------------------------------------------------- - // ---------------------------------------------------------------------------------------------------- - - // Signed - - // Same sign, negative - assert(morton_t(vector_t(-954, -455, -333)) < morton_t(vector_t(-433, -455, -433)) == bool_vector_t(true, false, false)); - // Same sign, positive - assert(morton_t(vector_t(954, 455, 333)) < morton_t(vector_t(433, 455, 433)) == bool_vector_t(false, false, true)); - // Differing signs - assert(morton_t(vector_t(954, -32, 0)) < morton_t(vector_t(-44, 0, -1)) == bool_vector_t(false, true, false)); - - // Unsigned - assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) < unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(true, false, false)); - - // ---------------------------------------------------------------------------------------------------- - // -------------------------------------- OPERATOR<= -------------------------------------------------- - // ---------------------------------------------------------------------------------------------------- - - // Signed - - // Same sign, negative - assert(morton_t(vector_t(-954, -455, -333)) <= morton_t(vector_t(-433, -455, -433)) == bool_vector_t(true, true, false)); - // Same sign, positive - assert(morton_t(vector_t(954, 455, 333)) <= morton_t(vector_t(433, 455, 433)) == bool_vector_t(false, true, true)); - // Differing signs - assert(morton_t(vector_t(954, -32, 0)) <= morton_t(vector_t(-44, 0, -1)) == bool_vector_t(false, true, false)); - - // Unsigned - assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) <= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(true, true, false)); - - // ---------------------------------------------------------------------------------------------------- - // -------------------------------------- OPERATOR> --------------------------------------------------- - // ---------------------------------------------------------------------------------------------------- - - // Signed - - // Same sign, negative - assert(morton_t(vector_t(-954, -455, -333)) > morton_t(vector_t(-433, -455, -433)) == bool_vector_t(false, false, true)); - // Same sign, positive - assert(morton_t(vector_t(954, 455, 333)) > morton_t(vector_t(433, 455, 433)) == bool_vector_t(true, false, false)); - // Differing signs - assert(morton_t(vector_t(954, -32, 0)) > morton_t(vector_t(-44, 0, -1)) == bool_vector_t(true, false, true)); - - // Unsigned - assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) > unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, false, true)); - - // ---------------------------------------------------------------------------------------------------- - // -------------------------------------- OPERATOR>= -------------------------------------------------- - // ---------------------------------------------------------------------------------------------------- - - // Signed - - // Same sign, negative - assert(morton_t(vector_t(-954, -455, -333)) >= morton_t(vector_t(-433, -455, -433)) == bool_vector_t(false, true, true)); - // Same sign, positive - assert(morton_t(vector_t(954, 455, 333)) >= morton_t(vector_t(433, 455, 433)) == bool_vector_t(true, true, false)); - // Differing signs - assert(morton_t(vector_t(954, -32, 0)) >= morton_t(vector_t(-44, 0, -1)) == bool_vector_t(true, false, true)); - - // Unsigned - assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) >= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, true, true)); - - */ - - return true; - } - - // Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop" - void workLoopBody() override {} - - // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. - bool keepRunning() override {return false;} - - // Cleanup - bool onAppTerminated() override - { - return device_base_t::onAppTerminated(); - } - - private: - smart_refctd_ptr m_pipeline; - - smart_refctd_ptr m_utils; - - StreamingTransientDataBufferMT<>* m_downStreamingBuffer; - smart_refctd_ptr m_deviceLocalBuffer; - - // These are Buffer Device Addresses - uint64_t m_downStreamingBufferAddress; - uint64_t m_deviceLocalBufferAddress; - - uint32_t m_alignment; - - smart_refctd_ptr m_timeline; - uint64_t semaphorValue = 0; + using device_base_t = MonoDeviceApplication; + using asset_base_t = MonoAssetManagerAndBuiltinResourceApplication; +public: + MortonTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) { + } + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Remember to call the base class initialization! + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + { + + } + + Tester::PipelineSetupData pplnSetupData; + pplnSetupData.device = m_device; + pplnSetupData.api = m_api; + pplnSetupData.assetMgr = m_assetMgr; + pplnSetupData.logger = m_logger; + pplnSetupData.physicalDevice = m_physicalDevice; + pplnSetupData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); + { + Tester mortonTester; + pplnSetupData.testShaderPath = "app_resources/mortonTest.comp.hlsl"; + mortonTester.setupPipeline(pplnSetupData); + mortonTester.performTests(); + } + + + return true; + } + + void onAppTerminated_impl() override + { + m_device->waitIdle(); + } + + void workLoopBody() override + { + m_keepRunning = false; + } + + bool keepRunning() override + { + return m_keepRunning; + } + + +private: + bool m_keepRunning = true; }; - -NBL_MAIN_FUNC(MortonTestApp) \ No newline at end of file +NBL_MAIN_FUNC(MortonTest) \ No newline at end of file diff --git a/22_CppCompat/ITester.h b/22_CppCompat/ITester.h index a216fbf40..207cdee51 100644 --- a/22_CppCompat/ITester.h +++ b/22_CppCompat/ITester.h @@ -217,6 +217,7 @@ class ITester { case TestType::CPU: ss << "CPU TEST ERROR:\n"; + break; case TestType::GPU: ss << "GPU TEST ERROR:\n"; } From c68c336317024ae80fb017b1cb71e6b32a152224 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Mon, 28 Apr 2025 15:16:34 -0300 Subject: [PATCH 8/9] Done with tests --- 12_Mortons/CTester.h | 401 ++++++++++++++++++ 12_Mortons/{Tester.h => ITester.h} | 133 +----- 12_Mortons/app_resources/common.hlsl | 299 ++----------- .../{mortonTest.comp.hlsl => test.comp.hlsl} | 5 +- 12_Mortons/app_resources/testCommon.hlsl | 242 +++++++++++ 12_Mortons/main.cpp | 13 +- 6 files changed, 691 insertions(+), 402 deletions(-) create mode 100644 12_Mortons/CTester.h rename 12_Mortons/{Tester.h => ITester.h} (66%) rename 12_Mortons/app_resources/{mortonTest.comp.hlsl => test.comp.hlsl} (79%) create mode 100644 12_Mortons/app_resources/testCommon.hlsl diff --git a/12_Mortons/CTester.h b/12_Mortons/CTester.h new file mode 100644 index 000000000..5a61be501 --- /dev/null +++ b/12_Mortons/CTester.h @@ -0,0 +1,401 @@ +#ifndef _NBL_EXAMPLES_TESTS_12_MORTON_C_TESTER_INCLUDED_ +#define _NBL_EXAMPLES_TESTS_12_MORTON_C_TESTER_INCLUDED_ + +#include +#include "app_resources/testCommon.hlsl" +#include "nbl/application_templates/MonoDeviceApplication.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "ITester.h" + +using namespace nbl; + +class CTester final : public ITester +{ +public: + void performTests() + { + std::random_device rd; + std::mt19937 mt(rd()); + + std::uniform_int_distribution shortDistribution(uint16_t(0), std::numeric_limits::max()); + std::uniform_int_distribution intDistribution(uint32_t(0), std::numeric_limits::max()); + std::uniform_int_distribution longDistribution(uint64_t(0), std::numeric_limits::max()); + + m_logger->log("TESTS:", system::ILogger::ELL_PERFORMANCE); + for (int i = 0; i < Iterations; ++i) + { + // Set input thest values that will be used in both CPU and GPU tests + InputTestValues testInput; + // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values + TestValues expected; + + uint32_t generatedShift = intDistribution(mt) & uint32_t(63); + testInput.shift = generatedShift; + { + uint64_t generatedA = longDistribution(mt); + uint64_t generatedB = longDistribution(mt); + + testInput.generatedA = generatedA; + testInput.generatedB = generatedB; + + expected.emulatedAnd = _static_cast(generatedA & generatedB); + expected.emulatedOr = _static_cast(generatedA | generatedB); + expected.emulatedXor = _static_cast(generatedA ^ generatedB); + expected.emulatedNot = _static_cast(~generatedA); + expected.emulatedPlus = _static_cast(generatedA + generatedB); + expected.emulatedMinus = _static_cast(generatedA - generatedB); + expected.emulatedLess = uint32_t(generatedA < generatedB); + expected.emulatedLessEqual = uint32_t(generatedA <= generatedB); + expected.emulatedGreater = uint32_t(generatedA > generatedB); + expected.emulatedGreaterEqual = uint32_t(generatedA >= generatedB); + + expected.emulatedLeftShifted = _static_cast(generatedA << generatedShift); + expected.emulatedUnsignedRightShifted = _static_cast(generatedA >> generatedShift); + expected.emulatedSignedRightShifted = _static_cast(static_cast(generatedA) >> generatedShift); + } + { + testInput.coordX = longDistribution(mt); + testInput.coordY = longDistribution(mt); + testInput.coordZ = longDistribution(mt); + testInput.coordW = longDistribution(mt); + + uint64_t2 Vec2A = { testInput.coordX, testInput.coordY }; + uint64_t2 Vec2B = { testInput.coordZ, testInput.coordW }; + + uint16_t2 Vec2ASmall = uint16_t2(Vec2A & smallBitsMask_2 ); + uint16_t2 Vec2BSmall = uint16_t2(Vec2B & smallBitsMask_2 ); + uint16_t2 Vec2AMedium = uint16_t2(Vec2A & mediumBitsMask_2); + uint16_t2 Vec2BMedium = uint16_t2(Vec2B & mediumBitsMask_2); + uint32_t2 Vec2AFull = uint32_t2(Vec2A & fullBitsMask_2); + uint32_t2 Vec2BFull = uint32_t2(Vec2B & fullBitsMask_2); + + uint64_t3 Vec3A = { testInput.coordX, testInput.coordY, testInput.coordZ }; + uint64_t3 Vec3B = { testInput.coordY, testInput.coordZ, testInput.coordW }; + + uint16_t3 Vec3ASmall = uint16_t3(Vec3A & smallBitsMask_3); + uint16_t3 Vec3BSmall = uint16_t3(Vec3B & smallBitsMask_3); + uint16_t3 Vec3AMedium = uint16_t3(Vec3A & mediumBitsMask_3); + uint16_t3 Vec3BMedium = uint16_t3(Vec3B & mediumBitsMask_3); + uint32_t3 Vec3AFull = uint32_t3(Vec3A & fullBitsMask_3); + uint32_t3 Vec3BFull = uint32_t3(Vec3B & fullBitsMask_3); + + uint64_t4 Vec4A = { testInput.coordX, testInput.coordY, testInput.coordZ, testInput.coordW }; + uint64_t4 Vec4B = { testInput.coordY, testInput.coordZ, testInput.coordW, testInput.coordX }; + + uint16_t4 Vec4ASmall = uint16_t4(Vec4A & smallBitsMask_4); + uint16_t4 Vec4BSmall = uint16_t4(Vec4B & smallBitsMask_4); + uint16_t4 Vec4AMedium = uint16_t4(Vec4A & mediumBitsMask_4); + uint16_t4 Vec4BMedium = uint16_t4(Vec4B & mediumBitsMask_4); + uint16_t4 Vec4AFull = uint16_t4(Vec4A & fullBitsMask_4); + uint16_t4 Vec4BFull = uint16_t4(Vec4B & fullBitsMask_4); + + // Signed vectors can't just have their highest bits masked off, for them to preserve sign we also need to left shift then right shift them + // so their highest bits are all 0s or 1s depending on the sign of the number they encode + + int16_t2 Vec2ASignedSmall = int16_t2(Vec2ASmall << uint16_t(16 - smallBits_2)) >> int16_t(16 - smallBits_2); + int16_t2 Vec2BSignedSmall = int16_t2(Vec2BSmall << uint16_t(16 - smallBits_2)) >> int16_t(16 - smallBits_2); + int16_t2 Vec2ASignedMedium = int16_t2(Vec2AMedium << uint16_t(16 - mediumBits_2)) >> int16_t(16 - mediumBits_2); + int16_t2 Vec2BSignedMedium = int16_t2(Vec2BMedium << uint16_t(16 - mediumBits_2)) >> int16_t(16 - mediumBits_2); + int32_t2 Vec2ASignedFull = int32_t2(Vec2AFull << uint32_t(32 - fullBits_2)) >> int32_t(32 - fullBits_2); + int32_t2 Vec2BSignedFull = int32_t2(Vec2BFull << uint32_t(32 - fullBits_2)) >> int32_t(32 - fullBits_2); + + int16_t3 Vec3ASignedSmall = int16_t3(Vec3ASmall << uint16_t(16 - smallBits_3)) >> int16_t(16 - smallBits_3); + int16_t3 Vec3BSignedSmall = int16_t3(Vec3BSmall << uint16_t(16 - smallBits_3)) >> int16_t(16 - smallBits_3); + int16_t3 Vec3ASignedMedium = int16_t3(Vec3AMedium << uint16_t(16 - mediumBits_3)) >> int16_t(16 - mediumBits_3); + int16_t3 Vec3BSignedMedium = int16_t3(Vec3BMedium << uint16_t(16 - mediumBits_3)) >> int16_t(16 - mediumBits_3); + int32_t3 Vec3ASignedFull = int32_t3(Vec3AFull << uint32_t(32 - fullBits_3)) >> int32_t(32 - fullBits_3); + int32_t3 Vec3BSignedFull = int32_t3(Vec3BFull << uint32_t(32 - fullBits_3)) >> int32_t(32 - fullBits_3); + + int16_t4 Vec4ASignedSmall = int16_t4(Vec4ASmall << uint16_t(16 - smallBits_4)) >> int16_t(16 - smallBits_4); + int16_t4 Vec4BSignedSmall = int16_t4(Vec4BSmall << uint16_t(16 - smallBits_4)) >> int16_t(16 - smallBits_4); + int16_t4 Vec4ASignedMedium = int16_t4(Vec4AMedium << uint16_t(16 - mediumBits_4)) >> int16_t(16 - mediumBits_4); + int16_t4 Vec4BSignedMedium = int16_t4(Vec4BMedium << uint16_t(16 - mediumBits_4)) >> int16_t(16 - mediumBits_4); + int16_t4 Vec4ASignedFull = int16_t4(Vec4AFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4); + int16_t4 Vec4BSignedFull = int16_t4(Vec4BFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4); + + // Plus + expected.mortonPlus_small_2 = morton::code::create(Vec2ASmall + Vec2BSmall); + expected.mortonPlus_medium_2 = morton::code::create(Vec2AMedium + Vec2BMedium); + expected.mortonPlus_full_2 = morton::code::create(Vec2AFull + Vec2BFull); + expected.mortonPlus_emulated_2 = morton::code::create(Vec2AFull + Vec2BFull); + + expected.mortonPlus_small_3 = morton::code::create(Vec3ASmall + Vec3BSmall); + expected.mortonPlus_medium_3 = morton::code::create(Vec3AMedium + Vec3BMedium); + expected.mortonPlus_full_3 = morton::code::create(Vec3AFull + Vec3BFull); + expected.mortonPlus_emulated_3 = morton::code::create(Vec3AFull + Vec3BFull); + + expected.mortonPlus_small_4 = morton::code::create(Vec4ASmall + Vec4BSmall); + expected.mortonPlus_medium_4 = morton::code::create(Vec4AMedium + Vec4BMedium); + expected.mortonPlus_full_4 = morton::code::create(Vec4AFull + Vec4BFull); + expected.mortonPlus_emulated_4 = morton::code::create(Vec4AFull + Vec4BFull); + + // Minus + expected.mortonMinus_small_2 = morton::code::create(Vec2ASmall - Vec2BSmall); + expected.mortonMinus_medium_2 = morton::code::create(Vec2AMedium - Vec2BMedium); + expected.mortonMinus_full_2 = morton::code::create(Vec2AFull - Vec2BFull); + expected.mortonMinus_emulated_2 = morton::code::create(Vec2AFull - Vec2BFull); + + expected.mortonMinus_small_3 = morton::code::create(Vec3ASmall - Vec3BSmall); + expected.mortonMinus_medium_3 = morton::code::create(Vec3AMedium - Vec3BMedium); + expected.mortonMinus_full_3 = morton::code::create(Vec3AFull - Vec3BFull); + expected.mortonMinus_emulated_3 = morton::code::create(Vec3AFull - Vec3BFull); + + expected.mortonMinus_small_4 = morton::code::create(Vec4ASmall - Vec4BSmall); + expected.mortonMinus_medium_4 = morton::code::create(Vec4AMedium - Vec4BMedium); + expected.mortonMinus_full_4 = morton::code::create(Vec4AFull - Vec4BFull); + expected.mortonMinus_emulated_4 = morton::code::create(Vec4AFull - Vec4BFull); + + // Coordinate-wise equality + expected.mortonEqual_small_2 = uint32_t2(glm::equal(Vec2ASmall, Vec2BSmall)); + expected.mortonEqual_medium_2 = uint32_t2(glm::equal(Vec2AMedium, Vec2BMedium)); + expected.mortonEqual_full_2 = uint32_t2(glm::equal(Vec2AFull, Vec2BFull)); + expected.mortonEqual_emulated_2 = uint32_t2(glm::equal(Vec2AFull, Vec2BFull)); + + expected.mortonEqual_small_3 = uint32_t3(glm::equal(Vec3ASmall, Vec3BSmall)); + expected.mortonEqual_medium_3 = uint32_t3(glm::equal(Vec3AMedium, Vec3BMedium)); + expected.mortonEqual_full_3 = uint32_t3(glm::equal(Vec3AFull, Vec3BFull)); + expected.mortonEqual_emulated_3 = uint32_t3(glm::equal(Vec3AFull, Vec3BFull)); + + expected.mortonEqual_small_4 = uint32_t4(glm::equal(Vec4ASmall, Vec4BSmall)); + expected.mortonEqual_medium_4 = uint32_t4(glm::equal(Vec4AMedium, Vec4BMedium)); + expected.mortonEqual_full_4 = uint32_t4(glm::equal(Vec4AFull, Vec4BFull)); + + // Coordinate-wise unsigned inequality (just testing with less) + expected.mortonUnsignedLess_small_2 = uint32_t2(glm::lessThan(Vec2ASmall, Vec2BSmall)); + expected.mortonUnsignedLess_medium_2 = uint32_t2(glm::lessThan(Vec2AMedium, Vec2BMedium)); + expected.mortonUnsignedLess_full_2 = uint32_t2(glm::lessThan(Vec2AFull, Vec2BFull)); + expected.mortonUnsignedLess_emulated_2 = uint32_t2(glm::lessThan(Vec2AFull, Vec2BFull)); + + expected.mortonUnsignedLess_small_3 = uint32_t3(glm::lessThan(Vec3ASmall, Vec3BSmall)); + expected.mortonUnsignedLess_medium_3 = uint32_t3(glm::lessThan(Vec3AMedium, Vec3BMedium)); + expected.mortonUnsignedLess_full_3 = uint32_t3(glm::lessThan(Vec3AFull, Vec3BFull)); + expected.mortonUnsignedLess_emulated_3 = uint32_t3(glm::lessThan(Vec3AFull, Vec3BFull)); + + expected.mortonUnsignedLess_small_4 = uint32_t4(glm::lessThan(Vec4ASmall, Vec4BSmall)); + expected.mortonUnsignedLess_medium_4 = uint32_t4(glm::lessThan(Vec4AMedium, Vec4BMedium)); + expected.mortonUnsignedLess_full_4 = uint32_t4(glm::lessThan(Vec4AFull, Vec4BFull)); + + // Coordinate-wise signed inequality + expected.mortonSignedLess_small_2 = uint32_t2(glm::lessThan(Vec2ASignedSmall, Vec2BSignedSmall)); + expected.mortonSignedLess_medium_2 = uint32_t2(glm::lessThan(Vec2ASignedMedium, Vec2BSignedMedium)); + expected.mortonSignedLess_full_2 = uint32_t2(glm::lessThan(Vec2ASignedFull, Vec2BSignedFull)); + + expected.mortonSignedLess_small_3 = uint32_t3(glm::lessThan(Vec3ASignedSmall, Vec3BSignedSmall)); + expected.mortonSignedLess_medium_3 = uint32_t3(glm::lessThan(Vec3ASignedMedium, Vec3BSignedMedium)); + expected.mortonSignedLess_full_3 = uint32_t3(glm::lessThan(Vec3ASignedFull, Vec3BSignedFull)); + + expected.mortonSignedLess_small_4 = uint32_t4(glm::lessThan(Vec4ASignedSmall, Vec4BSignedSmall)); + expected.mortonSignedLess_medium_4 = uint32_t4(glm::lessThan(Vec4ASignedMedium, Vec4BSignedMedium)); + expected.mortonSignedLess_full_4 = uint32_t4(glm::lessThan(Vec4ASignedFull, Vec4BSignedFull)); + + uint16_t castedShift = uint16_t(generatedShift); + // Left-shift + expected.mortonLeftShift_small_2 = morton::code::create((Vec2ASmall << uint16_t(castedShift % smallBits_2)) & uint16_t(smallBitsMask_2)); + expected.mortonLeftShift_medium_2 = morton::code::create((Vec2AMedium << uint16_t(castedShift % mediumBits_2)) & uint16_t(mediumBitsMask_2)); + expected.mortonLeftShift_full_2 = morton::code::create((Vec2AFull << uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2)); + expected.mortonLeftShift_emulated_2 = morton::code::create((Vec2AFull << uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2)); + + expected.mortonLeftShift_small_3 = morton::code::create((Vec3ASmall << uint16_t(castedShift % smallBits_3)) & uint16_t(smallBitsMask_3)); + expected.mortonLeftShift_medium_3 = morton::code::create((Vec3AMedium << uint16_t(castedShift % mediumBits_3)) & uint16_t(mediumBitsMask_3)); + expected.mortonLeftShift_full_3 = morton::code::create((Vec3AFull << uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3)); + expected.mortonLeftShift_emulated_3 = morton::code::create((Vec3AFull << uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3)); + + expected.mortonLeftShift_small_4 = morton::code::create((Vec4ASmall << uint16_t(castedShift % smallBits_4)) & uint16_t(smallBitsMask_4)); + expected.mortonLeftShift_medium_4 = morton::code::create((Vec4AMedium << uint16_t(castedShift % mediumBits_4)) & uint16_t(mediumBitsMask_4)); + expected.mortonLeftShift_full_4 = morton::code::create((Vec4AFull << uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4)); + expected.mortonLeftShift_emulated_4 = morton::code::create((Vec4AFull << uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4)); + + // Unsigned right-shift + expected.mortonUnsignedRightShift_small_2 = morton::code::create((Vec2ASmall >> uint16_t(castedShift % smallBits_2)) & uint16_t(smallBitsMask_2)); + expected.mortonUnsignedRightShift_medium_2 = morton::code::create((Vec2AMedium >> uint16_t(castedShift % mediumBits_2)) & uint16_t(mediumBitsMask_2)); + expected.mortonUnsignedRightShift_full_2 = morton::code::create((Vec2AFull >> uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2)); + expected.mortonUnsignedRightShift_emulated_2 = morton::code::create((Vec2AFull >> uint32_t(castedShift % fullBits_2))& uint32_t(fullBitsMask_2)); + + expected.mortonUnsignedRightShift_small_3 = morton::code::create((Vec3ASmall >> uint16_t(castedShift % smallBits_3)) & uint16_t(smallBitsMask_3)); + expected.mortonUnsignedRightShift_medium_3 = morton::code::create((Vec3AMedium >> uint16_t(castedShift % mediumBits_3)) & uint16_t(mediumBitsMask_3)); + expected.mortonUnsignedRightShift_full_3 = morton::code::create((Vec3AFull >> uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3)); + expected.mortonUnsignedRightShift_emulated_3 = morton::code::create((Vec3AFull >> uint32_t(castedShift % fullBits_3))& uint32_t(fullBitsMask_3)); + + expected.mortonUnsignedRightShift_small_4 = morton::code::create((Vec4ASmall >> uint16_t(castedShift % smallBits_4)) & uint16_t(smallBitsMask_4)); + expected.mortonUnsignedRightShift_medium_4 = morton::code::create((Vec4AMedium >> uint16_t(castedShift % mediumBits_4)) & uint16_t(mediumBitsMask_4)); + expected.mortonUnsignedRightShift_full_4 = morton::code::create((Vec4AFull >> uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4)); + expected.mortonUnsignedRightShift_emulated_4 = morton::code::create((Vec4AFull >> uint16_t(castedShift % fullBits_4))& uint16_t(fullBitsMask_4)); + + // Signed right-shift + expected.mortonSignedRightShift_small_2 = morton::code::create((Vec2ASignedSmall >> int16_t(castedShift % smallBits_2)) & int16_t(smallBitsMask_2)); + expected.mortonSignedRightShift_medium_2 = morton::code::create((Vec2ASignedMedium >> int16_t(castedShift % mediumBits_2)) & int16_t(mediumBitsMask_2)); + expected.mortonSignedRightShift_full_2 = morton::code::create((Vec2ASignedFull >> int32_t(castedShift % fullBits_2)) & int32_t(fullBitsMask_2)); + + expected.mortonSignedRightShift_small_3 = morton::code::create((Vec3ASignedSmall >> int16_t(castedShift % smallBits_3)) & int16_t(smallBitsMask_3)); + expected.mortonSignedRightShift_medium_3 = morton::code::create((Vec3ASignedMedium >> int16_t(castedShift % mediumBits_3)) & int16_t(mediumBitsMask_3)); + expected.mortonSignedRightShift_full_3 = morton::code::create((Vec3ASignedFull >> int32_t(castedShift % fullBits_3)) & int32_t(fullBitsMask_3)); + + expected.mortonSignedRightShift_small_4 = morton::code::create((Vec4ASignedSmall >> int16_t(castedShift % smallBits_4)) & int16_t(smallBitsMask_4)); + expected.mortonSignedRightShift_medium_4 = morton::code::create((Vec4ASignedMedium >> int16_t(castedShift % mediumBits_4)) & int16_t(mediumBitsMask_4)); + expected.mortonSignedRightShift_full_4 = morton::code::create((Vec4ASignedFull >> int16_t(castedShift % fullBits_4)) & int16_t(fullBitsMask_4)); + } + + performCpuTests(testInput, expected); + performGpuTests(testInput, expected); + } + m_logger->log("FIRST TESTS DONE.", system::ILogger::ELL_PERFORMANCE); + } + +private: + inline static constexpr int Iterations = 100u; + + void performCpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues) + { + TestValues cpuTestValues; + + fillTestValues(commonTestInputValues, cpuTestValues); + verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU); + + } + + void performGpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues) + { + TestValues gpuTestValues; + gpuTestValues = dispatch(commonTestInputValues); + verifyTestValues(expectedTestValues, gpuTestValues, ITester::TestType::GPU); + } + + void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType) + { + verifyTestValue("emulatedAnd", expectedTestValues.emulatedAnd, testValues.emulatedAnd, testType); + verifyTestValue("emulatedOr", expectedTestValues.emulatedOr, testValues.emulatedOr, testType); + verifyTestValue("emulatedXor", expectedTestValues.emulatedXor, testValues.emulatedXor, testType); + verifyTestValue("emulatedNot", expectedTestValues.emulatedNot, testValues.emulatedNot, testType); + verifyTestValue("emulatedPlus", expectedTestValues.emulatedPlus, testValues.emulatedPlus, testType); + verifyTestValue("emulatedMinus", expectedTestValues.emulatedMinus, testValues.emulatedMinus, testType); + verifyTestValue("emulatedLess", expectedTestValues.emulatedLess, testValues.emulatedLess, testType); + verifyTestValue("emulatedLessEqual", expectedTestValues.emulatedLessEqual, testValues.emulatedLessEqual, testType); + verifyTestValue("emulatedGreater", expectedTestValues.emulatedGreater, testValues.emulatedGreater, testType); + verifyTestValue("emulatedGreaterEqual", expectedTestValues.emulatedGreaterEqual, testValues.emulatedGreaterEqual, testType); + verifyTestValue("emulatedLeftShifted", expectedTestValues.emulatedLeftShifted, testValues.emulatedLeftShifted, testType); + verifyTestValue("emulatedUnsignedRightShifted", expectedTestValues.emulatedUnsignedRightShifted, testValues.emulatedUnsignedRightShifted, testType); + verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType); + + // Morton Plus + verifyTestValue("mortonPlus_small_2", expectedTestValues.mortonPlus_small_2, testValues.mortonPlus_small_2, testType); + verifyTestValue("mortonPlus_medium_2", expectedTestValues.mortonPlus_medium_2, testValues.mortonPlus_medium_2, testType); + verifyTestValue("mortonPlus_full_2", expectedTestValues.mortonPlus_full_2, testValues.mortonPlus_full_2, testType); + verifyTestValue("mortonPlus_emulated_2", expectedTestValues.mortonPlus_emulated_2, testValues.mortonPlus_emulated_2, testType); + + verifyTestValue("mortonPlus_small_3", expectedTestValues.mortonPlus_small_3, testValues.mortonPlus_small_3, testType); + verifyTestValue("mortonPlus_medium_3", expectedTestValues.mortonPlus_medium_3, testValues.mortonPlus_medium_3, testType); + verifyTestValue("mortonPlus_full_3", expectedTestValues.mortonPlus_full_3, testValues.mortonPlus_full_3, testType); + verifyTestValue("mortonPlus_emulated_3", expectedTestValues.mortonPlus_emulated_3, testValues.mortonPlus_emulated_3, testType); + + verifyTestValue("mortonPlus_small_4", expectedTestValues.mortonPlus_small_4, testValues.mortonPlus_small_4, testType); + verifyTestValue("mortonPlus_medium_4", expectedTestValues.mortonPlus_medium_4, testValues.mortonPlus_medium_4, testType); + verifyTestValue("mortonPlus_full_4", expectedTestValues.mortonPlus_full_4, testValues.mortonPlus_full_4, testType); + verifyTestValue("mortonPlus_emulated_4", expectedTestValues.mortonPlus_emulated_4, testValues.mortonPlus_emulated_4, testType); + + // Morton Minus + verifyTestValue("mortonMinus_small_2", expectedTestValues.mortonMinus_small_2, testValues.mortonMinus_small_2, testType); + verifyTestValue("mortonMinus_medium_2", expectedTestValues.mortonMinus_medium_2, testValues.mortonMinus_medium_2, testType); + verifyTestValue("mortonMinus_full_2", expectedTestValues.mortonMinus_full_2, testValues.mortonMinus_full_2, testType); + verifyTestValue("mortonMinus_emulated_2", expectedTestValues.mortonMinus_emulated_2, testValues.mortonMinus_emulated_2, testType); + + verifyTestValue("mortonMinus_small_3", expectedTestValues.mortonMinus_small_3, testValues.mortonMinus_small_3, testType); + verifyTestValue("mortonMinus_medium_3", expectedTestValues.mortonMinus_medium_3, testValues.mortonMinus_medium_3, testType); + verifyTestValue("mortonMinus_full_3", expectedTestValues.mortonMinus_full_3, testValues.mortonMinus_full_3, testType); + verifyTestValue("mortonMinus_emulated_3", expectedTestValues.mortonMinus_emulated_3, testValues.mortonMinus_emulated_3, testType); + + verifyTestValue("mortonMinus_small_4", expectedTestValues.mortonMinus_small_4, testValues.mortonMinus_small_4, testType); + verifyTestValue("mortonMinus_medium_4", expectedTestValues.mortonMinus_medium_4, testValues.mortonMinus_medium_4, testType); + verifyTestValue("mortonMinus_full_4", expectedTestValues.mortonMinus_full_4, testValues.mortonMinus_full_4, testType); + verifyTestValue("mortonMinus_emulated_4", expectedTestValues.mortonMinus_emulated_4, testValues.mortonMinus_emulated_4, testType); + + // Morton coordinate-wise equality + verifyTestValue("mortonEqual_small_2", expectedTestValues.mortonEqual_small_2, testValues.mortonEqual_small_2, testType); + verifyTestValue("mortonEqual_medium_2", expectedTestValues.mortonEqual_medium_2, testValues.mortonEqual_medium_2, testType); + verifyTestValue("mortonEqual_full_2", expectedTestValues.mortonEqual_full_2, testValues.mortonEqual_full_2, testType); + verifyTestValue("mortonEqual_emulated_2", expectedTestValues.mortonEqual_emulated_2, testValues.mortonEqual_emulated_2, testType); + + verifyTestValue("mortonEqual_small_3", expectedTestValues.mortonEqual_small_3, testValues.mortonEqual_small_3, testType); + verifyTestValue("mortonEqual_medium_3", expectedTestValues.mortonEqual_medium_3, testValues.mortonEqual_medium_3, testType); + verifyTestValue("mortonEqual_full_3", expectedTestValues.mortonEqual_full_3, testValues.mortonEqual_full_3, testType); + verifyTestValue("mortonEqual_emulated_3", expectedTestValues.mortonEqual_emulated_3, testValues.mortonEqual_emulated_3, testType); + + verifyTestValue("mortonEqual_small_4", expectedTestValues.mortonEqual_small_4, testValues.mortonEqual_small_4, testType); + verifyTestValue("mortonEqual_medium_4", expectedTestValues.mortonEqual_medium_4, testValues.mortonEqual_medium_4, testType); + verifyTestValue("mortonEqual_full_4", expectedTestValues.mortonEqual_full_4, testValues.mortonEqual_full_4, testType); + + // Morton coordinate-wise unsigned inequality + verifyTestValue("mortonUnsignedLess_small_2", expectedTestValues.mortonUnsignedLess_small_2, testValues.mortonUnsignedLess_small_2, testType); + verifyTestValue("mortonUnsignedLess_medium_2", expectedTestValues.mortonUnsignedLess_medium_2, testValues.mortonUnsignedLess_medium_2, testType); + verifyTestValue("mortonUnsignedLess_full_2", expectedTestValues.mortonUnsignedLess_full_2, testValues.mortonUnsignedLess_full_2, testType); + verifyTestValue("mortonUnsignedLess_emulated_2", expectedTestValues.mortonUnsignedLess_emulated_2, testValues.mortonUnsignedLess_emulated_2, testType); + + verifyTestValue("mortonUnsignedLess_small_3", expectedTestValues.mortonUnsignedLess_small_3, testValues.mortonUnsignedLess_small_3, testType); + verifyTestValue("mortonUnsignedLess_medium_3", expectedTestValues.mortonUnsignedLess_medium_3, testValues.mortonUnsignedLess_medium_3, testType); + verifyTestValue("mortonUnsignedLess_full_3", expectedTestValues.mortonUnsignedLess_full_3, testValues.mortonUnsignedLess_full_3, testType); + verifyTestValue("mortonUnsignedLess_emulated_3", expectedTestValues.mortonUnsignedLess_emulated_3, testValues.mortonUnsignedLess_emulated_3, testType); + + verifyTestValue("mortonUnsignedLess_small_4", expectedTestValues.mortonUnsignedLess_small_4, testValues.mortonUnsignedLess_small_4, testType); + verifyTestValue("mortonUnsignedLess_medium_4", expectedTestValues.mortonUnsignedLess_medium_4, testValues.mortonUnsignedLess_medium_4, testType); + verifyTestValue("mortonUnsignedLess_full_4", expectedTestValues.mortonUnsignedLess_full_4, testValues.mortonUnsignedLess_full_4, testType); + + // Morton coordinate-wise signed inequality + verifyTestValue("mortonSignedLess_small_2", expectedTestValues.mortonSignedLess_small_2, testValues.mortonSignedLess_small_2, testType); + verifyTestValue("mortonSignedLess_medium_2", expectedTestValues.mortonSignedLess_medium_2, testValues.mortonSignedLess_medium_2, testType); + verifyTestValue("mortonSignedLess_full_2", expectedTestValues.mortonSignedLess_full_2, testValues.mortonSignedLess_full_2, testType); + + verifyTestValue("mortonSignedLess_small_3", expectedTestValues.mortonSignedLess_small_3, testValues.mortonSignedLess_small_3, testType); + verifyTestValue("mortonSignedLess_medium_3", expectedTestValues.mortonSignedLess_medium_3, testValues.mortonSignedLess_medium_3, testType); + verifyTestValue("mortonSignedLess_full_3", expectedTestValues.mortonSignedLess_full_3, testValues.mortonSignedLess_full_3, testType); + + verifyTestValue("mortonSignedLess_small_4", expectedTestValues.mortonSignedLess_small_4, testValues.mortonSignedLess_small_4, testType); + verifyTestValue("mortonSignedLess_medium_4", expectedTestValues.mortonSignedLess_medium_4, testValues.mortonSignedLess_medium_4, testType); + verifyTestValue("mortonSignedLess_full_4", expectedTestValues.mortonSignedLess_full_4, testValues.mortonSignedLess_full_4, testType); + + // Morton left-shift + verifyTestValue("mortonLeftShift_small_2", expectedTestValues.mortonLeftShift_small_2, testValues.mortonLeftShift_small_2, testType); + verifyTestValue("mortonLeftShift_medium_2", expectedTestValues.mortonLeftShift_medium_2, testValues.mortonLeftShift_medium_2, testType); + verifyTestValue("mortonLeftShift_full_2", expectedTestValues.mortonLeftShift_full_2, testValues.mortonLeftShift_full_2, testType); + verifyTestValue("mortonLeftShift_emulated_2", expectedTestValues.mortonLeftShift_emulated_2, testValues.mortonLeftShift_emulated_2, testType); + + verifyTestValue("mortonLeftShift_small_3", expectedTestValues.mortonLeftShift_small_3, testValues.mortonLeftShift_small_3, testType); + verifyTestValue("mortonLeftShift_medium_3", expectedTestValues.mortonLeftShift_medium_3, testValues.mortonLeftShift_medium_3, testType); + verifyTestValue("mortonLeftShift_full_3", expectedTestValues.mortonLeftShift_full_3, testValues.mortonLeftShift_full_3, testType); + verifyTestValue("mortonLeftShift_emulated_3", expectedTestValues.mortonLeftShift_emulated_3, testValues.mortonLeftShift_emulated_3, testType); + + verifyTestValue("mortonLeftShift_small_4", expectedTestValues.mortonLeftShift_small_4, testValues.mortonLeftShift_small_4, testType); + verifyTestValue("mortonLeftShift_medium_4", expectedTestValues.mortonLeftShift_medium_4, testValues.mortonLeftShift_medium_4, testType); + verifyTestValue("mortonLeftShift_full_4", expectedTestValues.mortonLeftShift_full_4, testValues.mortonLeftShift_full_4, testType); + verifyTestValue("mortonLeftShift_emulated_4", expectedTestValues.mortonLeftShift_emulated_4, testValues.mortonLeftShift_emulated_4, testType); + + // Morton unsigned right-shift + verifyTestValue("mortonUnsignedRightShift_small_2", expectedTestValues.mortonUnsignedRightShift_small_2, testValues.mortonUnsignedRightShift_small_2, testType); + verifyTestValue("mortonUnsignedRightShift_medium_2", expectedTestValues.mortonUnsignedRightShift_medium_2, testValues.mortonUnsignedRightShift_medium_2, testType); + verifyTestValue("mortonUnsignedRightShift_full_2", expectedTestValues.mortonUnsignedRightShift_full_2, testValues.mortonUnsignedRightShift_full_2, testType); + verifyTestValue("mortonUnsignedRightShift_emulated_2", expectedTestValues.mortonUnsignedRightShift_emulated_2, testValues.mortonUnsignedRightShift_emulated_2, testType); + + verifyTestValue("mortonUnsignedRightShift_small_3", expectedTestValues.mortonUnsignedRightShift_small_3, testValues.mortonUnsignedRightShift_small_3, testType); + verifyTestValue("mortonUnsignedRightShift_medium_3", expectedTestValues.mortonUnsignedRightShift_medium_3, testValues.mortonUnsignedRightShift_medium_3, testType); + verifyTestValue("mortonUnsignedRightShift_full_3", expectedTestValues.mortonUnsignedRightShift_full_3, testValues.mortonUnsignedRightShift_full_3, testType); + verifyTestValue("mortonUnsignedRightShift_emulated_3", expectedTestValues.mortonUnsignedRightShift_emulated_3, testValues.mortonUnsignedRightShift_emulated_3, testType); + + verifyTestValue("mortonUnsignedRightShift_small_4", expectedTestValues.mortonUnsignedRightShift_small_4, testValues.mortonUnsignedRightShift_small_4, testType); + verifyTestValue("mortonUnsignedRightShift_medium_4", expectedTestValues.mortonUnsignedRightShift_medium_4, testValues.mortonUnsignedRightShift_medium_4, testType); + verifyTestValue("mortonUnsignedRightShift_full_4", expectedTestValues.mortonUnsignedRightShift_full_4, testValues.mortonUnsignedRightShift_full_4, testType); + verifyTestValue("mortonUnsignedRightShift_emulated_4", expectedTestValues.mortonUnsignedRightShift_emulated_4, testValues.mortonUnsignedRightShift_emulated_4, testType); + + // Morton signed right-shift + verifyTestValue("mortonSignedRightShift_small_2", expectedTestValues.mortonSignedRightShift_small_2, testValues.mortonSignedRightShift_small_2, testType); + verifyTestValue("mortonSignedRightShift_medium_2", expectedTestValues.mortonSignedRightShift_medium_2, testValues.mortonSignedRightShift_medium_2, testType); + verifyTestValue("mortonSignedRightShift_full_2", expectedTestValues.mortonSignedRightShift_full_2, testValues.mortonSignedRightShift_full_2, testType); + + verifyTestValue("mortonSignedRightShift_small_3", expectedTestValues.mortonSignedRightShift_small_3, testValues.mortonSignedRightShift_small_3, testType); + verifyTestValue("mortonSignedRightShift_medium_3", expectedTestValues.mortonSignedRightShift_medium_3, testValues.mortonSignedRightShift_medium_3, testType); + verifyTestValue("mortonSignedRightShift_full_3", expectedTestValues.mortonSignedRightShift_full_3, testValues.mortonSignedRightShift_full_3, testType); + + verifyTestValue("mortonSignedRightShift_small_4", expectedTestValues.mortonSignedRightShift_small_4, testValues.mortonSignedRightShift_small_4, testType); + verifyTestValue("mortonSignedRightShift_medium_4", expectedTestValues.mortonSignedRightShift_medium_4, testValues.mortonSignedRightShift_medium_4, testType); + verifyTestValue("mortonSignedRightShift_full_4", expectedTestValues.mortonSignedRightShift_full_4, testValues.mortonSignedRightShift_full_4, testType); + } +}; + +#endif \ No newline at end of file diff --git a/12_Mortons/Tester.h b/12_Mortons/ITester.h similarity index 66% rename from 12_Mortons/Tester.h rename to 12_Mortons/ITester.h index 480328d18..2510dd997 100644 --- a/12_Mortons/Tester.h +++ b/12_Mortons/ITester.h @@ -1,5 +1,5 @@ -#ifndef _NBL_EXAMPLES_TESTS_12_MORTONS_TESTER_INCLUDED_ -#define _NBL_EXAMPLES_TESTS_12_MORTONS_TESTER_INCLUDED_ +#ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_I_TESTER_INCLUDED_ +#define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_I_TESTER_INCLUDED_ #include #include "app_resources/common.hlsl" @@ -8,10 +8,10 @@ using namespace nbl; -class Tester +class ITester { public: - virtual ~Tester() + virtual ~ITester() { m_outputBufferAllocation.memory->unmap(); }; @@ -128,7 +128,7 @@ class Tester if (!inputBuff) logFail("Failed to create a GPU Buffer of size %d!\n", params.size); - inputBuff->setObjectDebugName("morton input buffer"); + inputBuff->setObjectDebugName("emulated_float64_t output buffer"); video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuff->getMemoryReqs(); reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); @@ -163,7 +163,7 @@ class Tester if (!outputBuff) logFail("Failed to create a GPU Buffer of size %d!\n", params.size); - outputBuff->setObjectDebugName("morton output buffer"); + outputBuff->setObjectDebugName("emulated_float64_t output buffer"); video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs(); reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); @@ -211,29 +211,6 @@ class Tester if (expectedVal == testVal) return; - std::stringstream ss; - switch (testType) - { - case TestType::CPU: - ss << "CPU TEST ERROR:\n"; - case TestType::GPU: - ss << "GPU TEST ERROR:\n"; - } - - ss << "nbl::hlsl::" << memberName << " produced incorrect output!" << '\n'; //test value: " << testVal << " expected value: " << expectedVal << '\n'; - - m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR); - } - - template - void verifyTestVector3dValue(const std::string& memberName, const nbl::hlsl::vector& expectedVal, const nbl::hlsl::vector& testVal, const TestType testType) - { - static constexpr float MaxAllowedError = 0.1f; - if (std::abs(double(expectedVal.x) - double(testVal.x)) <= MaxAllowedError && - std::abs(double(expectedVal.y) - double(testVal.y)) <= MaxAllowedError && - std::abs(double(expectedVal.z) - double(testVal.z)) <= MaxAllowedError) - return; - std::stringstream ss; switch (testType) { @@ -244,69 +221,11 @@ class Tester ss << "GPU TEST ERROR:\n"; } - ss << "nbl::hlsl::" << memberName << " produced incorrect output! test value: " << - testVal.x << ' ' << testVal.y << ' ' << testVal.z << - " expected value: " << expectedVal.x << ' ' << expectedVal.y << ' ' << expectedVal.z << '\n'; + ss << "nbl::hlsl::" << memberName << " produced incorrect output!" << '\n'; m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR); } - void performTests() - { - std::random_device rd; - std::mt19937 mt(rd()); - - std::uniform_int_distribution shortDistribution(uint16_t(0), std::numeric_limits::max()); - std::uniform_int_distribution intDistribution(uint32_t(0), std::numeric_limits::max()); - std::uniform_int_distribution longDistribution(uint64_t(0), std::numeric_limits::max()); - - m_logger->log("TESTS:", system::ILogger::ELL_PERFORMANCE); - for (int i = 0; i < Iterations; ++i) - { - // Set input thest values that will be used in both CPU and GPU tests - InputTestValues testInput; - // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values - TestValues expected; - - uint32_t generatedShift = intDistribution(mt) & uint32_t(63); - testInput.shift = generatedShift; - { - uint64_t generatedA = longDistribution(mt); - uint64_t generatedB = longDistribution(mt); - - testInput.generatedA = generatedA; - testInput.generatedB = generatedB; - - expected.emulatedAnd = _static_cast(generatedA & generatedB); - expected.emulatedOr = _static_cast(generatedA | generatedB); - expected.emulatedXor = _static_cast(generatedA ^ generatedB); - expected.emulatedNot = _static_cast(~generatedA); - expected.emulatedPlus = _static_cast(generatedA + generatedB); - expected.emulatedMinus = _static_cast(generatedA - generatedB); - expected.emulatedLess = uint32_t(generatedA < generatedB); - expected.emulatedLessEqual = uint32_t(generatedA <= generatedB); - expected.emulatedGreater = uint32_t(generatedA > generatedB); - expected.emulatedGreaterEqual = uint32_t(generatedA >= generatedB); - - expected.emulatedLeftShifted = _static_cast(generatedA << generatedShift); - expected.emulatedUnsignedRightShifted = _static_cast(generatedA >> generatedShift); - expected.emulatedSignedRightShifted = _static_cast(static_cast(generatedA) >> generatedShift); - } - { - uint64_t coordX = longDistribution(mt); - uint64_t coordY = longDistribution(mt); - uint64_t coordZ = longDistribution(mt); - uint64_t coordW = longDistribution(mt); - - - } - - performCpuTests(testInput, expected); - performGpuTests(testInput, expected); - } - m_logger->log("TESTS DONE.", system::ILogger::ELL_PERFORMANCE); - } - protected: uint32_t m_queueFamily; core::smart_refctd_ptr m_device; @@ -324,7 +243,7 @@ class Tester core::smart_refctd_ptr m_semaphore; video::IQueue* m_queue; uint64_t m_semaphoreCounter; - + template OutputStruct dispatch(const InputStruct& input) { @@ -375,42 +294,6 @@ class Tester m_logger->log(msg, system::ILogger::ELL_ERROR, std::forward(args)...); exit(-1); } - - inline static constexpr int Iterations = 100u; - - void performCpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues) - { - TestValues cpuTestValues; - cpuTestValues.fillTestValues(commonTestInputValues); - verifyTestValues(expectedTestValues, cpuTestValues, TestType::CPU); - - } - - void performGpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues) - { - TestValues gpuTestValues; - gpuTestValues = dispatch(commonTestInputValues); - verifyTestValues(expectedTestValues, gpuTestValues, TestType::GPU); - } - - void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, TestType testType) - { - verifyTestValue("emulatedAnd", expectedTestValues.emulatedAnd, testValues.emulatedAnd, testType); - verifyTestValue("emulatedOr", expectedTestValues.emulatedOr, testValues.emulatedOr, testType); - verifyTestValue("emulatedXor", expectedTestValues.emulatedXor, testValues.emulatedXor, testType); - verifyTestValue("emulatedNot", expectedTestValues.emulatedNot, testValues.emulatedNot, testType); - verifyTestValue("emulatedPlus", expectedTestValues.emulatedPlus, testValues.emulatedPlus, testType); - verifyTestValue("emulatedMinus", expectedTestValues.emulatedMinus, testValues.emulatedMinus, testType); - verifyTestValue("emulatedLess", expectedTestValues.emulatedLess, testValues.emulatedLess, testType); - verifyTestValue("emulatedLessEqual", expectedTestValues.emulatedLessEqual, testValues.emulatedLessEqual, testType); - verifyTestValue("emulatedGreater", expectedTestValues.emulatedGreater, testValues.emulatedGreater, testType); - verifyTestValue("emulatedGreaterEqual", expectedTestValues.emulatedGreaterEqual, testValues.emulatedGreaterEqual, testType); - verifyTestValue("emulatedLeftShifted", expectedTestValues.emulatedLeftShifted, testValues.emulatedLeftShifted, testType); - verifyTestValue("emulatedUnsignedRightShifted", expectedTestValues.emulatedUnsignedRightShifted, testValues.emulatedUnsignedRightShifted, testType); - verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType); - - //verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType); - } }; #endif \ No newline at end of file diff --git a/12_Mortons/app_resources/common.hlsl b/12_Mortons/app_resources/common.hlsl index be6a2f4a0..b058ad821 100644 --- a/12_Mortons/app_resources/common.hlsl +++ b/12_Mortons/app_resources/common.hlsl @@ -5,10 +5,6 @@ #ifndef _NBL_EXAMPLES_TESTS_12_MORTON_COMMON_INCLUDED_ #define _NBL_EXAMPLES_TESTS_12_MORTON_COMMON_INCLUDED_ -// because DXC doesn't properly support `_Static_assert` -// TODO: add a message, and move to macros.h or cpp_compat -#define STATIC_ASSERT(...) { nbl::hlsl::conditional<__VA_ARGS__, int, void>::type a = 0; } - #include #include @@ -23,6 +19,22 @@ NBL_CONSTEXPR uint16_t smallBits_4 = 4; NBL_CONSTEXPR uint16_t mediumBits_4 = 8; NBL_CONSTEXPR uint16_t fullBits_4 = 16; +#ifndef __HLSL_VERSION + +constexpr uint64_t smallBitsMask_2 = (uint64_t(1) << smallBits_2) - 1; +constexpr uint64_t mediumBitsMask_2 = (uint64_t(1) << mediumBits_2) - 1; +constexpr uint64_t fullBitsMask_2 = (uint64_t(1) << fullBits_2) - 1; + +constexpr uint64_t smallBitsMask_3 = (uint64_t(1) << smallBits_3) - 1; +constexpr uint64_t mediumBitsMask_3 = (uint64_t(1) << mediumBits_3) - 1; +constexpr uint64_t fullBitsMask_3 = (uint64_t(1) << fullBits_3) - 1; + +constexpr uint64_t smallBitsMask_4 = (uint64_t(1) << smallBits_4) - 1; +constexpr uint64_t mediumBitsMask_4 = (uint64_t(1) << mediumBits_4) - 1; +constexpr uint64_t fullBitsMask_4 = (uint64_t(1) << fullBits_4) - 1; + +#endif + using namespace nbl::hlsl; struct InputTestValues { @@ -190,33 +202,9 @@ struct TestValues morton::code mortonSignedRightShift_full_4; morton::code mortonSignedRightShift_emulated_4; - void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input) + /* + void fillSecondTestValues(NBL_CONST_REF_ARG(InputTestValues) input) { - emulated_uint64_t emulatedA = _static_cast(input.generatedA); - emulated_uint64_t emulatedB = _static_cast(input.generatedB); - - // Emulated int tests - emulatedAnd = emulatedA & emulatedB; - emulatedOr = emulatedA | emulatedB; - emulatedXor = emulatedA ^ emulatedB; - emulatedNot = emulatedA.operator~(); - emulatedPlus = emulatedA + emulatedB; - emulatedMinus = emulatedA - emulatedB; - emulatedLess = uint32_t(emulatedA < emulatedB); - emulatedLessEqual = uint32_t(emulatedA <= emulatedB); - emulatedGreater = uint32_t(emulatedA > emulatedB); - emulatedGreaterEqual = uint32_t(emulatedA >= emulatedB); - - left_shift_operator leftShift; - emulatedLeftShifted = leftShift(emulatedA, input.shift); - - arithmetic_right_shift_operator unsignedRightShift; - emulatedUnsignedRightShifted = unsignedRightShift(emulatedA, input.shift); - - arithmetic_right_shift_operator signedRightShift; - emulatedSignedRightShifted = signedRightShift(_static_cast(emulatedA), input.shift); - - // Morton tests uint64_t2 Vec2A = { input.coordX, input.coordY }; uint64_t2 Vec2B = { input.coordZ, input.coordW }; @@ -235,250 +223,29 @@ struct TestValues int64_t4 Vec4ASigned = int64_t4(Vec4A); int64_t4 Vec4BSigned = int64_t4(Vec4B); - morton::code morton_small_2A = morton::code::create(Vec2A); - morton::code morton_medium_2A = morton::code::create(Vec2A); - morton::code morton_full_2A = morton::code::create(Vec2A); - morton::code morton_emulated_2A = morton::code::create(Vec2A); - morton::code morton_small_2B = morton::code::create(Vec2B); - morton::code morton_medium_2B = morton::code::create(Vec2B); - morton::code morton_full_2B = morton::code::create(Vec2B); - morton::code morton_emulated_2B = morton::code::create(Vec2B); - - morton::code morton_small_3A = morton::code::create(Vec3A); - morton::code morton_medium_3A = morton::code::create(Vec3A); - morton::code morton_full_3A = morton::code::create(Vec3A); - morton::code morton_emulated_3A = morton::code::create(Vec3A); - morton::code morton_small_3B = morton::code::create(Vec3B); - morton::code morton_medium_3B = morton::code::create(Vec3B); - morton::code morton_full_3B = morton::code::create(Vec3B); - morton::code morton_emulated_3B = morton::code::create(Vec3B); - - morton::code morton_small_4A = morton::code::create(Vec4A); - morton::code morton_medium_4A = morton::code::create(Vec4A); - morton::code morton_full_4A = morton::code::create(Vec4A); morton::code morton_emulated_4A = morton::code::create(Vec4A); - morton::code morton_small_4B = morton::code::create(Vec4B); - morton::code morton_medium_4B = morton::code::create(Vec4B); - morton::code morton_full_4B = morton::code::create(Vec4B); - morton::code morton_emulated_4B = morton::code::create(Vec4B); - - morton::code morton_small_2ASigned = morton::code::create(Vec2ASigned); - morton::code morton_medium_2ASigned = morton::code::create(Vec2ASigned); - morton::code morton_full_2ASigned = morton::code::create(Vec2ASigned); - morton::code morton_emulated_2ASigned = morton::code::create(Vec2ASigned); - morton::code morton_small_2BSigned = morton::code::create(Vec2BSigned); - morton::code morton_medium_2BSigned = morton::code::create(Vec2BSigned); - morton::code morton_full_2BSigned = morton::code::create(Vec2BSigned); - morton::code morton_emulated_2BSigned = morton::code::create(Vec2BSigned); - - morton::code morton_small_3ASigned = morton::code::create(Vec3ASigned); - morton::code morton_medium_3ASigned = morton::code::create(Vec3ASigned); - morton::code morton_full_3ASigned = morton::code::create(Vec3ASigned); - morton::code morton_emulated_3ASigned = morton::code::create(Vec3ASigned); - morton::code morton_small_3BSigned = morton::code::create(Vec3BSigned); - morton::code morton_medium_3BSigned = morton::code::create(Vec3BSigned); - morton::code morton_full_3BSigned = morton::code::create(Vec3BSigned); - morton::code morton_emulated_3BSigned = morton::code::create(Vec3BSigned); - - morton::code morton_small_4ASigned = morton::code::create(Vec4ASigned); - morton::code morton_medium_4ASigned = morton::code::create(Vec4ASigned); - morton::code morton_full_4ASigned = morton::code::create(Vec4ASigned); - morton::code morton_emulated_4ASigned = morton::code::create(Vec4ASigned); - morton::code morton_small_4BSigned = morton::code::create(Vec4BSigned); - morton::code morton_medium_4BSigned = morton::code::create(Vec4BSigned); - morton::code morton_full_4BSigned = morton::code::create(Vec4BSigned); - morton::code morton_emulated_4BSigned = morton::code::create(Vec4BSigned); - - /* - left_shift_operator > leftShiftTemp; - portable_vector_t interleaved = _static_cast >(uint16_t4(Vec4B)) & morton::impl::coding_mask_v<4, fullBits_4, morton::impl::CodingStages, emulated_uint64_t>; - - #define ENCODE_LOOP_ITERATION(I) NBL_IF_CONSTEXPR(fullBits_4 > (uint16_t(1) << I))\ - {\ - interleaved = interleaved | leftShiftTemp(interleaved, (uint16_t(1) << I) * (4 - 1));\ - interleaved = interleaved & _static_cast(morton::impl::coding_mask<4, fullBits_4, I>::value);\ - } - - ENCODE_LOOP_ITERATION(4) - ENCODE_LOOP_ITERATION(3) - ENCODE_LOOP_ITERATION(2) - ENCODE_LOOP_ITERATION(1) - ENCODE_LOOP_ITERATION(0) - - #undef ENCODE_LOOP_ITERATION - // After interleaving, shift each coordinate left by their index - return leftShiftTemp(interleaved, truncate >(vector(0, 1, 2, 3))); - - - array_get, emulated_uint64_t> getter; - emulatedAnd = getter(interleaved, 0); - */ - - // Plus - mortonPlus_small_2 = morton_small_2A + morton_small_2B; - mortonPlus_medium_2 = morton_medium_2A + morton_medium_2B; - mortonPlus_full_2 = morton_full_2A + morton_full_2B; - mortonPlus_emulated_2 = morton_emulated_2A + morton_emulated_2B; - - mortonPlus_small_3 = morton_small_3A + morton_small_3B; - mortonPlus_medium_3 = morton_medium_3A + morton_medium_3B; - mortonPlus_full_3 = morton_full_3A + morton_full_3B; - mortonPlus_emulated_3 = morton_emulated_3A + morton_emulated_3B; - - mortonPlus_small_4 = morton_small_4A + morton_small_4B; - mortonPlus_medium_4 = morton_medium_4A + morton_medium_4B; - mortonPlus_full_4 = morton_full_4A + morton_full_4B; - mortonPlus_emulated_4 = morton_emulated_4A + morton_emulated_4B; - - // Minus - mortonMinus_small_2 = morton_small_2A - morton_small_2B; - mortonMinus_medium_2 = morton_medium_2A - morton_medium_2B; - mortonMinus_full_2 = morton_full_2A - morton_full_2B; - mortonMinus_emulated_2 = morton_emulated_2A - morton_emulated_2B; - - mortonMinus_small_3 = morton_small_3A - morton_small_3B; - mortonMinus_medium_3 = morton_medium_3A - morton_medium_3B; - mortonMinus_full_3 = morton_full_3A - morton_full_3B; - mortonMinus_emulated_3 = morton_emulated_3A - morton_emulated_3B; - - mortonMinus_small_4 = morton_small_4A - morton_small_4B; - mortonMinus_medium_4 = morton_medium_4A - morton_medium_4B; - mortonMinus_full_4 = morton_full_4A - morton_full_4B; - mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B; - - // Coordinate-wise equality - mortonEqual_small_2 = uint32_t2(morton_small_2A.equal(uint16_t2(Vec2B))); - mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal(uint16_t2(Vec2B))); - mortonEqual_full_2 = uint32_t2(morton_full_2A.equal(uint32_t2(Vec2B))); - mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal(uint32_t2(Vec2B))); - - mortonEqual_small_3 = uint32_t3(morton_small_3A.equal(uint16_t3(Vec3B))); - mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal(uint16_t3(Vec3B))); - mortonEqual_full_3 = uint32_t3(morton_full_3A.equal(uint32_t3(Vec3B))); - mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal(uint32_t3(Vec3B))); - - mortonEqual_small_4 = uint32_t4(morton_small_4A.equal(uint16_t4(Vec4B))); - mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal(uint16_t4(Vec4B))); - mortonEqual_full_4 = uint32_t4(morton_full_4A.equal(uint16_t4(Vec4B))); - mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal(uint16_t4(Vec4B))); - - // Coordinate-wise unsigned inequality (just testing with less) - mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan(uint16_t2(Vec2B))); - mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan(uint16_t2(Vec2B))); - mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan(uint32_t2(Vec2B))); - mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan(uint32_t2(Vec2B))); - - mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan(uint16_t3(Vec3B))); - mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan(uint16_t3(Vec3B))); - mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan(uint32_t3(Vec3B))); - mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan(uint32_t3(Vec3B))); - - mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan(uint16_t4(Vec4B))); - mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan(uint16_t4(Vec4B))); - mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan(uint16_t4(Vec4B))); - mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan(uint16_t4(Vec4B))); + morton::code morton_emulated_2_signed = morton::code::create(Vec2ASigned); + morton::code morton_emulated_3_signed = morton::code::create(Vec3ASigned); + morton::code morton_emulated_4_signed = morton::code::create(Vec4ASigned); + + output.mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal(uint16_t4(Vec4B))); - // Coordinate-wise signed inequality - mortonSignedLess_small_2 = uint32_t2(morton_small_2ASigned.lessThan(int16_t2(Vec2BSigned))); - mortonSignedLess_medium_2 = uint32_t2(morton_medium_2ASigned.lessThan(int16_t2(Vec2BSigned))); - mortonSignedLess_full_2 = uint32_t2(morton_full_2ASigned.lessThan(int32_t2(Vec2BSigned))); - //mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2ASigned.lessThan(int32_t2(Vec2BSigned))); - - mortonSignedLess_small_3 = uint32_t3(morton_small_3ASigned.lessThan(int16_t3(Vec3BSigned))); - mortonSignedLess_medium_3 = uint32_t3(morton_medium_3ASigned.lessThan(int16_t3(Vec3BSigned))); - mortonSignedLess_full_3 = uint32_t3(morton_full_3ASigned.lessThan(int32_t3(Vec3BSigned))); - //mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3ASigned.lessThan(int32_t3(Vec3BSigned))); - - mortonSignedLess_small_4 = uint32_t4(morton_small_4ASigned.lessThan(int16_t4(Vec4BSigned))); - mortonSignedLess_medium_4 = uint32_t4(morton_medium_4ASigned.lessThan(int16_t4(Vec4BSigned))); - mortonSignedLess_full_4 = uint32_t4(morton_full_4ASigned.lessThan(int16_t4(Vec4BSigned))); - //mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4ASigned.lessThan(int16_t4(Vec4BSigned))); + output.mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan(uint16_t4(Vec4B))); - // Left-shift + mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2_signed.lessThan(int32_t2(Vec2BSigned))); + mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3_signed.lessThan(int32_t3(Vec3BSigned))); + mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4_signed.lessThan(int16_t4(Vec4BSigned))); + uint16_t castedShift = uint16_t(input.shift); - left_shift_operator > leftShiftSmall2; - mortonLeftShift_small_2 = leftShiftSmall2(morton_small_2A, castedShift); - left_shift_operator > leftShiftMedium2; - mortonLeftShift_medium_2 = leftShiftMedium2(morton_medium_2A, castedShift); - left_shift_operator > leftShiftFull2; - mortonLeftShift_full_2 = leftShiftFull2(morton_full_2A, castedShift); - left_shift_operator > leftShiftEmulated2; - mortonLeftShift_emulated_2 = leftShiftEmulated2(morton_emulated_2A, castedShift); - - left_shift_operator > leftShiftSmall3; - mortonLeftShift_small_3 = leftShiftSmall3(morton_small_3A, castedShift); - left_shift_operator > leftShiftMedium3; - mortonLeftShift_medium_3 = leftShiftMedium3(morton_medium_3A, castedShift); - left_shift_operator > leftShiftFull3; - mortonLeftShift_full_3 = leftShiftFull3(morton_full_3A, castedShift); - left_shift_operator > leftShiftEmulated3; - mortonLeftShift_emulated_3 = leftShiftEmulated3(morton_emulated_3A, castedShift); - - left_shift_operator > leftShiftSmall4; - mortonLeftShift_small_4 = leftShiftSmall4(morton_small_4A, castedShift); - left_shift_operator > leftShiftMedium4; - mortonLeftShift_medium_4 = leftShiftMedium4(morton_medium_4A, castedShift); - left_shift_operator > leftShiftFull4; - mortonLeftShift_full_4 = leftShiftFull4(morton_full_4A, castedShift); - left_shift_operator > leftShiftEmulated4; - mortonLeftShift_emulated_4 = leftShiftEmulated4(morton_emulated_4A, castedShift); - - // Unsigned right-shift - arithmetic_right_shift_operator > rightShiftSmall2; - mortonUnsignedRightShift_small_2 = rightShiftSmall2(morton_small_2A, castedShift); - arithmetic_right_shift_operator > rightShiftMedium2; - mortonUnsignedRightShift_medium_2 = rightShiftMedium2(morton_medium_2A, castedShift); - arithmetic_right_shift_operator > rightShiftFull2; - mortonUnsignedRightShift_full_2 = rightShiftFull2(morton_full_2A, castedShift); - arithmetic_right_shift_operator > rightShiftEmulated2; - mortonUnsignedRightShift_emulated_2 = rightShiftEmulated2(morton_emulated_2A, castedShift); - - arithmetic_right_shift_operator > rightShiftSmall3; - mortonUnsignedRightShift_small_3 = rightShiftSmall3(morton_small_3A, castedShift); - arithmetic_right_shift_operator > rightShiftMedium3; - mortonUnsignedRightShift_medium_3 = rightShiftMedium3(morton_medium_3A, castedShift); - arithmetic_right_shift_operator > rightShiftFull3; - mortonUnsignedRightShift_full_3 = rightShiftFull3(morton_full_3A, castedShift); - arithmetic_right_shift_operator > rightShiftEmulated3; - mortonUnsignedRightShift_emulated_3 = rightShiftEmulated3(morton_emulated_3A, castedShift); - - arithmetic_right_shift_operator > rightShiftSmall4; - mortonUnsignedRightShift_small_4 = rightShiftSmall4(morton_small_4A, castedShift); - arithmetic_right_shift_operator > rightShiftMedium4; - mortonUnsignedRightShift_medium_4 = rightShiftMedium4(morton_medium_4A, castedShift); - arithmetic_right_shift_operator > rightShiftFull4; - mortonUnsignedRightShift_full_4 = rightShiftFull4(morton_full_4A, castedShift); - arithmetic_right_shift_operator > rightShiftEmulated4; - mortonUnsignedRightShift_emulated_4 = rightShiftEmulated4(morton_emulated_4A, castedShift); - - // Signed right-shift - arithmetic_right_shift_operator > rightShiftSignedSmall2; - mortonSignedRightShift_small_2 = rightShiftSignedSmall2(morton_small_2ASigned, castedShift); - arithmetic_right_shift_operator > rightShiftSignedMedium2; - mortonSignedRightShift_medium_2 = rightShiftSignedMedium2(morton_medium_2ASigned, castedShift); - arithmetic_right_shift_operator > rightShiftSignedFull2; - mortonSignedRightShift_full_2 = rightShiftSignedFull2(morton_full_2ASigned, castedShift); + arithmetic_right_shift_operator > rightShiftSignedEmulated2; - //mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2ASigned, castedShift); - - arithmetic_right_shift_operator > rightShiftSignedSmall3; - mortonSignedRightShift_small_3 = rightShiftSignedSmall3(morton_small_3ASigned, castedShift); - arithmetic_right_shift_operator > rightShiftSignedMedium3; - mortonSignedRightShift_medium_3 = rightShiftSignedMedium3(morton_medium_3ASigned, castedShift); - arithmetic_right_shift_operator > rightShiftSignedFull3; - mortonSignedRightShift_full_3 = rightShiftSignedFull3(morton_full_3ASigned, castedShift); + mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2_signed, castedShift); arithmetic_right_shift_operator > rightShiftSignedEmulated3; - //mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3ASigned, castedShift); - - arithmetic_right_shift_operator > rightShiftSignedSmall4; - mortonSignedRightShift_small_4 = rightShiftSignedSmall4(morton_small_4ASigned, castedShift); - arithmetic_right_shift_operator > rightShiftSignedMedium4; - mortonSignedRightShift_medium_4 = rightShiftSignedMedium4(morton_medium_4ASigned, castedShift); - arithmetic_right_shift_operator > rightShiftSignedFull4; - mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4ASigned, castedShift); + mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3_signed, castedShift); arithmetic_right_shift_operator > rightShiftSignedEmulated4; - //mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4ASigned, castedShift); + mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4_signed, castedShift); } + */ }; #endif diff --git a/12_Mortons/app_resources/mortonTest.comp.hlsl b/12_Mortons/app_resources/test.comp.hlsl similarity index 79% rename from 12_Mortons/app_resources/mortonTest.comp.hlsl rename to 12_Mortons/app_resources/test.comp.hlsl index 7041568b8..243983d5a 100644 --- a/12_Mortons/app_resources/mortonTest.comp.hlsl +++ b/12_Mortons/app_resources/test.comp.hlsl @@ -1,9 +1,8 @@ //// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h -#pragma shader_stage(compute) -#include "common.hlsl" +#include "testCommon.hlsl" [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -12,5 +11,5 @@ void main(uint3 invocationID : SV_DispatchThreadID) { if (invocationID.x == 0) - outputTestValues[0].fillTestValues(inputTestValues[0]); + fillTestValues(inputTestValues[0], outputTestValues[0]); } diff --git a/12_Mortons/app_resources/testCommon.hlsl b/12_Mortons/app_resources/testCommon.hlsl new file mode 100644 index 000000000..9ff9a4fa8 --- /dev/null +++ b/12_Mortons/app_resources/testCommon.hlsl @@ -0,0 +1,242 @@ +#include "common.hlsl" + +void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestValues) output) +{ + emulated_uint64_t emulatedA = _static_cast(input.generatedA); + emulated_uint64_t emulatedB = _static_cast(input.generatedB); + + // Emulated int tests + output.emulatedAnd = emulatedA & emulatedB; + output.emulatedOr = emulatedA | emulatedB; + output.emulatedXor = emulatedA ^ emulatedB; + output.emulatedNot = emulatedA.operator~(); + output.emulatedPlus = emulatedA + emulatedB; + output.emulatedMinus = emulatedA - emulatedB; + output.emulatedLess = uint32_t(emulatedA < emulatedB); + output.emulatedLessEqual = uint32_t(emulatedA <= emulatedB); + output.emulatedGreater = uint32_t(emulatedA > emulatedB); + output.emulatedGreaterEqual = uint32_t(emulatedA >= emulatedB); + + left_shift_operator leftShift; + output.emulatedLeftShifted = leftShift(emulatedA, input.shift); + + arithmetic_right_shift_operator unsignedRightShift; + output.emulatedUnsignedRightShifted = unsignedRightShift(emulatedA, input.shift); + + arithmetic_right_shift_operator signedRightShift; + output.emulatedSignedRightShifted = signedRightShift(_static_cast(emulatedA), input.shift); + + // Morton tests + uint64_t2 Vec2A = { input.coordX, input.coordY }; + uint64_t2 Vec2B = { input.coordZ, input.coordW }; + + uint64_t3 Vec3A = { input.coordX, input.coordY, input.coordZ }; + uint64_t3 Vec3B = { input.coordY, input.coordZ, input.coordW }; + + uint64_t4 Vec4A = { input.coordX, input.coordY, input.coordZ, input.coordW }; + uint64_t4 Vec4B = { input.coordY, input.coordZ, input.coordW, input.coordX }; + + int64_t2 Vec2ASigned = int64_t2(Vec2A); + int64_t2 Vec2BSigned = int64_t2(Vec2B); + + int64_t3 Vec3ASigned = int64_t3(Vec3A); + int64_t3 Vec3BSigned = int64_t3(Vec3B); + + int64_t4 Vec4ASigned = int64_t4(Vec4A); + int64_t4 Vec4BSigned = int64_t4(Vec4B); + + morton::code morton_small_2A = morton::code::create(Vec2A); + morton::code morton_medium_2A = morton::code::create(Vec2A); + morton::code morton_full_2A = morton::code::create(Vec2A); + morton::code morton_emulated_2A = morton::code::create(Vec2A); + morton::code morton_small_2B = morton::code::create(Vec2B); + morton::code morton_medium_2B = morton::code::create(Vec2B); + morton::code morton_full_2B = morton::code::create(Vec2B); + morton::code morton_emulated_2B = morton::code::create(Vec2B); + + morton::code morton_small_3A = morton::code::create(Vec3A); + morton::code morton_medium_3A = morton::code::create(Vec3A); + morton::code morton_full_3A = morton::code::create(Vec3A); + morton::code morton_emulated_3A = morton::code::create(Vec3A); + morton::code morton_small_3B = morton::code::create(Vec3B); + morton::code morton_medium_3B = morton::code::create(Vec3B); + morton::code morton_full_3B = morton::code::create(Vec3B); + morton::code morton_emulated_3B = morton::code::create(Vec3B); + + morton::code morton_small_4A = morton::code::create(Vec4A); + morton::code morton_medium_4A = morton::code::create(Vec4A); + morton::code morton_full_4A = morton::code::create(Vec4A); + morton::code morton_emulated_4A = morton::code::create(Vec4A); + morton::code morton_small_4B = morton::code::create(Vec4B); + morton::code morton_medium_4B = morton::code::create(Vec4B); + morton::code morton_full_4B = morton::code::create(Vec4B); + morton::code morton_emulated_4B = morton::code::create(Vec4B); + + morton::code morton_small_2_signed = morton::code::create(Vec2ASigned); + morton::code morton_medium_2_signed = morton::code::create(Vec2ASigned); + morton::code morton_full_2_signed = morton::code::create(Vec2ASigned); + + morton::code morton_small_3_signed = morton::code::create(Vec3ASigned); + morton::code morton_medium_3_signed = morton::code::create(Vec3ASigned); + morton::code morton_full_3_signed = morton::code::create(Vec3ASigned); + + morton::code morton_small_4_signed = morton::code::create(Vec4ASigned); + morton::code morton_medium_4_signed = morton::code::create(Vec4ASigned); + morton::code morton_full_4_signed = morton::code::create(Vec4ASigned); + + // Plus + output.mortonPlus_small_2 = morton_small_2A + morton_small_2B; + output.mortonPlus_medium_2 = morton_medium_2A + morton_medium_2B; + output.mortonPlus_full_2 = morton_full_2A + morton_full_2B; + output.mortonPlus_emulated_2 = morton_emulated_2A + morton_emulated_2B; + + output.mortonPlus_small_3 = morton_small_3A + morton_small_3B; + output.mortonPlus_medium_3 = morton_medium_3A + morton_medium_3B; + output.mortonPlus_full_3 = morton_full_3A + morton_full_3B; + output.mortonPlus_emulated_3 = morton_emulated_3A + morton_emulated_3B; + + output.mortonPlus_small_4 = morton_small_4A + morton_small_4B; + output.mortonPlus_medium_4 = morton_medium_4A + morton_medium_4B; + output.mortonPlus_full_4 = morton_full_4A + morton_full_4B; + output.mortonPlus_emulated_4 = morton_emulated_4A + morton_emulated_4B; + + // Minus + output.mortonMinus_small_2 = morton_small_2A - morton_small_2B; + output.mortonMinus_medium_2 = morton_medium_2A - morton_medium_2B; + output.mortonMinus_full_2 = morton_full_2A - morton_full_2B; + output.mortonMinus_emulated_2 = morton_emulated_2A - morton_emulated_2B; + + output.mortonMinus_small_3 = morton_small_3A - morton_small_3B; + output.mortonMinus_medium_3 = morton_medium_3A - morton_medium_3B; + output.mortonMinus_full_3 = morton_full_3A - morton_full_3B; + output.mortonMinus_emulated_3 = morton_emulated_3A - morton_emulated_3B; + + output.mortonMinus_small_4 = morton_small_4A - morton_small_4B; + output.mortonMinus_medium_4 = morton_medium_4A - morton_medium_4B; + output.mortonMinus_full_4 = morton_full_4A - morton_full_4B; + output.mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B; + + // Coordinate-wise equality + output.mortonEqual_small_2 = uint32_t2(morton_small_2A.equal(uint16_t2(Vec2B))); + output.mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal(uint16_t2(Vec2B))); + output.mortonEqual_full_2 = uint32_t2(morton_full_2A.equal(uint32_t2(Vec2B))); + output.mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal(uint32_t2(Vec2B))); + + output.mortonEqual_small_3 = uint32_t3(morton_small_3A.equal(uint16_t3(Vec3B))); + output.mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal(uint16_t3(Vec3B))); + output.mortonEqual_full_3 = uint32_t3(morton_full_3A.equal(uint32_t3(Vec3B))); + output.mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal(uint32_t3(Vec3B))); + + output.mortonEqual_small_4 = uint32_t4(morton_small_4A.equal(uint16_t4(Vec4B))); + output.mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal(uint16_t4(Vec4B))); + output.mortonEqual_full_4 = uint32_t4(morton_full_4A.equal(uint16_t4(Vec4B))); + + // Coordinate-wise unsigned inequality (just testing with less) + output.mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan(uint16_t2(Vec2B))); + output.mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan(uint16_t2(Vec2B))); + output.mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan(uint32_t2(Vec2B))); + output.mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan(uint32_t2(Vec2B))); + + output.mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan(uint16_t3(Vec3B))); + output.mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan(uint16_t3(Vec3B))); + output.mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan(uint32_t3(Vec3B))); + output.mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan(uint32_t3(Vec3B))); + + output.mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan(uint16_t4(Vec4B))); + output.mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan(uint16_t4(Vec4B))); + output.mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan(uint16_t4(Vec4B))); + + // Coordinate-wise signed inequality + output.mortonSignedLess_small_2 = uint32_t2(morton_small_2_signed.lessThan(int16_t2(Vec2BSigned))); + output.mortonSignedLess_medium_2 = uint32_t2(morton_medium_2_signed.lessThan(int16_t2(Vec2BSigned))); + output.mortonSignedLess_full_2 = uint32_t2(morton_full_2_signed.lessThan(int32_t2(Vec2BSigned))); + + output.mortonSignedLess_small_3 = uint32_t3(morton_small_3_signed.lessThan(int16_t3(Vec3BSigned))); + output.mortonSignedLess_medium_3 = uint32_t3(morton_medium_3_signed.lessThan(int16_t3(Vec3BSigned))); + output.mortonSignedLess_full_3 = uint32_t3(morton_full_3_signed.lessThan(int32_t3(Vec3BSigned))); + + output.mortonSignedLess_small_4 = uint32_t4(morton_small_4_signed.lessThan(int16_t4(Vec4BSigned))); + output.mortonSignedLess_medium_4 = uint32_t4(morton_medium_4_signed.lessThan(int16_t4(Vec4BSigned))); + output.mortonSignedLess_full_4 = uint32_t4(morton_full_4_signed.lessThan(int16_t4(Vec4BSigned))); + + // Cast to uint16_t which is what left shift for Mortons expect + uint16_t castedShift = uint16_t(input.shift); + // Each left shift clamps to correct bits so the result kinda makes sense + // Left-shift + left_shift_operator > leftShiftSmall2; + output.mortonLeftShift_small_2 = leftShiftSmall2(morton_small_2A, castedShift % smallBits_2); + left_shift_operator > leftShiftMedium2; + output.mortonLeftShift_medium_2 = leftShiftMedium2(morton_medium_2A, castedShift % mediumBits_2); + left_shift_operator > leftShiftFull2; + output.mortonLeftShift_full_2 = leftShiftFull2(morton_full_2A, castedShift % fullBits_2); + left_shift_operator > leftShiftEmulated2; + output.mortonLeftShift_emulated_2 = leftShiftEmulated2(morton_emulated_2A, castedShift % fullBits_2); + + left_shift_operator > leftShiftSmall3; + output.mortonLeftShift_small_3 = leftShiftSmall3(morton_small_3A, castedShift % smallBits_3); + left_shift_operator > leftShiftMedium3; + output.mortonLeftShift_medium_3 = leftShiftMedium3(morton_medium_3A, castedShift % mediumBits_3); + left_shift_operator > leftShiftFull3; + output.mortonLeftShift_full_3 = leftShiftFull3(morton_full_3A, castedShift % fullBits_3); + left_shift_operator > leftShiftEmulated3; + output.mortonLeftShift_emulated_3 = leftShiftEmulated3(morton_emulated_3A, castedShift % fullBits_3); + + left_shift_operator > leftShiftSmall4; + output.mortonLeftShift_small_4 = leftShiftSmall4(morton_small_4A, castedShift % smallBits_4); + left_shift_operator > leftShiftMedium4; + output.mortonLeftShift_medium_4 = leftShiftMedium4(morton_medium_4A, castedShift % mediumBits_4); + left_shift_operator > leftShiftFull4; + output.mortonLeftShift_full_4 = leftShiftFull4(morton_full_4A, castedShift % fullBits_4); + left_shift_operator > leftShiftEmulated4; + output.mortonLeftShift_emulated_4 = leftShiftEmulated4(morton_emulated_4A, castedShift % fullBits_4); + + // Unsigned right-shift + arithmetic_right_shift_operator > rightShiftSmall2; + output.mortonUnsignedRightShift_small_2 = rightShiftSmall2(morton_small_2A, castedShift % smallBits_2); + arithmetic_right_shift_operator > rightShiftMedium2; + output.mortonUnsignedRightShift_medium_2 = rightShiftMedium2(morton_medium_2A, castedShift % mediumBits_2); + arithmetic_right_shift_operator > rightShiftFull2; + output.mortonUnsignedRightShift_full_2 = rightShiftFull2(morton_full_2A, castedShift % fullBits_2); + arithmetic_right_shift_operator > rightShiftEmulated2; + output.mortonUnsignedRightShift_emulated_2 = rightShiftEmulated2(morton_emulated_2A, castedShift % fullBits_2); + + arithmetic_right_shift_operator > rightShiftSmall3; + output.mortonUnsignedRightShift_small_3 = rightShiftSmall3(morton_small_3A, castedShift % smallBits_3); + arithmetic_right_shift_operator > rightShiftMedium3; + output.mortonUnsignedRightShift_medium_3 = rightShiftMedium3(morton_medium_3A, castedShift % mediumBits_3); + arithmetic_right_shift_operator > rightShiftFull3; + output.mortonUnsignedRightShift_full_3 = rightShiftFull3(morton_full_3A, castedShift % fullBits_3); + arithmetic_right_shift_operator > rightShiftEmulated3; + output.mortonUnsignedRightShift_emulated_3 = rightShiftEmulated3(morton_emulated_3A, castedShift % fullBits_3); + + arithmetic_right_shift_operator > rightShiftSmall4; + output.mortonUnsignedRightShift_small_4 = rightShiftSmall4(morton_small_4A, castedShift % smallBits_4); + arithmetic_right_shift_operator > rightShiftMedium4; + output.mortonUnsignedRightShift_medium_4 = rightShiftMedium4(morton_medium_4A, castedShift % mediumBits_4); + arithmetic_right_shift_operator > rightShiftFull4; + output.mortonUnsignedRightShift_full_4 = rightShiftFull4(morton_full_4A, castedShift % fullBits_4); + arithmetic_right_shift_operator > rightShiftEmulated4; + output.mortonUnsignedRightShift_emulated_4 = rightShiftEmulated4(morton_emulated_4A, castedShift % fullBits_4); + + // Signed right-shift + arithmetic_right_shift_operator > rightShiftSignedSmall2; + output.mortonSignedRightShift_small_2 = rightShiftSignedSmall2(morton_small_2_signed, castedShift % smallBits_2); + arithmetic_right_shift_operator > rightShiftSignedMedium2; + output.mortonSignedRightShift_medium_2 = rightShiftSignedMedium2(morton_medium_2_signed, castedShift % mediumBits_2); + arithmetic_right_shift_operator > rightShiftSignedFull2; + output.mortonSignedRightShift_full_2 = rightShiftSignedFull2(morton_full_2_signed, castedShift % fullBits_2); + + arithmetic_right_shift_operator > rightShiftSignedSmall3; + output.mortonSignedRightShift_small_3 = rightShiftSignedSmall3(morton_small_3_signed, castedShift % smallBits_3); + arithmetic_right_shift_operator > rightShiftSignedMedium3; + output.mortonSignedRightShift_medium_3 = rightShiftSignedMedium3(morton_medium_3_signed, castedShift % mediumBits_3); + arithmetic_right_shift_operator > rightShiftSignedFull3; + output.mortonSignedRightShift_full_3 = rightShiftSignedFull3(morton_full_3_signed, castedShift % fullBits_3); + + arithmetic_right_shift_operator > rightShiftSignedSmall4; + output.mortonSignedRightShift_small_4 = rightShiftSignedSmall4(morton_small_4_signed, castedShift % smallBits_4); + arithmetic_right_shift_operator > rightShiftSignedMedium4; + output.mortonSignedRightShift_medium_4 = rightShiftSignedMedium4(morton_medium_4_signed, castedShift % mediumBits_4); + arithmetic_right_shift_operator > rightShiftSignedFull4; + output.mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4_signed, castedShift % fullBits_4); +} \ No newline at end of file diff --git a/12_Mortons/main.cpp b/12_Mortons/main.cpp index f83c49b9e..18fd067ec 100644 --- a/12_Mortons/main.cpp +++ b/12_Mortons/main.cpp @@ -10,7 +10,7 @@ #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include "app_resources/common.hlsl" -#include "Tester.h" +#include "CTester.h" using namespace nbl::core; using namespace nbl::hlsl; @@ -35,24 +35,21 @@ class MortonTest final : public MonoDeviceApplication, public MonoAssetManagerAn return false; if (!asset_base_t::onAppInitialized(std::move(system))) return false; - { - - } - Tester::PipelineSetupData pplnSetupData; + CTester::PipelineSetupData pplnSetupData; pplnSetupData.device = m_device; pplnSetupData.api = m_api; pplnSetupData.assetMgr = m_assetMgr; pplnSetupData.logger = m_logger; pplnSetupData.physicalDevice = m_physicalDevice; pplnSetupData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); + // Some tests with mortons with emulated uint storage were cut off, it should be fine since each tested on their own produces correct results for each operator { - Tester mortonTester; - pplnSetupData.testShaderPath = "app_resources/mortonTest.comp.hlsl"; + CTester mortonTester; + pplnSetupData.testShaderPath = "app_resources/test.comp.hlsl"; mortonTester.setupPipeline(pplnSetupData); mortonTester.performTests(); } - return true; } From f05dec4652d1af3fa1a4664760efb1f3e934134a Mon Sep 17 00:00:00 2001 From: Fletterio Date: Mon, 28 Apr 2025 15:29:40 -0300 Subject: [PATCH 9/9] Clarifying comment for blocker issue --- 12_Mortons/main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/12_Mortons/main.cpp b/12_Mortons/main.cpp index 18fd067ec..a05e61842 100644 --- a/12_Mortons/main.cpp +++ b/12_Mortons/main.cpp @@ -44,6 +44,7 @@ class MortonTest final : public MonoDeviceApplication, public MonoAssetManagerAn pplnSetupData.physicalDevice = m_physicalDevice; pplnSetupData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); // Some tests with mortons with emulated uint storage were cut off, it should be fine since each tested on their own produces correct results for each operator + // Blocked by https://github.com/KhronosGroup/SPIRV-Tools/issues/6104 { CTester mortonTester; pplnSetupData.testShaderPath = "app_resources/test.comp.hlsl";