From 02286fa2618f1186d602eb9fd17fdc5600f2ecdb Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 12 Feb 2024 01:19:25 +0200 Subject: [PATCH 01/14] Migrate example 14 ComputeScan from old to new api --- 14_ComputeScan/CMakeLists.txt | 6 + 14_ComputeScan/config.json.template | 28 +++ 14_ComputeScan/main.cpp | 256 ++++++++++++++++++++++++++++ 14_ComputeScan/pipeline.groovy | 50 ++++++ CMakeLists.txt | 4 +- 5 files changed, 343 insertions(+), 1 deletion(-) create mode 100644 14_ComputeScan/CMakeLists.txt create mode 100644 14_ComputeScan/config.json.template create mode 100644 14_ComputeScan/main.cpp create mode 100644 14_ComputeScan/pipeline.groovy diff --git a/14_ComputeScan/CMakeLists.txt b/14_ComputeScan/CMakeLists.txt new file mode 100644 index 000000000..2f9218f93 --- /dev/null +++ b/14_ComputeScan/CMakeLists.txt @@ -0,0 +1,6 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file diff --git a/14_ComputeScan/config.json.template b/14_ComputeScan/config.json.template new file mode 100644 index 000000000..a4ee411fa --- /dev/null +++ b/14_ComputeScan/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [ "NBL_BUILD_CEGUI" ] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/14_ComputeScan/main.cpp b/14_ComputeScan/main.cpp new file mode 100644 index 000000000..5d3588d5d --- /dev/null +++ b/14_ComputeScan/main.cpp @@ -0,0 +1,256 @@ +#include "../common/BasicMultiQueueApplication.hpp" +#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp" + +#include +#include + +using namespace nbl; +using namespace core; +using namespace asset; +using namespace system; +using namespace video; + +class ComputeScanApp final : public examples::BasicMultiQueueApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = examples::BasicMultiQueueApplication; + using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; + +public: + ComputeScanApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + transferDownQueue = getTransferDownQueue(); + computeQueue = getComputeQueue(); + + // Create (an almost) 128MB input buffer + constexpr auto in_size = 128u << 20u; + constexpr auto in_count = in_size / sizeof(uint32_t) - 23u; + + m_logger->log("Input element count: %d", ILogger::ELL_PERFORMANCE, in_count); + + auto in = new uint32_t[in_count]; + { + std::random_device random_device; + std::mt19937 generator(random_device()); + std::uniform_int_distribution distribution(0u, ~0u); + for (auto i = 0u; i < in_count; i++) + in[i] = distribution(generator); + } + auto minSSBOAlign = m_physicalDevice->getLimits().minSSBOAlignment; + constexpr auto begin = in_count / 4 + 118; + assert(((begin * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u); + constexpr auto end = in_count * 3 / 4 - 78; + assert(((end * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u); + constexpr auto elementCount = end - begin; + + smart_refctd_ptr gpuinputDataBuffer; + { + IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; + inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount; // TODO Declare the element data type in the shader? + inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + gpuinputDataBuffer = m_utils->createFilledDeviceLocalBufferOnDedMem( + getTransferUpQueue(), + std::move(inputDataBufferCreationParams), + inputData + ); + } + SBufferRange in_gpu_range = { begin * sizeof(uint32_t), elementCount * sizeof(uint32_t), gpuinputDataBuffer }; + + const auto scanType = video::CScanner::EST_EXCLUSIVE; + auto scanner = m_utils->getDefaultScanner(); + auto scan_pipeline = scanner->getDefaultPipeline(scanType, CScanner::EDT_UINT, CScanner::EO_ADD); + + CScanner::DefaultPushConstants scan_push_constants; + CScanner::DispatchInfo scan_dispatch_info; + scanner->buildParameters(elementCount, scan_push_constants, scan_dispatch_info); + + IGPUBuffer::SCreationParams params = { scan_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT }; + SBufferRange scratch_gpu_range = {0u, params.size, m_device->createBuffer(std::move(params)) }; + { + auto memReqs = scratch_gpu_range.buffer->getMemoryReqs(); + memReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + auto scratchMem = m_device->allocate(memReqs, scratch_gpu_range.buffer.get()); + } + + auto dsLayout = scanner->getDefaultDescriptorSetLayout(); + auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, &dsLayout, &dsLayout + 1u); + auto ds = dsPool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); + scanner->updateDescriptorSet(m_device.get(), ds.get(), in_gpu_range, scratch_gpu_range); + + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::ECF_RESET_COMMAND_BUFFER_BIT); + if (!m_device->createCommandBuffers(cmdpool.get(), IGPUCommandBuffer::EL_PRIMARY, 1u, &cmdbuf)) + { + logFail("Failed to create Command Buffers!\n"); + return false; + } + } + + cmdbuf->begin(IGPUCommandBuffer::EU_SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this + cmdbuf->fillBuffer(scratch_gpu_range.buffer.get(), 0u, sizeof(uint32_t) + scratch_gpu_range.size / 2u, 0u); + cmdbuf->bindComputePipeline(scan_pipeline); + auto pipeline_layout = scan_pipeline->getLayout(); + cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, pipeline_layout, 0u, 1u, &ds.get()); + scanner->dispatchHelper( + cmdbuf.get(), pipeline_layout, scan_push_constants, scan_dispatch_info, + static_cast(asset::EPSF_COMPUTE_SHADER_BIT | asset::EPSF_TRANSFER_BIT), 0u, nullptr, + static_cast(asset::EPSF_COMPUTE_SHADER_BIT | asset::EPSF_TRANSFER_BIT), 0u, nullptr + ); + cmdbuf->end(); + + core::smart_refctd_ptr fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); + + IGPUQueue::SSubmitInfo submit = {}; + submit.commandBufferCount = 1u; + submit.commandBuffers = &cmdbuf.get(); + computeQueue->startCapture(); + computeQueue->submit(1, &submit, fence.get()); + computeQueue->endCapture(); + + // cpu counterpart + auto cpu_begin = in + begin; + m_logger->log("CPU scan begin", system::ILogger::ELL_PERFORMANCE); + + auto start = std::chrono::high_resolution_clock::now(); + switch (scanType) + { + case video::CScanner::EST_INCLUSIVE: + std::inclusive_scan(cpu_begin, in + end, cpu_begin); + break; + case video::CScanner::EST_EXCLUSIVE: + std::exclusive_scan(cpu_begin, in + end, cpu_begin, 0u); + break; + default: + assert(false); + exit(0xdeadbeefu); + break; + } + auto stop = std::chrono::high_resolution_clock::now(); + + m_logger->log("CPU scan end. Time taken: %d us", system::ILogger::ELL_PERFORMANCE, std::chrono::duration_cast(stop - start).count()); + + // wait for the gpu impl to complete + m_device->blockForFences(1u, &fence.get()); + + { + IGPUBuffer::SCreationParams params = {}; + params.size = in_gpu_range.size; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT; + // (REVIEW): Check if this new download_buffer is needed or if we can directly read from the gpu_input buffer + auto downloaded_buffer = m_device->createBuffer(std::move(params)); + auto memReqs = downloaded_buffer->getMemoryReqs(); + memReqs.memoryTypeBits &= m_physicalDevice->getDownStreamingMemoryTypeBits(); + auto queriesMem = m_device->allocate(memReqs, downloaded_buffer.get()); + { + // (REVIEW): Maybe we can just reset the cmdbuf we already have? + core::smart_refctd_ptr cmdbuf; + { + auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::ECF_NONE); + m_device->createCommandBuffers(cmdPool.get(), IGPUCommandBuffer::EL_PRIMARY, 1u, &cmdbuf); + } + cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool + asset::SBufferCopy region; + region.srcOffset = in_gpu_range.offset; + region.dstOffset = 0u; + region.size = in_gpu_range.size; + cmdbuf->copyBuffer(in_gpu_range.buffer.get(), downloaded_buffer.get(), 1u, ®ion); + cmdbuf->end(); + fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); + IGPUQueue::SSubmitInfo submit = {}; + submit.commandBufferCount = 1u; + submit.commandBuffers = &cmdbuf.get(); + computeQueue->submit(1u, &submit, fence.get()); + m_device->blockForFences(1u, &fence.get()); + } + + auto mem = const_cast(downloaded_buffer->getBoundMemory()); + { + video::IDeviceMemoryAllocation::MappedMemoryRange range; + { + range.memory = mem; + range.offset = 0u; + range.length = in_gpu_range.size; + } + m_device->mapMemory(range, video::IDeviceMemoryAllocation::EMCAF_READ); + } + auto gpu_begin = reinterpret_cast(mem->getMappedPointer()); + for (auto i = 0u; i < elementCount; i++) + { + if (gpu_begin[i] != cpu_begin[i]) + _NBL_DEBUG_BREAK_IF(true); + } + m_logger->log("Result Comparison Test Passed", system::ILogger::ELL_PERFORMANCE); + } + + delete[] in; + + return true; + } + + virtual video::SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + { + video::SPhysicalDeviceFeatures retval = {}; + + retval.bufferDeviceAddress = true; + retval.subgroupBroadcastDynamicId = true; + retval.shaderSubgroupExtendedTypes = true; + // TODO: actually need to implement this and set it on the pipelines + retval.computeFullSubgroups = true; + retval.subgroupSizeControl = true; + + return retval; + } + + virtual bool onAppTerminated() override + { + m_logger->log("==========Result==========", ILogger::ELL_INFO); + m_logger->log("Scan Success: %b", ILogger::ELL_INFO, scanSuccess); + delete[] inputData; + return true; + } + + // the unit test is carried out on init + void workLoopBody() override {} + + bool keepRunning() override { return false; } + +private: + void logTestOutcome(bool passed, uint32_t workgroupSize) + { + if (passed) + m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); + else + { + m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); + } + } + + smart_refctd_ptr createPipeline(smart_refctd_ptr&& overridenUnspecialized) + { + auto shader = m_device->createShader(std::move(overridenUnspecialized)); + auto specialized = m_device->createSpecializedShader(shader.get(), ISpecializedShader::SInfo(nullptr, nullptr, "main")); + return m_device->createComputePipeline(nullptr, smart_refctd_ptr(pipelineLayout), std::move(specialized)); + } + + IGPUQueue* transferDownQueue; + IGPUQueue* computeQueue; + + uint32_t* inputData = nullptr; + smart_refctd_ptr descriptorSet; + smart_refctd_ptr pipelineLayout; + + smart_refctd_ptr fence; + smart_refctd_ptr cmdbuf; + smart_refctd_ptr resultsBuffer; + + bool scanSuccess = false; +}; + +NBL_MAIN_FUNC(ComputeScanApp) \ No newline at end of file diff --git a/14_ComputeScan/pipeline.groovy b/14_ComputeScan/pipeline.groovy new file mode 100644 index 000000000..4eaaafe23 --- /dev/null +++ b/14_ComputeScan/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CComputeScanBuilder extends IBuilder +{ + public CComputeScanBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CComputeScanBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a20a33a9..57d7c0141 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,7 +26,9 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(07_StagingAndMultipleQueues EXCLUDE_FROM_ALL) # showcase the set-up of a swapchain and picking of a matching device add_subdirectory(08_HelloSwapchain EXCLUDE_FROM_ALL) - + + # global scan + add_subdirectory(14_ComputeScan EXCLUDE_FROM_ALL) # Waiting for a refactor #add_subdirectory(27_PLYSTLDemo EXCLUDE_FROM_ALL) #add_subdirectory(29_SpecializationConstants EXCLUDE_FROM_ALL) From 8fe0d41d4d3ac0b9b10bb5040a8ba0f40ea17dc4 Mon Sep 17 00:00:00 2001 From: PentaKon Date: Mon, 12 Feb 2024 13:15:03 +0200 Subject: [PATCH 02/14] Migrate example 14_ComputeScan to new APIs --- 14_ComputeScan/CMakeLists.txt | 6 + 14_ComputeScan/config.json.template | 28 +++ 14_ComputeScan/main.cpp | 254 ++++++++++++++++++++++++++++ 14_ComputeScan/pipeline.groovy | 50 ++++++ CMakeLists.txt | 4 +- 5 files changed, 341 insertions(+), 1 deletion(-) create mode 100644 14_ComputeScan/CMakeLists.txt create mode 100644 14_ComputeScan/config.json.template create mode 100644 14_ComputeScan/main.cpp create mode 100644 14_ComputeScan/pipeline.groovy diff --git a/14_ComputeScan/CMakeLists.txt b/14_ComputeScan/CMakeLists.txt new file mode 100644 index 000000000..2f9218f93 --- /dev/null +++ b/14_ComputeScan/CMakeLists.txt @@ -0,0 +1,6 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file diff --git a/14_ComputeScan/config.json.template b/14_ComputeScan/config.json.template new file mode 100644 index 000000000..a4ee411fa --- /dev/null +++ b/14_ComputeScan/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [ "NBL_BUILD_CEGUI" ] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/14_ComputeScan/main.cpp b/14_ComputeScan/main.cpp new file mode 100644 index 000000000..59ff420d9 --- /dev/null +++ b/14_ComputeScan/main.cpp @@ -0,0 +1,254 @@ +#include "../common/BasicMultiQueueApplication.hpp" +#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp" + +#include +#include + +using namespace nbl; +using namespace core; +using namespace asset; +using namespace system; +using namespace video; + +class ComputeScanApp final : public examples::BasicMultiQueueApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = examples::BasicMultiQueueApplication; + using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; + +public: + ComputeScanApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + transferDownQueue = getTransferDownQueue(); + computeQueue = getComputeQueue(); + + // Create (an almost) 128MB input buffer + constexpr auto in_size = 128u << 20u; + constexpr auto in_count = in_size / sizeof(uint32_t) - 23u; + + m_logger->log("Input element count: %d", ILogger::ELL_PERFORMANCE, in_count); + + auto in = new uint32_t[in_count]; + { + std::random_device random_device; + std::mt19937 generator(random_device()); + std::uniform_int_distribution distribution(0u, ~0u); + for (auto i = 0u; i < in_count; i++) + in[i] = distribution(generator); + } + auto minSSBOAlign = m_physicalDevice->getLimits().minSSBOAlignment; + constexpr auto begin = in_count / 4 + 118; + assert(((begin * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u); + constexpr auto end = in_count * 3 / 4 - 78; + assert(((end * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u); + constexpr auto elementCount = end - begin; + + smart_refctd_ptr gpuinputDataBuffer; + { + IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; + inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount; // TODO Declare the element data type in the shader? + inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + gpuinputDataBuffer = m_utils->createFilledDeviceLocalBufferOnDedMem( + getTransferUpQueue(), + std::move(inputDataBufferCreationParams), + inputData + ); + } + SBufferRange in_gpu_range = { begin * sizeof(uint32_t), elementCount * sizeof(uint32_t), gpuinputDataBuffer }; + + const auto scanType = video::CScanner::EST_EXCLUSIVE; + auto scanner = m_utils->getDefaultScanner(); + auto scan_pipeline = scanner->getDefaultPipeline(scanType, CScanner::EDT_UINT, CScanner::EO_ADD); + + CScanner::DefaultPushConstants scan_push_constants; + CScanner::DispatchInfo scan_dispatch_info; + scanner->buildParameters(elementCount, scan_push_constants, scan_dispatch_info); + + IGPUBuffer::SCreationParams params = { scan_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT }; + SBufferRange scratch_gpu_range = {0u, params.size, m_device->createBuffer(std::move(params)) }; + { + auto memReqs = scratch_gpu_range.buffer->getMemoryReqs(); + memReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + auto scratchMem = m_device->allocate(memReqs, scratch_gpu_range.buffer.get()); + } + + auto dsLayout = scanner->getDefaultDescriptorSetLayout(); + auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, &dsLayout, &dsLayout + 1u); + auto ds = dsPool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); + scanner->updateDescriptorSet(m_device.get(), ds.get(), in_gpu_range, scratch_gpu_range); + + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::ECF_RESET_COMMAND_BUFFER_BIT); + if (!m_device->createCommandBuffers(cmdpool.get(), IGPUCommandBuffer::EL_PRIMARY, 1u, &cmdbuf)) + { + logFail("Failed to create Command Buffers!\n"); + return false; + } + } + + cmdbuf->begin(IGPUCommandBuffer::EU_SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this + cmdbuf->fillBuffer(scratch_gpu_range.buffer.get(), 0u, sizeof(uint32_t) + scratch_gpu_range.size / 2u, 0u); + cmdbuf->bindComputePipeline(scan_pipeline); + auto pipeline_layout = scan_pipeline->getLayout(); + cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, pipeline_layout, 0u, 1u, &ds.get()); + scanner->dispatchHelper( + cmdbuf.get(), pipeline_layout, scan_push_constants, scan_dispatch_info, + static_cast(asset::EPSF_COMPUTE_SHADER_BIT | asset::EPSF_TRANSFER_BIT), 0u, nullptr, + static_cast(asset::EPSF_COMPUTE_SHADER_BIT | asset::EPSF_TRANSFER_BIT), 0u, nullptr + ); + cmdbuf->end(); + + core::smart_refctd_ptr fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); + + IGPUQueue::SSubmitInfo submit = {}; + submit.commandBufferCount = 1u; + submit.commandBuffers = &cmdbuf.get(); + computeQueue->startCapture(); + computeQueue->submit(1, &submit, fence.get()); + computeQueue->endCapture(); + + // cpu counterpart + auto cpu_begin = in + begin; + m_logger->log("CPU scan begin", system::ILogger::ELL_PERFORMANCE); + + auto start = std::chrono::high_resolution_clock::now(); + switch (scanType) + { + case video::CScanner::EST_INCLUSIVE: + std::inclusive_scan(cpu_begin, in + end, cpu_begin); + break; + case video::CScanner::EST_EXCLUSIVE: + std::exclusive_scan(cpu_begin, in + end, cpu_begin, 0u); + break; + default: + assert(false); + exit(0xdeadbeefu); + break; + } + auto stop = std::chrono::high_resolution_clock::now(); + + m_logger->log("CPU scan end. Time taken: %d us", system::ILogger::ELL_PERFORMANCE, std::chrono::duration_cast(stop - start).count()); + + // wait for the gpu impl to complete + m_device->blockForFences(1u, &fence.get()); + + { + IGPUBuffer::SCreationParams params = {}; + params.size = in_gpu_range.size; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT; + // (REVIEW): Check if this new download_buffer is needed or if we can directly read from the gpu_input buffer + auto downloaded_buffer = m_device->createBuffer(std::move(params)); + auto memReqs = downloaded_buffer->getMemoryReqs(); + memReqs.memoryTypeBits &= m_physicalDevice->getDownStreamingMemoryTypeBits(); + auto queriesMem = m_device->allocate(memReqs, downloaded_buffer.get()); + { + // (REVIEW): Maybe we can just reset the cmdbuf we already have? + core::smart_refctd_ptr cmdbuf; + { + auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::ECF_NONE); + m_device->createCommandBuffers(cmdPool.get(), IGPUCommandBuffer::EL_PRIMARY, 1u, &cmdbuf); + } + cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool + asset::SBufferCopy region; + region.srcOffset = in_gpu_range.offset; + region.dstOffset = 0u; + region.size = in_gpu_range.size; + cmdbuf->copyBuffer(in_gpu_range.buffer.get(), downloaded_buffer.get(), 1u, ®ion); + cmdbuf->end(); + fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); + IGPUQueue::SSubmitInfo submit = {}; + submit.commandBufferCount = 1u; + submit.commandBuffers = &cmdbuf.get(); + computeQueue->submit(1u, &submit, fence.get()); + m_device->blockForFences(1u, &fence.get()); + } + + auto mem = const_cast(downloaded_buffer->getBoundMemory()); + { + video::IDeviceMemoryAllocation::MappedMemoryRange range; + { + range.memory = mem; + range.offset = 0u; + range.length = in_gpu_range.size; + } + m_device->mapMemory(range, video::IDeviceMemoryAllocation::EMCAF_READ); + } + auto gpu_begin = reinterpret_cast(mem->getMappedPointer()); + for (auto i = 0u; i < elementCount; i++) + { + if (gpu_begin[i] != cpu_begin[i]) + _NBL_DEBUG_BREAK_IF(true); + } + m_logger->log("Result Comparison Test Passed", system::ILogger::ELL_PERFORMANCE); + } + + delete[] in; + + return true; + } + + virtual video::SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + { + video::SPhysicalDeviceFeatures retval = {}; + + retval.bufferDeviceAddress = true; + retval.subgroupBroadcastDynamicId = true; + retval.shaderSubgroupExtendedTypes = true; + // TODO: actually need to implement this and set it on the pipelines + retval.computeFullSubgroups = true; + retval.subgroupSizeControl = true; + + return retval; + } + + virtual bool onAppTerminated() override + { + m_logger->log("==========Result==========", ILogger::ELL_INFO); + m_logger->log("Scan Success: %b", ILogger::ELL_INFO, scanSuccess); + delete[] inputData; + return true; + } + + // the unit test is carried out on init + void workLoopBody() override {} + + bool keepRunning() override { return false; } + +private: + void logTestOutcome(bool passed, uint32_t workgroupSize) + { + if (passed) + m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); + else + { + m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); + } + } + + smart_refctd_ptr createPipeline(smart_refctd_ptr&& overridenUnspecialized) + { + auto shader = m_device->createShader(std::move(overridenUnspecialized)); + auto specialized = m_device->createSpecializedShader(shader.get(), ISpecializedShader::SInfo(nullptr, nullptr, "main")); + return m_device->createComputePipeline(nullptr, smart_refctd_ptr(pipelineLayout), std::move(specialized)); + } + + IGPUQueue* transferDownQueue; + IGPUQueue* computeQueue; + + uint32_t* inputData = nullptr; + smart_refctd_ptr descriptorSet; + smart_refctd_ptr pipelineLayout; + + smart_refctd_ptr fence; + smart_refctd_ptr cmdbuf; + smart_refctd_ptr resultsBuffer; + + bool scanSuccess = false; +}; \ No newline at end of file diff --git a/14_ComputeScan/pipeline.groovy b/14_ComputeScan/pipeline.groovy new file mode 100644 index 000000000..4eaaafe23 --- /dev/null +++ b/14_ComputeScan/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CComputeScanBuilder extends IBuilder +{ + public CComputeScanBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CComputeScanBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a20a33a9..57d7c0141 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,7 +26,9 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(07_StagingAndMultipleQueues EXCLUDE_FROM_ALL) # showcase the set-up of a swapchain and picking of a matching device add_subdirectory(08_HelloSwapchain EXCLUDE_FROM_ALL) - + + # global scan + add_subdirectory(14_ComputeScan EXCLUDE_FROM_ALL) # Waiting for a refactor #add_subdirectory(27_PLYSTLDemo EXCLUDE_FROM_ALL) #add_subdirectory(29_SpecializationConstants EXCLUDE_FROM_ALL) From 56074055a015fb9c50b6b474d849db5295fe7752 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 22 Apr 2024 00:15:25 +0300 Subject: [PATCH 03/14] Fix compute scan example code to compile and work --- 14_ComputeScan/main.cpp | 175 +++++++++++++++++++++++----------------- 1 file changed, 101 insertions(+), 74 deletions(-) diff --git a/14_ComputeScan/main.cpp b/14_ComputeScan/main.cpp index 5d3588d5d..a6a6c8929 100644 --- a/14_ComputeScan/main.cpp +++ b/14_ComputeScan/main.cpp @@ -1,5 +1,5 @@ -#include "../common/BasicMultiQueueApplication.hpp" -#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "nbl/application_templates/BasicMultiQueueApplication.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include #include @@ -10,10 +10,10 @@ using namespace asset; using namespace system; using namespace video; -class ComputeScanApp final : public examples::BasicMultiQueueApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication +class ComputeScanApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::BasicMultiQueueApplication; - using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; + using device_base_t = application_templates::BasicMultiQueueApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; public: ComputeScanApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : @@ -26,22 +26,21 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public if (!asset_base_t::onAppInitialized(std::move(system))) return false; - transferDownQueue = getTransferDownQueue(); computeQueue = getComputeQueue(); // Create (an almost) 128MB input buffer - constexpr auto in_size = 128u << 20u; - constexpr auto in_count = in_size / sizeof(uint32_t) - 23u; + constexpr auto in_size = 128u << 5u; + constexpr auto in_count = 1418;//in_size / sizeof(uint32_t) - 23u; m_logger->log("Input element count: %d", ILogger::ELL_PERFORMANCE, in_count); - auto in = new uint32_t[in_count]; + inputData = new uint32_t[in_count]; { std::random_device random_device; std::mt19937 generator(random_device()); std::uniform_int_distribution distribution(0u, ~0u); for (auto i = 0u; i < in_count; i++) - in[i] = distribution(generator); + inputData[i] = distribution(generator) % 100000; } auto minSSBOAlign = m_physicalDevice->getLimits().minSSBOAlignment; constexpr auto begin = in_count / 4 + 118; @@ -53,10 +52,10 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public smart_refctd_ptr gpuinputDataBuffer; { IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; - inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount; // TODO Declare the element data type in the shader? + inputDataBufferCreationParams.size = sizeof(uint32_t) * in_count; // TODO Declare the element data type in the shader? inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; gpuinputDataBuffer = m_utils->createFilledDeviceLocalBufferOnDedMem( - getTransferUpQueue(), + { .queue = getTransferUpQueue() }, std::move(inputDataBufferCreationParams), inputData ); @@ -65,13 +64,12 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public const auto scanType = video::CScanner::EST_EXCLUSIVE; auto scanner = m_utils->getDefaultScanner(); - auto scan_pipeline = scanner->getDefaultPipeline(scanType, CScanner::EDT_UINT, CScanner::EO_ADD); CScanner::DefaultPushConstants scan_push_constants; CScanner::DispatchInfo scan_dispatch_info; scanner->buildParameters(elementCount, scan_push_constants, scan_dispatch_info); - IGPUBuffer::SCreationParams params = { scan_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT }; + IGPUBuffer::SCreationParams params = { scan_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT }; SBufferRange scratch_gpu_range = {0u, params.size, m_device->createBuffer(std::move(params)) }; { auto memReqs = scratch_gpu_range.buffer->getMemoryReqs(); @@ -79,53 +77,66 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public auto scratchMem = m_device->allocate(memReqs, scratch_gpu_range.buffer.get()); } + auto scan_pipeline = scanner->getDefaultPipeline(scanType, CScanner::EDT_UINT, CScanner::EO_ADD, scan_push_constants.scanParams.getScratchSize()); auto dsLayout = scanner->getDefaultDescriptorSetLayout(); - auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, &dsLayout, &dsLayout + 1u); + auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout, 1 }); auto ds = dsPool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); scanner->updateDescriptorSet(m_device.get(), ds.get(), in_gpu_range, scratch_gpu_range); { - smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::ECF_RESET_COMMAND_BUFFER_BIT); - if (!m_device->createCommandBuffers(cmdpool.get(), IGPUCommandBuffer::EL_PRIMARY, 1u, &cmdbuf)) + smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 })) { logFail("Failed to create Command Buffers!\n"); return false; } } - cmdbuf->begin(IGPUCommandBuffer::EU_SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this - cmdbuf->fillBuffer(scratch_gpu_range.buffer.get(), 0u, sizeof(uint32_t) + scratch_gpu_range.size / 2u, 0u); + cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this + cmdbuf->fillBuffer(scratch_gpu_range, 0u); cmdbuf->bindComputePipeline(scan_pipeline); auto pipeline_layout = scan_pipeline->getLayout(); cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, pipeline_layout, 0u, 1u, &ds.get()); - scanner->dispatchHelper( - cmdbuf.get(), pipeline_layout, scan_push_constants, scan_dispatch_info, - static_cast(asset::EPSF_COMPUTE_SHADER_BIT | asset::EPSF_TRANSFER_BIT), 0u, nullptr, - static_cast(asset::EPSF_COMPUTE_SHADER_BIT | asset::EPSF_TRANSFER_BIT), 0u, nullptr - ); + scanner->dispatchHelper(cmdbuf.get(), pipeline_layout, scan_push_constants, scan_dispatch_info, 0u, nullptr, 0u, nullptr); cmdbuf->end(); - core::smart_refctd_ptr fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); - - IGPUQueue::SSubmitInfo submit = {}; - submit.commandBufferCount = 1u; - submit.commandBuffers = &cmdbuf.get(); - computeQueue->startCapture(); - computeQueue->submit(1, &submit, fence.get()); - computeQueue->endCapture(); + core::smart_refctd_ptr semaphore = m_device->createSemaphore(0); + // submit + IQueue::SSubmitInfo::SSemaphoreInfo semInfo[1] = { { + .semaphore = semaphore.get(), + .value = 1, + // just as we've outputted all pixels, signal + .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + } }; + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + .cmdbuf = cmdbuf.get() + } }; + + const IQueue::SSubmitInfo infos[1] = { { + .commandBuffers = commandBuffers, + .signalSemaphores = semInfo + } }; + + computeQueue->startCapture(); + if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { + m_logger->log("Submission failure", system::ILogger::ELL_ERROR); + } + computeQueue->endCapture(); + } // cpu counterpart - auto cpu_begin = in + begin; + auto cpu_begin = inputData + begin; m_logger->log("CPU scan begin", system::ILogger::ELL_PERFORMANCE); auto start = std::chrono::high_resolution_clock::now(); switch (scanType) { case video::CScanner::EST_INCLUSIVE: - std::inclusive_scan(cpu_begin, in + end, cpu_begin); + std::inclusive_scan(cpu_begin, inputData + end, cpu_begin); break; case video::CScanner::EST_EXCLUSIVE: - std::exclusive_scan(cpu_begin, in + end, cpu_begin, 0u); + std::exclusive_scan(cpu_begin, inputData + end, cpu_begin, 0u); break; default: assert(false); @@ -137,7 +148,14 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public m_logger->log("CPU scan end. Time taken: %d us", system::ILogger::ELL_PERFORMANCE, std::chrono::duration_cast(stop - start).count()); // wait for the gpu impl to complete - m_device->blockForFences(1u, &fence.get()); + const ISemaphore::SWaitInfo cmdbufDonePending[] = {{ + .semaphore = semaphore.get(), + .value = 1 + }}; + if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for operation semaphore failed", ILogger::ELL_ERROR); + return false; + } { IGPUBuffer::SCreationParams params = {}; @@ -152,33 +170,52 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public // (REVIEW): Maybe we can just reset the cmdbuf we already have? core::smart_refctd_ptr cmdbuf; { - auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::ECF_NONE); - m_device->createCommandBuffers(cmdPool.get(), IGPUCommandBuffer::EL_PRIMARY, 1u, &cmdbuf); + auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::NONE); + cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf , 1}, core::smart_refctd_ptr(m_logger)); } - cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool - asset::SBufferCopy region; + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool + IGPUCommandBuffer::SBufferCopy region; region.srcOffset = in_gpu_range.offset; region.dstOffset = 0u; region.size = in_gpu_range.size; cmdbuf->copyBuffer(in_gpu_range.buffer.get(), downloaded_buffer.get(), 1u, ®ion); cmdbuf->end(); - fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); - IGPUQueue::SSubmitInfo submit = {}; - submit.commandBufferCount = 1u; - submit.commandBuffers = &cmdbuf.get(); - computeQueue->submit(1u, &submit, fence.get()); - m_device->blockForFences(1u, &fence.get()); + + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + .cmdbuf = cmdbuf.get() + } }; + + semInfo[0].value = 2; + const IQueue::SSubmitInfo infos[1] = { { + .commandBuffers = commandBuffers, + .signalSemaphores = semInfo + } }; + + if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { + m_logger->log("Download submission failure", system::ILogger::ELL_ERROR); + } + + const ISemaphore::SWaitInfo cmdbufDonePending[] = { { + .semaphore = semaphore.get(), + .value = 2 + } }; + if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for download semaphore failed", ILogger::ELL_ERROR); + return false; + } + } } - auto mem = const_cast(downloaded_buffer->getBoundMemory()); + auto mem = const_cast(downloaded_buffer->getBoundMemory().memory); { - video::IDeviceMemoryAllocation::MappedMemoryRange range; + ILogicalDevice::MappedMemoryRange range; { range.memory = mem; range.offset = 0u; range.length = in_gpu_range.size; } - m_device->mapMemory(range, video::IDeviceMemoryAllocation::EMCAF_READ); + mem->map({ .offset = range.offset, .length = range.length }, video::IDeviceMemoryAllocation::EMCAF_READ); } auto gpu_begin = reinterpret_cast(mem->getMappedPointer()); for (auto i = 0u; i < elementCount; i++) @@ -187,31 +224,32 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public _NBL_DEBUG_BREAK_IF(true); } m_logger->log("Result Comparison Test Passed", system::ILogger::ELL_PERFORMANCE); + scanSuccess = true; } - delete[] in; + delete[] inputData; return true; } - virtual video::SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override - { - video::SPhysicalDeviceFeatures retval = {}; + //virtual video::SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + //{ + // video::SPhysicalDeviceFeatures retval = {}; - retval.bufferDeviceAddress = true; - retval.subgroupBroadcastDynamicId = true; - retval.shaderSubgroupExtendedTypes = true; - // TODO: actually need to implement this and set it on the pipelines - retval.computeFullSubgroups = true; - retval.subgroupSizeControl = true; + // retval.bufferDeviceAddress = true; + // retval.subgroupBroadcastDynamicId = true; + // retval.shaderSubgroupExtendedTypes = true; + // // TODO: actually need to implement this and set it on the pipelines + // retval.computeFullSubgroups = true; + // retval.subgroupSizeControl = true; - return retval; - } + // return retval; + //} virtual bool onAppTerminated() override { m_logger->log("==========Result==========", ILogger::ELL_INFO); - m_logger->log("Scan Success: %b", ILogger::ELL_INFO, scanSuccess); + m_logger->log("Scan Success: %s", ILogger::ELL_INFO, scanSuccess?"true":"false"); delete[] inputData; return true; } @@ -232,21 +270,10 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public } } - smart_refctd_ptr createPipeline(smart_refctd_ptr&& overridenUnspecialized) - { - auto shader = m_device->createShader(std::move(overridenUnspecialized)); - auto specialized = m_device->createSpecializedShader(shader.get(), ISpecializedShader::SInfo(nullptr, nullptr, "main")); - return m_device->createComputePipeline(nullptr, smart_refctd_ptr(pipelineLayout), std::move(specialized)); - } - - IGPUQueue* transferDownQueue; - IGPUQueue* computeQueue; - + IQueue* computeQueue; uint32_t* inputData = nullptr; smart_refctd_ptr descriptorSet; smart_refctd_ptr pipelineLayout; - - smart_refctd_ptr fence; smart_refctd_ptr cmdbuf; smart_refctd_ptr resultsBuffer; From 40ce35e6c303a4c349c6a85381ac36feba34e15a Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 22 Apr 2024 00:17:41 +0300 Subject: [PATCH 04/14] Revert "Fix compute scan example code to compile and work" This reverts commit 56074055a015fb9c50b6b474d849db5295fe7752. --- 14_ComputeScan/main.cpp | 175 +++++++++++++++++----------------------- 1 file changed, 74 insertions(+), 101 deletions(-) diff --git a/14_ComputeScan/main.cpp b/14_ComputeScan/main.cpp index a6a6c8929..5d3588d5d 100644 --- a/14_ComputeScan/main.cpp +++ b/14_ComputeScan/main.cpp @@ -1,5 +1,5 @@ -#include "nbl/application_templates/BasicMultiQueueApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "../common/BasicMultiQueueApplication.hpp" +#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include #include @@ -10,10 +10,10 @@ using namespace asset; using namespace system; using namespace video; -class ComputeScanApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class ComputeScanApp final : public examples::BasicMultiQueueApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = application_templates::BasicMultiQueueApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + using device_base_t = examples::BasicMultiQueueApplication; + using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; public: ComputeScanApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : @@ -26,21 +26,22 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic if (!asset_base_t::onAppInitialized(std::move(system))) return false; + transferDownQueue = getTransferDownQueue(); computeQueue = getComputeQueue(); // Create (an almost) 128MB input buffer - constexpr auto in_size = 128u << 5u; - constexpr auto in_count = 1418;//in_size / sizeof(uint32_t) - 23u; + constexpr auto in_size = 128u << 20u; + constexpr auto in_count = in_size / sizeof(uint32_t) - 23u; m_logger->log("Input element count: %d", ILogger::ELL_PERFORMANCE, in_count); - inputData = new uint32_t[in_count]; + auto in = new uint32_t[in_count]; { std::random_device random_device; std::mt19937 generator(random_device()); std::uniform_int_distribution distribution(0u, ~0u); for (auto i = 0u; i < in_count; i++) - inputData[i] = distribution(generator) % 100000; + in[i] = distribution(generator); } auto minSSBOAlign = m_physicalDevice->getLimits().minSSBOAlignment; constexpr auto begin = in_count / 4 + 118; @@ -52,10 +53,10 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic smart_refctd_ptr gpuinputDataBuffer; { IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; - inputDataBufferCreationParams.size = sizeof(uint32_t) * in_count; // TODO Declare the element data type in the shader? + inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount; // TODO Declare the element data type in the shader? inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; gpuinputDataBuffer = m_utils->createFilledDeviceLocalBufferOnDedMem( - { .queue = getTransferUpQueue() }, + getTransferUpQueue(), std::move(inputDataBufferCreationParams), inputData ); @@ -64,12 +65,13 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic const auto scanType = video::CScanner::EST_EXCLUSIVE; auto scanner = m_utils->getDefaultScanner(); + auto scan_pipeline = scanner->getDefaultPipeline(scanType, CScanner::EDT_UINT, CScanner::EO_ADD); CScanner::DefaultPushConstants scan_push_constants; CScanner::DispatchInfo scan_dispatch_info; scanner->buildParameters(elementCount, scan_push_constants, scan_dispatch_info); - IGPUBuffer::SCreationParams params = { scan_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT }; + IGPUBuffer::SCreationParams params = { scan_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT }; SBufferRange scratch_gpu_range = {0u, params.size, m_device->createBuffer(std::move(params)) }; { auto memReqs = scratch_gpu_range.buffer->getMemoryReqs(); @@ -77,66 +79,53 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic auto scratchMem = m_device->allocate(memReqs, scratch_gpu_range.buffer.get()); } - auto scan_pipeline = scanner->getDefaultPipeline(scanType, CScanner::EDT_UINT, CScanner::EO_ADD, scan_push_constants.scanParams.getScratchSize()); auto dsLayout = scanner->getDefaultDescriptorSetLayout(); - auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout, 1 }); + auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, &dsLayout, &dsLayout + 1u); auto ds = dsPool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); scanner->updateDescriptorSet(m_device.get(), ds.get(), in_gpu_range, scratch_gpu_range); { - smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 })) + smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::ECF_RESET_COMMAND_BUFFER_BIT); + if (!m_device->createCommandBuffers(cmdpool.get(), IGPUCommandBuffer::EL_PRIMARY, 1u, &cmdbuf)) { logFail("Failed to create Command Buffers!\n"); return false; } } - cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this - cmdbuf->fillBuffer(scratch_gpu_range, 0u); + cmdbuf->begin(IGPUCommandBuffer::EU_SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this + cmdbuf->fillBuffer(scratch_gpu_range.buffer.get(), 0u, sizeof(uint32_t) + scratch_gpu_range.size / 2u, 0u); cmdbuf->bindComputePipeline(scan_pipeline); auto pipeline_layout = scan_pipeline->getLayout(); cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, pipeline_layout, 0u, 1u, &ds.get()); - scanner->dispatchHelper(cmdbuf.get(), pipeline_layout, scan_push_constants, scan_dispatch_info, 0u, nullptr, 0u, nullptr); + scanner->dispatchHelper( + cmdbuf.get(), pipeline_layout, scan_push_constants, scan_dispatch_info, + static_cast(asset::EPSF_COMPUTE_SHADER_BIT | asset::EPSF_TRANSFER_BIT), 0u, nullptr, + static_cast(asset::EPSF_COMPUTE_SHADER_BIT | asset::EPSF_TRANSFER_BIT), 0u, nullptr + ); cmdbuf->end(); - core::smart_refctd_ptr semaphore = m_device->createSemaphore(0); - // submit - IQueue::SSubmitInfo::SSemaphoreInfo semInfo[1] = { { - .semaphore = semaphore.get(), - .value = 1, - // just as we've outputted all pixels, signal - .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT - } }; - { - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { - .cmdbuf = cmdbuf.get() - } }; - - const IQueue::SSubmitInfo infos[1] = { { - .commandBuffers = commandBuffers, - .signalSemaphores = semInfo - } }; - - computeQueue->startCapture(); - if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { - m_logger->log("Submission failure", system::ILogger::ELL_ERROR); - } - computeQueue->endCapture(); - } + core::smart_refctd_ptr fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); + + IGPUQueue::SSubmitInfo submit = {}; + submit.commandBufferCount = 1u; + submit.commandBuffers = &cmdbuf.get(); + computeQueue->startCapture(); + computeQueue->submit(1, &submit, fence.get()); + computeQueue->endCapture(); // cpu counterpart - auto cpu_begin = inputData + begin; + auto cpu_begin = in + begin; m_logger->log("CPU scan begin", system::ILogger::ELL_PERFORMANCE); auto start = std::chrono::high_resolution_clock::now(); switch (scanType) { case video::CScanner::EST_INCLUSIVE: - std::inclusive_scan(cpu_begin, inputData + end, cpu_begin); + std::inclusive_scan(cpu_begin, in + end, cpu_begin); break; case video::CScanner::EST_EXCLUSIVE: - std::exclusive_scan(cpu_begin, inputData + end, cpu_begin, 0u); + std::exclusive_scan(cpu_begin, in + end, cpu_begin, 0u); break; default: assert(false); @@ -148,14 +137,7 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic m_logger->log("CPU scan end. Time taken: %d us", system::ILogger::ELL_PERFORMANCE, std::chrono::duration_cast(stop - start).count()); // wait for the gpu impl to complete - const ISemaphore::SWaitInfo cmdbufDonePending[] = {{ - .semaphore = semaphore.get(), - .value = 1 - }}; - if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { - m_logger->log("Blocking for operation semaphore failed", ILogger::ELL_ERROR); - return false; - } + m_device->blockForFences(1u, &fence.get()); { IGPUBuffer::SCreationParams params = {}; @@ -170,52 +152,33 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic // (REVIEW): Maybe we can just reset the cmdbuf we already have? core::smart_refctd_ptr cmdbuf; { - auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::NONE); - cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf , 1}, core::smart_refctd_ptr(m_logger)); + auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::ECF_NONE); + m_device->createCommandBuffers(cmdPool.get(), IGPUCommandBuffer::EL_PRIMARY, 1u, &cmdbuf); } - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool - IGPUCommandBuffer::SBufferCopy region; + cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool + asset::SBufferCopy region; region.srcOffset = in_gpu_range.offset; region.dstOffset = 0u; region.size = in_gpu_range.size; cmdbuf->copyBuffer(in_gpu_range.buffer.get(), downloaded_buffer.get(), 1u, ®ion); cmdbuf->end(); - - { - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { - .cmdbuf = cmdbuf.get() - } }; - - semInfo[0].value = 2; - const IQueue::SSubmitInfo infos[1] = { { - .commandBuffers = commandBuffers, - .signalSemaphores = semInfo - } }; - - if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { - m_logger->log("Download submission failure", system::ILogger::ELL_ERROR); - } - - const ISemaphore::SWaitInfo cmdbufDonePending[] = { { - .semaphore = semaphore.get(), - .value = 2 - } }; - if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { - m_logger->log("Blocking for download semaphore failed", ILogger::ELL_ERROR); - return false; - } - } + fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); + IGPUQueue::SSubmitInfo submit = {}; + submit.commandBufferCount = 1u; + submit.commandBuffers = &cmdbuf.get(); + computeQueue->submit(1u, &submit, fence.get()); + m_device->blockForFences(1u, &fence.get()); } - auto mem = const_cast(downloaded_buffer->getBoundMemory().memory); + auto mem = const_cast(downloaded_buffer->getBoundMemory()); { - ILogicalDevice::MappedMemoryRange range; + video::IDeviceMemoryAllocation::MappedMemoryRange range; { range.memory = mem; range.offset = 0u; range.length = in_gpu_range.size; } - mem->map({ .offset = range.offset, .length = range.length }, video::IDeviceMemoryAllocation::EMCAF_READ); + m_device->mapMemory(range, video::IDeviceMemoryAllocation::EMCAF_READ); } auto gpu_begin = reinterpret_cast(mem->getMappedPointer()); for (auto i = 0u; i < elementCount; i++) @@ -224,32 +187,31 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic _NBL_DEBUG_BREAK_IF(true); } m_logger->log("Result Comparison Test Passed", system::ILogger::ELL_PERFORMANCE); - scanSuccess = true; } - delete[] inputData; + delete[] in; return true; } - //virtual video::SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override - //{ - // video::SPhysicalDeviceFeatures retval = {}; + virtual video::SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + { + video::SPhysicalDeviceFeatures retval = {}; - // retval.bufferDeviceAddress = true; - // retval.subgroupBroadcastDynamicId = true; - // retval.shaderSubgroupExtendedTypes = true; - // // TODO: actually need to implement this and set it on the pipelines - // retval.computeFullSubgroups = true; - // retval.subgroupSizeControl = true; + retval.bufferDeviceAddress = true; + retval.subgroupBroadcastDynamicId = true; + retval.shaderSubgroupExtendedTypes = true; + // TODO: actually need to implement this and set it on the pipelines + retval.computeFullSubgroups = true; + retval.subgroupSizeControl = true; - // return retval; - //} + return retval; + } virtual bool onAppTerminated() override { m_logger->log("==========Result==========", ILogger::ELL_INFO); - m_logger->log("Scan Success: %s", ILogger::ELL_INFO, scanSuccess?"true":"false"); + m_logger->log("Scan Success: %b", ILogger::ELL_INFO, scanSuccess); delete[] inputData; return true; } @@ -270,10 +232,21 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic } } - IQueue* computeQueue; + smart_refctd_ptr createPipeline(smart_refctd_ptr&& overridenUnspecialized) + { + auto shader = m_device->createShader(std::move(overridenUnspecialized)); + auto specialized = m_device->createSpecializedShader(shader.get(), ISpecializedShader::SInfo(nullptr, nullptr, "main")); + return m_device->createComputePipeline(nullptr, smart_refctd_ptr(pipelineLayout), std::move(specialized)); + } + + IGPUQueue* transferDownQueue; + IGPUQueue* computeQueue; + uint32_t* inputData = nullptr; smart_refctd_ptr descriptorSet; smart_refctd_ptr pipelineLayout; + + smart_refctd_ptr fence; smart_refctd_ptr cmdbuf; smart_refctd_ptr resultsBuffer; From 46d2e3f765aaf0c61ae98131ad3b101f5ed969c7 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 22 Apr 2024 00:20:10 +0300 Subject: [PATCH 05/14] Fix compute scan example code to compile and work --- 14_ComputeScan/main.cpp | 179 +++++++++++++++++++++++----------------- 1 file changed, 104 insertions(+), 75 deletions(-) diff --git a/14_ComputeScan/main.cpp b/14_ComputeScan/main.cpp index 59ff420d9..a6a6c8929 100644 --- a/14_ComputeScan/main.cpp +++ b/14_ComputeScan/main.cpp @@ -1,5 +1,5 @@ -#include "../common/BasicMultiQueueApplication.hpp" -#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "nbl/application_templates/BasicMultiQueueApplication.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" #include #include @@ -10,10 +10,10 @@ using namespace asset; using namespace system; using namespace video; -class ComputeScanApp final : public examples::BasicMultiQueueApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication +class ComputeScanApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::BasicMultiQueueApplication; - using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; + using device_base_t = application_templates::BasicMultiQueueApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; public: ComputeScanApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : @@ -26,22 +26,21 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public if (!asset_base_t::onAppInitialized(std::move(system))) return false; - transferDownQueue = getTransferDownQueue(); computeQueue = getComputeQueue(); // Create (an almost) 128MB input buffer - constexpr auto in_size = 128u << 20u; - constexpr auto in_count = in_size / sizeof(uint32_t) - 23u; + constexpr auto in_size = 128u << 5u; + constexpr auto in_count = 1418;//in_size / sizeof(uint32_t) - 23u; m_logger->log("Input element count: %d", ILogger::ELL_PERFORMANCE, in_count); - auto in = new uint32_t[in_count]; + inputData = new uint32_t[in_count]; { std::random_device random_device; std::mt19937 generator(random_device()); std::uniform_int_distribution distribution(0u, ~0u); for (auto i = 0u; i < in_count; i++) - in[i] = distribution(generator); + inputData[i] = distribution(generator) % 100000; } auto minSSBOAlign = m_physicalDevice->getLimits().minSSBOAlignment; constexpr auto begin = in_count / 4 + 118; @@ -53,10 +52,10 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public smart_refctd_ptr gpuinputDataBuffer; { IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; - inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount; // TODO Declare the element data type in the shader? + inputDataBufferCreationParams.size = sizeof(uint32_t) * in_count; // TODO Declare the element data type in the shader? inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; gpuinputDataBuffer = m_utils->createFilledDeviceLocalBufferOnDedMem( - getTransferUpQueue(), + { .queue = getTransferUpQueue() }, std::move(inputDataBufferCreationParams), inputData ); @@ -65,13 +64,12 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public const auto scanType = video::CScanner::EST_EXCLUSIVE; auto scanner = m_utils->getDefaultScanner(); - auto scan_pipeline = scanner->getDefaultPipeline(scanType, CScanner::EDT_UINT, CScanner::EO_ADD); CScanner::DefaultPushConstants scan_push_constants; CScanner::DispatchInfo scan_dispatch_info; scanner->buildParameters(elementCount, scan_push_constants, scan_dispatch_info); - IGPUBuffer::SCreationParams params = { scan_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT }; + IGPUBuffer::SCreationParams params = { scan_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT }; SBufferRange scratch_gpu_range = {0u, params.size, m_device->createBuffer(std::move(params)) }; { auto memReqs = scratch_gpu_range.buffer->getMemoryReqs(); @@ -79,53 +77,66 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public auto scratchMem = m_device->allocate(memReqs, scratch_gpu_range.buffer.get()); } + auto scan_pipeline = scanner->getDefaultPipeline(scanType, CScanner::EDT_UINT, CScanner::EO_ADD, scan_push_constants.scanParams.getScratchSize()); auto dsLayout = scanner->getDefaultDescriptorSetLayout(); - auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, &dsLayout, &dsLayout + 1u); + auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout, 1 }); auto ds = dsPool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); scanner->updateDescriptorSet(m_device.get(), ds.get(), in_gpu_range, scratch_gpu_range); { - smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::ECF_RESET_COMMAND_BUFFER_BIT); - if (!m_device->createCommandBuffers(cmdpool.get(), IGPUCommandBuffer::EL_PRIMARY, 1u, &cmdbuf)) + smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 })) { logFail("Failed to create Command Buffers!\n"); return false; } } - cmdbuf->begin(IGPUCommandBuffer::EU_SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this - cmdbuf->fillBuffer(scratch_gpu_range.buffer.get(), 0u, sizeof(uint32_t) + scratch_gpu_range.size / 2u, 0u); + cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this + cmdbuf->fillBuffer(scratch_gpu_range, 0u); cmdbuf->bindComputePipeline(scan_pipeline); auto pipeline_layout = scan_pipeline->getLayout(); cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, pipeline_layout, 0u, 1u, &ds.get()); - scanner->dispatchHelper( - cmdbuf.get(), pipeline_layout, scan_push_constants, scan_dispatch_info, - static_cast(asset::EPSF_COMPUTE_SHADER_BIT | asset::EPSF_TRANSFER_BIT), 0u, nullptr, - static_cast(asset::EPSF_COMPUTE_SHADER_BIT | asset::EPSF_TRANSFER_BIT), 0u, nullptr - ); + scanner->dispatchHelper(cmdbuf.get(), pipeline_layout, scan_push_constants, scan_dispatch_info, 0u, nullptr, 0u, nullptr); cmdbuf->end(); - core::smart_refctd_ptr fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); - - IGPUQueue::SSubmitInfo submit = {}; - submit.commandBufferCount = 1u; - submit.commandBuffers = &cmdbuf.get(); - computeQueue->startCapture(); - computeQueue->submit(1, &submit, fence.get()); - computeQueue->endCapture(); + core::smart_refctd_ptr semaphore = m_device->createSemaphore(0); + // submit + IQueue::SSubmitInfo::SSemaphoreInfo semInfo[1] = { { + .semaphore = semaphore.get(), + .value = 1, + // just as we've outputted all pixels, signal + .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + } }; + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + .cmdbuf = cmdbuf.get() + } }; + + const IQueue::SSubmitInfo infos[1] = { { + .commandBuffers = commandBuffers, + .signalSemaphores = semInfo + } }; + + computeQueue->startCapture(); + if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { + m_logger->log("Submission failure", system::ILogger::ELL_ERROR); + } + computeQueue->endCapture(); + } // cpu counterpart - auto cpu_begin = in + begin; + auto cpu_begin = inputData + begin; m_logger->log("CPU scan begin", system::ILogger::ELL_PERFORMANCE); auto start = std::chrono::high_resolution_clock::now(); switch (scanType) { case video::CScanner::EST_INCLUSIVE: - std::inclusive_scan(cpu_begin, in + end, cpu_begin); + std::inclusive_scan(cpu_begin, inputData + end, cpu_begin); break; case video::CScanner::EST_EXCLUSIVE: - std::exclusive_scan(cpu_begin, in + end, cpu_begin, 0u); + std::exclusive_scan(cpu_begin, inputData + end, cpu_begin, 0u); break; default: assert(false); @@ -137,7 +148,14 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public m_logger->log("CPU scan end. Time taken: %d us", system::ILogger::ELL_PERFORMANCE, std::chrono::duration_cast(stop - start).count()); // wait for the gpu impl to complete - m_device->blockForFences(1u, &fence.get()); + const ISemaphore::SWaitInfo cmdbufDonePending[] = {{ + .semaphore = semaphore.get(), + .value = 1 + }}; + if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for operation semaphore failed", ILogger::ELL_ERROR); + return false; + } { IGPUBuffer::SCreationParams params = {}; @@ -152,33 +170,52 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public // (REVIEW): Maybe we can just reset the cmdbuf we already have? core::smart_refctd_ptr cmdbuf; { - auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::ECF_NONE); - m_device->createCommandBuffers(cmdPool.get(), IGPUCommandBuffer::EL_PRIMARY, 1u, &cmdbuf); + auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::NONE); + cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf , 1}, core::smart_refctd_ptr(m_logger)); } - cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool - asset::SBufferCopy region; + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool + IGPUCommandBuffer::SBufferCopy region; region.srcOffset = in_gpu_range.offset; region.dstOffset = 0u; region.size = in_gpu_range.size; cmdbuf->copyBuffer(in_gpu_range.buffer.get(), downloaded_buffer.get(), 1u, ®ion); cmdbuf->end(); - fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); - IGPUQueue::SSubmitInfo submit = {}; - submit.commandBufferCount = 1u; - submit.commandBuffers = &cmdbuf.get(); - computeQueue->submit(1u, &submit, fence.get()); - m_device->blockForFences(1u, &fence.get()); + + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + .cmdbuf = cmdbuf.get() + } }; + + semInfo[0].value = 2; + const IQueue::SSubmitInfo infos[1] = { { + .commandBuffers = commandBuffers, + .signalSemaphores = semInfo + } }; + + if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { + m_logger->log("Download submission failure", system::ILogger::ELL_ERROR); + } + + const ISemaphore::SWaitInfo cmdbufDonePending[] = { { + .semaphore = semaphore.get(), + .value = 2 + } }; + if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for download semaphore failed", ILogger::ELL_ERROR); + return false; + } + } } - auto mem = const_cast(downloaded_buffer->getBoundMemory()); + auto mem = const_cast(downloaded_buffer->getBoundMemory().memory); { - video::IDeviceMemoryAllocation::MappedMemoryRange range; + ILogicalDevice::MappedMemoryRange range; { range.memory = mem; range.offset = 0u; range.length = in_gpu_range.size; } - m_device->mapMemory(range, video::IDeviceMemoryAllocation::EMCAF_READ); + mem->map({ .offset = range.offset, .length = range.length }, video::IDeviceMemoryAllocation::EMCAF_READ); } auto gpu_begin = reinterpret_cast(mem->getMappedPointer()); for (auto i = 0u; i < elementCount; i++) @@ -187,31 +224,32 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public _NBL_DEBUG_BREAK_IF(true); } m_logger->log("Result Comparison Test Passed", system::ILogger::ELL_PERFORMANCE); + scanSuccess = true; } - delete[] in; + delete[] inputData; return true; } - virtual video::SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override - { - video::SPhysicalDeviceFeatures retval = {}; + //virtual video::SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + //{ + // video::SPhysicalDeviceFeatures retval = {}; - retval.bufferDeviceAddress = true; - retval.subgroupBroadcastDynamicId = true; - retval.shaderSubgroupExtendedTypes = true; - // TODO: actually need to implement this and set it on the pipelines - retval.computeFullSubgroups = true; - retval.subgroupSizeControl = true; + // retval.bufferDeviceAddress = true; + // retval.subgroupBroadcastDynamicId = true; + // retval.shaderSubgroupExtendedTypes = true; + // // TODO: actually need to implement this and set it on the pipelines + // retval.computeFullSubgroups = true; + // retval.subgroupSizeControl = true; - return retval; - } + // return retval; + //} virtual bool onAppTerminated() override { m_logger->log("==========Result==========", ILogger::ELL_INFO); - m_logger->log("Scan Success: %b", ILogger::ELL_INFO, scanSuccess); + m_logger->log("Scan Success: %s", ILogger::ELL_INFO, scanSuccess?"true":"false"); delete[] inputData; return true; } @@ -232,23 +270,14 @@ class ComputeScanApp final : public examples::BasicMultiQueueApplication, public } } - smart_refctd_ptr createPipeline(smart_refctd_ptr&& overridenUnspecialized) - { - auto shader = m_device->createShader(std::move(overridenUnspecialized)); - auto specialized = m_device->createSpecializedShader(shader.get(), ISpecializedShader::SInfo(nullptr, nullptr, "main")); - return m_device->createComputePipeline(nullptr, smart_refctd_ptr(pipelineLayout), std::move(specialized)); - } - - IGPUQueue* transferDownQueue; - IGPUQueue* computeQueue; - + IQueue* computeQueue; uint32_t* inputData = nullptr; smart_refctd_ptr descriptorSet; smart_refctd_ptr pipelineLayout; - - smart_refctd_ptr fence; smart_refctd_ptr cmdbuf; smart_refctd_ptr resultsBuffer; bool scanSuccess = false; -}; \ No newline at end of file +}; + +NBL_MAIN_FUNC(ComputeScanApp) \ No newline at end of file From 1b5c1f077d8b357333a7c82e00aad00ee362abe8 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sat, 4 May 2024 17:50:50 +0300 Subject: [PATCH 06/14] Change example to wait for filled buffer creation using semaphore --- 14_ComputeScan/main.cpp | 47 ++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/14_ComputeScan/main.cpp b/14_ComputeScan/main.cpp index a6a6c8929..d7d11798f 100644 --- a/14_ComputeScan/main.cpp +++ b/14_ComputeScan/main.cpp @@ -30,7 +30,7 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic // Create (an almost) 128MB input buffer constexpr auto in_size = 128u << 5u; - constexpr auto in_count = 1418;//in_size / sizeof(uint32_t) - 23u; + constexpr auto in_count = in_size / sizeof(uint32_t) - 23u; m_logger->log("Input element count: %d", ILogger::ELL_PERFORMANCE, in_count); @@ -49,16 +49,35 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic assert(((end * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u); constexpr auto elementCount = end - begin; + // Set Semaphores to control GPU synchronization + core::smart_refctd_ptr semaphore = m_device->createSemaphore(0); + IQueue::SSubmitInfo::SSemaphoreInfo semInfo[1] = { { + .semaphore = semaphore.get(), + .value = 1, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } }; + smart_refctd_ptr gpuinputDataBuffer; { IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; inputDataBufferCreationParams.size = sizeof(uint32_t) * in_count; // TODO Declare the element data type in the shader? inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; - gpuinputDataBuffer = m_utils->createFilledDeviceLocalBufferOnDedMem( - { .queue = getTransferUpQueue() }, + auto temp = m_utils->createFilledDeviceLocalBufferOnDedMem( + SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(inputDataBufferCreationParams), - inputData + inputData, + { semInfo, 1 } ); + + const ISemaphore::SWaitInfo semWaitInfo[] = { { + .semaphore = semaphore.get(), + .value = 1 + } }; + if (m_device->blockForSemaphores(semWaitInfo) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for operation semaphore failed during input data buffer creation", ILogger::ELL_ERROR); + return false; + } + gpuinputDataBuffer = *temp.get(); } SBufferRange in_gpu_range = { begin * sizeof(uint32_t), elementCount * sizeof(uint32_t), gpuinputDataBuffer }; @@ -77,7 +96,7 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic auto scratchMem = m_device->allocate(memReqs, scratch_gpu_range.buffer.get()); } - auto scan_pipeline = scanner->getDefaultPipeline(scanType, CScanner::EDT_UINT, CScanner::EO_ADD, scan_push_constants.scanParams.getScratchSize()); + auto scan_pipeline = scanner->getDefaultPipeline(scanType, CScanner::EDT_UINT, CScanner::EO_ADD, params.size); auto dsLayout = scanner->getDefaultDescriptorSetLayout(); auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout, 1 }); auto ds = dsPool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); @@ -100,15 +119,9 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic scanner->dispatchHelper(cmdbuf.get(), pipeline_layout, scan_push_constants, scan_dispatch_info, 0u, nullptr, 0u, nullptr); cmdbuf->end(); - core::smart_refctd_ptr semaphore = m_device->createSemaphore(0); - // submit - IQueue::SSubmitInfo::SSemaphoreInfo semInfo[1] = { { - .semaphore = semaphore.get(), - .value = 1, - // just as we've outputted all pixels, signal - .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT - } }; { + semInfo[0].value = 2; + semInfo[0].stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { .cmdbuf = cmdbuf.get() } }; @@ -150,7 +163,7 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic // wait for the gpu impl to complete const ISemaphore::SWaitInfo cmdbufDonePending[] = {{ .semaphore = semaphore.get(), - .value = 1 + .value = 2 }}; if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { m_logger->log("Blocking for operation semaphore failed", ILogger::ELL_ERROR); @@ -186,7 +199,7 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic .cmdbuf = cmdbuf.get() } }; - semInfo[0].value = 2; + semInfo[0].value = 3; const IQueue::SSubmitInfo infos[1] = { { .commandBuffers = commandBuffers, .signalSemaphores = semInfo @@ -198,7 +211,7 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic const ISemaphore::SWaitInfo cmdbufDonePending[] = { { .semaphore = semaphore.get(), - .value = 2 + .value = 3 } }; if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { m_logger->log("Blocking for download semaphore failed", ILogger::ELL_ERROR); @@ -280,4 +293,4 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic bool scanSuccess = false; }; -NBL_MAIN_FUNC(ComputeScanApp) \ No newline at end of file +NBL_MAIN_FUNC(ComputeScanApp) From dbb7e7332641d7d0474092f4a6116a389911e7c1 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 10 Jun 2024 02:00:55 +0300 Subject: [PATCH 07/14] Rename ComputeScan to ComputeReduce --- .../CMakeLists.txt | 0 .../config.json.template | 0 {14_ComputeScan => 14_ComputeReduce}/main.cpp | 57 +++++++------------ .../pipeline.groovy | 6 +- 4 files changed, 24 insertions(+), 39 deletions(-) rename {14_ComputeScan => 14_ComputeReduce}/CMakeLists.txt (100%) rename {14_ComputeScan => 14_ComputeReduce}/config.json.template (100%) rename {14_ComputeScan => 14_ComputeReduce}/main.cpp (84%) rename {14_ComputeScan => 14_ComputeReduce}/pipeline.groovy (86%) diff --git a/14_ComputeScan/CMakeLists.txt b/14_ComputeReduce/CMakeLists.txt similarity index 100% rename from 14_ComputeScan/CMakeLists.txt rename to 14_ComputeReduce/CMakeLists.txt diff --git a/14_ComputeScan/config.json.template b/14_ComputeReduce/config.json.template similarity index 100% rename from 14_ComputeScan/config.json.template rename to 14_ComputeReduce/config.json.template diff --git a/14_ComputeScan/main.cpp b/14_ComputeReduce/main.cpp similarity index 84% rename from 14_ComputeScan/main.cpp rename to 14_ComputeReduce/main.cpp index d7d11798f..62c652c81 100644 --- a/14_ComputeScan/main.cpp +++ b/14_ComputeReduce/main.cpp @@ -29,7 +29,7 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic computeQueue = getComputeQueue(); // Create (an almost) 128MB input buffer - constexpr auto in_size = 128u << 5u; + constexpr auto in_size = 128u << 10u; constexpr auto in_count = in_size / sizeof(uint32_t) - 23u; m_logger->log("Input element count: %d", ILogger::ELL_PERFORMANCE, in_count); @@ -81,14 +81,13 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic } SBufferRange in_gpu_range = { begin * sizeof(uint32_t), elementCount * sizeof(uint32_t), gpuinputDataBuffer }; - const auto scanType = video::CScanner::EST_EXCLUSIVE; - auto scanner = m_utils->getDefaultScanner(); + auto reducer = m_utils->getDefaultReducer(); - CScanner::DefaultPushConstants scan_push_constants; - CScanner::DispatchInfo scan_dispatch_info; - scanner->buildParameters(elementCount, scan_push_constants, scan_dispatch_info); + CArithmeticOps::DefaultPushConstants reduce_push_constants; + CArithmeticOps::DispatchInfo reduce_dispatch_info; + reducer->buildParameters(elementCount, reduce_push_constants, reduce_dispatch_info); - IGPUBuffer::SCreationParams params = { scan_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT }; + IGPUBuffer::SCreationParams params = { reduce_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT }; SBufferRange scratch_gpu_range = {0u, params.size, m_device->createBuffer(std::move(params)) }; { auto memReqs = scratch_gpu_range.buffer->getMemoryReqs(); @@ -96,11 +95,11 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic auto scratchMem = m_device->allocate(memReqs, scratch_gpu_range.buffer.get()); } - auto scan_pipeline = scanner->getDefaultPipeline(scanType, CScanner::EDT_UINT, CScanner::EO_ADD, params.size); - auto dsLayout = scanner->getDefaultDescriptorSetLayout(); + auto reduce_pipeline = reducer->getDefaultPipeline(CArithmeticOps::EDT_UINT, CArithmeticOps::EO_ADD, params.size); // TODO: Update to test all operations + auto dsLayout = reducer->getDefaultDescriptorSetLayout(); auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout, 1 }); auto ds = dsPool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); - scanner->updateDescriptorSet(m_device.get(), ds.get(), in_gpu_range, scratch_gpu_range); + reducer->updateDescriptorSet(m_device.get(), ds.get(), in_gpu_range, scratch_gpu_range); { smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); @@ -113,10 +112,10 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this cmdbuf->fillBuffer(scratch_gpu_range, 0u); - cmdbuf->bindComputePipeline(scan_pipeline); - auto pipeline_layout = scan_pipeline->getLayout(); + cmdbuf->bindComputePipeline(reduce_pipeline); + auto pipeline_layout = reduce_pipeline->getLayout(); cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, pipeline_layout, 0u, 1u, &ds.get()); - scanner->dispatchHelper(cmdbuf.get(), pipeline_layout, scan_push_constants, scan_dispatch_info, 0u, nullptr, 0u, nullptr); + reducer->dispatchHelper(cmdbuf.get(), pipeline_layout, reduce_push_constants, reduce_dispatch_info, 0u, nullptr, 0u, nullptr); cmdbuf->end(); { @@ -138,27 +137,16 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic computeQueue->endCapture(); } + // TODO: Update to support all operations // cpu counterpart auto cpu_begin = inputData + begin; - m_logger->log("CPU scan begin", system::ILogger::ELL_PERFORMANCE); + m_logger->log("CPU reduce begin", system::ILogger::ELL_PERFORMANCE); auto start = std::chrono::high_resolution_clock::now(); - switch (scanType) - { - case video::CScanner::EST_INCLUSIVE: - std::inclusive_scan(cpu_begin, inputData + end, cpu_begin); - break; - case video::CScanner::EST_EXCLUSIVE: - std::exclusive_scan(cpu_begin, inputData + end, cpu_begin, 0u); - break; - default: - assert(false); - exit(0xdeadbeefu); - break; - } + auto result = std::reduce(cpu_begin, inputData + end, 0u); auto stop = std::chrono::high_resolution_clock::now(); - m_logger->log("CPU scan end. Time taken: %d us", system::ILogger::ELL_PERFORMANCE, std::chrono::duration_cast(stop - start).count()); + m_logger->log("CPU reduce end. Time taken: %d us", system::ILogger::ELL_PERFORMANCE, std::chrono::duration_cast(stop - start).count()); // wait for the gpu impl to complete const ISemaphore::SWaitInfo cmdbufDonePending[] = {{ @@ -231,13 +219,10 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic mem->map({ .offset = range.offset, .length = range.length }, video::IDeviceMemoryAllocation::EMCAF_READ); } auto gpu_begin = reinterpret_cast(mem->getMappedPointer()); - for (auto i = 0u; i < elementCount; i++) - { - if (gpu_begin[i] != cpu_begin[i]) - _NBL_DEBUG_BREAK_IF(true); - } + if (gpu_begin[0] != result) + _NBL_DEBUG_BREAK_IF(true); m_logger->log("Result Comparison Test Passed", system::ILogger::ELL_PERFORMANCE); - scanSuccess = true; + operationSuccess = true; } delete[] inputData; @@ -262,7 +247,7 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic virtual bool onAppTerminated() override { m_logger->log("==========Result==========", ILogger::ELL_INFO); - m_logger->log("Scan Success: %s", ILogger::ELL_INFO, scanSuccess?"true":"false"); + m_logger->log("Operation Success: %s", ILogger::ELL_INFO, operationSuccess ?"true":"false"); delete[] inputData; return true; } @@ -290,7 +275,7 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic smart_refctd_ptr cmdbuf; smart_refctd_ptr resultsBuffer; - bool scanSuccess = false; + bool operationSuccess = false; }; NBL_MAIN_FUNC(ComputeScanApp) diff --git a/14_ComputeScan/pipeline.groovy b/14_ComputeReduce/pipeline.groovy similarity index 86% rename from 14_ComputeScan/pipeline.groovy rename to 14_ComputeReduce/pipeline.groovy index 4eaaafe23..ffcf2f199 100644 --- a/14_ComputeScan/pipeline.groovy +++ b/14_ComputeReduce/pipeline.groovy @@ -2,9 +2,9 @@ import org.DevshGraphicsProgramming.Agent import org.DevshGraphicsProgramming.BuilderInfo import org.DevshGraphicsProgramming.IBuilder -class CComputeScanBuilder extends IBuilder +class CComputeReduceBuilder extends IBuilder { - public CComputeScanBuilder(Agent _agent, _info) + public CComputeReduceBuilder(Agent _agent, _info) { super(_agent, _info) } @@ -44,7 +44,7 @@ class CComputeScanBuilder extends IBuilder def create(Agent _agent, _info) { - return new CComputeScanBuilder(_agent, _info) + return new CComputeReduceBuilder(_agent, _info) } return this \ No newline at end of file From 7d93bd1437607dbd86a1a43d1952b02507a723d1 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 17 Jun 2024 11:13:07 +0300 Subject: [PATCH 08/14] Change example 14 to Global Reduce instead of Global Scan --- 14_ComputeReduce/main.cpp | 15 ++++++++------- CMakeLists.txt | 2 +- Readme.md | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/14_ComputeReduce/main.cpp b/14_ComputeReduce/main.cpp index 62c652c81..9bc2de751 100644 --- a/14_ComputeReduce/main.cpp +++ b/14_ComputeReduce/main.cpp @@ -10,13 +10,13 @@ using namespace asset; using namespace system; using namespace video; -class ComputeScanApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +class ComputeReduceApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { using device_base_t = application_templates::BasicMultiQueueApplication; using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; public: - ComputeScanApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + ComputeReduceApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} bool onAppInitialized(smart_refctd_ptr&& system) override @@ -29,8 +29,8 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic computeQueue = getComputeQueue(); // Create (an almost) 128MB input buffer - constexpr auto in_size = 128u << 10u; - constexpr auto in_count = in_size / sizeof(uint32_t) - 23u; + constexpr auto in_size = 128u << 20u; + constexpr auto in_count = in_size / sizeof(uint32_t) - 24u; m_logger->log("Input element count: %d", ILogger::ELL_PERFORMANCE, in_count); @@ -40,7 +40,7 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic std::mt19937 generator(random_device()); std::uniform_int_distribution distribution(0u, ~0u); for (auto i = 0u; i < in_count; i++) - inputData[i] = distribution(generator) % 100000; + inputData[i] = 1u;//distribution(generator) % 128; } auto minSSBOAlign = m_physicalDevice->getLimits().minSSBOAlignment; constexpr auto begin = in_count / 4 + 118; @@ -147,7 +147,7 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic auto stop = std::chrono::high_resolution_clock::now(); m_logger->log("CPU reduce end. Time taken: %d us", system::ILogger::ELL_PERFORMANCE, std::chrono::duration_cast(stop - start).count()); - + m_logger->log("Host result %d", system::ILogger::ELL_INFO, result); // wait for the gpu impl to complete const ISemaphore::SWaitInfo cmdbufDonePending[] = {{ .semaphore = semaphore.get(), @@ -219,6 +219,7 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic mem->map({ .offset = range.offset, .length = range.length }, video::IDeviceMemoryAllocation::EMCAF_READ); } auto gpu_begin = reinterpret_cast(mem->getMappedPointer()); + m_logger->log("Device result %d", system::ILogger::ELL_INFO, gpu_begin[0]); if (gpu_begin[0] != result) _NBL_DEBUG_BREAK_IF(true); m_logger->log("Result Comparison Test Passed", system::ILogger::ELL_PERFORMANCE); @@ -278,4 +279,4 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic bool operationSuccess = false; }; -NBL_MAIN_FUNC(ComputeScanApp) +NBL_MAIN_FUNC(ComputeReduceApp) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cd82cdc0..ab063bd20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ if(NBL_BUILD_EXAMPLES) # showcase the set-up of a swapchain and picking of a matching device add_subdirectory(08_HelloSwapchain EXCLUDE_FROM_ALL) # global scan - add_subdirectory(14_ComputeScan EXCLUDE_FROM_ALL) + add_subdirectory(14_ComputeReduce EXCLUDE_FROM_ALL) # showcase the use of a depth buffer and rudimentary camera add_subdirectory(09_DepthBufferAndCamera EXCLUDE_FROM_ALL) # demonstrate the counting sort utility diff --git a/Readme.md b/Readme.md index 8d124b33b..95e358433 100644 --- a/Readme.md +++ b/Readme.md @@ -31,7 +31,7 @@ Whenever CMake generates separate makefiles/solutions/projects, they will be gen | 11_LoDSystem | ![][11_MSVC_Release] | ![][11_MSVC_RWDI] | ![][11_MSVC_Debug] | ![][11_Android_Release] | ![][11_Android_RWDI] | ![][11_Android_Debug] | ![][B] | ![][S] | ![][S] | | | 12_glTF | ![][12_MSVC_Release] | ![][12_MSVC_RWDI] | ![][12_MSVC_Debug] | ![][12_Android_Release] | ![][12_Android_RWDI] | ![][12_Android_Debug] | ![][W] | ![][W] | ![][W] | COMPILE_WITH_GLTF_LOADER | | 13. | ![][13_MSVC_Release] | ![][13_MSVC_RWDI] | ![][13_MSVC_Debug] | ![][13_Android_Release] | ![][13_Android_RWDI] | ![][13_Android_Debug] | ![][NA] | ![][NA] | ![][NA] | | -| 14_ComputeScan | ![][14_MSVC_Release] | ![][14_MSVC_RWDI] | ![][14_MSVC_Debug] | ![][14_Android_Release] | ![][14_Android_RWDI] | ![][14_Android_Debug] | ![][B] | ![][S] | ![][S] | | +| 14_ComputeReduce | ![][14_MSVC_Release] | ![][14_MSVC_RWDI] | ![][14_MSVC_Debug] | ![][14_Android_Release] | ![][14_Android_RWDI] | ![][14_Android_Debug] | ![][B] | ![][S] | ![][S] | | | 15. | ![][15_MSVC_Release] | ![][15_MSVC_RWDI] | ![][15_MSVC_Debug] | ![][15_Android_Release] | ![][15_Android_RWDI] | ![][15_Android_Debug] | ![][NA] | ![][NA] | ![][NA] | | | 16_OrderIndependentTransparency | ![][16_MSVC_Release] | ![][16_MSVC_RWDI] | ![][16_MSVC_Debug] | ![][16_Android_Release] | ![][16_Android_RWDI] | ![][16_Android_Debug] | ![][B] | ![][S] | ![][S] | | | 17_SimpleBulletIntegration | ![][17_MSVC_Release] | ![][17_MSVC_RWDI] | ![][17_MSVC_Debug] | ![][17_Android_Release] | ![][17_Android_RWDI] | ![][17_Android_Debug] | ![][B] | ![][S] | ![][N] | BUILD_BULLET | From 26e5133666eb23a2da3be97a907b8892f7323eb8 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sun, 7 Jul 2024 17:21:00 +0300 Subject: [PATCH 09/14] Fix results fetching for test success assertion --- 14_ComputeReduce/main.cpp | 130 ++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 68 deletions(-) diff --git a/14_ComputeReduce/main.cpp b/14_ComputeReduce/main.cpp index 9bc2de751..ecd0eeca0 100644 --- a/14_ComputeReduce/main.cpp +++ b/14_ComputeReduce/main.cpp @@ -88,17 +88,18 @@ class ComputeReduceApp final : public application_templates::BasicMultiQueueAppl reducer->buildParameters(elementCount, reduce_push_constants, reduce_dispatch_info); IGPUBuffer::SCreationParams params = { reduce_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT }; + + auto reduce_pipeline = reducer->getDefaultPipeline(CArithmeticOps::EDT_UINT, CArithmeticOps::EO_ADD, params.size / sizeof(uint32_t)); // TODO: Update to test all operations + auto dsLayout = reducer->getDefaultDescriptorSetLayout(); + auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout, 1 }); + auto ds = dsPool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); + SBufferRange scratch_gpu_range = {0u, params.size, m_device->createBuffer(std::move(params)) }; { auto memReqs = scratch_gpu_range.buffer->getMemoryReqs(); - memReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + memReqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); auto scratchMem = m_device->allocate(memReqs, scratch_gpu_range.buffer.get()); } - - auto reduce_pipeline = reducer->getDefaultPipeline(CArithmeticOps::EDT_UINT, CArithmeticOps::EO_ADD, params.size); // TODO: Update to test all operations - auto dsLayout = reducer->getDefaultDescriptorSetLayout(); - auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout, 1 }); - auto ds = dsPool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); reducer->updateDescriptorSet(m_device.get(), ds.get(), in_gpu_range, scratch_gpu_range); { @@ -115,7 +116,7 @@ class ComputeReduceApp final : public application_templates::BasicMultiQueueAppl cmdbuf->bindComputePipeline(reduce_pipeline); auto pipeline_layout = reduce_pipeline->getLayout(); cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, pipeline_layout, 0u, 1u, &ds.get()); - reducer->dispatchHelper(cmdbuf.get(), pipeline_layout, reduce_push_constants, reduce_dispatch_info, 0u, nullptr, 0u, nullptr); + reducer->dispatchHelper(cmdbuf.get(), pipeline_layout, reduce_push_constants, reduce_dispatch_info, {}); cmdbuf->end(); { @@ -159,68 +160,61 @@ class ComputeReduceApp final : public application_templates::BasicMultiQueueAppl } { - IGPUBuffer::SCreationParams params = {}; - params.size = in_gpu_range.size; - params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT; - // (REVIEW): Check if this new download_buffer is needed or if we can directly read from the gpu_input buffer - auto downloaded_buffer = m_device->createBuffer(std::move(params)); - auto memReqs = downloaded_buffer->getMemoryReqs(); - memReqs.memoryTypeBits &= m_physicalDevice->getDownStreamingMemoryTypeBits(); - auto queriesMem = m_device->allocate(memReqs, downloaded_buffer.get()); - { - // (REVIEW): Maybe we can just reset the cmdbuf we already have? - core::smart_refctd_ptr cmdbuf; - { - auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::NONE); - cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf , 1}, core::smart_refctd_ptr(m_logger)); - } - cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool - IGPUCommandBuffer::SBufferCopy region; - region.srcOffset = in_gpu_range.offset; - region.dstOffset = 0u; - region.size = in_gpu_range.size; - cmdbuf->copyBuffer(in_gpu_range.buffer.get(), downloaded_buffer.get(), 1u, ®ion); - cmdbuf->end(); - - { - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { - .cmdbuf = cmdbuf.get() - } }; - - semInfo[0].value = 3; - const IQueue::SSubmitInfo infos[1] = { { - .commandBuffers = commandBuffers, - .signalSemaphores = semInfo - } }; - - if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { - m_logger->log("Download submission failure", system::ILogger::ELL_ERROR); - } - - const ISemaphore::SWaitInfo cmdbufDonePending[] = { { - .semaphore = semaphore.get(), - .value = 3 - } }; - if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { - m_logger->log("Blocking for download semaphore failed", ILogger::ELL_ERROR); - return false; - } - } - } - - auto mem = const_cast(downloaded_buffer->getBoundMemory().memory); - { - ILogicalDevice::MappedMemoryRange range; - { - range.memory = mem; - range.offset = 0u; - range.length = in_gpu_range.size; - } - mem->map({ .offset = range.offset, .length = range.length }, video::IDeviceMemoryAllocation::EMCAF_READ); - } + //IGPUBuffer::SCreationParams params = {}; + //params.size = 1u; + //params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT; + //// (REVIEW): Check if this new download_buffer is needed or if we can directly read from the gpu_input buffer + //auto downloaded_buffer = m_device->createBuffer(std::move(params)); + //auto memReqs = downloaded_buffer->getMemoryReqs(); + //memReqs.memoryTypeBits &= m_physicalDevice->getDownStreamingMemoryTypeBits(); + //auto queriesMem = m_device->allocate(memReqs, downloaded_buffer.get()); + //{ + // // (REVIEW): Maybe we can just reset the cmdbuf we already have? + // core::smart_refctd_ptr cmdbuf; + // { + // auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::NONE); + // cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf , 1}, core::smart_refctd_ptr(m_logger)); + // } + // cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool + // IGPUCommandBuffer::SBufferCopy region; + // region.srcOffset = in_gpu_range.offset; + // region.dstOffset = 0u; + // region.size = in_gpu_range.size; + // cmdbuf->copyBuffer(in_gpu_range.buffer.get(), downloaded_buffer.get(), 1u, ®ion); + // cmdbuf->end(); + + // { + // const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + // .cmdbuf = cmdbuf.get() + // } }; + + // semInfo[0].value = 3; + // const IQueue::SSubmitInfo infos[1] = { { + // .commandBuffers = commandBuffers, + // .signalSemaphores = semInfo + // } }; + + // if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { + // m_logger->log("Download submission failure", system::ILogger::ELL_ERROR); + // } + + // const ISemaphore::SWaitInfo cmdbufDonePending[] = { { + // .semaphore = semaphore.get(), + // .value = 3 + // } }; + // if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { + // m_logger->log("Blocking for download semaphore failed", ILogger::ELL_ERROR); + // return false; + // } + // } + //} + + auto mem = const_cast(scratch_gpu_range.buffer->getBoundMemory().memory); + mem->map({ .offset = 0u, .length = scratch_gpu_range.size }, video::IDeviceMemoryAllocation::EMCAF_READ); auto gpu_begin = reinterpret_cast(mem->getMappedPointer()); - m_logger->log("Device result %d", system::ILogger::ELL_INFO, gpu_begin[0]); - if (gpu_begin[0] != result) + auto gpu_result = gpu_begin[0u]; + m_logger->log("Device result %d", system::ILogger::ELL_INFO, gpu_result); + if (gpu_result != result) _NBL_DEBUG_BREAK_IF(true); m_logger->log("Result Comparison Test Passed", system::ILogger::ELL_PERFORMANCE); operationSuccess = true; From ac76f37cd3880130d30cacde957c926e3f5f69fb Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sun, 7 Jul 2024 18:02:20 +0300 Subject: [PATCH 10/14] Add example 15 for global compute scan --- 15_ComputeScan/CMakeLists.txt | 6 + 15_ComputeScan/config.json.template | 28 +++ 15_ComputeScan/main.cpp | 340 ++++++++++++++++++++++++++++ 15_ComputeScan/pipeline.groovy | 50 ++++ CMakeLists.txt | 7 +- Readme.md | 2 +- 6 files changed, 429 insertions(+), 4 deletions(-) create mode 100644 15_ComputeScan/CMakeLists.txt create mode 100644 15_ComputeScan/config.json.template create mode 100644 15_ComputeScan/main.cpp create mode 100644 15_ComputeScan/pipeline.groovy diff --git a/15_ComputeScan/CMakeLists.txt b/15_ComputeScan/CMakeLists.txt new file mode 100644 index 000000000..2f9218f93 --- /dev/null +++ b/15_ComputeScan/CMakeLists.txt @@ -0,0 +1,6 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file diff --git a/15_ComputeScan/config.json.template b/15_ComputeScan/config.json.template new file mode 100644 index 000000000..a4ee411fa --- /dev/null +++ b/15_ComputeScan/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [ "NBL_BUILD_CEGUI" ] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/15_ComputeScan/main.cpp b/15_ComputeScan/main.cpp new file mode 100644 index 000000000..690d963b2 --- /dev/null +++ b/15_ComputeScan/main.cpp @@ -0,0 +1,340 @@ +#include "nbl/application_templates/BasicMultiQueueApplication.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" + +#include +#include + +using namespace nbl; +using namespace core; +using namespace asset; +using namespace system; +using namespace video; + +class ComputeScanApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = application_templates::BasicMultiQueueApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + +public: + ComputeScanApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + computeQueue = getComputeQueue(); + + // Create (an almost) 128MB input buffer + constexpr auto in_size = 128u << 20u; + constexpr auto in_count = in_size / sizeof(uint32_t) - 24u; + + m_logger->log("Input element count: %d", ILogger::ELL_PERFORMANCE, in_count); + + inputData = new uint32_t[in_count]; + { + std::random_device random_device; + std::mt19937 generator(random_device()); + std::uniform_int_distribution distribution(0u, ~0u); + for (auto i = 0u; i < in_count; i++) + inputData[i] = 1u;//distribution(generator) % 128; + } + auto minSSBOAlign = m_physicalDevice->getLimits().minSSBOAlignment; + constexpr auto begin = in_count / 4 + 118; + assert(((begin * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u); + constexpr auto end = in_count * 3 / 4 - 78; + assert(((end * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u); + constexpr auto elementCount = end - begin; + + // Set Semaphores to control GPU synchronization + core::smart_refctd_ptr semaphore = m_device->createSemaphore(0); + IQueue::SSubmitInfo::SSemaphoreInfo semInfo[1] = { { + .semaphore = semaphore.get(), + .value = 1, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } }; + + smart_refctd_ptr gpuinputDataBuffer; + { + IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; + inputDataBufferCreationParams.size = sizeof(uint32_t) * in_count; // TODO Declare the element data type in the shader? + inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + auto temp = m_utils->createFilledDeviceLocalBufferOnDedMem( + SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, + std::move(inputDataBufferCreationParams), + inputData, + { semInfo, 1 } + ); + + const ISemaphore::SWaitInfo semWaitInfo[] = { { + .semaphore = semaphore.get(), + .value = 1 + } }; + if (m_device->blockForSemaphores(semWaitInfo) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for operation semaphore failed during input data buffer creation", ILogger::ELL_ERROR); + return false; + } + gpuinputDataBuffer = *temp.get(); + } + SBufferRange in_gpu_range = { begin * sizeof(uint32_t), elementCount * sizeof(uint32_t), gpuinputDataBuffer }; + + const auto scanType = video::CScanner::EST_EXCLUSIVE; + video::CReduce* reducer = m_utils->getDefaultReducer(); + video::CScanner* scanner = m_utils->getDefaultScanner(); + + CArithmeticOps::DefaultPushConstants push_constants; + CArithmeticOps::DispatchInfo dispatch_info; + scanner->buildParameters(elementCount, push_constants, dispatch_info); // common for reducer and scanner + + IGPUBuffer::SCreationParams params = { push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT }; + + auto reduce_pipeline = reducer->getDefaultPipeline(CArithmeticOps::EDT_UINT, CArithmeticOps::EO_ADD, params.size); // TODO: Update to test all operations + auto scan_pipeline = scanner->getDefaultPipeline(scanType, CArithmeticOps::EDT_UINT, CArithmeticOps::EO_ADD, params.size); // TODO: Update to test all operations + + auto reduceDSLayout = reducer->getDefaultDescriptorSetLayout(); + auto scanDSLayout = scanner->getDefaultDescriptorSetLayout(); + IGPUDescriptorSetLayout const* dsLayouts[2] = { reduceDSLayout, scanDSLayout }; + auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, dsLayouts); + auto reduceDS = dsPool->createDescriptorSet(core::smart_refctd_ptr(reduceDSLayout)); + auto scanDS = dsPool->createDescriptorSet(core::smart_refctd_ptr(scanDSLayout)); + + SBufferRange scratch_gpu_range = {0u, params.size, m_device->createBuffer(std::move(params)) }; + { + auto memReqs = scratch_gpu_range.buffer->getMemoryReqs(); + memReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + auto scratchMem = m_device->allocate(memReqs, scratch_gpu_range.buffer.get()); + } + reducer->updateDescriptorSet(m_device.get(), reduceDS.get(), in_gpu_range, scratch_gpu_range); + scanner->updateDescriptorSet(m_device.get(), scanDS.get(), in_gpu_range, scratch_gpu_range); + + // Prepare Buffer Barriers + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t reduceBarrier = { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT, + .srcAccessMask = ACCESS_FLAGS::HOST_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }, + .range = in_gpu_range + }; + const IGPUCommandBuffer::SPipelineBarrierDependencyInfo reduceInfo[1] = { {.bufBarriers = {&reduceBarrier, 1u}} }; + + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t scanBarrier = { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS + } + }, + .range = scratch_gpu_range // the scratch is the one that contains the intermediary Reduce values that we want for the scan + }; + const IGPUCommandBuffer::SPipelineBarrierDependencyInfo scanInfo[1] = { {.bufBarriers = {&scanBarrier, 1u}} }; + + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 })) + { + logFail("Failed to create Command Buffers!\n"); + return false; + } + } + + video::IGPUPipelineLayout const* pipeline_layouts[2] = { reduce_pipeline->getLayout(), scan_pipeline->getLayout() }; + + cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this + cmdbuf->fillBuffer(scratch_gpu_range, 0u); // Host side only? + + cmdbuf->bindComputePipeline(reduce_pipeline); + cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, reduce_pipeline->getLayout(), 0u, 1u, &reduceDS.get()); + reducer->dispatchHelper(cmdbuf.get(), reduce_pipeline->getLayout(), push_constants, dispatch_info, reduceInfo); + + // Reset the workgroup enumerator buffer + SBufferRange scratch_workgroupenum_range = scratch_gpu_range; + scratch_workgroupenum_range.offset = sizeof(uint32_t); + scratch_workgroupenum_range.size = push_constants.scanParams.getWorkgroupEnumeratorSize(); + cmdbuf->fillBuffer(scratch_workgroupenum_range, 0u); + + cmdbuf->bindComputePipeline(scan_pipeline); + cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, scan_pipeline->getLayout(), 0u, 1u, &scanDS.get()); + scanner->dispatchHelper(cmdbuf.get(), scan_pipeline->getLayout(), push_constants, dispatch_info, scanInfo); + + // REVIEW: Maybe collapse descriptor sets since they're the same? But this way we are prepared for potential future pipeline discrepancies between Reduce and Scan ops + + cmdbuf->end(); + + { + semInfo[0].value = 2; + semInfo[0].stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + .cmdbuf = cmdbuf.get() + } }; + + const IQueue::SSubmitInfo infos[1] = { { + .commandBuffers = commandBuffers, + .signalSemaphores = semInfo + } }; + + computeQueue->startCapture(); + if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { + m_logger->log("Submission failure", system::ILogger::ELL_ERROR); + } + computeQueue->endCapture(); + } + + // TODO: Update to support all operations + // cpu counterpart + auto cpu_begin = inputData + begin; + m_logger->log("CPU scan begin", system::ILogger::ELL_PERFORMANCE); + + auto start = std::chrono::high_resolution_clock::now(); + switch (scanType) + { + case video::CScanner::EST_INCLUSIVE: + std::inclusive_scan(cpu_begin, inputData + end, cpu_begin); + break; + case video::CScanner::EST_EXCLUSIVE: + std::exclusive_scan(cpu_begin, inputData + end, cpu_begin, 0u); + break; + default: + assert(false); + exit(0xdeadbeefu); + break; + } + auto stop = std::chrono::high_resolution_clock::now(); + + m_logger->log("CPU scan end. Time taken: %d us", system::ILogger::ELL_PERFORMANCE, std::chrono::duration_cast(stop - start).count()); + // wait for the gpu impl to complete + const ISemaphore::SWaitInfo cmdbufDonePending[] = {{ + .semaphore = semaphore.get(), + .value = 2 + }}; + if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for operation semaphore failed", ILogger::ELL_ERROR); + return false; + } + + { + IGPUBuffer::SCreationParams params = {}; + params.size = in_gpu_range.size; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT; + // (REVIEW): Check if this new download_buffer is needed or if we can directly read from the gpu_input buffer + auto downloaded_buffer = m_device->createBuffer(std::move(params)); + auto memReqs = downloaded_buffer->getMemoryReqs(); + memReqs.memoryTypeBits &= m_physicalDevice->getDownStreamingMemoryTypeBits(); + auto queriesMem = m_device->allocate(memReqs, downloaded_buffer.get()); + { + // (REVIEW): Maybe we can just reset the cmdbuf we already have? + core::smart_refctd_ptr cmdbuf; + { + auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::NONE); + cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf, 1}, core::smart_refctd_ptr(m_logger)); + } + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool + IGPUCommandBuffer::SBufferCopy region; + region.srcOffset = in_gpu_range.offset; + region.dstOffset = 0u; + region.size = in_gpu_range.size; + cmdbuf->copyBuffer(in_gpu_range.buffer.get(), downloaded_buffer.get(), 1u, ®ion); + cmdbuf->end(); + + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + .cmdbuf = cmdbuf.get() + } }; + + semInfo[0].value = 3; + const IQueue::SSubmitInfo infos[1] = { { + .commandBuffers = commandBuffers, + .signalSemaphores = semInfo + } }; + + if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { + m_logger->log("Download submission failure", system::ILogger::ELL_ERROR); + } + + const ISemaphore::SWaitInfo cmdbufDonePending[] = { { + .semaphore = semaphore.get(), + .value = 3 + } }; + if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for download semaphore failed", ILogger::ELL_ERROR); + return false; + } + } + } + + auto mem = const_cast(downloaded_buffer->getBoundMemory().memory); + { + mem->map({ .offset = 0u, .length = params.size }, video::IDeviceMemoryAllocation::EMCAF_READ); + } + auto gpu_begin = reinterpret_cast(mem->getMappedPointer()); + for (auto i = 0u; i < elementCount; i++) + { + if (gpu_begin[i] != cpu_begin[i]) + _NBL_DEBUG_BREAK_IF(true); + } + m_logger->log("Result Comparison Test Passed", system::ILogger::ELL_PERFORMANCE); + operationSuccess = true; + } + + delete[] inputData; + + return true; + } + + //virtual video::SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + //{ + // video::SPhysicalDeviceFeatures retval = {}; + + // retval.bufferDeviceAddress = true; + // retval.subgroupBroadcastDynamicId = true; + // retval.shaderSubgroupExtendedTypes = true; + // // TODO: actually need to implement this and set it on the pipelines + // retval.computeFullSubgroups = true; + // retval.subgroupSizeControl = true; + + // return retval; + //} + + virtual bool onAppTerminated() override + { + m_logger->log("==========Result==========", ILogger::ELL_INFO); + m_logger->log("Operation Success: %s", ILogger::ELL_INFO, operationSuccess ?"true":"false"); + delete[] inputData; + return true; + } + + // the unit test is carried out on init + void workLoopBody() override {} + + bool keepRunning() override { return false; } + +private: + void logTestOutcome(bool passed, uint32_t workgroupSize) + { + if (passed) + m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); + else + { + m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); + } + } + + IQueue* computeQueue; + uint32_t* inputData = nullptr; + smart_refctd_ptr descriptorSet; + smart_refctd_ptr pipelineLayout; + smart_refctd_ptr cmdbuf; + smart_refctd_ptr resultsBuffer; + + bool operationSuccess = false; +}; + +NBL_MAIN_FUNC(ComputeScanApp) diff --git a/15_ComputeScan/pipeline.groovy b/15_ComputeScan/pipeline.groovy new file mode 100644 index 000000000..4eaaafe23 --- /dev/null +++ b/15_ComputeScan/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CComputeScanBuilder extends IBuilder +{ + public CComputeScanBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CComputeScanBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a21befb6..17d502aa6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,13 +26,14 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(07_StagingAndMultipleQueues EXCLUDE_FROM_ALL) # showcase the set-up of a swapchain and picking of a matching device add_subdirectory(08_HelloSwapchain EXCLUDE_FROM_ALL) - # global scan - add_subdirectory(14_ComputeReduce EXCLUDE_FROM_ALL) # showcase the use of a depth buffer and rudimentary camera add_subdirectory(09_DepthBufferAndCamera EXCLUDE_FROM_ALL) # demonstrate the counting sort utility add_subdirectory(10_CountingSort EXCLUDE_FROM_ALL) - + # global reduce + add_subdirectory(14_ComputeReduce EXCLUDE_FROM_ALL) + # global scan + add_subdirectory(15_ComputeScan EXCLUDE_FROM_ALL) # Waiting for a refactor #add_subdirectory(27_PLYSTLDemo EXCLUDE_FROM_ALL) #add_subdirectory(29_SpecializationConstants EXCLUDE_FROM_ALL) diff --git a/Readme.md b/Readme.md index 95e358433..1b1dd696d 100644 --- a/Readme.md +++ b/Readme.md @@ -32,7 +32,7 @@ Whenever CMake generates separate makefiles/solutions/projects, they will be gen | 12_glTF | ![][12_MSVC_Release] | ![][12_MSVC_RWDI] | ![][12_MSVC_Debug] | ![][12_Android_Release] | ![][12_Android_RWDI] | ![][12_Android_Debug] | ![][W] | ![][W] | ![][W] | COMPILE_WITH_GLTF_LOADER | | 13. | ![][13_MSVC_Release] | ![][13_MSVC_RWDI] | ![][13_MSVC_Debug] | ![][13_Android_Release] | ![][13_Android_RWDI] | ![][13_Android_Debug] | ![][NA] | ![][NA] | ![][NA] | | | 14_ComputeReduce | ![][14_MSVC_Release] | ![][14_MSVC_RWDI] | ![][14_MSVC_Debug] | ![][14_Android_Release] | ![][14_Android_RWDI] | ![][14_Android_Debug] | ![][B] | ![][S] | ![][S] | | -| 15. | ![][15_MSVC_Release] | ![][15_MSVC_RWDI] | ![][15_MSVC_Debug] | ![][15_Android_Release] | ![][15_Android_RWDI] | ![][15_Android_Debug] | ![][NA] | ![][NA] | ![][NA] | | +| 15_ComputeScan | ![][15_MSVC_Release] | ![][15_MSVC_RWDI] | ![][15_MSVC_Debug] | ![][15_Android_Release] | ![][15_Android_RWDI] | ![][15_Android_Debug] | ![][NA] | ![][NA] | ![][NA] | | | 16_OrderIndependentTransparency | ![][16_MSVC_Release] | ![][16_MSVC_RWDI] | ![][16_MSVC_Debug] | ![][16_Android_Release] | ![][16_Android_RWDI] | ![][16_Android_Debug] | ![][B] | ![][S] | ![][S] | | | 17_SimpleBulletIntegration | ![][17_MSVC_Release] | ![][17_MSVC_RWDI] | ![][17_MSVC_Debug] | ![][17_Android_Release] | ![][17_Android_RWDI] | ![][17_Android_Debug] | ![][B] | ![][S] | ![][N] | BUILD_BULLET | | 18_MitsubaLoader | ![][18_MSVC_Release] | ![][18_MSVC_RWDI] | ![][18_MSVC_Debug] | ![][18_Android_Release] | ![][18_Android_RWDI] | ![][18_Android_Debug] | ![][S] | ![][S] | ![][N] | BUILD_MITSUBA_LOADER | From 73b4a8ead98daed4a731cb3ae79ec84396d0e53c Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sun, 16 Mar 2025 11:33:10 +0200 Subject: [PATCH 11/14] Update cmakelists for examples to match upstream --- CMakeLists.txt | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c2a96378..97ab4cd2f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,23 +43,18 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(07_StagingAndMultipleQueues EXCLUDE_FROM_ALL) # showcase the set-up of a swapchain and picking of a matching device add_subdirectory(08_HelloSwapchain EXCLUDE_FROM_ALL) - # showcase the use of a depth buffer and rudimentary camera - add_subdirectory(09_DepthBufferAndCamera EXCLUDE_FROM_ALL) + add_subdirectory(09_GeometryCreator EXCLUDE_FROM_ALL) # demonstrate the counting sort utility add_subdirectory(10_CountingSort EXCLUDE_FROM_ALL) # showcase use of FFT for post-FX Bloom effect add_subdirectory(11_FFT EXCLUDE_FROM_ALL) + add_subdirectory(14_ComputeReduce EXCLUDE_FROM_ALL) + add_subdirectory(15_ComputeScan EXCLUDE_FROM_ALL) - - # global reduce - add_subdirectory(14_ComputeReduce EXCLUDE_FROM_ALL) - # global scan - add_subdirectory(15_ComputeScan EXCLUDE_FROM_ALL) # Waiting for a refactor #add_subdirectory(27_PLYSTLDemo EXCLUDE_FROM_ALL) #add_subdirectory(29_SpecializationConstants EXCLUDE_FROM_ALL) #add_subdirectory(33_Draw3DLine EXCLUDE_FROM_ALL) - add_subdirectory(35_GeometryCreator EXCLUDE_FROM_ALL) # Unit Test Examples add_subdirectory(20_AllocatorTest EXCLUDE_FROM_ALL) @@ -69,7 +64,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(24_ColorSpaceTest EXCLUDE_FROM_ALL) add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL) add_subdirectory(26_Blur EXCLUDE_FROM_ALL) - add_subdirectory(27_MPMCScheduler EXCLUDE_FROM_ALL) + add_subdirectory(27_MPMCScheduler EXCLUDE_FROM_ALL) add_subdirectory(28_FFTBloom EXCLUDE_FROM_ALL) # add_subdirectory(36_CUDAInterop EXCLUDE_FROM_ALL) From e362e24b2e2f21fddad5924f0319700667c7efdc Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 21 Apr 2025 13:59:16 +0300 Subject: [PATCH 12/14] Realign some examples with master --- 07_StagingAndMultipleQueues/main.cpp | 7 ++++--- .../app_resources/common.glsl | 16 +++++++++------- 30_ComputeShaderPathTracer/main.cpp | 5 +++-- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp index 658a28a35..875053d60 100644 --- a/07_StagingAndMultipleQueues/main.cpp +++ b/07_StagingAndMultipleQueues/main.cpp @@ -432,15 +432,16 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul submitInfo[0].waitSemaphores = waitSemaphoreSubmitInfo; // there's no save to wait on, or need to prevent signal-after-submit because Renderdoc freezes because it // starts capturing immediately upon a submit and can't defer a capture till semaphores signal. - if (imageToProcessIdisRunningInRenderdoc()) + const bool isRunningInRenderdoc = m_api->runningInGraphicsDebugger()==IAPIConnection::EDebuggerType::Renderdoc; + if (imageToProcessIdisRunningInRenderdoc() && imageToProcessId>=SUBMITS_IN_FLIGHT) + if (isRunningInRenderdoc && imageToProcessId>=SUBMITS_IN_FLIGHT) for (auto old = histogramsSaved.load(); old < histogramSaveWaitSemaphoreValue; old = histogramsSaved.load()) histogramsSaved.wait(old); // Some Devices like all of the Intel GPUs do not have enough queues for us to allocate different queues to compute and transfers, // so our `BasicMultiQueueApplication` will "alias" a single queue to both usages. Normally you don't need to care, but here we're // attempting to do "out-of-order" "submit-before-signal" so we need to "hold back" submissions if the queues are aliased! - if (getTransferUpQueue()==computeQueue || m_api->isRunningInRenderdoc()) + if (getTransferUpQueue()==computeQueue || isRunningInRenderdoc) for (auto old = transfersSubmitted.load(); old <= imageToProcessId; old = transfersSubmitted.load()) transfersSubmitted.wait(old); computeQueue->submit(submitInfo); diff --git a/30_ComputeShaderPathTracer/app_resources/common.glsl b/30_ComputeShaderPathTracer/app_resources/common.glsl index 2463f82cf..aaadae4a8 100644 --- a/30_ComputeShaderPathTracer/app_resources/common.glsl +++ b/30_ComputeShaderPathTracer/app_resources/common.glsl @@ -352,9 +352,9 @@ struct Payload_t vec3 accumulation; float otherTechniqueHeuristic; vec3 throughput; - #ifdef KILL_DIFFUSE_SPECULAR_PATHS +#ifdef KILL_DIFFUSE_SPECULAR_PATHS bool hasDiffuse; - #endif +#endif }; struct Ray_t @@ -491,6 +491,7 @@ layout (constant_id = 1) const int MAX_SAMPLES_LOG2 = 10; #include +// TODO: use PCG hash + XOROSHIRO and don't read any textures mat2x3 rand3d(in uint protoDimension, in uint _sample, inout nbl_glsl_xoroshiro64star_state_t scramble_state) { mat2x3 retval; @@ -552,6 +553,7 @@ nbl_glsl_LightSample nbl_glsl_light_generate_and_remainder_and_pdf(out vec3 rema } uint getBSDFLightIDAndDetermineNormal(out vec3 normal, in uint objectID, in vec3 intersection); +// returns whether to stop tracing bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nbl_glsl_xoroshiro64star_state_t scramble_state) { const MutableRay_t _mutable = ray._mutable; @@ -602,7 +604,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb const bool isBSDF = BSDFNode_isBSDF(bsdf); //rand - mat2x3 epsilon = rand3d(depth,_sample,scramble_state); + mat2x3 epsilon = rand3d(depth*2,_sample,scramble_state); // thresholds const float bsdfPdfThreshold = 0.0001; @@ -613,7 +615,7 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb // do NEE const float neeProbability = 1.0;// BSDFNode_getNEEProb(bsdf); float rcpChoiceProb; - if (!nbl_glsl_partitionRandVariable(neeProbability,epsilon[0].z,rcpChoiceProb) && depth<2u) + if (!nbl_glsl_partitionRandVariable(neeProbability,epsilon[0].z,rcpChoiceProb)) { vec3 neeContrib; float lightPdf, t; nbl_glsl_LightSample nee_sample = nbl_glsl_light_generate_and_remainder_and_pdf( @@ -748,15 +750,15 @@ void main() ray._payload.accumulation = vec3(0.0); ray._payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths ray._payload.throughput = vec3(1.0); - #ifdef KILL_DIFFUSE_SPECULAR_PATHS +#ifdef KILL_DIFFUSE_SPECULAR_PATHS ray._payload.hasDiffuse = false; - #endif +#endif } // bounces { bool hit = true; bool rayAlive = true; - for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d+=2) + for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d++) { ray._mutable.intersectionT = nbl_glsl_FLT_MAX; ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction); diff --git a/30_ComputeShaderPathTracer/main.cpp b/30_ComputeShaderPathTracer/main.cpp index 26d673002..ed93cf81f 100644 --- a/30_ComputeShaderPathTracer/main.cpp +++ b/30_ComputeShaderPathTracer/main.cpp @@ -15,13 +15,14 @@ using namespace asset; using namespace ui; using namespace video; +// TODO: share push constants struct PTPushConstant { matrix4SIMD invMVP; int sampleCount; int depth; }; -// TODO: Add a QueryPool for timestamping once its ready +// TODO: Add a QueryPool for timestamping once its ready (actually add IMGUI mspf plotter) // TODO: Do buffer creation using assConv class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication { @@ -859,7 +860,7 @@ class ComputeShaderPathtracer final : public examples::SimpleWindowedApplication ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); ImGui::ListBox("Shader", &PTPipline, shaderNames, E_LIGHT_GEOMETRY::ELG_COUNT); ImGui::SliderInt("SPP", &spp, 1, MaxBufferSamples); - ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 3); + ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 6); ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); From f9fc7bf2acd6a275f3da40ee879d8c07647b958a Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 21 Apr 2025 14:01:21 +0300 Subject: [PATCH 13/14] Realign with master --- CMakeLists.txt | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 38f23707a..33f9463a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,12 +48,12 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(10_CountingSort EXCLUDE_FROM_ALL) # showcase use of FFT for post-FX Bloom effect add_subdirectory(11_FFT EXCLUDE_FROM_ALL) + add_subdirectory(14_ComputeReduce EXCLUDE_FROM_ALL) add_subdirectory(15_ComputeScan EXCLUDE_FROM_ALL) # Waiting for a refactor #add_subdirectory(27_PLYSTLDemo EXCLUDE_FROM_ALL) - #add_subdirectory(29_SpecializationConstants EXCLUDE_FROM_ALL) #add_subdirectory(33_Draw3DLine EXCLUDE_FROM_ALL) # Unit Test Examples @@ -76,16 +76,11 @@ if(NBL_BUILD_EXAMPLES) # add_subdirectory(39_DenoiserTonemapper EXCLUDE_FROM_ALL) # endif() - add_subdirectory(42_FragmentShaderPathTracer EXCLUDE_FROM_ALL) #add_subdirectory(43_SumAndCDFFilters EXCLUDE_FROM_ALL) - #add_subdirectory(45_BRDFEvalTest EXCLUDE_FROM_ALL) - #add_subdirectory(46_SamplingValidation EXCLUDE_FROM_ALL) add_subdirectory(47_DerivMapTest EXCLUDE_FROM_ALL) add_subdirectory(53_ComputeShaders EXCLUDE_FROM_ALL) add_subdirectory(54_Transformations EXCLUDE_FROM_ALL) add_subdirectory(55_RGB18E7S3 EXCLUDE_FROM_ALL) - add_subdirectory(56_RayQuery EXCLUDE_FROM_ALL) - add_subdirectory(60_ClusteredRendering EXCLUDE_FROM_ALL) add_subdirectory(61_UI EXCLUDE_FROM_ALL) add_subdirectory(62_CAD EXCLUDE_FROM_ALL) add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL) @@ -96,7 +91,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL) add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL) - add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) + add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") From 9fb1ce9a24fa7e91fcbcea6294c81f45e5a0de0c Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Tue, 22 Apr 2025 13:51:25 +0300 Subject: [PATCH 14/14] Change renderdoc capture to be accessed via CVulkanConnection instead of IQueue --- 14_ComputeReduce/CMakeLists.txt | 21 ++++++++++++++++++++- 14_ComputeReduce/main.cpp | 4 ++-- 15_ComputeScan/CMakeLists.txt | 21 ++++++++++++++++++++- 15_ComputeScan/main.cpp | 4 ++-- 4 files changed, 44 insertions(+), 6 deletions(-) diff --git a/14_ComputeReduce/CMakeLists.txt b/14_ComputeReduce/CMakeLists.txt index 2f9218f93..0724366c9 100644 --- a/14_ComputeReduce/CMakeLists.txt +++ b/14_ComputeReduce/CMakeLists.txt @@ -1,6 +1,25 @@ + include(common RESULT_VARIABLE RES) if(NOT RES) message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") endif() -nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/14_ComputeReduce/main.cpp b/14_ComputeReduce/main.cpp index ecd0eeca0..8cfe65be7 100644 --- a/14_ComputeReduce/main.cpp +++ b/14_ComputeReduce/main.cpp @@ -131,11 +131,11 @@ class ComputeReduceApp final : public application_templates::BasicMultiQueueAppl .signalSemaphores = semInfo } }; - computeQueue->startCapture(); + m_api->startCapture(); if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { m_logger->log("Submission failure", system::ILogger::ELL_ERROR); } - computeQueue->endCapture(); + m_api->endCapture(); } // TODO: Update to support all operations diff --git a/15_ComputeScan/CMakeLists.txt b/15_ComputeScan/CMakeLists.txt index 2f9218f93..0724366c9 100644 --- a/15_ComputeScan/CMakeLists.txt +++ b/15_ComputeScan/CMakeLists.txt @@ -1,6 +1,25 @@ + include(common RESULT_VARIABLE RES) if(NOT RES) message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") endif() -nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/15_ComputeScan/main.cpp b/15_ComputeScan/main.cpp index 690d963b2..f339c06ac 100644 --- a/15_ComputeScan/main.cpp +++ b/15_ComputeScan/main.cpp @@ -181,11 +181,11 @@ class ComputeScanApp final : public application_templates::BasicMultiQueueApplic .signalSemaphores = semInfo } }; - computeQueue->startCapture(); + m_api->startCapture(); if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { m_logger->log("Submission failure", system::ILogger::ELL_ERROR); } - computeQueue->endCapture(); + m_api->endCapture(); } // TODO: Update to support all operations