diff --git a/14_ComputeReduce/CMakeLists.txt b/14_ComputeReduce/CMakeLists.txt new file mode 100644 index 000000000..0724366c9 --- /dev/null +++ b/14_ComputeReduce/CMakeLists.txt @@ -0,0 +1,25 @@ + +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/14_ComputeReduce/config.json.template b/14_ComputeReduce/config.json.template new file mode 100644 index 000000000..a4ee411fa --- /dev/null +++ b/14_ComputeReduce/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [ "NBL_BUILD_CEGUI" ] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/14_ComputeReduce/main.cpp b/14_ComputeReduce/main.cpp new file mode 100644 index 000000000..8cfe65be7 --- /dev/null +++ b/14_ComputeReduce/main.cpp @@ -0,0 +1,276 @@ +#include "nbl/application_templates/BasicMultiQueueApplication.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" + +#include +#include + +using namespace nbl; +using namespace core; +using namespace asset; +using namespace system; +using namespace video; + +class ComputeReduceApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = application_templates::BasicMultiQueueApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + +public: + ComputeReduceApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + computeQueue = getComputeQueue(); + + // Create (an almost) 128MB input buffer + constexpr auto in_size = 128u << 20u; + constexpr auto in_count = in_size / sizeof(uint32_t) - 24u; + + m_logger->log("Input element count: %d", ILogger::ELL_PERFORMANCE, in_count); + + inputData = new uint32_t[in_count]; + { + std::random_device random_device; + std::mt19937 generator(random_device()); + std::uniform_int_distribution distribution(0u, ~0u); + for (auto i = 0u; i < in_count; i++) + inputData[i] = 1u;//distribution(generator) % 128; + } + auto minSSBOAlign = m_physicalDevice->getLimits().minSSBOAlignment; + constexpr auto begin = in_count / 4 + 118; + assert(((begin * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u); + constexpr auto end = in_count * 3 / 4 - 78; + assert(((end * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u); + constexpr auto elementCount = end - begin; + + // Set Semaphores to control GPU synchronization + core::smart_refctd_ptr semaphore = m_device->createSemaphore(0); + IQueue::SSubmitInfo::SSemaphoreInfo semInfo[1] = { { + .semaphore = semaphore.get(), + .value = 1, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } }; + + smart_refctd_ptr gpuinputDataBuffer; + { + IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; + inputDataBufferCreationParams.size = sizeof(uint32_t) * in_count; // TODO Declare the element data type in the shader? + inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + auto temp = m_utils->createFilledDeviceLocalBufferOnDedMem( + SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, + std::move(inputDataBufferCreationParams), + inputData, + { semInfo, 1 } + ); + + const ISemaphore::SWaitInfo semWaitInfo[] = { { + .semaphore = semaphore.get(), + .value = 1 + } }; + if (m_device->blockForSemaphores(semWaitInfo) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for operation semaphore failed during input data buffer creation", ILogger::ELL_ERROR); + return false; + } + gpuinputDataBuffer = *temp.get(); + } + SBufferRange in_gpu_range = { begin * sizeof(uint32_t), elementCount * sizeof(uint32_t), gpuinputDataBuffer }; + + auto reducer = m_utils->getDefaultReducer(); + + CArithmeticOps::DefaultPushConstants reduce_push_constants; + CArithmeticOps::DispatchInfo reduce_dispatch_info; + reducer->buildParameters(elementCount, reduce_push_constants, reduce_dispatch_info); + + IGPUBuffer::SCreationParams params = { reduce_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT }; + + auto reduce_pipeline = reducer->getDefaultPipeline(CArithmeticOps::EDT_UINT, CArithmeticOps::EO_ADD, params.size / sizeof(uint32_t)); // TODO: Update to test all operations + auto dsLayout = reducer->getDefaultDescriptorSetLayout(); + auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout, 1 }); + auto ds = dsPool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); + + SBufferRange scratch_gpu_range = {0u, params.size, m_device->createBuffer(std::move(params)) }; + { + auto memReqs = scratch_gpu_range.buffer->getMemoryReqs(); + memReqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); + auto scratchMem = m_device->allocate(memReqs, scratch_gpu_range.buffer.get()); + } + reducer->updateDescriptorSet(m_device.get(), ds.get(), in_gpu_range, scratch_gpu_range); + + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 })) + { + logFail("Failed to create Command Buffers!\n"); + return false; + } + } + + cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this + cmdbuf->fillBuffer(scratch_gpu_range, 0u); + cmdbuf->bindComputePipeline(reduce_pipeline); + auto pipeline_layout = reduce_pipeline->getLayout(); + cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, pipeline_layout, 0u, 1u, &ds.get()); + reducer->dispatchHelper(cmdbuf.get(), pipeline_layout, reduce_push_constants, reduce_dispatch_info, {}); + cmdbuf->end(); + + { + semInfo[0].value = 2; + semInfo[0].stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + .cmdbuf = cmdbuf.get() + } }; + + const IQueue::SSubmitInfo infos[1] = { { + .commandBuffers = commandBuffers, + .signalSemaphores = semInfo + } }; + + m_api->startCapture(); + if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { + m_logger->log("Submission failure", system::ILogger::ELL_ERROR); + } + m_api->endCapture(); + } + + // TODO: Update to support all operations + // cpu counterpart + auto cpu_begin = inputData + begin; + m_logger->log("CPU reduce begin", system::ILogger::ELL_PERFORMANCE); + + auto start = std::chrono::high_resolution_clock::now(); + auto result = std::reduce(cpu_begin, inputData + end, 0u); + auto stop = std::chrono::high_resolution_clock::now(); + + m_logger->log("CPU reduce end. Time taken: %d us", system::ILogger::ELL_PERFORMANCE, std::chrono::duration_cast(stop - start).count()); + m_logger->log("Host result %d", system::ILogger::ELL_INFO, result); + // wait for the gpu impl to complete + const ISemaphore::SWaitInfo cmdbufDonePending[] = {{ + .semaphore = semaphore.get(), + .value = 2 + }}; + if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for operation semaphore failed", ILogger::ELL_ERROR); + return false; + } + + { + //IGPUBuffer::SCreationParams params = {}; + //params.size = 1u; + //params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT; + //// (REVIEW): Check if this new download_buffer is needed or if we can directly read from the gpu_input buffer + //auto downloaded_buffer = m_device->createBuffer(std::move(params)); + //auto memReqs = downloaded_buffer->getMemoryReqs(); + //memReqs.memoryTypeBits &= m_physicalDevice->getDownStreamingMemoryTypeBits(); + //auto queriesMem = m_device->allocate(memReqs, downloaded_buffer.get()); + //{ + // // (REVIEW): Maybe we can just reset the cmdbuf we already have? + // core::smart_refctd_ptr cmdbuf; + // { + // auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::NONE); + // cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf , 1}, core::smart_refctd_ptr(m_logger)); + // } + // cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool + // IGPUCommandBuffer::SBufferCopy region; + // region.srcOffset = in_gpu_range.offset; + // region.dstOffset = 0u; + // region.size = in_gpu_range.size; + // cmdbuf->copyBuffer(in_gpu_range.buffer.get(), downloaded_buffer.get(), 1u, ®ion); + // cmdbuf->end(); + + // { + // const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + // .cmdbuf = cmdbuf.get() + // } }; + + // semInfo[0].value = 3; + // const IQueue::SSubmitInfo infos[1] = { { + // .commandBuffers = commandBuffers, + // .signalSemaphores = semInfo + // } }; + + // if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { + // m_logger->log("Download submission failure", system::ILogger::ELL_ERROR); + // } + + // const ISemaphore::SWaitInfo cmdbufDonePending[] = { { + // .semaphore = semaphore.get(), + // .value = 3 + // } }; + // if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { + // m_logger->log("Blocking for download semaphore failed", ILogger::ELL_ERROR); + // return false; + // } + // } + //} + + auto mem = const_cast(scratch_gpu_range.buffer->getBoundMemory().memory); + mem->map({ .offset = 0u, .length = scratch_gpu_range.size }, video::IDeviceMemoryAllocation::EMCAF_READ); + auto gpu_begin = reinterpret_cast(mem->getMappedPointer()); + auto gpu_result = gpu_begin[0u]; + m_logger->log("Device result %d", system::ILogger::ELL_INFO, gpu_result); + if (gpu_result != result) + _NBL_DEBUG_BREAK_IF(true); + m_logger->log("Result Comparison Test Passed", system::ILogger::ELL_PERFORMANCE); + operationSuccess = true; + } + + delete[] inputData; + + return true; + } + + //virtual video::SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + //{ + // video::SPhysicalDeviceFeatures retval = {}; + + // retval.bufferDeviceAddress = true; + // retval.subgroupBroadcastDynamicId = true; + // retval.shaderSubgroupExtendedTypes = true; + // // TODO: actually need to implement this and set it on the pipelines + // retval.computeFullSubgroups = true; + // retval.subgroupSizeControl = true; + + // return retval; + //} + + virtual bool onAppTerminated() override + { + m_logger->log("==========Result==========", ILogger::ELL_INFO); + m_logger->log("Operation Success: %s", ILogger::ELL_INFO, operationSuccess ?"true":"false"); + delete[] inputData; + return true; + } + + // the unit test is carried out on init + void workLoopBody() override {} + + bool keepRunning() override { return false; } + +private: + void logTestOutcome(bool passed, uint32_t workgroupSize) + { + if (passed) + m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); + else + { + m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); + } + } + + IQueue* computeQueue; + uint32_t* inputData = nullptr; + smart_refctd_ptr descriptorSet; + smart_refctd_ptr pipelineLayout; + smart_refctd_ptr cmdbuf; + smart_refctd_ptr resultsBuffer; + + bool operationSuccess = false; +}; + +NBL_MAIN_FUNC(ComputeReduceApp) diff --git a/14_ComputeReduce/pipeline.groovy b/14_ComputeReduce/pipeline.groovy new file mode 100644 index 000000000..ffcf2f199 --- /dev/null +++ b/14_ComputeReduce/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CComputeReduceBuilder extends IBuilder +{ + public CComputeReduceBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CComputeReduceBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/15_ComputeScan/CMakeLists.txt b/15_ComputeScan/CMakeLists.txt new file mode 100644 index 000000000..0724366c9 --- /dev/null +++ b/15_ComputeScan/CMakeLists.txt @@ -0,0 +1,25 @@ + +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/15_ComputeScan/config.json.template b/15_ComputeScan/config.json.template new file mode 100644 index 000000000..a4ee411fa --- /dev/null +++ b/15_ComputeScan/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [ "NBL_BUILD_CEGUI" ] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/15_ComputeScan/main.cpp b/15_ComputeScan/main.cpp new file mode 100644 index 000000000..f339c06ac --- /dev/null +++ b/15_ComputeScan/main.cpp @@ -0,0 +1,340 @@ +#include "nbl/application_templates/BasicMultiQueueApplication.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" + +#include +#include + +using namespace nbl; +using namespace core; +using namespace asset; +using namespace system; +using namespace video; + +class ComputeScanApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = application_templates::BasicMultiQueueApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + +public: + ComputeScanApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + computeQueue = getComputeQueue(); + + // Create (an almost) 128MB input buffer + constexpr auto in_size = 128u << 20u; + constexpr auto in_count = in_size / sizeof(uint32_t) - 24u; + + m_logger->log("Input element count: %d", ILogger::ELL_PERFORMANCE, in_count); + + inputData = new uint32_t[in_count]; + { + std::random_device random_device; + std::mt19937 generator(random_device()); + std::uniform_int_distribution distribution(0u, ~0u); + for (auto i = 0u; i < in_count; i++) + inputData[i] = 1u;//distribution(generator) % 128; + } + auto minSSBOAlign = m_physicalDevice->getLimits().minSSBOAlignment; + constexpr auto begin = in_count / 4 + 118; + assert(((begin * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u); + constexpr auto end = in_count * 3 / 4 - 78; + assert(((end * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u); + constexpr auto elementCount = end - begin; + + // Set Semaphores to control GPU synchronization + core::smart_refctd_ptr semaphore = m_device->createSemaphore(0); + IQueue::SSubmitInfo::SSemaphoreInfo semInfo[1] = { { + .semaphore = semaphore.get(), + .value = 1, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } }; + + smart_refctd_ptr gpuinputDataBuffer; + { + IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; + inputDataBufferCreationParams.size = sizeof(uint32_t) * in_count; // TODO Declare the element data type in the shader? + inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + auto temp = m_utils->createFilledDeviceLocalBufferOnDedMem( + SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, + std::move(inputDataBufferCreationParams), + inputData, + { semInfo, 1 } + ); + + const ISemaphore::SWaitInfo semWaitInfo[] = { { + .semaphore = semaphore.get(), + .value = 1 + } }; + if (m_device->blockForSemaphores(semWaitInfo) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for operation semaphore failed during input data buffer creation", ILogger::ELL_ERROR); + return false; + } + gpuinputDataBuffer = *temp.get(); + } + SBufferRange in_gpu_range = { begin * sizeof(uint32_t), elementCount * sizeof(uint32_t), gpuinputDataBuffer }; + + const auto scanType = video::CScanner::EST_EXCLUSIVE; + video::CReduce* reducer = m_utils->getDefaultReducer(); + video::CScanner* scanner = m_utils->getDefaultScanner(); + + CArithmeticOps::DefaultPushConstants push_constants; + CArithmeticOps::DispatchInfo dispatch_info; + scanner->buildParameters(elementCount, push_constants, dispatch_info); // common for reducer and scanner + + IGPUBuffer::SCreationParams params = { push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT }; + + auto reduce_pipeline = reducer->getDefaultPipeline(CArithmeticOps::EDT_UINT, CArithmeticOps::EO_ADD, params.size); // TODO: Update to test all operations + auto scan_pipeline = scanner->getDefaultPipeline(scanType, CArithmeticOps::EDT_UINT, CArithmeticOps::EO_ADD, params.size); // TODO: Update to test all operations + + auto reduceDSLayout = reducer->getDefaultDescriptorSetLayout(); + auto scanDSLayout = scanner->getDefaultDescriptorSetLayout(); + IGPUDescriptorSetLayout const* dsLayouts[2] = { reduceDSLayout, scanDSLayout }; + auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, dsLayouts); + auto reduceDS = dsPool->createDescriptorSet(core::smart_refctd_ptr(reduceDSLayout)); + auto scanDS = dsPool->createDescriptorSet(core::smart_refctd_ptr(scanDSLayout)); + + SBufferRange scratch_gpu_range = {0u, params.size, m_device->createBuffer(std::move(params)) }; + { + auto memReqs = scratch_gpu_range.buffer->getMemoryReqs(); + memReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + auto scratchMem = m_device->allocate(memReqs, scratch_gpu_range.buffer.get()); + } + reducer->updateDescriptorSet(m_device.get(), reduceDS.get(), in_gpu_range, scratch_gpu_range); + scanner->updateDescriptorSet(m_device.get(), scanDS.get(), in_gpu_range, scratch_gpu_range); + + // Prepare Buffer Barriers + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t reduceBarrier = { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT, + .srcAccessMask = ACCESS_FLAGS::HOST_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }, + .range = in_gpu_range + }; + const IGPUCommandBuffer::SPipelineBarrierDependencyInfo reduceInfo[1] = { {.bufBarriers = {&reduceBarrier, 1u}} }; + + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t scanBarrier = { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS + } + }, + .range = scratch_gpu_range // the scratch is the one that contains the intermediary Reduce values that we want for the scan + }; + const IGPUCommandBuffer::SPipelineBarrierDependencyInfo scanInfo[1] = { {.bufBarriers = {&scanBarrier, 1u}} }; + + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 })) + { + logFail("Failed to create Command Buffers!\n"); + return false; + } + } + + video::IGPUPipelineLayout const* pipeline_layouts[2] = { reduce_pipeline->getLayout(), scan_pipeline->getLayout() }; + + cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this + cmdbuf->fillBuffer(scratch_gpu_range, 0u); // Host side only? + + cmdbuf->bindComputePipeline(reduce_pipeline); + cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, reduce_pipeline->getLayout(), 0u, 1u, &reduceDS.get()); + reducer->dispatchHelper(cmdbuf.get(), reduce_pipeline->getLayout(), push_constants, dispatch_info, reduceInfo); + + // Reset the workgroup enumerator buffer + SBufferRange scratch_workgroupenum_range = scratch_gpu_range; + scratch_workgroupenum_range.offset = sizeof(uint32_t); + scratch_workgroupenum_range.size = push_constants.scanParams.getWorkgroupEnumeratorSize(); + cmdbuf->fillBuffer(scratch_workgroupenum_range, 0u); + + cmdbuf->bindComputePipeline(scan_pipeline); + cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, scan_pipeline->getLayout(), 0u, 1u, &scanDS.get()); + scanner->dispatchHelper(cmdbuf.get(), scan_pipeline->getLayout(), push_constants, dispatch_info, scanInfo); + + // REVIEW: Maybe collapse descriptor sets since they're the same? But this way we are prepared for potential future pipeline discrepancies between Reduce and Scan ops + + cmdbuf->end(); + + { + semInfo[0].value = 2; + semInfo[0].stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + .cmdbuf = cmdbuf.get() + } }; + + const IQueue::SSubmitInfo infos[1] = { { + .commandBuffers = commandBuffers, + .signalSemaphores = semInfo + } }; + + m_api->startCapture(); + if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { + m_logger->log("Submission failure", system::ILogger::ELL_ERROR); + } + m_api->endCapture(); + } + + // TODO: Update to support all operations + // cpu counterpart + auto cpu_begin = inputData + begin; + m_logger->log("CPU scan begin", system::ILogger::ELL_PERFORMANCE); + + auto start = std::chrono::high_resolution_clock::now(); + switch (scanType) + { + case video::CScanner::EST_INCLUSIVE: + std::inclusive_scan(cpu_begin, inputData + end, cpu_begin); + break; + case video::CScanner::EST_EXCLUSIVE: + std::exclusive_scan(cpu_begin, inputData + end, cpu_begin, 0u); + break; + default: + assert(false); + exit(0xdeadbeefu); + break; + } + auto stop = std::chrono::high_resolution_clock::now(); + + m_logger->log("CPU scan end. Time taken: %d us", system::ILogger::ELL_PERFORMANCE, std::chrono::duration_cast(stop - start).count()); + // wait for the gpu impl to complete + const ISemaphore::SWaitInfo cmdbufDonePending[] = {{ + .semaphore = semaphore.get(), + .value = 2 + }}; + if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for operation semaphore failed", ILogger::ELL_ERROR); + return false; + } + + { + IGPUBuffer::SCreationParams params = {}; + params.size = in_gpu_range.size; + params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT; + // (REVIEW): Check if this new download_buffer is needed or if we can directly read from the gpu_input buffer + auto downloaded_buffer = m_device->createBuffer(std::move(params)); + auto memReqs = downloaded_buffer->getMemoryReqs(); + memReqs.memoryTypeBits &= m_physicalDevice->getDownStreamingMemoryTypeBits(); + auto queriesMem = m_device->allocate(memReqs, downloaded_buffer.get()); + { + // (REVIEW): Maybe we can just reset the cmdbuf we already have? + core::smart_refctd_ptr cmdbuf; + { + auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::NONE); + cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf, 1}, core::smart_refctd_ptr(m_logger)); + } + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool + IGPUCommandBuffer::SBufferCopy region; + region.srcOffset = in_gpu_range.offset; + region.dstOffset = 0u; + region.size = in_gpu_range.size; + cmdbuf->copyBuffer(in_gpu_range.buffer.get(), downloaded_buffer.get(), 1u, ®ion); + cmdbuf->end(); + + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { { + .cmdbuf = cmdbuf.get() + } }; + + semInfo[0].value = 3; + const IQueue::SSubmitInfo infos[1] = { { + .commandBuffers = commandBuffers, + .signalSemaphores = semInfo + } }; + + if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) { + m_logger->log("Download submission failure", system::ILogger::ELL_ERROR); + } + + const ISemaphore::SWaitInfo cmdbufDonePending[] = { { + .semaphore = semaphore.get(), + .value = 3 + } }; + if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) { + m_logger->log("Blocking for download semaphore failed", ILogger::ELL_ERROR); + return false; + } + } + } + + auto mem = const_cast(downloaded_buffer->getBoundMemory().memory); + { + mem->map({ .offset = 0u, .length = params.size }, video::IDeviceMemoryAllocation::EMCAF_READ); + } + auto gpu_begin = reinterpret_cast(mem->getMappedPointer()); + for (auto i = 0u; i < elementCount; i++) + { + if (gpu_begin[i] != cpu_begin[i]) + _NBL_DEBUG_BREAK_IF(true); + } + m_logger->log("Result Comparison Test Passed", system::ILogger::ELL_PERFORMANCE); + operationSuccess = true; + } + + delete[] inputData; + + return true; + } + + //virtual video::SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override + //{ + // video::SPhysicalDeviceFeatures retval = {}; + + // retval.bufferDeviceAddress = true; + // retval.subgroupBroadcastDynamicId = true; + // retval.shaderSubgroupExtendedTypes = true; + // // TODO: actually need to implement this and set it on the pipelines + // retval.computeFullSubgroups = true; + // retval.subgroupSizeControl = true; + + // return retval; + //} + + virtual bool onAppTerminated() override + { + m_logger->log("==========Result==========", ILogger::ELL_INFO); + m_logger->log("Operation Success: %s", ILogger::ELL_INFO, operationSuccess ?"true":"false"); + delete[] inputData; + return true; + } + + // the unit test is carried out on init + void workLoopBody() override {} + + bool keepRunning() override { return false; } + +private: + void logTestOutcome(bool passed, uint32_t workgroupSize) + { + if (passed) + m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); + else + { + m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); + } + } + + IQueue* computeQueue; + uint32_t* inputData = nullptr; + smart_refctd_ptr descriptorSet; + smart_refctd_ptr pipelineLayout; + smart_refctd_ptr cmdbuf; + smart_refctd_ptr resultsBuffer; + + bool operationSuccess = false; +}; + +NBL_MAIN_FUNC(ComputeScanApp) diff --git a/15_ComputeScan/pipeline.groovy b/15_ComputeScan/pipeline.groovy new file mode 100644 index 000000000..4eaaafe23 --- /dev/null +++ b/15_ComputeScan/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CComputeScanBuilder extends IBuilder +{ + public CComputeScanBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CComputeScanBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b3279a48..9f8b4d54e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,9 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(10_CountingSort EXCLUDE_FROM_ALL) # showcase use of FFT for post-FX Bloom effect add_subdirectory(11_FFT EXCLUDE_FROM_ALL) - + + add_subdirectory(14_ComputeReduce EXCLUDE_FROM_ALL) + add_subdirectory(15_ComputeScan EXCLUDE_FROM_ALL) # Waiting for a refactor #add_subdirectory(27_PLYSTLDemo EXCLUDE_FROM_ALL) diff --git a/Readme.md b/Readme.md index 8d124b33b..1b1dd696d 100644 --- a/Readme.md +++ b/Readme.md @@ -31,8 +31,8 @@ Whenever CMake generates separate makefiles/solutions/projects, they will be gen | 11_LoDSystem | ![][11_MSVC_Release] | ![][11_MSVC_RWDI] | ![][11_MSVC_Debug] | ![][11_Android_Release] | ![][11_Android_RWDI] | ![][11_Android_Debug] | ![][B] | ![][S] | ![][S] | | | 12_glTF | ![][12_MSVC_Release] | ![][12_MSVC_RWDI] | ![][12_MSVC_Debug] | ![][12_Android_Release] | ![][12_Android_RWDI] | ![][12_Android_Debug] | ![][W] | ![][W] | ![][W] | COMPILE_WITH_GLTF_LOADER | | 13. | ![][13_MSVC_Release] | ![][13_MSVC_RWDI] | ![][13_MSVC_Debug] | ![][13_Android_Release] | ![][13_Android_RWDI] | ![][13_Android_Debug] | ![][NA] | ![][NA] | ![][NA] | | -| 14_ComputeScan | ![][14_MSVC_Release] | ![][14_MSVC_RWDI] | ![][14_MSVC_Debug] | ![][14_Android_Release] | ![][14_Android_RWDI] | ![][14_Android_Debug] | ![][B] | ![][S] | ![][S] | | -| 15. | ![][15_MSVC_Release] | ![][15_MSVC_RWDI] | ![][15_MSVC_Debug] | ![][15_Android_Release] | ![][15_Android_RWDI] | ![][15_Android_Debug] | ![][NA] | ![][NA] | ![][NA] | | +| 14_ComputeReduce | ![][14_MSVC_Release] | ![][14_MSVC_RWDI] | ![][14_MSVC_Debug] | ![][14_Android_Release] | ![][14_Android_RWDI] | ![][14_Android_Debug] | ![][B] | ![][S] | ![][S] | | +| 15_ComputeScan | ![][15_MSVC_Release] | ![][15_MSVC_RWDI] | ![][15_MSVC_Debug] | ![][15_Android_Release] | ![][15_Android_RWDI] | ![][15_Android_Debug] | ![][NA] | ![][NA] | ![][NA] | | | 16_OrderIndependentTransparency | ![][16_MSVC_Release] | ![][16_MSVC_RWDI] | ![][16_MSVC_Debug] | ![][16_Android_Release] | ![][16_Android_RWDI] | ![][16_Android_Debug] | ![][B] | ![][S] | ![][S] | | | 17_SimpleBulletIntegration | ![][17_MSVC_Release] | ![][17_MSVC_RWDI] | ![][17_MSVC_Debug] | ![][17_Android_Release] | ![][17_Android_RWDI] | ![][17_Android_Debug] | ![][B] | ![][S] | ![][N] | BUILD_BULLET | | 18_MitsubaLoader | ![][18_MSVC_Release] | ![][18_MSVC_RWDI] | ![][18_MSVC_Debug] | ![][18_Android_Release] | ![][18_Android_RWDI] | ![][18_Android_Debug] | ![][S] | ![][S] | ![][N] | BUILD_MITSUBA_LOADER |