Devsh-Graphics-Programming · devshgraphicsprogramming · Jul 9, 2023 · Jul 9, 2023 · Jul 9, 2023 · Jul 9, 2023
diff --git a/63.CUDAInterop/CMakeLists.txt → 63_CUDAInterop/CMakeLists.txt b/63.CUDAInterop/CMakeLists.txt → 63_CUDAInterop/CMakeLists.txt
diff --git a/63.CUDAInterop/main.cpp → 63_CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp → 63_CUDAInterop/main.cpp
@@ -9,7 +9,7 @@
 #include "nbl/video/CCUDASharedMemory.h"
 #include "nbl/video/CCUDASharedSemaphore.h"
 
-#include "../common./MonoSystemMonoLoggerApplication.hpp"
+#include "../common/MonoDeviceApplication.hpp"
 
 using namespace nbl;
 using namespace core;
@@ -47,17 +47,16 @@ size_t size = sizeof(float) * numElements;
 static_assert(false);
 #endif
 
-class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
+class CUDA2VKApp : public examples::MonoDeviceApplication
 {
-	using base_t = examples::MonoSystemMonoLoggerApplication;
+	using base_t = examples::MonoDeviceApplication;
 public:
 	// Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor
 	using base_t::base_t;
 
 	smart_refctd_ptr<CCUDAHandler> cudaHandler;
 	smart_refctd_ptr<CCUDADevice> cudaDevice;
-	// IUtilities* util;
-	smart_refctd_ptr<ILogicalDevice> logicalDevice;
+
 	IQueue* queue;
 
 	std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpubuffers;
@@ -76,63 +75,19 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 		// Remember to call the base class initialization!
 		if (!base_t::onAppInitialized(std::move(system)))
 			return false;
-		// `system` could have been null (see the comments in `MonoSystemMonoLoggerApplication::onAppInitialized` as for why)
-		// use `MonoSystemMonoLoggerApplication::m_system` throughout the example instead!
-
-		// You should already know Vulkan and come here to save on the boilerplate, if you don't know what instances and instance extensions are, then find out.
-		smart_refctd_ptr<CVulkanConnection> api;
-		{
-			// You generally want to default initialize any parameter structs
-			IAPIConnection::SFeatures apiFeaturesToEnable = {};
-			// generally you want to make your life easier during development
-			apiFeaturesToEnable.validations = true;
-			apiFeaturesToEnable.synchronizationValidation = true;
-			// want to make sure we have this so we can name resources for vieweing in RenderDoc captures
-			apiFeaturesToEnable.debugUtils = true;
-			// create our Vulkan instance
-			if (!(api = CVulkanConnection::create(smart_refctd_ptr(m_system), 0, _NBL_APP_NAME_, smart_refctd_ptr(base_t::m_logger), apiFeaturesToEnable)))
-				return logFail("Failed to crate an IAPIConnection!");
-		}
-
-		// We won't go deep into performing physical device selection in this example, we'll take any device with a compute queue.
-		// Nabla has its own set of required baseline Vulkan features anyway, it won't report any device that doesn't meet them.
-		IPhysicalDevice* physDev = nullptr;
-		ILogicalDevice::SCreationParams params = {};
-		// we will only deal with a single queue in this example
-		params.queueParamsCount = 1;
-		params.queueParams[0].count = 1;
-		params.featuresToEnable;
-		for (auto physDevIt = api->getPhysicalDevices().begin(); physDevIt != api->getPhysicalDevices().end(); physDevIt++)
-		{
-			const auto familyProps = (*physDevIt)->getQueueFamilyProperties();
-			// this is the only "complicated" part, we want to create a queue that supports compute pipelines
-			for (auto i = 0; i < familyProps.size(); i++)
-				if (familyProps[i].queueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT))
-				{
-					physDev = *physDevIt;
-					params.queueParams[0].familyIndex = i;
-					break;
-				}
-		}
-		if (!physDev)
-			return logFail("Failed to find any Physical Devices with Compute capable Queue Families!");
 
 		{
-			auto& limits = physDev->getLimits();
+			auto& limits = m_physicalDevice->getLimits();
 			if (!limits.externalMemoryWin32 || !limits.externalFenceWin32 || !limits.externalSemaphoreWin32)
 				return logFail("Physical device does not support the required extensions");
 
 			cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr<ILogger>(m_logger));
 			assert(cudaHandler);
-			cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(api), physDev);
+			cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(m_api), m_physicalDevice);
 		}
 
-		// logical devices need to be created form physical devices which will actually let us create vulkan objects and use the physical device
-		logicalDevice = physDev->createLogicalDevice(std::move(params));
-		if (!logicalDevice)
-			return logFail("Failed to create a Logical Device!");
 
-		queue = logicalDevice->getQueue(params.queueParams[0].familyIndex, 0);
+		queue = base_t::getComputeQueue();
 
 		createResources();
 
@@ -160,6 +115,7 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 		ASSERT_SUCCESS(cu.pcuModuleUnload(module));
 		ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
 
+		m_device->waitIdle();
 		return true;
 	}
 
@@ -180,19 +136,19 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
 		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
 
-		sema = logicalDevice->createSemaphore({ .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
+		sema = m_device->createSemaphore({ .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
 		ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get()));
 		{
-			auto devmemory = mem[2]->exportAsMemory(logicalDevice.get());
+			auto devmemory = mem[2]->exportAsMemory(m_device.get());
 			assert(devmemory);
 			IGPUBuffer::SCreationParams params = {};
 			params.size = devmemory->getAllocationSize();
 			params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
 			params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-			importedbuf = logicalDevice->createBuffer(std::move(params));
+			importedbuf = m_device->createBuffer(std::move(params));
 			assert(importedbuf);
 			ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedbuf.get(), .binding = {.memory = devmemory.get() } };
-			bool re = logicalDevice->bindBufferMemory(1, &bindInfo);
+			bool re = m_device->bindBufferMemory(1, &bindInfo);
 			assert(re);
 		}
 
@@ -205,18 +161,18 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 			params.extent = { gridDim[0], blockDim[0], 1 };
 			params.mipLevels = 1;
 			params.arrayLayers = 1;
-			params.usage = IGPUImage::EUF_STORAGE_BIT | IGPUImage::EUF_TRANSFER_SRC_BIT;
+			params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
 			params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
 			params.tiling = IGPUImage::TILING::LINEAR;
-			importedimg = mem[2]->exportAsImage(logicalDevice.get(), std::move(params));
+			importedimg = mem[2]->createAndBindImage(m_device.get(), std::move(params));
 			assert(importedimg);
 		}
 
-		commandPool = logicalDevice->createCommandPool(queue->getFamilyIndex(), {});
+		commandPool = m_device->createCommandPool(queue->getFamilyIndex(), {});
 		bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd, smart_refctd_ptr(m_logger));
 		assert(re);
 
-		auto createStaging = [logicalDevice=logicalDevice]()
+		auto createStaging = [logicalDevice= m_device]()
 		{
 			auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} });
 			auto req = buf->getMemoryReqs();
@@ -258,8 +214,6 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 			IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
 				.barrier = {
 					.dep = {
-						// .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
-						// .srcAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
 						.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
 						.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
 					},
@@ -271,16 +225,14 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 
 			bool re = true;
 			re &= cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			re &= cmd->pipelineBarrier(EDF_NONE, { .bufBarrierCount = 1, .bufBarriers = &bufBarrier}); 
+			re &= cmd->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1} });
 
 			IGPUCommandBuffer::SBufferCopy region = { .size = size };
 			re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, &region);
 
 			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
 				.barrier = { 
 					.dep = { 
-						// .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
-						// .srcAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
 						.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, 
 						.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
 					},
@@ -293,11 +245,11 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 					.levelCount = 1u,
 					.layerCount = 1u,
 				},
-				.oldLayout = IImage::LAYOUT::PREINITIALIZED,
+				.oldLayout = IImage::LAYOUT::UNDEFINED,
 				.newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
 			};
 
-			re &= cmd->pipelineBarrier(EDF_NONE, {.imgBarrierCount = 1, .imgBarriers = &imgBarrier });
+			re &= cmd->pipelineBarrier(EDF_NONE, { .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
 
 			IImage::SBufferCopy imgRegion = {
 				.imageSubresource = {
@@ -329,15 +281,15 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 	void kernelCallback()
 	{
 		// Make sure we are also done with the readback
-		auto wait = std::array{ILogicalDevice::SSemaphoreWaitInfo{.semaphore = sema.get(), .value = 2}};
-		logicalDevice->waitForSemaphores(wait, true, -1);
+		auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = sema.get(), .value = 2}};
+		m_device->waitForSemaphores(wait, true, -1);
 
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
 		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
 		float* CBuf = reinterpret_cast<float*>(stagingbuf->getBoundMemory().memory->getMappedPointer());
 		float* CImg = reinterpret_cast<float*>(stagingbuf2->getBoundMemory().memory->getMappedPointer());
 
-		 assert(!memcmp(CBuf, CImg, size));
+		assert(!memcmp(CBuf, CImg, size));
 
 		for (auto i = 0; i < numElements; i++)
 		{
@@ -354,56 +306,5 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 	// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
 	void workLoopBody() override {}
 };
-//
-//int main(int argc, char** argv)
-//{
-//	auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{
-//		.appName = { "63.CUDAInterop" },
-//		.apiType = EAT_VULKAN, 
-//		.swapchainImageUsage = IImage::EUF_NONE,
-//	});
-//
-//	auto& system = initOutput.system;
-//	auto& apiConnection = initOutput.apiConnection;
-//	auto& physicalDevice = initOutput.physicalDevice;
-//	auto& logicalDevice = initOutput.logicalDevice;
-//	auto& utilities = initOutput.utilities;
-//	auto& queues = initOutput.queues;
-//	auto& logger = initOutput.logger;
-//
-//	assert(physicalDevice->getLimits().externalMemory);
-//	auto cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr<ILogger>(logger));
-//	assert(cudaHandler);
-//	auto cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(apiConnection), physicalDevice);
-//	auto& cu = cudaHandler->getCUDAFunctionTable();	
-//
-//	smart_refctd_ptr<ICPUBuffer> ptx;
-//	CUmodule   module;
-//	CUfunction kernel;
-//	CUstream   stream;
-//
-//	{
-//		ISystem::future_t<smart_refctd_ptr<IFile>> fut;
-//		system->createFile(fut, "../vectorAdd_kernel.cu", IFileBase::ECF_READ);
-//	/*	auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions());
-//		ASSERT_SUCCESS_NV(res);
-//		ptx = std::move(ptx_);*/
-//	}
-//
-//	//ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
-//	//ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
-//	//ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
-//
-//	//{
-//	//	auto cuda2vk = CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data());
-//	//	cuda2vk.launchKernel(kernel, stream);
-//	//	ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
-//	//}
-//
-//	//ASSERT_SUCCESS(cu.pcuModuleUnload(module));
-//	//ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
-//
-//	return 0;
-//}
 
 NBL_MAIN_FUNC(CUDA2VKApp)
diff --git a/63.CUDAInterop/pipeline.groovy → 63_CUDAInterop/pipeline.groovy b/63.CUDAInterop/pipeline.groovy → 63_CUDAInterop/pipeline.groovy
diff --git a/63.CUDAInterop/vectorAdd_kernel.cu → 63_CUDAInterop/vectorAdd_kernel.cu b/63.CUDAInterop/vectorAdd_kernel.cu → 63_CUDAInterop/vectorAdd_kernel.cu
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -65,6 +65,6 @@ if(NBL_BUILD_EXAMPLES)
 	#add_subdirectory(61_UI EXCLUDE_FROM_ALL)
 	add_subdirectory(62_CAD EXCLUDE_FROM_ALL)
 	add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
-    add_subdirectory(63.CUDAInterop EXCLUDE_FROM_ALL)
+    add_subdirectory(63_CUDAInterop EXCLUDE_FROM_ALL)
 	add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
 endif()
diff --git a/common/MonoDeviceApplication.hpp b/common/MonoDeviceApplication.hpp
@@ -245,15 +245,14 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
 			return retval;
 		}
 
-		// virtual to allow aliasing and total flexibility
-		virtual video::IQueue* getComputeQueue() const
+
+		virtual video::IQueue* getQueue(video::IQueue::FAMILY_FLAGS flags) const
 		{
 			// In the default implementation of everything I asked only for one queue from first compute family
 			const auto familyProperties = m_device->getPhysicalDevice()->getQueueFamilyProperties();
-			for (auto i=0u; i<familyProperties.size(); i++)
-			if (familyProperties[i].queueFlags.hasFlags(video::IQueue::FAMILY_FLAGS::COMPUTE_BIT))
-				return m_device->getQueue(i,0);
-
+			for (auto i = 0u; i < familyProperties.size(); i++)
+				if (familyProperties[i].queueFlags.hasFlags(video::IQueue::FAMILY_FLAGS::COMPUTE_BIT))
+					return m_device->getQueue(i, 0);
 			return nullptr;
 		}