Skip to content

Cuda interop vk13 #90

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
143 changes: 22 additions & 121 deletions 63.CUDAInterop/main.cpp → 63_CUDAInterop/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include "nbl/video/CCUDASharedMemory.h"
#include "nbl/video/CCUDASharedSemaphore.h"

#include "../common./MonoSystemMonoLoggerApplication.hpp"
#include "../common/MonoDeviceApplication.hpp"

using namespace nbl;
using namespace core;
Expand Down Expand Up @@ -47,17 +47,16 @@ size_t size = sizeof(float) * numElements;
static_assert(false);
#endif

class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
class CUDA2VKApp : public examples::MonoDeviceApplication
{
using base_t = examples::MonoSystemMonoLoggerApplication;
using base_t = examples::MonoDeviceApplication;
public:
// Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor
using base_t::base_t;

smart_refctd_ptr<CCUDAHandler> cudaHandler;
smart_refctd_ptr<CCUDADevice> cudaDevice;
// IUtilities* util;
smart_refctd_ptr<ILogicalDevice> logicalDevice;

IQueue* queue;

std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpubuffers;
Expand All @@ -76,63 +75,19 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
// Remember to call the base class initialization!
if (!base_t::onAppInitialized(std::move(system)))
return false;
// `system` could have been null (see the comments in `MonoSystemMonoLoggerApplication::onAppInitialized` as for why)
// use `MonoSystemMonoLoggerApplication::m_system` throughout the example instead!

// You should already know Vulkan and come here to save on the boilerplate, if you don't know what instances and instance extensions are, then find out.
smart_refctd_ptr<CVulkanConnection> api;
{
// You generally want to default initialize any parameter structs
IAPIConnection::SFeatures apiFeaturesToEnable = {};
// generally you want to make your life easier during development
apiFeaturesToEnable.validations = true;
apiFeaturesToEnable.synchronizationValidation = true;
// want to make sure we have this so we can name resources for vieweing in RenderDoc captures
apiFeaturesToEnable.debugUtils = true;
// create our Vulkan instance
if (!(api = CVulkanConnection::create(smart_refctd_ptr(m_system), 0, _NBL_APP_NAME_, smart_refctd_ptr(base_t::m_logger), apiFeaturesToEnable)))
return logFail("Failed to crate an IAPIConnection!");
}

// We won't go deep into performing physical device selection in this example, we'll take any device with a compute queue.
// Nabla has its own set of required baseline Vulkan features anyway, it won't report any device that doesn't meet them.
IPhysicalDevice* physDev = nullptr;
ILogicalDevice::SCreationParams params = {};
// we will only deal with a single queue in this example
params.queueParamsCount = 1;
params.queueParams[0].count = 1;
params.featuresToEnable;
for (auto physDevIt = api->getPhysicalDevices().begin(); physDevIt != api->getPhysicalDevices().end(); physDevIt++)
{
const auto familyProps = (*physDevIt)->getQueueFamilyProperties();
// this is the only "complicated" part, we want to create a queue that supports compute pipelines
for (auto i = 0; i < familyProps.size(); i++)
if (familyProps[i].queueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT))
{
physDev = *physDevIt;
params.queueParams[0].familyIndex = i;
break;
}
}
if (!physDev)
return logFail("Failed to find any Physical Devices with Compute capable Queue Families!");

{
auto& limits = physDev->getLimits();
auto& limits = m_physicalDevice->getLimits();
if (!limits.externalMemoryWin32 || !limits.externalFenceWin32 || !limits.externalSemaphoreWin32)
return logFail("Physical device does not support the required extensions");

cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr<ILogger>(m_logger));
assert(cudaHandler);
cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(api), physDev);
cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(m_api), m_physicalDevice);
}

// logical devices need to be created form physical devices which will actually let us create vulkan objects and use the physical device
logicalDevice = physDev->createLogicalDevice(std::move(params));
if (!logicalDevice)
return logFail("Failed to create a Logical Device!");

queue = logicalDevice->getQueue(params.queueParams[0].familyIndex, 0);
queue = base_t::getComputeQueue();

createResources();

Expand Down Expand Up @@ -160,6 +115,7 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
ASSERT_SUCCESS(cu.pcuModuleUnload(module));
ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));

m_device->waitIdle();
return true;
}

Expand All @@ -180,19 +136,19 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));

sema = logicalDevice->createSemaphore({ .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
sema = m_device->createSemaphore({ .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get()));
{
auto devmemory = mem[2]->exportAsMemory(logicalDevice.get());
auto devmemory = mem[2]->exportAsMemory(m_device.get());
assert(devmemory);
IGPUBuffer::SCreationParams params = {};
params.size = devmemory->getAllocationSize();
params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
importedbuf = logicalDevice->createBuffer(std::move(params));
importedbuf = m_device->createBuffer(std::move(params));
assert(importedbuf);
ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedbuf.get(), .binding = {.memory = devmemory.get() } };
bool re = logicalDevice->bindBufferMemory(1, &bindInfo);
bool re = m_device->bindBufferMemory(1, &bindInfo);
assert(re);
}

Expand All @@ -205,18 +161,18 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
params.extent = { gridDim[0], blockDim[0], 1 };
params.mipLevels = 1;
params.arrayLayers = 1;
params.usage = IGPUImage::EUF_STORAGE_BIT | IGPUImage::EUF_TRANSFER_SRC_BIT;
params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
params.tiling = IGPUImage::TILING::LINEAR;
importedimg = mem[2]->exportAsImage(logicalDevice.get(), std::move(params));
importedimg = mem[2]->createAndBindImage(m_device.get(), std::move(params));
assert(importedimg);
}

commandPool = logicalDevice->createCommandPool(queue->getFamilyIndex(), {});
commandPool = m_device->createCommandPool(queue->getFamilyIndex(), {});
bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd, smart_refctd_ptr(m_logger));
assert(re);

auto createStaging = [logicalDevice=logicalDevice]()
auto createStaging = [logicalDevice= m_device]()
{
auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} });
auto req = buf->getMemoryReqs();
Expand Down Expand Up @@ -258,8 +214,6 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
.barrier = {
.dep = {
// .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
// .srcAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
},
Expand All @@ -271,16 +225,14 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication

bool re = true;
re &= cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
re &= cmd->pipelineBarrier(EDF_NONE, { .bufBarrierCount = 1, .bufBarriers = &bufBarrier});
re &= cmd->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1} });

IGPUCommandBuffer::SBufferCopy region = { .size = size };
re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, &region);

IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
.barrier = {
.dep = {
// .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
// .srcAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
},
Expand All @@ -293,11 +245,11 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
.levelCount = 1u,
.layerCount = 1u,
},
.oldLayout = IImage::LAYOUT::PREINITIALIZED,
.oldLayout = IImage::LAYOUT::UNDEFINED,
.newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
};

re &= cmd->pipelineBarrier(EDF_NONE, {.imgBarrierCount = 1, .imgBarriers = &imgBarrier });
re &= cmd->pipelineBarrier(EDF_NONE, { .imgBarriers = {&imgBarrier,&imgBarrier + 1} });

IImage::SBufferCopy imgRegion = {
.imageSubresource = {
Expand Down Expand Up @@ -329,15 +281,15 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
void kernelCallback()
{
// Make sure we are also done with the readback
auto wait = std::array{ILogicalDevice::SSemaphoreWaitInfo{.semaphore = sema.get(), .value = 2}};
logicalDevice->waitForSemaphores(wait, true, -1);
auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = sema.get(), .value = 2}};
m_device->waitForSemaphores(wait, true, -1);

float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
float* CBuf = reinterpret_cast<float*>(stagingbuf->getBoundMemory().memory->getMappedPointer());
float* CImg = reinterpret_cast<float*>(stagingbuf2->getBoundMemory().memory->getMappedPointer());

assert(!memcmp(CBuf, CImg, size));
assert(!memcmp(CBuf, CImg, size));

for (auto i = 0; i < numElements; i++)
{
Expand All @@ -354,56 +306,5 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
void workLoopBody() override {}
};
//
//int main(int argc, char** argv)
//{
// auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{
// .appName = { "63.CUDAInterop" },
// .apiType = EAT_VULKAN,
// .swapchainImageUsage = IImage::EUF_NONE,
// });
//
// auto& system = initOutput.system;
// auto& apiConnection = initOutput.apiConnection;
// auto& physicalDevice = initOutput.physicalDevice;
// auto& logicalDevice = initOutput.logicalDevice;
// auto& utilities = initOutput.utilities;
// auto& queues = initOutput.queues;
// auto& logger = initOutput.logger;
//
// assert(physicalDevice->getLimits().externalMemory);
// auto cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr<ILogger>(logger));
// assert(cudaHandler);
// auto cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(apiConnection), physicalDevice);
// auto& cu = cudaHandler->getCUDAFunctionTable();
//
// smart_refctd_ptr<ICPUBuffer> ptx;
// CUmodule module;
// CUfunction kernel;
// CUstream stream;
//
// {
// ISystem::future_t<smart_refctd_ptr<IFile>> fut;
// system->createFile(fut, "../vectorAdd_kernel.cu", IFileBase::ECF_READ);
// /* auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions());
// ASSERT_SUCCESS_NV(res);
// ptx = std::move(ptx_);*/
// }
//
// //ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
// //ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
// //ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
//
// //{
// // auto cuda2vk = CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data());
// // cuda2vk.launchKernel(kernel, stream);
// // ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
// //}
//
// //ASSERT_SUCCESS(cu.pcuModuleUnload(module));
// //ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
//
// return 0;
//}

NBL_MAIN_FUNC(CUDA2VKApp)
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,6 @@ if(NBL_BUILD_EXAMPLES)
#add_subdirectory(61_UI EXCLUDE_FROM_ALL)
add_subdirectory(62_CAD EXCLUDE_FROM_ALL)
add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
add_subdirectory(63.CUDAInterop EXCLUDE_FROM_ALL)
add_subdirectory(63_CUDAInterop EXCLUDE_FROM_ALL)
add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
endif()
11 changes: 5 additions & 6 deletions common/MonoDeviceApplication.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,15 +245,14 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
return retval;
}

// virtual to allow aliasing and total flexibility
virtual video::IQueue* getComputeQueue() const

virtual video::IQueue* getQueue(video::IQueue::FAMILY_FLAGS flags) const
{
// In the default implementation of everything I asked only for one queue from first compute family
const auto familyProperties = m_device->getPhysicalDevice()->getQueueFamilyProperties();
for (auto i=0u; i<familyProperties.size(); i++)
if (familyProperties[i].queueFlags.hasFlags(video::IQueue::FAMILY_FLAGS::COMPUTE_BIT))
return m_device->getQueue(i,0);

for (auto i = 0u; i < familyProperties.size(); i++)
if (familyProperties[i].queueFlags.hasFlags(video::IQueue::FAMILY_FLAGS::COMPUTE_BIT))
return m_device->getQueue(i, 0);
return nullptr;
}

Expand Down