Skip to content

Commit d15bb7a

Browse files
GPU Upload Heap & UMA. (#1421)
* Initial work for GPU upload heap & UMA. * Finish D3D12 Support. * Rework the logic for the GPU Upload Heap fallback. * Only enable UMA on Vulkan on integrated GPUs. * Fix D3D12 fallback condition. --------- Co-authored-by: Dario <dariosamo@gmail.com>
1 parent 1c1dc09 commit d15bb7a

File tree

6 files changed

+113
-36
lines changed

6 files changed

+113
-36
lines changed

UnleashedRecomp/CMakeLists.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,11 @@ endif()
310310
if (UNLEASHED_RECOMP_D3D12)
311311
find_package(directx-headers CONFIG REQUIRED)
312312
find_package(directx12-agility CONFIG REQUIRED)
313-
target_compile_definitions(UnleashedRecomp PRIVATE UNLEASHED_RECOMP_D3D12)
313+
target_compile_definitions(UnleashedRecomp PRIVATE
314+
UNLEASHED_RECOMP_D3D12
315+
D3D12MA_USING_DIRECTX_HEADERS
316+
D3D12MA_OPTIONS16_SUPPORTED
317+
)
314318
endif()
315319

316320
if (CMAKE_SYSTEM_NAME MATCHES "Linux")

UnleashedRecomp/gpu/rhi/plume_d3d12.cpp

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,8 @@ namespace plume {
442442
return D3D12_HEAP_TYPE_UPLOAD;
443443
case RenderHeapType::READBACK:
444444
return D3D12_HEAP_TYPE_READBACK;
445+
case RenderHeapType::GPU_UPLOAD:
446+
return D3D12_HEAP_TYPE_GPU_UPLOAD;
445447
default:
446448
assert(false && "Unknown heap type.");
447449
return D3D12_HEAP_TYPE_DEFAULT;
@@ -2385,7 +2387,7 @@ namespace plume {
23852387
range.End = readRange->end;
23862388
}
23872389

2388-
void *outputData;
2390+
void *outputData = nullptr;
23892391
d3d->Map(subresource, (readRange != nullptr) ? &range : nullptr, &outputData);
23902392
return outputData;
23912393
}
@@ -2629,14 +2631,22 @@ namespace plume {
26292631

26302632
// D3D12Pool
26312633

2632-
D3D12Pool::D3D12Pool(D3D12Device *device, const RenderPoolDesc &desc) {
2634+
D3D12Pool::D3D12Pool(D3D12Device *device, const RenderPoolDesc &desc, bool gpuUploadHeapFallback) {
26332635
assert(device != nullptr);
26342636

26352637
this->device = device;
26362638
this->desc = desc;
26372639

26382640
D3D12MA::POOL_DESC poolDesc = {};
2639-
poolDesc.HeapProperties.Type = toD3D12(desc.heapType);
2641+
2642+
// When using an UMA architecture without explicit support for GPU Upload heaps, we instead just make a custom heap with the same properties as Upload heaps.
2643+
if ((desc.heapType == RenderHeapType::GPU_UPLOAD) && gpuUploadHeapFallback) {
2644+
poolDesc.HeapProperties = device->d3d->GetCustomHeapProperties(0, D3D12_HEAP_TYPE_UPLOAD);
2645+
}
2646+
else {
2647+
poolDesc.HeapProperties.Type = toD3D12(desc.heapType);
2648+
}
2649+
26402650
poolDesc.MinBlockCount = desc.minBlockCount;
26412651
poolDesc.MaxBlockCount = desc.maxBlockCount;
26422652
poolDesc.Flags |= desc.useLinearAlgorithm ? D3D12MA::POOL_FLAG_ALGORITHM_LINEAR : D3D12MA::POOL_FLAG_NONE;
@@ -3390,13 +3400,15 @@ namespace plume {
33903400
if (SUCCEEDED(res)) {
33913401
triangleFanSupportOption = d3d12Options15.TriangleFanSupported;
33923402
}
3393-
3394-
// Check if dynamic depth bias is supported.
3403+
3404+
// Check if dynamic depth bias and GPU upload heap are supported.
33953405
bool dynamicDepthBiasOption = false;
3406+
bool gpuUploadHeapOption = false;
33963407
D3D12_FEATURE_DATA_D3D12_OPTIONS16 d3d12Options16 = {};
33973408
res = deviceOption->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS16, &d3d12Options16, sizeof(d3d12Options16));
33983409
if (SUCCEEDED(res)) {
33993410
dynamicDepthBiasOption = d3d12Options16.DynamicDepthBiasSupported;
3411+
gpuUploadHeapOption = d3d12Options16.GPUUploadHeapSupported;
34003412
}
34013413

34023414
// Check if the architecture has UMA.
@@ -3431,6 +3443,11 @@ namespace plume {
34313443
capabilities.triangleFan = triangleFanSupportOption;
34323444
capabilities.dynamicDepthBias = dynamicDepthBiasOption;
34333445
capabilities.uma = uma;
3446+
3447+
// Pretend GPU Upload heaps are supported if UMA is supported, as the backend has a workaround using a custom pool for it.
3448+
capabilities.gpuUploadHeap = uma || gpuUploadHeapOption;
3449+
gpuUploadHeapFallback = uma && !gpuUploadHeapOption;
3450+
34343451
description.name = deviceName;
34353452
description.dedicatedVideoMemory = adapterDesc.DedicatedVideoMemory;
34363453
description.vendor = RenderDeviceVendor(adapterDesc.VendorId);
@@ -3528,6 +3545,13 @@ namespace plume {
35283545
colorTargetHeapAllocator = std::make_unique<D3D12DescriptorHeapAllocator>(this, TargetDescriptorHeapSize, D3D12_DESCRIPTOR_HEAP_TYPE_RTV);
35293546
depthTargetHeapAllocator = std::make_unique<D3D12DescriptorHeapAllocator>(this, TargetDescriptorHeapSize, D3D12_DESCRIPTOR_HEAP_TYPE_DSV);
35303547

3548+
// Create the custom upload pool that will be used as the fallback when using an UMA architecture without explicit support for GPU Upload heaps.
3549+
if (gpuUploadHeapFallback) {
3550+
RenderPoolDesc poolDesc;
3551+
poolDesc.heapType = RenderHeapType::GPU_UPLOAD;
3552+
customUploadPool = std::make_unique<D3D12Pool>(this, poolDesc, true);
3553+
}
3554+
35313555
// Create a command queue only for retrieving the timestamp frequency. Delete it immediately afterwards.
35323556
std::unique_ptr<D3D12CommandQueue> timestampCommandQueue = std::make_unique<D3D12CommandQueue>(this, RenderCommandListType::DIRECT);
35333557
res = timestampCommandQueue->d3d->GetTimestampFrequency(&timestampFrequency);
@@ -3577,7 +3601,12 @@ namespace plume {
35773601
}
35783602

35793603
std::unique_ptr<RenderBuffer> D3D12Device::createBuffer(const RenderBufferDesc &desc) {
3580-
return std::make_unique<D3D12Buffer>(this, nullptr, desc);
3604+
if ((desc.heapType == RenderHeapType::GPU_UPLOAD) && gpuUploadHeapFallback) {
3605+
return std::make_unique<D3D12Buffer>(this, customUploadPool.get(), desc);
3606+
}
3607+
else {
3608+
return std::make_unique<D3D12Buffer>(this, nullptr, desc);
3609+
}
35813610
}
35823611

35833612
std::unique_ptr<RenderTexture> D3D12Device::createTexture(const RenderTextureDesc &desc) {
@@ -3589,7 +3618,7 @@ namespace plume {
35893618
}
35903619

35913620
std::unique_ptr<RenderPool> D3D12Device::createPool(const RenderPoolDesc &desc) {
3592-
return std::make_unique<D3D12Pool>(this, desc);
3621+
return std::make_unique<D3D12Pool>(this, desc, gpuUploadHeapFallback);
35933622
}
35943623

35953624
std::unique_ptr<RenderPipelineLayout> D3D12Device::createPipelineLayout(const RenderPipelineLayoutDesc &desc) {

UnleashedRecomp/gpu/rhi/plume_d3d12.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ namespace plume {
329329
D3D12Device *device = nullptr;
330330
RenderPoolDesc desc;
331331

332-
D3D12Pool(D3D12Device *device, const RenderPoolDesc &desc);
332+
D3D12Pool(D3D12Device *device, const RenderPoolDesc &desc, bool gpuUploadHeapFallback);
333333
~D3D12Pool() override;
334334
std::unique_ptr<RenderBuffer> createBuffer(const RenderBufferDesc &desc) override;
335335
std::unique_ptr<RenderTexture> createTexture(const RenderTextureDesc &desc) override;
@@ -430,9 +430,11 @@ namespace plume {
430430
std::unique_ptr<D3D12DescriptorHeapAllocator> samplerHeapAllocator;
431431
std::unique_ptr<D3D12DescriptorHeapAllocator> colorTargetHeapAllocator;
432432
std::unique_ptr<D3D12DescriptorHeapAllocator> depthTargetHeapAllocator;
433+
std::unique_ptr<D3D12Pool> customUploadPool;
433434
RenderDeviceCapabilities capabilities;
434435
RenderDeviceDescription description;
435436
uint64_t timestampFrequency = 1;
437+
bool gpuUploadHeapFallback = false;
436438

437439
D3D12Device(D3D12Interface *renderInterface, const std::string &preferredDeviceName);
438440
~D3D12Device() override;

UnleashedRecomp/gpu/rhi/plume_render_interface_types.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,8 @@ namespace plume {
351351
UNKNOWN,
352352
DEFAULT,
353353
UPLOAD,
354-
READBACK
354+
READBACK,
355+
GPU_UPLOAD
355356
};
356357

357358
enum class RenderTextureArrangement {
@@ -1807,6 +1808,9 @@ namespace plume {
18071808

18081809
// UMA.
18091810
bool uma = false;
1811+
1812+
// GPU Upload heap.
1813+
bool gpuUploadHeap = false;
18101814
};
18111815

18121816
struct RenderInterfaceCapabilities {

UnleashedRecomp/gpu/rhi/plume_vulkan.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,12 @@ namespace plume {
808808
bufferInfo.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
809809
createInfo.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
810810
break;
811+
case RenderHeapType::GPU_UPLOAD:
812+
bufferInfo.usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
813+
bufferInfo.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
814+
createInfo.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
815+
createInfo.requiredFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
816+
break;
811817
default:
812818
assert(false && "Unknown heap type.");
813819
break;
@@ -833,7 +839,7 @@ namespace plume {
833839
}
834840

835841
if (res != VK_SUCCESS) {
836-
fprintf(stderr, "vkCreateBuffer failed with error code 0x%X.\n", res);
842+
fprintf(stderr, "vmaCreateBuffer failed with error code 0x%X.\n", res);
837843
return;
838844
}
839845
}
@@ -3887,6 +3893,15 @@ namespace plume {
38873893
VkDeviceSize memoryHeapSize = 0;
38883894
const VkPhysicalDeviceMemoryProperties *memoryProps = nullptr;
38893895
vmaGetMemoryProperties(allocator, &memoryProps);
3896+
3897+
constexpr VkMemoryPropertyFlags uploadHeapPropertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
3898+
bool hasHostVisibleDeviceLocalMemory = false;
3899+
for (uint32_t i = 0; i < memoryProps->memoryTypeCount; i++) {
3900+
if ((memoryProps->memoryTypes[i].propertyFlags & uploadHeapPropertyFlags) == uploadHeapPropertyFlags) {
3901+
hasHostVisibleDeviceLocalMemory = true;
3902+
}
3903+
}
3904+
38903905
for (uint32_t i = 0; i < memoryProps->memoryHeapCount; i++) {
38913906
if (memoryProps->memoryHeaps[i].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
38923907
memoryHeapSize = std::max(memoryProps->memoryHeaps[i].size, memoryHeapSize);
@@ -3907,6 +3922,8 @@ namespace plume {
39073922
capabilities.preferHDR = memoryHeapSize > (512 * 1024 * 1024);
39083923
capabilities.triangleFan = true;
39093924
capabilities.dynamicDepthBias = true;
3925+
capabilities.uma = (description.type == RenderDeviceType::INTEGRATED) && hasHostVisibleDeviceLocalMemory;
3926+
capabilities.gpuUploadHeap = capabilities.uma;
39103927

39113928
// Fill Vulkan-only capabilities.
39123929
loadStoreOpNoneSupported = supportedOptionalExtensions.find(VK_EXT_LOAD_STORE_OP_NONE_EXTENSION_NAME) != supportedOptionalExtensions.end();

UnleashedRecomp/gpu/video.cpp

Lines changed: 46 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2114,40 +2114,54 @@ static void* LockVertexBuffer(GuestBuffer* buffer, uint32_t, uint32_t, uint32_t
21142114
return LockBuffer(buffer, flags);
21152115
}
21162116

2117+
static std::atomic<uint32_t> g_bufferUploadCount = 0;
2118+
21172119
template<typename T>
21182120
static void UnlockBuffer(GuestBuffer* buffer, bool useCopyQueue)
21192121
{
2120-
auto uploadBuffer = g_device->createBuffer(RenderBufferDesc::UploadBuffer(buffer->dataSize));
2121-
2122-
auto dest = reinterpret_cast<T*>(uploadBuffer->map());
2123-
auto src = reinterpret_cast<const T*>(buffer->mappedMemory);
2124-
2125-
for (size_t i = 0; i < buffer->dataSize; i += sizeof(T))
2126-
{
2127-
*dest = ByteSwap(*src);
2128-
++dest;
2129-
++src;
2130-
}
2122+
auto copyBuffer = [&](T* dest)
2123+
{
2124+
auto src = reinterpret_cast<const T*>(buffer->mappedMemory);
21312125

2132-
uploadBuffer->unmap();
2126+
for (size_t i = 0; i < buffer->dataSize; i += sizeof(T))
2127+
{
2128+
*dest = ByteSwap(*src);
2129+
++dest;
2130+
++src;
2131+
}
2132+
};
21332133

2134-
if (useCopyQueue)
2134+
if (useCopyQueue && g_capabilities.gpuUploadHeap)
21352135
{
2136-
ExecuteCopyCommandList([&]
2137-
{
2138-
g_copyCommandList->copyBufferRegion(buffer->buffer->at(0), uploadBuffer->at(0), buffer->dataSize);
2139-
});
2136+
copyBuffer(reinterpret_cast<T*>(buffer->buffer->map()));
2137+
buffer->buffer->unmap();
21402138
}
21412139
else
21422140
{
2143-
auto& commandList = g_commandLists[g_frame];
2141+
auto uploadBuffer = g_device->createBuffer(RenderBufferDesc::UploadBuffer(buffer->dataSize));
2142+
copyBuffer(reinterpret_cast<T*>(uploadBuffer->map()));
2143+
uploadBuffer->unmap();
21442144

2145-
commandList->barriers(RenderBarrierStage::COPY, RenderBufferBarrier(buffer->buffer.get(), RenderBufferAccess::WRITE));
2146-
commandList->copyBufferRegion(buffer->buffer->at(0), uploadBuffer->at(0), buffer->dataSize);
2147-
commandList->barriers(RenderBarrierStage::GRAPHICS, RenderBufferBarrier(buffer->buffer.get(), RenderBufferAccess::READ));
2145+
if (useCopyQueue)
2146+
{
2147+
ExecuteCopyCommandList([&]
2148+
{
2149+
g_copyCommandList->copyBufferRegion(buffer->buffer->at(0), uploadBuffer->at(0), buffer->dataSize);
2150+
});
2151+
}
2152+
else
2153+
{
2154+
auto& commandList = g_commandLists[g_frame];
2155+
2156+
commandList->barriers(RenderBarrierStage::COPY, RenderBufferBarrier(buffer->buffer.get(), RenderBufferAccess::WRITE));
2157+
commandList->copyBufferRegion(buffer->buffer->at(0), uploadBuffer->at(0), buffer->dataSize);
2158+
commandList->barriers(RenderBarrierStage::GRAPHICS, RenderBufferBarrier(buffer->buffer.get(), RenderBufferAccess::READ));
21482159

2149-
g_tempBuffers[g_frame].emplace_back(std::move(uploadBuffer));
2160+
g_tempBuffers[g_frame].emplace_back(std::move(uploadBuffer));
2161+
}
21502162
}
2163+
2164+
g_bufferUploadCount++;
21512165
}
21522166

21532167
template<typename T>
@@ -2325,10 +2339,11 @@ static void DrawProfiler()
23252339
std::lock_guard lock(g_userHeap.physicalMutex);
23262340
physicalDiagnostics = o1heapGetDiagnostics(g_userHeap.physicalHeap);
23272341
}
2328-
2342+
23292343
ImGui::Text("Heap Allocated: %d MB", int32_t(diagnostics.allocated / (1024 * 1024)));
23302344
ImGui::Text("Physical Heap Allocated: %d MB", int32_t(physicalDiagnostics.allocated / (1024 * 1024)));
23312345
ImGui::Text("GPU Waits: %d", int32_t(g_waitForGPUCount));
2346+
ImGui::Text("Buffer Uploads: %d", int32_t(g_bufferUploadCount));
23322347
ImGui::NewLine();
23332348

23342349
ImGui::Text("Present Wait: %s", g_capabilities.presentWait ? "Supported" : "Unsupported");
@@ -2344,6 +2359,7 @@ static void DrawProfiler()
23442359
ImGui::Text("Device Type: %s", DeviceTypeName(g_device->getDescription().type));
23452360
ImGui::Text("VRAM: %.2f MiB", (double)(g_device->getDescription().dedicatedVideoMemory) / (1024.0 * 1024.0));
23462361
ImGui::Text("UMA: %s", g_capabilities.uma ? "Supported" : "Unsupported");
2362+
ImGui::Text("GPU Upload Heap: %s", g_capabilities.gpuUploadHeap ? "Supported" : "Unsupported");
23472363

23482364
const char* sdlVideoDriver = SDL_GetCurrentVideoDriver();
23492365
if (sdlVideoDriver != nullptr)
@@ -3024,10 +3040,15 @@ static GuestTexture* CreateTexture(uint32_t width, uint32_t height, uint32_t dep
30243040
return texture;
30253041
}
30263042

3043+
static RenderHeapType GetBufferHeapType()
3044+
{
3045+
return g_capabilities.gpuUploadHeap ? RenderHeapType::GPU_UPLOAD : RenderHeapType::DEFAULT;
3046+
}
3047+
30273048
static GuestBuffer* CreateVertexBuffer(uint32_t length)
30283049
{
30293050
auto buffer = g_userHeap.AllocPhysical<GuestBuffer>(ResourceType::VertexBuffer);
3030-
buffer->buffer = g_device->createBuffer(RenderBufferDesc::VertexBuffer(length, RenderHeapType::DEFAULT, RenderBufferFlag::INDEX));
3051+
buffer->buffer = g_device->createBuffer(RenderBufferDesc::VertexBuffer(length, GetBufferHeapType(), RenderBufferFlag::INDEX));
30313052
buffer->dataSize = length;
30323053
#ifdef _DEBUG
30333054
buffer->buffer->setName(fmt::format("Vertex Buffer {:X}", g_memory.MapVirtual(buffer)));
@@ -3038,7 +3059,7 @@ static GuestBuffer* CreateVertexBuffer(uint32_t length)
30383059
static GuestBuffer* CreateIndexBuffer(uint32_t length, uint32_t, uint32_t format)
30393060
{
30403061
auto buffer = g_userHeap.AllocPhysical<GuestBuffer>(ResourceType::IndexBuffer);
3041-
buffer->buffer = g_device->createBuffer(RenderBufferDesc::IndexBuffer(length, RenderHeapType::DEFAULT));
3062+
buffer->buffer = g_device->createBuffer(RenderBufferDesc::IndexBuffer(length, GetBufferHeapType()));
30423063
buffer->dataSize = length;
30433064
buffer->format = ConvertFormat(format);
30443065
buffer->guestFormat = format;

0 commit comments

Comments
 (0)