-
Notifications
You must be signed in to change notification settings - Fork 65
Work on property pool HLSL impl #649
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 20 commits
a1747c6
adc4d57
d9ddf41
1707158
279c220
4be1a3c
3570c03
c44bb49
9460e24
88d1d00
52d6972
706000d
b625153
ef4b779
b8db8c9
1a0c998
99d80a7
61604ee
7ac728b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
|
||
#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_ | ||
#define _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_ | ||
|
||
#include <nbl/builtin/hlsl/cpp_compat/vector.hlsl> | ||
#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl> | ||
#include <nbl/builtin/hlsl/type_traits.hlsl> | ||
|
||
|
||
#if (__cplusplus >= 202002L && __cpp_concepts) | ||
|
||
#define NBL_CONCEPT_TYPE_PARAMS(...) template <__VA_ARGS__> | ||
#define NBL_CONCEPT_SIGNATURE(NAME, ...) concept NAME = requires(__VA_ARGS__) | ||
#define NBL_CONCEPT_BODY(...) { __VA_ARGS__ }; | ||
#define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__; | ||
#define NBL_REQUIRES(...) requires __VA_ARGS__ | ||
|
||
#include <concepts> | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace concepts | ||
{ | ||
|
||
// Alias some of the std concepts in nbl. As this is C++20 only, we don't need to use | ||
// the macros here. | ||
template <typename T, typename U> | ||
concept same_as = std::same_as<T, U>; | ||
|
||
template <typename D, typename B> | ||
concept derived_from = std::derived_from<D, B>; | ||
|
||
template <typename F, typename T> | ||
concept convertible_to = std::convertible_to<F, T>; | ||
|
||
template <typename T, typename F> | ||
concept assignable_from = std::assignable_from<T, F>; | ||
|
||
template <typename T, typename U> | ||
concept common_with = std::common_with<T, U>; | ||
|
||
template <typename T> | ||
concept integral = std::integral<T>; | ||
|
||
template <typename T> | ||
concept signed_integral = std::signed_integral<T>; | ||
|
||
template <typename T> | ||
concept unsigned_integral = std::unsigned_integral<T>; | ||
|
||
template <typename T> | ||
concept floating_point = std::floating_point<T>; | ||
|
||
|
||
// Some other useful concepts. | ||
|
||
template<typename T, typename... Ts> | ||
concept any_of = (same_as<T, Ts> || ...); | ||
|
||
template <typename T> | ||
concept scalar = floating_point<T> || integral<T>; | ||
|
||
template <typename T> | ||
concept vectorial = is_vector<T>::value; | ||
|
||
template <typename T> | ||
concept matricial = is_matrix<T>::value; | ||
|
||
} | ||
} | ||
} | ||
|
||
#else | ||
|
||
// No C++20 support. Do nothing. | ||
#define NBL_CONCEPT_TYPE_PARAMS(...) | ||
#define NBL_CONCEPT_SIGNATURE(NAME, ...) | ||
#define NBL_CONCEPT_BODY(...) | ||
#define NBL_REQUIRES(...) | ||
|
||
#endif | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" | ||
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" | ||
#include "nbl/builtin/hlsl/property_pool/transfer.hlsl" | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace property_pools | ||
{ | ||
|
||
[[vk::push_constant]] GlobalPushContants globals; | ||
|
||
template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2, uint64_t DstIndexSizeLog2> | ||
struct TransferLoop | ||
{ | ||
void iteration(uint propertyId, TransferRequest transferRequest, uint64_t invocationIndex) | ||
{ | ||
const uint64_t srcIndexSize = uint64_t(1) << SrcIndexSizeLog2; | ||
const uint64_t dstIndexSize = uint64_t(1) << DstIndexSizeLog2; | ||
|
||
// Fill: Always use offset 0 on src | ||
const uint64_t srcOffset = Fill ? 0 : invocationIndex * transferRequest.propertySize; | ||
const uint64_t dstOffset = invocationIndex * transferRequest.propertySize; | ||
|
||
// IOTA: Use the index as the fetching offset | ||
// Non IOTA: Read the address buffer ("index buffer") to select fetching offset | ||
const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcOffset : vk::RawBufferLoad<uint32_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint32_t)); | ||
const uint64_t dstAddressBufferOffset = DstIndexIota ? dstOffset : vk::RawBufferLoad<uint32_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint32_t)); | ||
deprilula28 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize; | ||
const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize; | ||
|
||
//vk::RawBufferStore<uint64_t>(transferRequest.dstAddr + invocationIndex * sizeof(uint64_t) * 2, srcAddressMapped,8); | ||
//vk::RawBufferStore<uint64_t>(transferRequest.dstAddr + invocationIndex * sizeof(uint64_t) * 2 + sizeof(uint64_t), dstAddressMapped,8); | ||
if (SrcIndexSizeLog2 == 0) {} // we can't write individual bytes | ||
else if (SrcIndexSizeLog2 == 1) vk::RawBufferStore<uint16_t>(dstAddressMapped, vk::RawBufferLoad<uint16_t>(srcAddressMapped)); | ||
deprilula28 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else if (SrcIndexSizeLog2 == 2) vk::RawBufferStore<uint32_t>(dstAddressMapped, vk::RawBufferLoad<uint32_t>(srcAddressMapped)); | ||
else if (SrcIndexSizeLog2 == 3) vk::RawBufferStore<uint64_t>(dstAddressMapped, vk::RawBufferLoad<uint64_t>(srcAddressMapped)); | ||
} | ||
|
||
void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) | ||
{ | ||
uint64_t elementCount = uint64_t(transferRequest.elementCount32) | ||
| uint64_t(transferRequest.elementCountExtra) << 32; | ||
uint64_t lastInvocation = min(elementCount, globals.endOffset); | ||
for (uint64_t invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize) | ||
{ | ||
iteration(propertyId, transferRequest, invocationIndex); | ||
} | ||
} | ||
}; | ||
|
||
// For creating permutations of the functions based on parameters that are constant over the transfer request | ||
// These branches should all be scalar, and because of how templates are compiled statically, the loops shouldn't have any | ||
// branching within them | ||
// | ||
// Permutations: | ||
// 2 (fill or not) * 2 (src index iota or not) * 2 (dst index iota or not) * 4 (src index size) * 4 (dst index size) | ||
// Total amount of permutations: 128 | ||
|
||
template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2> | ||
struct TransferLoopPermutationSrcIndexSizeLog | ||
{ | ||
void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) | ||
{ | ||
if (transferRequest.dstIndexSizeLog2 == 0) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 0> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else if (transferRequest.dstIndexSizeLog2 == 1) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 1> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else if (transferRequest.dstIndexSizeLog2 == 2) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 2> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 3> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
} | ||
}; | ||
|
||
template<bool Fill, bool SrcIndexIota, bool DstIndexIota> | ||
struct TransferLoopPermutationDstIota | ||
{ | ||
void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) | ||
{ | ||
if (transferRequest.srcIndexSizeLog2 == 0) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 0> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else if (transferRequest.srcIndexSizeLog2 == 1) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 1> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else if (transferRequest.srcIndexSizeLog2 == 2) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 2> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 3> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
} | ||
}; | ||
|
||
template<bool Fill, bool SrcIndexIota> | ||
struct TransferLoopPermutationSrcIota | ||
{ | ||
void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) | ||
{ | ||
bool dstIota = transferRequest.dstIndexAddr == 0; | ||
if (dstIota) { TransferLoopPermutationDstIota<Fill, SrcIndexIota, true> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else { TransferLoopPermutationDstIota<Fill, SrcIndexIota, false> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
} | ||
}; | ||
|
||
template<bool Fill> | ||
struct TransferLoopPermutationFill | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. only use structs instead of templated functions when you need partial specialization There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what you mean There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The struct functor only makes sense if:
template<typename Accessor, typename Compare>
uint32_t find_first(inout Accessor accessor, const Compare comparator); if neither of the above applies, just use a templated function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah ok, I think your original comment was the wrong way around There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
"only use structs instead of templated functions when you need partial specialization"
|
||
void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) | ||
{ | ||
bool srcIota = transferRequest.srcIndexAddr == 0; | ||
if (srcIota) { TransferLoopPermutationSrcIota<Fill, true> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else { TransferLoopPermutationSrcIota<Fill, false> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
} | ||
}; | ||
|
||
template<typename device_capabilities> | ||
void main(uint32_t3 dispatchId) | ||
{ | ||
const uint propertyId = dispatchId.y; | ||
const uint invocationIndex = dispatchId.x; | ||
|
||
// Loading transfer request from the pointer (can't use struct | ||
// with BDA on HLSL SPIRV) | ||
uint64_t transferCmdAddr = globals.transferCommandsAddress + sizeof(TransferRequest) * propertyId; | ||
TransferRequest transferRequest; | ||
transferRequest.srcAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr,8); | ||
transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t),8); | ||
transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 2,8); | ||
transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 3,8); | ||
Comment on lines
+126
to
+129
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make a wrapper for |
||
// Remaining elements are part of the same bitfield | ||
// TODO: Do this only using raw buffer load? | ||
uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 4,8); | ||
transferRequest.elementCount32 = uint32_t(bitfieldType); | ||
transferRequest.elementCountExtra = uint32_t(bitfieldType >> 32); | ||
transferRequest.propertySize = uint32_t(bitfieldType >> (32 + 3)); | ||
transferRequest.fill = uint32_t(bitfieldType >> (32 + 3 + 24)); | ||
transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1)); | ||
transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1 + 2)); | ||
|
||
const uint dispatchSize = nbl::hlsl::device_capabilities_traits<device_capabilities>::maxOptimallyResidentWorkgroupInvocations; | ||
deprilula28 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
const bool fill = transferRequest.fill == 1; | ||
|
||
//uint64_t debugWriteAddr = transferRequest.dstAddr + sizeof(uint64_t) * 9 * propertyId; | ||
//vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 0, transferRequest.srcAddr,8); | ||
//vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 1, transferRequest.dstAddr,8); | ||
//vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 2, transferRequest.srcIndexAddr,8); | ||
//vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 3, transferRequest.dstIndexAddr,8); | ||
//uint64_t elementCount = uint64_t(transferRequest.elementCount32) | ||
// | uint64_t(transferRequest.elementCountExtra) << 32; | ||
//vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 4, elementCount,8); | ||
//vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 5, transferRequest.propertySize,4); | ||
//vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 6, transferRequest.fill,4); | ||
//vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 7, transferRequest.srcIndexSizeLog2,4); | ||
//vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 8, transferRequest.dstIndexSizeLog2,4); | ||
//vk::RawBufferStore<uint64_t>(transferRequest.dstAddr + sizeof(uint64_t) * invocationIndex, invocationIndex,8); | ||
|
||
if (fill) { TransferLoopPermutationFill<true> loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else { TransferLoopPermutationFill<false> loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } | ||
} | ||
|
||
} | ||
} | ||
} | ||
|
||
// TODO: instead use some sort of replace function for getting optimal size? | ||
[numthreads(512,1,1)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I already wrote on discord to codegen a 5 line compute shader with |
||
void main(uint32_t3 dispatchId : SV_DispatchThreadID) | ||
{ | ||
nbl::hlsl::property_pools::main<nbl::hlsl::jit::device_capabilities>(dispatchId); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#ifndef _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_ | ||
#define _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_ | ||
|
||
#include "nbl/builtin/hlsl/cpp_compat.hlsl" | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace property_pools | ||
{ | ||
|
||
struct TransferRequest | ||
{ | ||
// This represents a transfer command/request | ||
uint64_t srcAddr; | ||
uint64_t dstAddr; | ||
uint64_t srcIndexAddr; // IOTA default | ||
uint64_t dstIndexAddr; // IOTA default | ||
// TODO: go back to this ideal layout when things work | ||
// (Getting a fatal error from DXC when using 64-bit bitfields:) | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// fatal error: generated SPIR-V is invalid: [VUID-StandaloneSpirv-Base-04781] Expected 32-bit int type for Base operand: BitFieldInsert | ||
// %58 = OpBitFieldInsert %ulong %42 %57 %uint_0 %uint_35 | ||
// | ||
//uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers | ||
//uint64_t propertySize : 24; // all the leftover bits (just use bytes now) | ||
//uint64_t fill : 1; | ||
//// 0=uint8, 1=uint16, 2=uint32, 3=uint64 | ||
//uint64_t srcIndexSizeLog2 : 2; | ||
//uint64_t dstIndexSizeLog2 : 2; | ||
uint32_t elementCount32; // 32 first bits | ||
uint32_t elementCountExtra : 3; // 3 last bits | ||
uint32_t propertySize : 24; | ||
uint32_t fill: 1; | ||
uint32_t srcIndexSizeLog2 : 2; | ||
uint32_t dstIndexSizeLog2 : 2; | ||
}; | ||
|
||
struct GlobalPushContants | ||
deprilula28 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
// BDA address (GPU pointer) into the transfer commands buffer | ||
uint64_t transferCommandsAddress; | ||
// Define the range of invocations (X axis) that will be transfered over in this dispatch | ||
// May be sectioned off in the case of overflow or any other situation that doesn't allow | ||
// for a full transfer | ||
uint64_t beginOffset; | ||
uint64_t endOffset; | ||
Comment on lines
+46
to
+50
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would be useful to make it clear we're counting in DWORDs or shorts (if you want to do 16bit transfer atoms instead) |
||
}; | ||
|
||
NBL_CONSTEXPR uint32_t MaxPropertiesPerDispatch = 128; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there any reason to keep this around anymore? |
||
|
||
} | ||
} | ||
} | ||
|
||
#endif | ||
|
Uh oh!
There was an error while loading. Please reload this page.