Addressed PR review comments

Fletterio · Fletterio · commit 6401e53a0f98 · 2025-01-10T17:08:51.000-03:00
diff --git a/examples_tests b/examples_tests
@@ -1 +1 @@
-Subproject commit 8aa8653df99c925ec9ed03cac78e4f1a68e74253
+Subproject commit b4d88fac1f74cfaa0541e50dc20c86221f78039d
diff --git a/include/nbl/builtin/hlsl/fft/common.hlsl b/include/nbl/builtin/hlsl/fft/common.hlsl
@@ -14,34 +14,63 @@ namespace hlsl
 namespace fft
 {
 
-// template parameter N controls the number of dimensions of the input
-// template parameter M controls the number of dimensions to pad up to PoT
-// "axes" indicates which dimensions to pad up to PoT
-template <uint16_t N, uint16_t M NBL_FUNC_REQUIRES(M <= N)
-inline vector<uint64_t, 3> padDimensions(NBL_CONST_REF_ARG(vector<uint32_t, N>) dimensions, NBL_CONST_REF_ARG(vector<uint16_t, M>) axes, bool realFFT = false)
+
+template <uint16_t N NBL_FUNC_REQUIRES(N > 0 && N <= 4)
+/**
+* @brief Returns the size of the full FFT computed, in terms of number of complex elements.
+*
+* @tparam N Number of dimensions of the signal to perform FFT on.
+*
+* @param [in] dimensions Size of the signal.
+* @param [in] realFFT Indicates whether the signal is real. False by default.
+* @param [in] firstAxis Indicates which axis the FFT is performed on first. Only relevant for real-valued signals. Must be less than N. 0 by default.
+*/
+inline vector<uint64_t, N> padDimensions(NBL_CONST_REF_ARG(vector<uint32_t, N>) dimensions, bool realFFT = false, uint16_t firstAxis = 0u)
 {
     vector<uint32_t, N> newDimensions = dimensions;
-    uint16_t axisCount = 0;
-    for (uint16_t i = 0u; i < M; i++)
+    for (uint16_t i = 0u; i < N; i++)
     {
         newDimensions[i] = hlsl::roundUpToPoT(newDimensions[i]);
-        if (realFFT && !axisCount++)
-            newDimensions[i] /= 2;
     }
+    if (realFFT)
+        newDimensions[firstAxis] /= 2;
     return newDimensions;
 }
 
-// template parameter N controls the number of dimensions of the input
-// template parameter M controls the number of dimensions we run an FFT along AND store the result
-// "axes" indicates which dimensions we run an FFT along AND store the result
-template <uint16_t N, uint16_t M NBL_FUNC_REQUIRES(M <= N)
-inline uint64_t getOutputBufferSize(NBL_CONST_REF_ARG(vector<uint32_t, N>) inputDimensions, uint32_t numChannels, NBL_CONST_REF_ARG(vector<uint16_t, M>) axes, bool realFFT = false, bool halfFloats = false)
+template <uint16_t N NBL_FUNC_REQUIRES(N > 0 && N <= 4)
+/**
+* @brief Returns the size required by a buffer to hold the result of the FFT of a signal after a certain pass.
+*
+* @tparam N Number of dimensions of the signal to perform FFT on.
+*
+* @param [in] numChannels Number of channels of the signal.
+* @param [in] inputDimensions Size of the signal.
+* @param [in] passIx Which pass the size is being computed for.
+* @param [in] axisPassOrder Order of the axis in which the FFT is computed in. Default is xyzw.
+* @param [in] realFFT True if the signal is real. False by default.
+* @param [in] halfFloats True if using half-precision floats. False by default.
+*/
+inline uint64_t getOutputBufferSize(
+    uint32_t numChannels,
+    NBL_CONST_REF_ARG(vector<uint32_t, N>) inputDimensions,
+    uint16_t passIx,
+    NBL_CONST_REF_ARG(vector<uint16_t, N>) axisPassOrder = _static_cast<vector<uint16_t, N> >(uint16_t4(0, 1, 2, 3)),
+    bool realFFT = false,
+    bool halfFloats = false
+)
 {
-    const vector<uint64_t, 3> paddedDims = padDimensions<N, M>(inputDimensions, axes);
-    const uint64_t numberOfComplexElements = paddedDims[0] * paddedDims[1] * paddedDims[2] * uint64_t(numChannels);
+    const vector<uint32_t, N> paddedDimensions = padDimensions<N>(inputDimensions, realFFT, axisPassOrder[0]);
+    vector<bool, N> axesDone = promote<vector<bool, N>, bool>(false);
+    for (uint16_t i = 0; i <= passIx; i++)
+        axesDone[axisPassOrder[i]] = true;
+    const vector<uint32_t, N> passOutputDimension = lerp(inputDimensions, paddedDimensions, axesDone);
+    uint64_t numberOfComplexElements = uint64_t(numChannels);
+    for (uint16_t i = 0; i < N; i++)
+        numberOfComplexElements *= uint64_t(passOutputDimension[i]);
     return numberOfComplexElements * (halfFloats ? sizeof(complex_t<float16_t>) : sizeof(complex_t<float32_t>));
 }
 
+
 // Computes the kth element in the group of N roots of unity
 // Notice 0 <= k < N/2, rotating counterclockwise in the forward (DIF) transform and clockwise in the inverse (DIT)
 template<bool inverse, typename Scalar>
@@ -95,11 +124,33 @@ void unpack(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi
     lo = x;
 }
 
-// Bit-reverses T as a binary string of length given by Bits
-template<typename T, uint16_t Bits NBL_FUNC_REQUIRES(is_integral_v<T> && Bits <= sizeof(T) * 8)
+template<typename T, uint16_t Bits NBL_FUNC_REQUIRES(is_unsigned_v<T>&& Bits <= sizeof(T) * 8)
+/**
+* @brief Takes the binary representation of `value` as a string of `Bits` bits and returns a value of the same type resulting from reversing the string
+*
+* @tparam T Type of the value to operate on.
+* @tparam Bits The length of the string of bits used to represent `value`.
+*
+* @param [in] value The value to bitreverse.
+*/
 T bitReverseAs(T value)
 {
-    return hlsl::bitReverse<uint32_t>(value) >> (sizeof(T) * 8 - Bits);
+    return bitReverse<T>(value) >> promote<T, scalar_type_t<T> >(scalar_type_t <T>(sizeof(T) * 8 - Bits));
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_unsigned_v<T>)
+/**
+* @brief Takes the binary representation of `value` and returns a value of the same type resulting from reversing the string of bits as if it was `bits` long.
+* Keep in mind `bits` cannot exceed `8 * sizeof(T)`.
+*
+* @tparam T type of the value to operate on.
+*
+* @param [in] value The value to bitreverse.
+* @param [in] bits The length of the string of bits used to represent `value`.
+*/
+T bitReverseAs(T value, uint16_t bits)
+{
+    return bitReverse<T>(value) >> promote<T, scalar_type_t<T> >(scalar_type_t <T>(sizeof(T) * 8 - bits));
 }
 
 }
diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
@@ -7,11 +7,7 @@
 #include "nbl/builtin/hlsl/cpp_compat/basic.h"
 #include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
-<<<<<<< HEAD
-#include "nbl/builtin/hlsl/bit.hlsl"
-=======
 #include "nbl/builtin/hlsl/spirv_intrinsics/glsl.std.450.hlsl"
->>>>>>> master
 
 namespace nbl 
 {
diff --git a/include/nbl/builtin/hlsl/math/intutil.hlsl b/include/nbl/builtin/hlsl/math/intutil.hlsl
@@ -58,10 +58,9 @@ NBL_CONSTEXPR_FORCED_INLINE_FUNC Integer align(Integer alignment, Integer size,
     return address = nextAlignedAddr;
 }
 
+// ------------------------------------- CPP ONLY ----------------------------------------------------------
 #ifndef __HLSL_VERSION
 
-// Have to wait for the HLSL patch for `is_enum`. Would also have to figure out how to do it without initializer lists for HLSL use. 
-
 //! Get bitmask from variadic arguments passed. 
 /*
     For example if you were to create bitmask for vertex attributes
diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl
@@ -38,22 +38,43 @@ struct OptimalFFTParameters
 {
     uint16_t elementsPerInvocationLog2 : 8;
     uint16_t workgroupSizeLog2 : 8;
+
+    // Used to check if the parameters returned by `optimalFFTParameters` are valid
+    bool areValid()
+    {
+        return elementsPerInvocationLog2 > 0 && workgroupSizeLog2 > 0;
+    }
 };
 
-inline OptimalFFTParameters optimalFFTParameters(const uint32_t maxWorkgroupSize, uint32_t inputArrayLength)
+/**
+* @brief Returns the best parameters (according to our metric) to run an FFT
+*
+* @param [in] maxWorkgroupSize The max number of threads that can be launched in a single workgroup
+* @param [in] inputArrayLength The length of the array to run an FFT on
+* @param [in] minSubgroupSize The smallest possible number of threads that can run in a single subgroup. 32 by default.
+*/
+inline OptimalFFTParameters optimalFFTParameters(uint32_t maxWorkgroupSize, uint32_t inputArrayLength, uint32_t minSubgroupSize = 32u)
 {
+    NBL_CONSTEXPR_STATIC OptimalFFTParameters invalidParameters = { 0 , 0 };
+
     // Round inputArrayLength to PoT
-    uint32_t FFTLength = 1u << (1u + findMSB(_static_cast<uint32_t>(inputArrayLength - 1u)));
+    const uint32_t FFTLength = 1u << (1u + findMSB(_static_cast<uint32_t>(inputArrayLength - 1u)));
     // Round maxWorkgroupSize down to PoT
-    uint32_t actualMaxWorkgroupSize = 1u << (findMSB(maxWorkgroupSize));
+    const uint32_t actualMaxWorkgroupSize = 1u << (findMSB(maxWorkgroupSize));
     // This is the logic found in core::roundUpToPoT to get the log2
     const uint16_t workgroupSizeLog2 = _static_cast<uint16_t>(1u + findMSB(_static_cast<uint32_t>(min(FFTLength / 2, actualMaxWorkgroupSize) - 1u)));
-    #ifndef __HLSL_VERSION
-    assert((FFTLength >> workgroupSizeLog2) > 1);
-    #endif
     const uint16_t elementsPerInvocationLog2 = _static_cast<uint16_t>(findMSB(FFTLength >> workgroupSizeLog2));
     const OptimalFFTParameters retVal = { elementsPerInvocationLog2, workgroupSizeLog2 };
-    return retVal;
+    
+    // Parameters are valid if the workgroup size is at most half of the FFT Length and at least as big as the smallest subgroup that can be launched
+    if ((FFTLength >> workgroupSizeLog2) > 1 && minSubgroupSize <= (1u << workgroupSizeLog2))
+    {
+        return retVal;
+    }
+    else
+    {
+        return invalidParameters;
+    }
 }
 
 }
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
@@ -242,7 +242,6 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/glsl.std.450
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/cpp_compat.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/cpp_compat/basic.h")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/cpp_compat/intrinsics.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/cpp_compat/impl/intrinsics_impl.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/cpp_compat/matrix.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/cpp_compat/promote.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/cpp_compat/vector.hlsl")