Address all PR comments

Fletterio · Fletterio · commit 5518e01cfda1 · 2025-01-19T21:08:56.000-03:00
diff --git a/examples_tests b/examples_tests
@@ -1 +1 @@
-Subproject commit accbc2064c8502ea6fafb816e399358e234c55f9
+Subproject commit b8ce64b753d98e1fefdb7ad0be28d883c11f7d6b
diff --git a/include/nbl/builtin/hlsl/fft/common.hlsl b/include/nbl/builtin/hlsl/fft/common.hlsl
@@ -17,22 +17,21 @@ namespace fft
 
 template <uint16_t N NBL_FUNC_REQUIRES(N > 0 && N <= 4)
 /**
-* @brief Returns the size of the full FFT computed, in terms of number of complex elements.
+* @brief Returns the size of the full FFT computed, in terms of number of complex elements. If the signal is real, you MUST provide a valid value for `firstAxis`
 *
 * @tparam N Number of dimensions of the signal to perform FFT on.
 *
 * @param [in] dimensions Size of the signal.
-* @param [in] realFFT Indicates whether the signal is real. False by default.
-* @param [in] firstAxis Indicates which axis the FFT is performed on first. Only relevant for real-valued signals. Must be less than N. 0 by default.
+* @param [in] firstAxis Indicates which axis the FFT is performed on first. Only relevant for real-valued signals, in which case it must be less than N. N by default.
 */
-inline vector<uint64_t, N> padDimensions(NBL_CONST_REF_ARG(vector<uint32_t, N>) dimensions, bool realFFT = false, uint16_t firstAxis = 0u)
+inline vector<uint64_t, N> padDimensions(vector<uint32_t, N> dimensions, uint16_t firstAxis = N)
 {
     vector<uint32_t, N> newDimensions = dimensions;
     for (uint16_t i = 0u; i < N; i++)
     {
         newDimensions[i] = hlsl::roundUpToPoT(newDimensions[i]);
     }
-    if (realFFT)
+    if (firstAxis < N)
         newDimensions[firstAxis] /= 2;
     return newDimensions;
 }
@@ -52,15 +51,14 @@ template <uint16_t N NBL_FUNC_REQUIRES(N > 0 && N <= 4)
 */
 inline uint64_t getOutputBufferSize(
     uint32_t numChannels,
-    NBL_CONST_REF_ARG(vector<uint32_t, N>) inputDimensions,
+    vector<uint32_t, N> inputDimensions,
     uint16_t passIx,
-    NBL_CONST_REF_ARG(vector<uint16_t, N>) axisPassOrder = _static_cast<vector<uint16_t, N> >(uint16_t4(0, 1, 2, 3)),
+    vector<uint16_t, N> axisPassOrder = _static_cast<vector<uint16_t, N> >(uint16_t4(0, 1, 2, 3)),
     bool realFFT = false,
-
     bool halfFloats = false
 )
 {
-    const vector<uint32_t, N> paddedDimensions = padDimensions<N>(inputDimensions, realFFT, axisPassOrder[0]);
+    const vector<uint32_t, N> paddedDimensions = padDimensions<N>(inputDimensions, realFFT ? axisPassOrder[0] : N);
     vector<bool, N> axesDone = promote<vector<bool, N>, bool>(false);
     for (uint16_t i = 0; i <= passIx; i++)
         axesDone[axisPassOrder[i]] = true;
@@ -87,16 +85,16 @@ template <uint16_t N NBL_FUNC_REQUIRES(N > 0 && N <= 4)
 */
 inline uint64_t getOutputBufferSizeConvolution(
     uint32_t numChannels,
-    NBL_CONST_REF_ARG(vector<uint32_t, N>) inputDimensions,
-    NBL_CONST_REF_ARG(vector<uint32_t, N>) kernelDimensions,
+    vector<uint32_t, N> inputDimensions,
+    vector<uint32_t, N> kernelDimensions,
     uint16_t passIx,
-    NBL_CONST_REF_ARG(vector<uint16_t, N>) axisPassOrder = _static_cast<vector<uint16_t, N> >(uint16_t4(0, 1, 2, 3)),
+    vector<uint16_t, N> axisPassOrder = _static_cast<vector<uint16_t, N> >(uint16_t4(0, 1, 2, 3)),
     bool realFFT = false,
 
     bool halfFloats = false
 )
 {
-    const vector<uint32_t, N> paddedDimensions = padDimensions<N>(inputDimensions + kernelDimensions, realFFT, axisPassOrder[0]);
+    const vector<uint32_t, N> paddedDimensions = padDimensions<N>(inputDimensions + kernelDimensions, realFFT ? axisPassOrder[0] : N);
     vector<bool, N> axesDone = promote<vector<bool, N>, bool>(false);
     for (uint16_t i = 0; i <= passIx; i++)
         axesDone[axisPassOrder[i]] = true;
@@ -126,7 +124,7 @@ complex_t<Scalar> twiddle(uint32_t k, uint32_t halfN)
 template<bool inverse, typename Scalar>
 struct DIX
 {
-    static void radix2(NBL_CONST_REF_ARG(complex_t<Scalar>) twiddle, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)
+    static void radix2(complex_t<Scalar> twiddle, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)
     {
         plus_assign< complex_t<Scalar> > plusAss;
         //Decimation in time - inverse           
diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl
@@ -54,28 +54,30 @@ struct OptimalFFTParameters
 * @param [in] inputArrayLength The length of the array to run an FFT on
 * @param [in] minSubgroupSize The smallest possible number of threads that can run in a single subgroup. 32 by default.
 */
-inline OptimalFFTParameters optimalFFTParameters(uint32_t maxWorkgroupSize, uint32_t inputArrayLength, uint32_t minSubgroupSize = 32u)
+inline OptimalFFTParameters optimalFFTParameters(uint32_t maxWorkgroupSize, uint32_t inputArrayLength, uint32_t minSubgroupSize)
 {
     NBL_CONSTEXPR_STATIC OptimalFFTParameters invalidParameters = { 0 , 0 };
 
+    if (minSubgroupSize < 4 || maxWorkgroupSize < minSubgroupSize || inputArrayLength <= minSubgroupSize)
+        return invalidParameters;
+
     // Round inputArrayLength to PoT
-    const uint32_t FFTLength = 1u << (1u + findMSB(_static_cast<uint32_t>(inputArrayLength - 1u)));
+    const uint32_t FFTLength = hlsl::roundUpToPoT(inputArrayLength);
     // Round maxWorkgroupSize down to PoT
-    const uint32_t actualMaxWorkgroupSize = 1u << (findMSB(maxWorkgroupSize));
-    // This is the logic found in core::roundUpToPoT to get the log2
+    const uint32_t actualMaxWorkgroupSize = hlsl::roundDownToPoT(maxWorkgroupSize);
+    // This is the logic found in hlsl::roundUpToPoT to get the log2
     const uint16_t workgroupSizeLog2 = _static_cast<uint16_t>(1u + findMSB(_static_cast<uint32_t>(min(FFTLength / 2, actualMaxWorkgroupSize) - 1u)));
-    const uint16_t elementsPerInvocationLog2 = _static_cast<uint16_t>(findMSB(FFTLength >> workgroupSizeLog2));
-    const OptimalFFTParameters retVal = { elementsPerInvocationLog2, workgroupSizeLog2 };
     
     // Parameters are valid if the workgroup size is at most half of the FFT Length and at least as big as the smallest subgroup that can be launched
-    if ((FFTLength >> workgroupSizeLog2) > 1 && minSubgroupSize <= (1u << workgroupSizeLog2))
-    {
-        return retVal;
-    }
-    else
+    if ((FFTLength >> workgroupSizeLog2) <= 1 || minSubgroupSize > (1u << workgroupSizeLog2))
     {
         return invalidParameters;
     }
+    
+    const uint16_t elementsPerInvocationLog2 = _static_cast<uint16_t>(findMSB(FFTLength >> workgroupSizeLog2));
+    const OptimalFFTParameters retVal = { elementsPerInvocationLog2, workgroupSizeLog2 };
+    
+    return retVal;
 }
 
 namespace impl