Merge pull request #818 from Devsh-Graphics-Programming/more_fft_utils

devshgraphicsprogramming · web-flow · commit 0f8bcac2b261 · 2025-01-21T14:19:15.000+01:00
FFT Fixes
diff --git a/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl
@@ -53,8 +53,8 @@ NBL_CONCEPT_BEGIN(3)
 #define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
 #define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
 NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.set(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.get(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<complex_t<Scalar> >(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<complex_t<Scalar> >(index, val)), is_same_v, void))
 );
 #undef val
 #undef index
diff --git a/include/nbl/builtin/hlsl/fft/README.md b/include/nbl/builtin/hlsl/fft/README.md
@@ -12,9 +12,10 @@ To run an FFT, you need to call the FFT struct's static `__call` method. You do
 IMPORTANT: You MUST launch kernel with a workgroup size of `ConstevalParameters::WorkgroupSize` 
 
 * `Accessor` is an accessor to the array. It MUST provide the methods   
-    `void get(uint32_t index, inout complex_t<Scalar> value)`,   
-    `void set(uint32_t index, in complex_t<Scalar> value)`,  
-which are hopefully self-explanatory. Furthermore, if doing an FFT with `ElementsPerInvocationLog2 > 1`, it MUST also provide a `void memoryBarrier()` method. If not accessing any type of memory during the FFT, it can be a method that does nothing. Otherwise, it must do a barrier with `AcquireRelease` semantics, with proper semantics for the type of memory it accesses. This example uses an Accessor going straight to global memory, so it requires a memory barrier. For an example of an accessor that doesn't, see the `28_FFTBloom` example, where we use preloaded accessors.
+`template <typename AccessType> void set(uint32_t idx, AccessType value)` and   
+`template <typename AccessType> void get(uint32_t idx, NBL_REF_ARG(AccessType) value)` 
+which are hopefully self-explanatory. These methods need to be able to be specialized with `AccessType` being `complex_t<Scalar>` for the FFT to work properly.
+Furthermore, if doing an FFT with `ElementsPerInvocationLog2 > 1`, it MUST also provide a `void memoryBarrier()` method. If not accessing any type of memory during the FFT, it can be a method that does nothing. Otherwise, it must do a barrier with `AcquireRelease` semantics, with proper semantics for the type of memory it accesses. This example uses an Accessor going straight to global memory, so it requires a memory barrier. For an example of an accessor that doesn't, see the `28_FFTBloom` example, where we use preloaded accessors.
 
 * `SharedMemoryAccessor` is an accessor to a shared memory array of `uint32_t` that MUST be able to fit `WorkgroupSize` many complex elements (one per thread). When instantiating a `workgroup::fft::ConstevalParameters` struct, you can access its static member field `SharedMemoryDWORDs` that yields the amount of `uint32_t`s the shared memory array must be able to hold. It MUST provide the methods   
 `template <typename IndexType, typename AccessType> void set(IndexType idx, AccessType value)`,   
@@ -27,6 +28,8 @@ Furthermore, you must define the method `uint32_t3 nbl::hlsl::glsl::gl_WorkGroup
 
 ## Utils
 
+### Figuring out the storage required for an FFT
+
 ### Figuring out compile-time parameters
 We provide a   
 `workgroup::fft::optimalFFTParameters(uint32_t maxWorkgroupSize, uint32_t inputArrayLength)`   
@@ -39,7 +42,9 @@ By default, we prefer to use only 2 elements per invocation when possible, and o
 ### Indexing
 We made some decisions in the design of the FFT algorithm pertaining to load/store order. In particular we wanted to keep stores linear to minimize cache misses when writing the output of an FFT. As such, the output of the FFT is not in its normal order, nor in bitreversed order (which is the standard for Cooley-Tukey implementations). Instead, it's in what we will refer to Nabla order going forward. The Nabla order allows for coalesced writes of the output. 
 
-The result of an FFT (either forward or inverse, assuming the input is in its natural order) will be referred to as an $\text{NFFT}$ (N for Nabla). This $\text{NFFT}$ contains the same elements as the $\text{DFT}$ (which is the properly-ordered result of an FFT) of the same signal, just in Nabla order. We provide a struct   
+This whole discussion applies to our implementation of the forward FFT only. We have not yet implemented the same functions for the inverse FFT since we didn't have a need for it.
+
+The result of a forward FFT will be referred to as an $\text{NFFT}$ (N for Nabla). This $\text{NFFT}$ contains the same elements as the $\text{DFT}$ (which is the properly-ordered result of an FFT) of the same signal, just in Nabla order. We provide a struct   
 `FFTIndexingUtils<uint16_t ElementsPerInvocationLog2, uint16_t WorkgroupSizeLog2>`   
 that automatically handles the math for you in case you want to go from one order to the other. It provides the following methods:
 
@@ -168,6 +173,7 @@ $\text{bitreverse} \circ e^{-1} = g^{-1} \circ \text{bitreverse}$
 
 $F$ is called `FFTIndexingUtils::getDFTIndex` and detailed in the users section above.
 
+Please note that this whole discussion and the function $F$ we worked out are only valid in the forward NFFT case. This is because we used a DIF diagram to work out the expression. An expression for the output order of the inverse NFFT should be easy to work out in the same way considering a DIT diagram. However, I did not have a use for it so I didn't bother.
 
 
 ## Unpacking Rule for packed real FFTs
diff --git a/include/nbl/builtin/hlsl/fft/common.hlsl b/include/nbl/builtin/hlsl/fft/common.hlsl
@@ -17,22 +17,21 @@ namespace fft
 
 template <uint16_t N NBL_FUNC_REQUIRES(N > 0 && N <= 4)
 /**
-* @brief Returns the size of the full FFT computed, in terms of number of complex elements.
+* @brief Returns the size of the full FFT computed, in terms of number of complex elements. If the signal is real, you MUST provide a valid value for `firstAxis`
 *
 * @tparam N Number of dimensions of the signal to perform FFT on.
 *
 * @param [in] dimensions Size of the signal.
-* @param [in] realFFT Indicates whether the signal is real. False by default.
-* @param [in] firstAxis Indicates which axis the FFT is performed on first. Only relevant for real-valued signals. Must be less than N. 0 by default.
+* @param [in] firstAxis Indicates which axis the FFT is performed on first. Only relevant for real-valued signals, in which case it must be less than N. N by default.
 */
-inline vector<uint64_t, N> padDimensions(NBL_CONST_REF_ARG(vector<uint32_t, N>) dimensions, bool realFFT = false, uint16_t firstAxis = 0u)
+inline vector<uint64_t, N> padDimensions(vector<uint32_t, N> dimensions, uint16_t firstAxis = N)
 {
     vector<uint32_t, N> newDimensions = dimensions;
     for (uint16_t i = 0u; i < N; i++)
     {
         newDimensions[i] = hlsl::roundUpToPoT(newDimensions[i]);
     }
-    if (realFFT)
+    if (firstAxis < N)
         newDimensions[firstAxis] /= 2;
     return newDimensions;
 }
@@ -52,14 +51,50 @@ template <uint16_t N NBL_FUNC_REQUIRES(N > 0 && N <= 4)
 */
 inline uint64_t getOutputBufferSize(
     uint32_t numChannels,
-    NBL_CONST_REF_ARG(vector<uint32_t, N>) inputDimensions,
+    vector<uint32_t, N> inputDimensions,
     uint16_t passIx,
-    NBL_CONST_REF_ARG(vector<uint16_t, N>) axisPassOrder = _static_cast<vector<uint16_t, N> >(uint16_t4(0, 1, 2, 3)),
+    vector<uint16_t, N> axisPassOrder = _static_cast<vector<uint16_t, N> >(uint16_t4(0, 1, 2, 3)),
     bool realFFT = false,
     bool halfFloats = false
 )
 {
-    const vector<uint32_t, N> paddedDimensions = padDimensions<N>(inputDimensions, realFFT, axisPassOrder[0]);
+    const vector<uint32_t, N> paddedDimensions = padDimensions<N>(inputDimensions, realFFT ? axisPassOrder[0] : N);
+    vector<bool, N> axesDone = promote<vector<bool, N>, bool>(false);
+    for (uint16_t i = 0; i <= passIx; i++)
+        axesDone[axisPassOrder[i]] = true;
+    const vector<uint32_t, N> passOutputDimension = lerp(inputDimensions, paddedDimensions, axesDone);
+    uint64_t numberOfComplexElements = uint64_t(numChannels);
+    for (uint16_t i = 0; i < N; i++)
+        numberOfComplexElements *= uint64_t(passOutputDimension[i]);
+    return numberOfComplexElements * (halfFloats ? sizeof(complex_t<float16_t>) : sizeof(complex_t<float32_t>));
+}
+
+template <uint16_t N NBL_FUNC_REQUIRES(N > 0 && N <= 4)
+/**
+* @brief Returns the size required by a buffer to hold the result of the FFT of a signal after a certain pass, when using the FFT to convolve it against a kernel.
+*
+* @tparam N Number of dimensions of the signal to perform FFT on.
+*
+* @param [in] numChannels Number of channels of the signal.
+* @param [in] inputDimensions Size of the signal.
+* @param [in] kernelDimensions Size of the kernel.
+* @param [in] passIx Which pass the size is being computed for.
+* @param [in] axisPassOrder Order of the axis in which the FFT is computed in. Default is xyzw.
+* @param [in] realFFT True if the signal is real. False by default.
+* @param [in] halfFloats True if using half-precision floats. False by default.
+*/
+inline uint64_t getOutputBufferSizeConvolution(
+    uint32_t numChannels,
+    vector<uint32_t, N> inputDimensions,
+    vector<uint32_t, N> kernelDimensions,
+    uint16_t passIx,
+    vector<uint16_t, N> axisPassOrder = _static_cast<vector<uint16_t, N> >(uint16_t4(0, 1, 2, 3)),
+    bool realFFT = false,
+
+    bool halfFloats = false
+)
+{
+    const vector<uint32_t, N> paddedDimensions = padDimensions<N>(inputDimensions + kernelDimensions, realFFT ? axisPassOrder[0] : N);
     vector<bool, N> axesDone = promote<vector<bool, N>, bool>(false);
     for (uint16_t i = 0; i <= passIx; i++)
         axesDone[axisPassOrder[i]] = true;
@@ -89,7 +124,7 @@ complex_t<Scalar> twiddle(uint32_t k, uint32_t halfN)
 template<bool inverse, typename Scalar>
 struct DIX
 {
-    static void radix2(NBL_CONST_REF_ARG(complex_t<Scalar>) twiddle, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)
+    static void radix2(complex_t<Scalar> twiddle, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)
     {
         plus_assign< complex_t<Scalar> > plusAss;
         //Decimation in time - inverse           
diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl
@@ -54,28 +54,30 @@ struct OptimalFFTParameters
 * @param [in] inputArrayLength The length of the array to run an FFT on
 * @param [in] minSubgroupSize The smallest possible number of threads that can run in a single subgroup. 32 by default.
 */
-inline OptimalFFTParameters optimalFFTParameters(uint32_t maxWorkgroupSize, uint32_t inputArrayLength, uint32_t minSubgroupSize = 32u)
+inline OptimalFFTParameters optimalFFTParameters(uint32_t maxWorkgroupSize, uint32_t inputArrayLength, uint32_t minSubgroupSize)
 {
     NBL_CONSTEXPR_STATIC OptimalFFTParameters invalidParameters = { 0 , 0 };
 
+    if (minSubgroupSize < 4 || maxWorkgroupSize < minSubgroupSize || inputArrayLength <= minSubgroupSize)
+        return invalidParameters;
+
     // Round inputArrayLength to PoT
-    const uint32_t FFTLength = 1u << (1u + findMSB(_static_cast<uint32_t>(inputArrayLength - 1u)));
+    const uint32_t FFTLength = hlsl::roundUpToPoT(inputArrayLength);
     // Round maxWorkgroupSize down to PoT
-    const uint32_t actualMaxWorkgroupSize = 1u << (findMSB(maxWorkgroupSize));
-    // This is the logic found in core::roundUpToPoT to get the log2
+    const uint32_t actualMaxWorkgroupSize = hlsl::roundDownToPoT(maxWorkgroupSize);
+    // This is the logic found in hlsl::roundUpToPoT to get the log2
     const uint16_t workgroupSizeLog2 = _static_cast<uint16_t>(1u + findMSB(_static_cast<uint32_t>(min(FFTLength / 2, actualMaxWorkgroupSize) - 1u)));
-    const uint16_t elementsPerInvocationLog2 = _static_cast<uint16_t>(findMSB(FFTLength >> workgroupSizeLog2));
-    const OptimalFFTParameters retVal = { elementsPerInvocationLog2, workgroupSizeLog2 };
     
     // Parameters are valid if the workgroup size is at most half of the FFT Length and at least as big as the smallest subgroup that can be launched
-    if ((FFTLength >> workgroupSizeLog2) > 1 && minSubgroupSize <= (1u << workgroupSizeLog2))
-    {
-        return retVal;
-    }
-    else
+    if ((FFTLength >> workgroupSizeLog2) <= 1 || minSubgroupSize > (1u << workgroupSizeLog2))
     {
         return invalidParameters;
     }
+    
+    const uint16_t elementsPerInvocationLog2 = _static_cast<uint16_t>(findMSB(FFTLength >> workgroupSizeLog2));
+    const OptimalFFTParameters retVal = { elementsPerInvocationLog2, workgroupSizeLog2 };
+    
+    return retVal;
 }
 
 namespace impl