Move some HLSL stuff to CPP-shared

Fletterio · Fletterio · commit fdb79040d4a3 · 2025-01-13T16:34:00.000-03:00
diff --git a/examples_tests b/examples_tests
@@ -1 +1 @@
-Subproject commit b4d88fac1f74cfaa0541e50dc20c86221f78039d
+Subproject commit accbc2064c8502ea6fafb816e399358e234c55f9
diff --git a/include/nbl/builtin/hlsl/bitreverse.hlsl b/include/nbl/builtin/hlsl/bitreverse.hlsl
@@ -0,0 +1,47 @@
+#ifndef _NBL_BUILTIN_HLSL_BITREVERSE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_BITREVERSE_INCLUDED_
+
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+
+template<typename T, uint16_t Bits NBL_FUNC_REQUIRES(is_unsigned_v<T>&& Bits <= sizeof(T) * 8)
+/**
+* @brief Takes the binary representation of `value` as a string of `Bits` bits and returns a value of the same type resulting from reversing the string
+*
+* @tparam T Type of the value to operate on.
+* @tparam Bits The length of the string of bits used to represent `value`.
+*
+* @param [in] value The value to bitreverse.
+*/
+T bitReverseAs(T value)
+{
+	return bitReverse<T>(value) >> promote<T, scalar_type_t<T> >(scalar_type_t <T>(sizeof(T) * 8 - Bits));
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_unsigned_v<T>)
+/**
+* @brief Takes the binary representation of `value` and returns a value of the same type resulting from reversing the string of bits as if it was `bits` long.
+* Keep in mind `bits` cannot exceed `8 * sizeof(T)`.
+*
+* @tparam T type of the value to operate on.
+*
+* @param [in] value The value to bitreverse.
+* @param [in] bits The length of the string of bits used to represent `value`.
+*/
+T bitReverseAs(T value, uint16_t bits)
+{
+	return bitReverse<T>(value) >> promote<T, scalar_type_t<T> >(scalar_type_t <T>(sizeof(T) * 8 - bits));
+}
+
+
+}
+}
+
+
+
+#endif
diff --git a/include/nbl/builtin/hlsl/fft/common.hlsl b/include/nbl/builtin/hlsl/fft/common.hlsl
@@ -124,35 +124,6 @@ void unpack(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi
     lo = x;
 }
 
-template<typename T, uint16_t Bits NBL_FUNC_REQUIRES(is_unsigned_v<T>&& Bits <= sizeof(T) * 8)
-/**
-* @brief Takes the binary representation of `value` as a string of `Bits` bits and returns a value of the same type resulting from reversing the string
-*
-* @tparam T Type of the value to operate on.
-* @tparam Bits The length of the string of bits used to represent `value`.
-*
-* @param [in] value The value to bitreverse.
-*/
-T bitReverseAs(T value)
-{
-    return bitReverse<T>(value) >> promote<T, scalar_type_t<T> >(scalar_type_t <T>(sizeof(T) * 8 - Bits));
-}
-
-template<typename T NBL_FUNC_REQUIRES(is_unsigned_v<T>)
-/**
-* @brief Takes the binary representation of `value` and returns a value of the same type resulting from reversing the string of bits as if it was `bits` long.
-* Keep in mind `bits` cannot exceed `8 * sizeof(T)`.
-*
-* @tparam T type of the value to operate on.
-*
-* @param [in] value The value to bitreverse.
-* @param [in] bits The length of the string of bits used to represent `value`.
-*/
-T bitReverseAs(T value, uint16_t bits)
-{
-    return bitReverse<T>(value) >> promote<T, scalar_type_t<T> >(scalar_type_t <T>(sizeof(T) * 8 - bits));
-}
-
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl
@@ -1,6 +1,7 @@
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/concepts.hlsl>
 #include <nbl/builtin/hlsl/fft/common.hlsl>
+#include <nbl/builtin/hlsl/bitreverse.hlsl>
 
 #ifndef _NBL_BUILTIN_HLSL_WORKGROUP_FFT_INCLUDED_
 #define _NBL_BUILTIN_HLSL_WORKGROUP_FFT_INCLUDED_
@@ -77,6 +78,83 @@ inline OptimalFFTParameters optimalFFTParameters(uint32_t maxWorkgroupSize, uint
     }
 }
 
+namespace impl
+{
+template<uint16_t N, uint16_t H>
+enable_if_t<(H <= N) && (N < 32), uint32_t> circularBitShiftRightHigher(uint32_t i)
+{
+    // Highest H bits are numbered N-1 through N - H
+    // N - H is then the middle bit
+    // Lowest bits numbered from 0 through N - H - 1
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t lowMask = (1 << (N - H)) - 1;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t midMask = 1 << (N - H);
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t highMask = ~(lowMask | midMask);
+
+    uint32_t low = i & lowMask;
+    uint32_t mid = i & midMask;
+    uint32_t high = i & highMask;
+
+    high >>= 1;
+    mid <<= H - 1;
+
+    return mid | high | low;
+}
+
+template<uint16_t N, uint16_t H>
+enable_if_t<(H <= N) && (N < 32), uint32_t> circularBitShiftLeftHigher(uint32_t i)
+{
+    // Highest H bits are numbered N-1 through N - H
+    // N - 1 is then the highest bit, and N - 2 through N - H are the middle bits
+    // Lowest bits numbered from 0 through N - H - 1
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t lowMask = (1 << (N - H)) - 1;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t highMask = 1 << (N - 1);
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t midMask = ~(lowMask | highMask);
+
+    uint32_t low = i & lowMask;
+    uint32_t mid = i & midMask;
+    uint32_t high = i & highMask;
+
+    mid <<= 1;
+    high >>= H - 1;
+
+    return mid | high | low;
+}
+} //namespace impl
+
+template<uint16_t ElementsPerInvocationLog2, uint16_t WorkgroupSizeLog2>
+struct FFTIndexingUtils
+{
+    // This function maps the index `outputIdx` in the output array of a Nabla FFT to the index `freqIdx` in the DFT such that `DFT[freqIdx] = NablaFFT[outputIdx]`
+    // This is because Cooley-Tukey + subgroup operations end up spewing out the outputs in a weird order
+    static uint32_t getDFTIndex(uint32_t outputIdx)
+    {
+        return impl::circularBitShiftRightHigher<FFTSizeLog2, FFTSizeLog2 - ElementsPerInvocationLog2 + 1>(hlsl::bitReverseAs<uint32_t, FFTSizeLog2>(outputIdx));
+    }
+
+    // This function maps the index `freqIdx` in the DFT to the index `idx` in the output array of a Nabla FFT such that `DFT[freqIdx] = NablaFFT[idx]`
+    // It is essentially the inverse of `getDFTIndex`
+    static uint32_t getNablaIndex(uint32_t freqIdx)
+    {
+        return hlsl::bitReverseAs<uint32_t, FFTSizeLog2>(impl::circularBitShiftLeftHigher<FFTSizeLog2, FFTSizeLog2 - ElementsPerInvocationLog2 + 1>(freqIdx));
+    }
+
+    // Mirrors an index about the Nyquist frequency in the DFT order
+    static uint32_t getDFTMirrorIndex(uint32_t freqIdx)
+    {
+        return (FFTSize - freqIdx) & (FFTSize - 1);
+    }
+
+    // Given an index `outputIdx` of an element into the Nabla FFT, get the index into the Nabla FFT of the element corresponding to its negative frequency
+    static uint32_t getNablaMirrorIndex(uint32_t outputIdx)
+    {
+        return getNablaIndex(getDFTMirrorIndex(getDFTIndex(outputIdx)));
+    }
+
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t FFTSizeLog2 = ElementsPerInvocationLog2 + WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t FFTSize = uint32_t(1) << FFTSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2;
+};
+
 }
 }
 }
@@ -135,76 +213,12 @@ namespace impl
             }
         }
     };
-
-    template<uint16_t N, uint16_t H>
-    enable_if_t<(H <= N) && (N < 32), uint32_t> circularBitShiftRightHigher(uint32_t i)
-    {
-        // Highest H bits are numbered N-1 through N - H
-        // N - H is then the middle bit
-        // Lowest bits numbered from 0 through N - H - 1
-        NBL_CONSTEXPR_STATIC_INLINE uint32_t lowMask = (1 << (N - H)) - 1;
-        NBL_CONSTEXPR_STATIC_INLINE uint32_t midMask = 1 << (N - H);
-        NBL_CONSTEXPR_STATIC_INLINE uint32_t highMask = ~(lowMask | midMask);
-
-        uint32_t low = i & lowMask;
-        uint32_t mid = i & midMask;
-        uint32_t high = i & highMask;
-
-        high >>= 1;
-        mid <<= H - 1;
-
-        return mid | high | low;
-    }
-
-    template<uint16_t N, uint16_t H>
-    enable_if_t<(H <= N) && (N < 32), uint32_t> circularBitShiftLeftHigher(uint32_t i)
-    {
-        // Highest H bits are numbered N-1 through N - H
-        // N - 1 is then the highest bit, and N - 2 through N - H are the middle bits
-        // Lowest bits numbered from 0 through N - H - 1
-        NBL_CONSTEXPR_STATIC_INLINE uint32_t lowMask = (1 << (N - H)) - 1;
-        NBL_CONSTEXPR_STATIC_INLINE uint32_t highMask = 1 << (N - 1);
-        NBL_CONSTEXPR_STATIC_INLINE uint32_t midMask = ~(lowMask | highMask);
-
-        uint32_t low = i & lowMask;
-        uint32_t mid = i & midMask;
-        uint32_t high = i & highMask;
-
-        mid <<= 1;
-        high >>= H - 1;
-
-        return mid | high | low;
-    }
 } //namespace impl
 
 template<uint16_t ElementsPerInvocationLog2, uint16_t WorkgroupSizeLog2>
-struct FFTIndexingUtils
+struct FFTMirrorTradeUtils
 {
-    // This function maps the index `outputIdx` in the output array of a Nabla FFT to the index `freqIdx` in the DFT such that `DFT[freqIdx] = NablaFFT[outputIdx]`
-    // This is because Cooley-Tukey + subgroup operations end up spewing out the outputs in a weird order
-    static uint32_t getDFTIndex(uint32_t outputIdx)
-    {
-        return impl::circularBitShiftRightHigher<FFTSizeLog2, FFTSizeLog2 - ElementsPerInvocationLog2 + 1>(hlsl::fft::bitReverseAs<uint32_t, FFTSizeLog2>(outputIdx));
-    }
-
-    // This function maps the index `freqIdx` in the DFT to the index `idx` in the output array of a Nabla FFT such that `DFT[freqIdx] = NablaFFT[idx]`
-    // It is essentially the inverse of `getDFTIndex`
-    static uint32_t getNablaIndex(uint32_t freqIdx)
-    {
-        return hlsl::fft::bitReverseAs<uint32_t, FFTSizeLog2>(impl::circularBitShiftLeftHigher<FFTSizeLog2, FFTSizeLog2 - ElementsPerInvocationLog2 + 1>(freqIdx));
-    }
-
-    // Mirrors an index about the Nyquist frequency in the DFT order
-    static uint32_t getDFTMirrorIndex(uint32_t freqIdx)
-    {
-        return (FFTSize - freqIdx) & (FFTSize - 1);
-    }
-
-    // Given an index `outputIdx` of an element into the Nabla FFT, get the index into the Nabla FFT of the element corresponding to its negative frequency
-    static uint32_t getNablaMirrorIndex(uint32_t outputIdx)
-    {
-        return getNablaIndex(getDFTMirrorIndex(getDFTIndex(outputIdx)));
-    }
+    using indexing_utils_t = FFTIndexingUtils<ElementsPerInvocationLog2, WorkgroupSizeLog2>;
 
     // When unpacking an FFT of two packed signals, given a `globalElementIndex` you need its "mirror index" to unpack the value at NablaFFT[globalElementIndex].
     // The function above has you covered in that sense, but what also happens is that not only does the thread holding `NablaFFT[globalElementIndex]` need its mirror value
@@ -216,10 +230,10 @@ struct FFTIndexingUtils
         uint32_t otherThreadID;
         uint32_t mirrorLocalIndex;
     };
-    
+
     static NablaMirrorLocalInfo getNablaMirrorLocalInfo(uint32_t globalElementIndex)
     {
-        const uint32_t otherElementIndex = FFTIndexingUtils::getNablaMirrorIndex(globalElementIndex);
+        const uint32_t otherElementIndex = indexing_utils_t::getNablaMirrorIndex(globalElementIndex);
         const uint32_t mirrorLocalIndex = otherElementIndex / WorkgroupSize;
         const uint32_t otherThreadID = otherElementIndex & (WorkgroupSize - 1);
         const NablaMirrorLocalInfo info = { otherThreadID, mirrorLocalIndex };
@@ -235,23 +249,13 @@ struct FFTIndexingUtils
 
     static NablaMirrorGlobalInfo getNablaMirrorGlobalInfo(uint32_t globalElementIndex)
     {
-        const uint32_t otherElementIndex = FFTIndexingUtils::getNablaMirrorIndex(globalElementIndex);
+        const uint32_t otherElementIndex = indexing_utils_t::getNablaMirrorIndex(globalElementIndex);
         const uint32_t mirrorGlobalIndex = glsl::bitfieldInsert<uint32_t>(otherElementIndex, workgroup::SubgroupContiguousIndex(), 0, uint32_t(WorkgroupSizeLog2));
         const uint32_t otherThreadID = otherElementIndex & (WorkgroupSize - 1);
         const NablaMirrorGlobalInfo info = { otherThreadID, mirrorGlobalIndex };
         return info;
     }
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t FFTSizeLog2 = ElementsPerInvocationLog2 + WorkgroupSizeLog2;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t FFTSize = uint32_t(1) << FFTSizeLog2;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2;
-};
-
-template<uint16_t ElementsPerInvocationLog2, uint16_t WorkgroupSizeLog2>
-struct FFTMirrorTradeUtils
-{
-    using indexing_utils_t = FFTIndexingUtils<ElementsPerInvocationLog2, WorkgroupSizeLog2>;
-    using mirror_info_t = typename indexing_utils_t::NablaMirrorGlobalInfo;
     // If trading elements when, for example, unpacking real FFTs, you might do so from within your accessor or from outside. 
     // If doing so from within your accessor, particularly if using a preloaded accessor, you might want to do this yourself by
     // using FFTIndexingUtils::getNablaMirrorTradeInfo and trading the elements yourself (an example of how to set this up is given in
@@ -261,7 +265,7 @@ struct FFTMirrorTradeUtils
     template<typename scalar_t, typename fft_array_accessor_t, typename shared_memory_adaptor_t>
     static complex_t<scalar_t> getNablaMirror(uint32_t globalElementIndex, fft_array_accessor_t arrayAccessor, shared_memory_adaptor_t sharedmemAdaptor)
     {
-        const mirror_info_t mirrorInfo = indexing_utils_t::getNablaMirrorGlobalInfo(globalElementIndex);
+        const NablaMirrorGlobalInfo mirrorInfo = getNablaMirrorGlobalInfo(globalElementIndex);
         complex_t<scalar_t> toTrade = arrayAccessor.get(mirrorInfo.mirrorGlobalIndex);
         vector<scalar_t, 2> toTradeVector = { toTrade.real(), toTrade.imag() };
         workgroup::Shuffle<shared_memory_adaptor_t, vector<scalar_t, 2> >::__call(toTradeVector, mirrorInfo.otherThreadID, sharedmemAdaptor);
@@ -271,6 +275,7 @@ struct FFTMirrorTradeUtils
     }
 
     NBL_CONSTEXPR_STATIC_INLINE indexing_utils_t IndexingUtils;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = indexing_utils_t::WorkgroupSize;
 };
 
 
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
@@ -353,5 +353,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/mip_mapped
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/storable_image.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/fft.hlsl")
 
+# temporary (delete once replaced)
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bitreverse.hlsl")
 
 ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL")