Allow user to create FFT objects using pre-allocated data (#8)

jatinchowdhury18 · web-flow · commit 667b38ef7319 · 2025-09-05T01:49:58.000-07:00
* Small fixes

* Working pre-allocated FFT

* Update C test

* Fixes for x64

* DRY refactor
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ build*/
 *.raddbg_project
 .vscode/
 .idea/
+.zed/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -57,7 +57,7 @@ else()
     else()
         set(DEBUG_OPT_FLAGS -O2)
     endif()
-    message(STATUS "Setting debug optimization flags: ${DEBUG_OPT_FLAGS}")
+    message(STATUS "chowdsp_fft -- Setting debug optimization flags: ${DEBUG_OPT_FLAGS}")
 
     target_compile_options(chowdsp_fft PRIVATE $<$<CONFIG:Debug>:${DEBUG_OPT_FLAGS}>)
     if(TARGET chowdsp_fft_avx)
diff --git a/chowdsp_fft.cpp b/chowdsp_fft.cpp
@@ -86,7 +86,8 @@ void aligned_free (void* p)
 namespace chowdsp::fft::avx
 {
 struct FFT_Setup;
-FFT_Setup* fft_new_setup (int N, fft_transform_t transform);
+size_t fft_bytes_required (int N, fft_transform_t transform);
+FFT_Setup* fft_new_setup (int N, fft_transform_t transform, void* data);
 void fft_destroy_setup (FFT_Setup* s);
 void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* foutput, void* scratch, fft_direction_t direction, int ordered);
 void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling);
@@ -228,27 +229,53 @@ bool check_is_pointer_sse_setup (void* ptr)
 }
 #endif
 
-void* fft_new_setup (int N, fft_transform_t transform, [[maybe_unused]] bool use_avx_if_available)
+size_t fft_bytes_required (int N, fft_transform_t transform, bool use_avx_if_available)
 {
 #if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
 #if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
     if (use_avx_if_available)
     {
         if (cpu_supports_avx())
         {
-            auto* setup_ptr = avx::fft_new_setup (N, transform);
+            return avx::fft_bytes_required (N, transform);
+        }
+    }
+    return sse::fft_bytes_required (N, transform);
+#else
+    return sse::fft_bytes_required (N, transform);
+#endif
+#elif defined(__ARM_NEON__) || defined(_M_ARM64)
+    return neon::fft_bytes_required (N, transform);
+#endif
+}
+
+void* fft_new_setup (int N, fft_transform_t transform, bool use_avx_if_available)
+{
+    const auto fft_bytes = fft_bytes_required (N, transform, use_avx_if_available);
+    return fft_new_setup_preallocated (N, transform, aligned_malloc (fft_bytes), use_avx_if_available);
+}
+
+void* fft_new_setup_preallocated (int N, fft_transform_t transform, void* data, [[maybe_unused]] bool use_avx_if_available)
+{
+#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
+#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
+    if (use_avx_if_available)
+    {
+        if (cpu_supports_avx())
+        {
+            auto* setup_ptr = avx::fft_new_setup (N, transform, data);
             if (setup_ptr != nullptr)
                 return setup_ptr;
         }
     }
-    void* ptr = sse::fft_new_setup (N, transform);
+    void* ptr = sse::fft_new_setup (N, transform, data);
     set_pointer_is_sse_setup (ptr);
     return ptr;
 #else
-    return sse::fft_new_setup (N, transform);
+    return sse::fft_new_setup (N, transform, data);
 #endif
 #elif defined(__ARM_NEON__) || defined(_M_ARM64)
-    return neon::fft_new_setup (N, transform);
+    return neon::fft_new_setup (N, transform, data);
 #endif
 }
 
diff --git a/chowdsp_fft.h b/chowdsp_fft.h
@@ -74,6 +74,16 @@ typedef enum
     FFT_COMPLEX
 } fft_transform_t;
 
+/*
+  Returns the number of bytes needed for a "pre-allocated" FFT.
+  See `fft_new_setup_preallocated` for more details.
+*/
+size_t fft_bytes_required (int N, fft_transform_t transform, bool use_avx_if_available
+#ifdef __cplusplus
+                                                             = true
+#endif
+);
+
 /*
   prepare for performing transforms of size N -- the returned
   FFT_Setup structure is read-only so it can safely be shared by
@@ -84,6 +94,28 @@ void* fft_new_setup (int N, fft_transform_t transform, bool use_avx_if_available
                                                        = true
 #endif
 );
+
+/*
+  Same as fft_new_setup, except that all the memory for the FFT
+  is pre-allocated, and is provided by the caller via the "data"
+  pointer.
+
+  pffft's `aligned_malloc` aligns all allocations to 64 bytes.
+  Depending on your specific case, you may be able to get away
+  with a lower alignment requirement, but make sure to test! It
+  is expected that at least 16-byte alignment will be required
+  for FFTs using NEON or SSE, and at least 32-byte alignment will
+  be required for FFTs using AVX.
+
+  If you create your FFT with this method, you don't need to call
+  `fft_destroy_setup`, but you are responsible for freeing the
+  pre-allocated memory.
+*/
+void* fft_new_setup_preallocated (int N, fft_transform_t transform, void* data, bool use_avx_if_available
+#ifdef __cplusplus
+                                                                                = true
+#endif
+);
 void fft_destroy_setup (void*);
 
 /** Returns the width (in bytes) of the SIMD registers used by the FFT setup. */
diff --git a/simd/chowdsp_fft_impl_avx.cpp b/simd/chowdsp_fft_impl_avx.cpp
@@ -70,72 +70,21 @@ struct FFT_Setup
     float* twiddle; // points into 'data', N/4 elements
 };
 
-FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
+size_t fft_bytes_required (int N, fft_transform_t transform)
 {
-    /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
-       and 32 for real FFTs -- a lot of stuff would need to be rewritten to
-       handle other cases (or maybe just switch to a scalar fft, I don't know..) */
-    if (transform == FFT_REAL)
-    {
-        if (! ((N % (2 * SIMD_SZ * SIMD_SZ)) == 0 && N > 0))
-            return nullptr;
-    }
-    if (transform == FFT_COMPLEX)
-    {
-        if (! ((N % (SIMD_SZ * SIMD_SZ)) == 0 && N > 0))
-            return nullptr;
-    }
-
-    auto* s = (FFT_Setup*) malloc (sizeof (FFT_Setup));
-    //assert((N % 32) == 0);
-    s->N = N;
-    s->transform = transform;
-    /* nb of complex simd vectors */
-    s->Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
-    s->data = (__m256*) aligned_malloc (2 * s->Ncvec * sizeof (float) * SIMD_SZ);
-    s->e = (float*) s->data;
-    s->twiddle = (float*) (s->data + (2 * s->Ncvec * (SIMD_SZ - 1)) / SIMD_SZ);
-
-    int k, m;
-    for (k = 0; k < s->Ncvec; ++k)
-    {
-        int i = k / (int) SIMD_SZ;
-        int j = k % (int) SIMD_SZ;
-        for (m = 0; m < SIMD_SZ - 1; ++m)
-        {
-            const auto A = static_cast<float> (-2 * M_PI * (m + 1) * k / N);
-            s->e[(2 * (i * 7 + m) + 0) * SIMD_SZ + j] = std::cos (A);
-            s->e[(2 * (i * 7 + m) + 1) * SIMD_SZ + j] = std::sin (A);
-        }
-    }
-
-    if (transform == FFT_REAL)
-    {
-        common::rffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
-    }
-    else
-    {
-        common::cffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
-    }
-
-    /* check that N is decomposable with allowed prime factors */
-    for (k = 0, m = 1; k < s->ifac[1]; ++k)
-    {
-        m *= s->ifac[2 + k];
-    }
-    if (m != N / SIMD_SZ)
-    {
-        fft_destroy_setup (s);
-        s = nullptr;
-    }
+    const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
+    const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
+    return data_bytes + sizeof (FFT_Setup);
+}
 
-    return s;
+FFT_Setup* fft_new_setup (int N, fft_transform_t transform, void* data)
+{
+    return common::fft_new_setup<FFT_Setup, __m256> (N, transform, SIMD_SZ, data);
 }
 
 void fft_destroy_setup (FFT_Setup* s)
 {
     aligned_free (s->data);
-    free (s);
 }
 
 //====================================================================
diff --git a/simd/chowdsp_fft_impl_common.hpp b/simd/chowdsp_fft_impl_common.hpp
@@ -158,4 +158,72 @@ void inline cffti1_ps (int n, float* wa, int* ifac)
         l1 = l2;
     }
 }
+
+template <typename Setup_Type, typename SIMD_Type>
+static inline Setup_Type* fft_new_setup (int N, fft_transform_t transform, int SIMD_SZ, void* data)
+{
+    /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
+       and 32 for real FFTs -- a lot of stuff would need to be rewritten to
+       handle other cases (or maybe just switch to a scalar fft, I don't know..) */
+    if (transform == FFT_REAL)
+    {
+        if (! ((N % (2 * SIMD_SZ * SIMD_SZ)) == 0 && N > 0))
+            return nullptr;
+    }
+    if (transform == FFT_COMPLEX)
+    {
+        if (! ((N % (SIMD_SZ * SIMD_SZ)) == 0 && N > 0))
+            return nullptr;
+    }
+
+    const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
+    const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
+    auto* s_data = (std::byte*) data;
+
+    auto* s = (Setup_Type*) (s_data + data_bytes);
+    s->N = N;
+    s->transform = transform;
+    /* nb of complex simd vectors */
+    s->Ncvec = Ncvec;
+    s->data = (SIMD_Type*) s_data;
+    s->e = (float*) s->data;
+    s->twiddle = (float*) (s->data + (2 * s->Ncvec * (SIMD_SZ - 1)) / SIMD_SZ);
+
+    int k, m;
+    const auto M = SIMD_SZ - 1;
+
+    for (k = 0; k < s->Ncvec; ++k)
+    {
+        int i = k / (int) SIMD_SZ;
+        int j = k % (int) SIMD_SZ;
+        for (m = 0; m < M; ++m)
+        {
+            const auto A = static_cast<float> (-2 * M_PI * (m + 1) * k / N);
+            s->e[(2 * (i * M + m) + 0) * SIMD_SZ + j] = std::cos (A);
+            s->e[(2 * (i * M + m) + 1) * SIMD_SZ + j] = std::sin (A);
+        }
+    }
+
+    if (transform == FFT_REAL)
+    {
+        rffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
+    }
+    else
+    {
+        cffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
+    }
+
+    /* check that N is decomposable with allowed prime factors */
+    for (k = 0, m = 1; k < s->ifac[1]; ++k)
+    {
+        m *= s->ifac[2 + k];
+    }
+    if (m != N / SIMD_SZ)
+    {
+        fft_destroy_setup (s);
+        s = nullptr;
+    }
+
+    return s;
+}
 }
diff --git a/simd/chowdsp_fft_impl_neon.cpp b/simd/chowdsp_fft_impl_neon.cpp
@@ -68,69 +68,21 @@ struct FFT_Setup
     float* twiddle; // points into 'data', N/4 elements
 };
 
-static FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
+static size_t fft_bytes_required (int N, fft_transform_t transform)
 {
-    auto* s = (FFT_Setup*) malloc (sizeof (FFT_Setup));
-    /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
-       and 32 for real FFTs -- a lot of stuff would need to be rewritten to
-       handle other cases (or maybe just switch to a scalar fft, I don't know..) */
-    if (transform == FFT_REAL)
-    {
-        assert ((N % (2 * SIMD_SZ * SIMD_SZ)) == 0 && N > 0);
-    }
-    if (transform == FFT_COMPLEX)
-    {
-        assert ((N % (SIMD_SZ * SIMD_SZ)) == 0 && N > 0);
-    }
-    //assert((N % 32) == 0);
-    s->N = N;
-    s->transform = transform;
-    /* nb of complex simd vectors */
-    s->Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
-    s->data = (float32x4_t*) aligned_malloc (2 * s->Ncvec * sizeof (float) * SIMD_SZ);
-    s->e = (float*) s->data;
-    s->twiddle = (float*) (s->data + (2 * s->Ncvec * (SIMD_SZ - 1)) / SIMD_SZ);
-
-    int k, m;
-    for (k = 0; k < s->Ncvec; ++k)
-    {
-        int i = k / (int) SIMD_SZ;
-        int j = k % (int) SIMD_SZ;
-        for (m = 0; m < SIMD_SZ - 1; ++m)
-        {
-            const auto A = static_cast<float> (-2 * M_PI * (m + 1) * k / N);
-            s->e[(2 * (i * 3 + m) + 0) * SIMD_SZ + j] = std::cos (A);
-            s->e[(2 * (i * 3 + m) + 1) * SIMD_SZ + j] = std::sin (A);
-        }
-    }
-
-    if (transform == FFT_REAL)
-    {
-        common::rffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
-    }
-    else
-    {
-        common::cffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
-    }
-
-    /* check that N is decomposable with allowed prime factors */
-    for (k = 0, m = 1; k < s->ifac[1]; ++k)
-    {
-        m *= s->ifac[2 + k];
-    }
-    if (m != N / SIMD_SZ)
-    {
-        fft_destroy_setup (s);
-        s = nullptr;
-    }
+    const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
+    const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
+    return data_bytes + sizeof (FFT_Setup);
+}
 
-    return s;
+static FFT_Setup* fft_new_setup (int N, fft_transform_t transform, void* data)
+{
+    return common::fft_new_setup<FFT_Setup, float32x4_t> (N, transform, SIMD_SZ, data);
 }
 
 static void fft_destroy_setup (FFT_Setup* s)
 {
     aligned_free (s->data);
-    free (s);
 }
 
 //====================================================================
diff --git a/simd/chowdsp_fft_impl_sse.cpp b/simd/chowdsp_fft_impl_sse.cpp
diff --git a/test/test.c b/test/test.c
diff --git a/test/test.cpp b/test/test.cpp