Chowdhury-DSP · jatinchowdhury18 · Apr 6, 2025 · Apr 6, 2025 · Apr 6, 2025 · Apr 6, 2025
diff --git a/README.md b/README.md
@@ -7,7 +7,9 @@
 `chowdsp_fft` is a fork of the [`pffft`](https://bitbucket.org/jpommier/pffft/src/master/)
 library for computing Fast Fourier Transforms. This fork adds a couple
 of optimizations, most importantly the ability to use AVX intrinsics
-for single-precision FFTs on compatible platforms.
+for single-precision FFTs on compatible platforms. The library also
+contains some methods which may be useful for computing convolutions
+using frequency-domain multiplication.
 
 ## Disclaimer
 

diff --git a/chowdsp_fft.cpp b/chowdsp_fft.cpp
@@ -267,6 +267,26 @@ void fft_destroy_setup (void* ptr)
 #endif
 }
 
+int fft_simd_width_bytes (void* setup)
+{
+#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
+#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
+    if (check_is_pointer_sse_setup (setup))
+    {
+        return 16;
+    }
+    else
+    {
+        return 32;
+    }
+#else
+    return 16;
+#endif
+#elif defined(__ARM_NEON__) || defined(_M_ARM64)
+    return 16;
+#endif
+}
+
 void fft_transform (void* setup, const float* input, float* output, float* work, fft_direction_t direction)
 {
 #if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
@@ -382,4 +402,40 @@ void fft_convolve_unordered (void* setup, const float* a, const float* b, float*
                                    scaling);
 #endif
 }
+
+void fft_accumulate (void* setup, const float* a, const float* b, float* ab, int N)
+{
+#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
+#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
+    if (check_is_pointer_sse_setup (setup))
+    {
+        sse::fft_accumulate_internal (reinterpret_cast<sse::FFT_Setup*> (setup),
+                                      a,
+                                      b,
+                                      ab,
+                                      N);
+    }
+    else
+    {
+        avx::fft_accumulate_internal (reinterpret_cast<avx::FFT_Setup*> (setup),
+                                      a,
+                                      b,
+                                      ab,
+                                      N);
+    }
+#else
+    sse::fft_accumulate_internal (reinterpret_cast<sse::FFT_Setup*> (setup),
+                                  a,
+                                  b,
+                                  ab,
+                                  N);
+#endif
+#elif defined(__ARM_NEON__) || defined(_M_ARM64)
+    neon::fft_accumulate_internal (reinterpret_cast<neon::FFT_Setup*> (setup),
+                                   a,
+                                   b,
+                                   ab,
+                                   N);
+#endif
+}
 } // namespace chowdsp::fft
diff --git a/chowdsp_fft.h b/chowdsp_fft.h
@@ -86,6 +86,9 @@ void* fft_new_setup (int N, fft_transform_t transform, bool use_avx_if_available
 );
 void fft_destroy_setup (void*);
 
+/** Returns the width (in bytes) of the SIMD registers used by the FFT setup. */
+int fft_simd_width_bytes (void* setup);
+
 /*
    Perform a Fourier transform , The z-domain data is stored as
    interleaved complex numbers.
@@ -111,9 +114,19 @@ void fft_transform_unordered (void* setup, const float* input, float* output, fl
 
 /**
  * Convolve two (unordered) frequency-domain signals, with some scale factor.
+ *
+ * The operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+ *
+ * The dft_a, dft_b and dft_ab pointers may alias.
  */
 void fft_convolve_unordered (void* setup, const float* a, const float* b, float* ab, float scaling);
 
+/**
+ * Computes the sum of two signals of length N.
+ * N must be a multiple of the FFT setup's SIMD width.
+ */
+void fft_accumulate (void* setup, const float* a, const float* b, float* ab, int N);
+
 void* aligned_malloc (size_t nb_bytes);
 void aligned_free (void*);
 

diff --git a/simd/chowdsp_fft_impl_avx.cpp b/simd/chowdsp_fft_impl_avx.cpp
@@ -2028,5 +2028,20 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b,
         reinterpret_cast<float*> (&vab[1])[0] = abi0 + ai0 * bi0 * scaling;
     }
 }
+
+void fft_accumulate_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, int N)
+{
+    assert (N % SIMD_SZ == 0);
+    const auto Ncvec = N / (int) SIMD_SZ;
+    auto* va = (const __m256*) a;
+    auto* vb = (const __m256*) b;
+    auto* vab = (__m256*) ab;
+
+    for (int i = 0; i < Ncvec; i += 2)
+    {
+        vab[i] = _mm256_add_ps (va[i], vb[i]);
+        vab[i + 1] = _mm256_add_ps (va[i + 1], vb[i + 1]);
-    for (int i = 0; i < Ncvec; i += 2)
-    {
-        vab[i] = _mm256_add_ps (va[i], vb[i]);
-        vab[i + 1] = _mm256_add_ps (va[i + 1], vb[i + 1]);
+    for (int i = 0; i < Ncvec; ++i)
+    {
+        vab[i] = _mm256_add_ps (va[i], vb[i]);
-    for (int i = 0; i < Ncvec; i += 2)
-    {
-        vab[i] = _mm256_add_ps (va[i], vb[i]);
-        vab[i + 1] = _mm256_add_ps (va[i + 1], vb[i + 1]);
+    for (int i = 0; i < Ncvec; ++i)
+    {
+        vab[i] = _mm256_add_ps (va[i], vb[i]);
+    }
+}
 } // namespace chowdsp::fft::avx
 #endif
diff --git a/simd/chowdsp_fft_impl_neon.cpp b/simd/chowdsp_fft_impl_neon.cpp
@@ -1674,4 +1674,19 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b,
         reinterpret_cast<float*> (&vab[1])[0] = abi0 + ai0 * bi0 * scaling;
     }
 }
+
+void fft_accumulate_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, int N)
+{
+    assert (N % SIMD_SZ == 0);
+    const auto Ncvec = N / (int) SIMD_SZ;
+    auto* va = (const float32x4_t*) a;
+    auto* vb = (const float32x4_t*) b;
+    auto* vab = (float32x4_t*) ab;
+
+    for (int i = 0; i < Ncvec; i += 2)
+    {
+        vab[i] = vaddq_f32 (va[i], vb[i]);
+        vab[i + 1] = vaddq_f32 (va[i + 1], vb[i + 1]);
+    }
+}
 } // namespace chowdsp::fft::neon
diff --git a/simd/chowdsp_fft_impl_sse.cpp b/simd/chowdsp_fft_impl_sse.cpp
@@ -1689,4 +1689,19 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b,
         reinterpret_cast<float*> (&vab[1])[0] = abi0 + ai0 * bi0 * scaling;
     }
 }
+
+void fft_accumulate_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, int N)
+{
+    assert (N % SIMD_SZ == 0);
+    const auto Ncvec = N / (int) SIMD_SZ;
+    auto* va = (const __m128*) a;
+    auto* vb = (const __m128*) b;
+    auto* vab = (__m128*) ab;
+
+    for (int i = 0; i < Ncvec; i += 2)
+    {
+        vab[i] = _mm_add_ps (va[i], vb[i]);
+        vab[i + 1] = _mm_add_ps (va[i + 1], vb[i + 1]);
-    for (int i = 0; i < Ncvec; i += 2)
-    {
-        vab[i] = _mm_add_ps (va[i], vb[i]);
-        vab[i + 1] = _mm_add_ps (va[i + 1], vb[i + 1]);
+    for (int i = 0; i < Ncvec; ++i)
+    {
+        vab[i] = _mm_add_ps (va[i], vb[i]);
-    for (int i = 0; i < Ncvec; i += 2)
-    {
-        vab[i] = _mm_add_ps (va[i], vb[i]);
-        vab[i + 1] = _mm_add_ps (va[i + 1], vb[i + 1]);
+    for (int i = 0; i < Ncvec; ++i)
+    {
+        vab[i] = _mm_add_ps (va[i], vb[i]);
+    }
+}
 } // namespace chowdsp::fft::sse
diff --git a/test/test.cpp b/test/test.cpp
@@ -31,6 +31,11 @@ void test_fft_complex (int N, bool use_avx = false)
     REQUIRE (fft_setup != nullptr);
     auto* pffft_setup = pffft_new_setup (N, PFFFT_COMPLEX);
 
+    if (use_avx)
+        REQUIRE (chowdsp::fft::fft_simd_width_bytes (fft_setup) == 32);
+    else
+        REQUIRE (chowdsp::fft::fft_simd_width_bytes (fft_setup) == 16);
+
     chowdsp::fft::fft_transform (fft_setup, data, data, work_data, chowdsp::fft::FFT_FORWARD);
     pffft_transform_ordered (pffft_setup, data_ref, data_ref, work_data_ref, PFFFT_FORWARD);
 
@@ -176,13 +181,16 @@ void test_convolution_real (int N, bool use_avx = false)
     pffft_transform (pffft_setup, sine2_ref, sine2_ref, work_data_ref, PFFFT_FORWARD);
     pffft_zconvolve_accumulate (pffft_setup, sine1_ref, sine2_ref, out_ref, norm_gain);
     pffft_transform (pffft_setup, out_ref, out_ref, work_data_ref, PFFFT_BACKWARD);
+    for (int i = 0; i < N; ++i)
+        out_ref[i] += sine1_ref[i];
 
     auto* fft_setup = chowdsp::fft::fft_new_setup (N, chowdsp::fft::FFT_REAL, use_avx);
     REQUIRE (fft_setup != nullptr);
     chowdsp::fft::fft_transform_unordered (fft_setup, sine1, sine1, work_data, chowdsp::fft::FFT_FORWARD);
     chowdsp::fft::fft_transform_unordered (fft_setup, sine2, sine2, work_data, chowdsp::fft::FFT_FORWARD);
     chowdsp::fft::fft_convolve_unordered (fft_setup, sine1, sine2, out, norm_gain);
     chowdsp::fft::fft_transform_unordered (fft_setup, out, out, work_data, chowdsp::fft::FFT_BACKWARD);
+    chowdsp::fft::fft_accumulate (fft_setup, out, sine1, out, N);
 
     compare (out_ref, out, N);