diff --git a/.gitignore b/.gitignore index 4ba8b72..a328b03 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,8 @@ build*/ +.focus-config +.raddbg_project .vscode/ .idea/ + + diff --git a/README.md b/README.md index fa1ec46..b99ddc8 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,9 @@ `chowdsp_fft` is a fork of the [`pffft`](https://bitbucket.org/jpommier/pffft/src/master/) library for computing Fast Fourier Transforms. This fork adds a couple of optimizations, most importantly the ability to use AVX intrinsics -for single-precision FFTs on compatible platforms. +for single-precision FFTs on compatible platforms. The library also +contains some methods which may be useful for computing convolutions +using frequency-domain multiplication. ## Disclaimer diff --git a/chowdsp_fft.cpp b/chowdsp_fft.cpp index b026faa..8aafb28 100644 --- a/chowdsp_fft.cpp +++ b/chowdsp_fft.cpp @@ -90,6 +90,7 @@ FFT_Setup* fft_new_setup (int N, fft_transform_t transform); void fft_destroy_setup (FFT_Setup* s); void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* foutput, void* scratch, fft_direction_t direction, int ordered); void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling); +void fft_accumulate_internal (const float* a, const float* b, float* ab, int N); } // namespace chowdsp::fft::avx static constexpr uintptr_t address_mask = ~static_cast (3); static constexpr uintptr_t typeid_mask = static_cast (3); @@ -267,6 +268,26 @@ void fft_destroy_setup (void* ptr) #endif } +int fft_simd_width_bytes (void* setup) +{ +#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) +#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX + if (check_is_pointer_sse_setup (setup)) + { + return 16; + } + else + { + return 32; + } +#else + return 16; +#endif +#elif defined(__ARM_NEON__) || defined(_M_ARM64) + return 16; +#endif +} + void fft_transform (void* setup, const float* input, float* output, float* work, fft_direction_t direction) { #if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) @@ -382,4 +403,24 @@ void fft_convolve_unordered (void* setup, const float* a, const float* b, float* scaling); #endif } + +void fft_accumulate (void* setup, const float* a, const float* b, float* ab, int N) +{ +#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) +#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX + if (check_is_pointer_sse_setup (setup)) + { + sse::fft_accumulate_internal (a, b, ab, N); + } + else + { + avx::fft_accumulate_internal (a, b, ab, N); + } +#else + sse::fft_accumulate_internal (a, b, ab, N); +#endif +#elif defined(__ARM_NEON__) || defined(_M_ARM64) + neon::fft_accumulate_internal (a, b, ab, N); +#endif +} } // namespace chowdsp::fft diff --git a/chowdsp_fft.h b/chowdsp_fft.h index f9ce206..ce610c0 100644 --- a/chowdsp_fft.h +++ b/chowdsp_fft.h @@ -86,6 +86,9 @@ void* fft_new_setup (int N, fft_transform_t transform, bool use_avx_if_available ); void fft_destroy_setup (void*); +/** Returns the width (in bytes) of the SIMD registers used by the FFT setup. */ +int fft_simd_width_bytes (void* setup); + /* Perform a Fourier transform , The z-domain data is stored as interleaved complex numbers. @@ -111,9 +114,19 @@ void fft_transform_unordered (void* setup, const float* input, float* output, fl /** * Convolve two (unordered) frequency-domain signals, with some scale factor. + * + * The operation performed is: dft_ab += (dft_a * fdt_b)*scaling + * + * The dft_a, dft_b and dft_ab pointers may alias. */ void fft_convolve_unordered (void* setup, const float* a, const float* b, float* ab, float scaling); +/** + * Computes the sum of two signals of length N. + * N must be a multiple of the FFT setup's SIMD width. + */ +void fft_accumulate (void* setup, const float* a, const float* b, float* ab, int N); + void* aligned_malloc (size_t nb_bytes); void aligned_free (void*); diff --git a/simd/chowdsp_fft_impl_avx.cpp b/simd/chowdsp_fft_impl_avx.cpp index 4cd2d6b..c503de2 100644 --- a/simd/chowdsp_fft_impl_avx.cpp +++ b/simd/chowdsp_fft_impl_avx.cpp @@ -2028,5 +2028,20 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, reinterpret_cast (&vab[1])[0] = abi0 + ai0 * bi0 * scaling; } } + +void fft_accumulate_internal (const float* a, const float* b, float* ab, int N) +{ + assert (N % (SIMD_SZ * 2) == 0); + const auto Ncvec = N / (int) SIMD_SZ; + auto* va = (const __m256*) a; + auto* vb = (const __m256*) b; + auto* vab = (__m256*) ab; + + for (int i = 0; i < Ncvec; i += 2) + { + vab[i] = _mm256_add_ps (va[i], vb[i]); + vab[i + 1] = _mm256_add_ps (va[i + 1], vb[i + 1]); + } +} } // namespace chowdsp::fft::avx #endif diff --git a/simd/chowdsp_fft_impl_neon.cpp b/simd/chowdsp_fft_impl_neon.cpp index e7eade9..8ad4b6f 100644 --- a/simd/chowdsp_fft_impl_neon.cpp +++ b/simd/chowdsp_fft_impl_neon.cpp @@ -1674,4 +1674,19 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, reinterpret_cast (&vab[1])[0] = abi0 + ai0 * bi0 * scaling; } } + +void fft_accumulate_internal (const float* a, const float* b, float* ab, int N) +{ + assert (N % (SIMD_SZ * 2) == 0); + const auto Ncvec = N / (int) SIMD_SZ; + auto* va = (const float32x4_t*) a; + auto* vb = (const float32x4_t*) b; + auto* vab = (float32x4_t*) ab; + + for (int i = 0; i < Ncvec; i += 2) + { + vab[i] = vaddq_f32 (va[i], vb[i]); + vab[i + 1] = vaddq_f32 (va[i + 1], vb[i + 1]); + } +} } // namespace chowdsp::fft::neon diff --git a/simd/chowdsp_fft_impl_sse.cpp b/simd/chowdsp_fft_impl_sse.cpp index 4691594..88da119 100644 --- a/simd/chowdsp_fft_impl_sse.cpp +++ b/simd/chowdsp_fft_impl_sse.cpp @@ -1689,4 +1689,19 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, reinterpret_cast (&vab[1])[0] = abi0 + ai0 * bi0 * scaling; } } + +void fft_accumulate_internal (const float* a, const float* b, float* ab, int N) +{ + assert (N % (SIMD_SZ * 2) == 0); + const auto Ncvec = N / (int) SIMD_SZ; + auto* va = (const __m128*) a; + auto* vb = (const __m128*) b; + auto* vab = (__m128*) ab; + + for (int i = 0; i < Ncvec; i += 2) + { + vab[i] = _mm_add_ps (va[i], vb[i]); + vab[i + 1] = _mm_add_ps (va[i + 1], vb[i + 1]); + } +} } // namespace chowdsp::fft::sse diff --git a/test/test.cpp b/test/test.cpp index 8d171c1..5d6c963 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -30,6 +30,8 @@ void test_fft_complex (int N, bool use_avx = false) auto* fft_setup = chowdsp::fft::fft_new_setup (N, chowdsp::fft::FFT_COMPLEX, use_avx); REQUIRE (fft_setup != nullptr); auto* pffft_setup = pffft_new_setup (N, PFFFT_COMPLEX); + if (! use_avx) + REQUIRE (chowdsp::fft::fft_simd_width_bytes (fft_setup) == 16); chowdsp::fft::fft_transform (fft_setup, data, data, work_data, chowdsp::fft::FFT_FORWARD); pffft_transform_ordered (pffft_setup, data_ref, data_ref, work_data_ref, PFFFT_FORWARD); @@ -75,7 +77,7 @@ void test_fft_real (int N, bool use_avx = false) chowdsp::fft::fft_transform (fft_setup, data, data, work_data, chowdsp::fft::FFT_FORWARD); pffft_transform_ordered (pffft_setup, data_ref, data_ref, work_data_ref, PFFFT_FORWARD); - + compare (data_ref, data, N); chowdsp::fft::fft_transform (fft_setup, data, data, work_data, chowdsp::fft::FFT_BACKWARD); @@ -176,6 +178,8 @@ void test_convolution_real (int N, bool use_avx = false) pffft_transform (pffft_setup, sine2_ref, sine2_ref, work_data_ref, PFFFT_FORWARD); pffft_zconvolve_accumulate (pffft_setup, sine1_ref, sine2_ref, out_ref, norm_gain); pffft_transform (pffft_setup, out_ref, out_ref, work_data_ref, PFFFT_BACKWARD); + for (int i = 0; i < N; ++i) + out_ref[i] += sine1_ref[i]; auto* fft_setup = chowdsp::fft::fft_new_setup (N, chowdsp::fft::FFT_REAL, use_avx); REQUIRE (fft_setup != nullptr); @@ -183,6 +187,7 @@ void test_convolution_real (int N, bool use_avx = false) chowdsp::fft::fft_transform_unordered (fft_setup, sine2, sine2, work_data, chowdsp::fft::FFT_FORWARD); chowdsp::fft::fft_convolve_unordered (fft_setup, sine1, sine2, out, norm_gain); chowdsp::fft::fft_transform_unordered (fft_setup, out, out, work_data, chowdsp::fft::FFT_BACKWARD); + chowdsp::fft::fft_accumulate (fft_setup, out, sine1_ref, out, N); compare (out_ref, out, N);