Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
`chowdsp_fft` is a fork of the [`pffft`](https://bitbucket.org/jpommier/pffft/src/master/)
library for computing Fast Fourier Transforms. This fork adds a couple
of optimizations, most importantly the ability to use AVX intrinsics
for single-precision FFTs on compatible platforms.
for single-precision FFTs on compatible platforms. The library also
contains some methods which may be useful for computing convolutions
using frequency-domain multiplication.

## Disclaimer

Expand Down
56 changes: 56 additions & 0 deletions chowdsp_fft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,26 @@ void fft_destroy_setup (void* ptr)
#endif
}

int fft_simd_width_bytes (void* setup)
{
#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
if (check_is_pointer_sse_setup (setup))
{
return 16;
}
else
{
return 32;
}
#else
return 16;
#endif
#elif defined(__ARM_NEON__) || defined(_M_ARM64)
return 16;
#endif
}

void fft_transform (void* setup, const float* input, float* output, float* work, fft_direction_t direction)
{
#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
Expand Down Expand Up @@ -382,4 +402,40 @@ void fft_convolve_unordered (void* setup, const float* a, const float* b, float*
scaling);
#endif
}

void fft_accumulate (void* setup, const float* a, const float* b, float* ab, int N)
{
#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
if (check_is_pointer_sse_setup (setup))
{
sse::fft_accumulate_internal (reinterpret_cast<sse::FFT_Setup*> (setup),
a,
b,
ab,
N);
}
else
{
avx::fft_accumulate_internal (reinterpret_cast<avx::FFT_Setup*> (setup),
a,
b,
ab,
N);
}
#else
sse::fft_accumulate_internal (reinterpret_cast<sse::FFT_Setup*> (setup),
a,
b,
ab,
N);
#endif
#elif defined(__ARM_NEON__) || defined(_M_ARM64)
neon::fft_accumulate_internal (reinterpret_cast<neon::FFT_Setup*> (setup),
a,
b,
ab,
N);
#endif
}
} // namespace chowdsp::fft
13 changes: 13 additions & 0 deletions chowdsp_fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ void* fft_new_setup (int N, fft_transform_t transform, bool use_avx_if_available
);
void fft_destroy_setup (void*);

/** Returns the width (in bytes) of the SIMD registers used by the FFT setup. */
int fft_simd_width_bytes (void* setup);

/*
Perform a Fourier transform , The z-domain data is stored as
interleaved complex numbers.
Expand All @@ -111,9 +114,19 @@ void fft_transform_unordered (void* setup, const float* input, float* output, fl

/**
* Convolve two (unordered) frequency-domain signals, with some scale factor.
*
* The operation performed is: dft_ab += (dft_a * fdt_b)*scaling
*
* The dft_a, dft_b and dft_ab pointers may alias.
*/
void fft_convolve_unordered (void* setup, const float* a, const float* b, float* ab, float scaling);

/**
* Computes the sum of two signals of length N.
* N must be a multiple of the FFT setup's SIMD width.
*/
void fft_accumulate (void* setup, const float* a, const float* b, float* ab, int N);

void* aligned_malloc (size_t nb_bytes);
void aligned_free (void*);

Expand Down
15 changes: 15 additions & 0 deletions simd/chowdsp_fft_impl_avx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2028,5 +2028,20 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b,
reinterpret_cast<float*> (&vab[1])[0] = abi0 + ai0 * bi0 * scaling;
}
}

void fft_accumulate_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, int N)
{
assert (N % SIMD_SZ == 0);
const auto Ncvec = N / (int) SIMD_SZ;
auto* va = (const __m256*) a;
auto* vb = (const __m256*) b;
auto* vab = (__m256*) ab;

for (int i = 0; i < Ncvec; i += 2)
{
vab[i] = _mm256_add_ps (va[i], vb[i]);
vab[i + 1] = _mm256_add_ps (va[i + 1], vb[i + 1]);
Comment on lines +2040 to +2043
Copy link

Copilot AI Apr 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Processing two SIMD vectors per iteration assumes that Ncvec (N/SIMD_SZ) is even, which is not enforced by the current assertion. This could lead to out-of-bound access when Ncvec is odd; consider iterating over each vector or assert an even count.

Suggested change
for (int i = 0; i < Ncvec; i += 2)
{
vab[i] = _mm256_add_ps (va[i], vb[i]);
vab[i + 1] = _mm256_add_ps (va[i + 1], vb[i + 1]);
for (int i = 0; i < Ncvec; ++i)
{
vab[i] = _mm256_add_ps (va[i], vb[i]);

Copilot uses AI. Check for mistakes.
}
}
} // namespace chowdsp::fft::avx
#endif
15 changes: 15 additions & 0 deletions simd/chowdsp_fft_impl_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1674,4 +1674,19 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b,
reinterpret_cast<float*> (&vab[1])[0] = abi0 + ai0 * bi0 * scaling;
}
}

void fft_accumulate_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, int N)
{
assert (N % SIMD_SZ == 0);
const auto Ncvec = N / (int) SIMD_SZ;
auto* va = (const float32x4_t*) a;
auto* vb = (const float32x4_t*) b;
auto* vab = (float32x4_t*) ab;

for (int i = 0; i < Ncvec; i += 2)
Copy link

Copilot AI Apr 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The loop processes two SIMD vectors per iteration without ensuring that Ncvec is even. If Ncvec is odd, the access to vab[i+1] could result in out-of-bound memory access. Consider revising the loop to handle an odd number of elements or adding a corresponding assertion.

Copilot uses AI. Check for mistakes.
{
vab[i] = vaddq_f32 (va[i], vb[i]);
vab[i + 1] = vaddq_f32 (va[i + 1], vb[i + 1]);
}
}
} // namespace chowdsp::fft::neon
15 changes: 15 additions & 0 deletions simd/chowdsp_fft_impl_sse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1689,4 +1689,19 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b,
reinterpret_cast<float*> (&vab[1])[0] = abi0 + ai0 * bi0 * scaling;
}
}

void fft_accumulate_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, int N)
{
assert (N % SIMD_SZ == 0);
const auto Ncvec = N / (int) SIMD_SZ;
auto* va = (const __m128*) a;
auto* vb = (const __m128*) b;
auto* vab = (__m128*) ab;

for (int i = 0; i < Ncvec; i += 2)
{
vab[i] = _mm_add_ps (va[i], vb[i]);
vab[i + 1] = _mm_add_ps (va[i + 1], vb[i + 1]);
Comment on lines +1701 to +1704
Copy link

Copilot AI Apr 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The loop processes two SIMD vectors per iteration but only asserts N is a multiple of SIMD_SZ. If Ncvec (i.e. N/SIMD_SZ) is odd, the access to vab[i+1] may be out-of-bounds. Consider iterating over all vectors individually or asserting that Ncvec is even.

Suggested change
for (int i = 0; i < Ncvec; i += 2)
{
vab[i] = _mm_add_ps (va[i], vb[i]);
vab[i + 1] = _mm_add_ps (va[i + 1], vb[i + 1]);
for (int i = 0; i < Ncvec; ++i)
{
vab[i] = _mm_add_ps (va[i], vb[i]);

Copilot uses AI. Check for mistakes.
}
}
} // namespace chowdsp::fft::sse
8 changes: 8 additions & 0 deletions test/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ void test_fft_complex (int N, bool use_avx = false)
REQUIRE (fft_setup != nullptr);
auto* pffft_setup = pffft_new_setup (N, PFFFT_COMPLEX);

if (use_avx)
REQUIRE (chowdsp::fft::fft_simd_width_bytes (fft_setup) == 32);
else
REQUIRE (chowdsp::fft::fft_simd_width_bytes (fft_setup) == 16);

chowdsp::fft::fft_transform (fft_setup, data, data, work_data, chowdsp::fft::FFT_FORWARD);
pffft_transform_ordered (pffft_setup, data_ref, data_ref, work_data_ref, PFFFT_FORWARD);

Expand Down Expand Up @@ -176,13 +181,16 @@ void test_convolution_real (int N, bool use_avx = false)
pffft_transform (pffft_setup, sine2_ref, sine2_ref, work_data_ref, PFFFT_FORWARD);
pffft_zconvolve_accumulate (pffft_setup, sine1_ref, sine2_ref, out_ref, norm_gain);
pffft_transform (pffft_setup, out_ref, out_ref, work_data_ref, PFFFT_BACKWARD);
for (int i = 0; i < N; ++i)
out_ref[i] += sine1_ref[i];

auto* fft_setup = chowdsp::fft::fft_new_setup (N, chowdsp::fft::FFT_REAL, use_avx);
REQUIRE (fft_setup != nullptr);
chowdsp::fft::fft_transform_unordered (fft_setup, sine1, sine1, work_data, chowdsp::fft::FFT_FORWARD);
chowdsp::fft::fft_transform_unordered (fft_setup, sine2, sine2, work_data, chowdsp::fft::FFT_FORWARD);
chowdsp::fft::fft_convolve_unordered (fft_setup, sine1, sine2, out, norm_gain);
chowdsp::fft::fft_transform_unordered (fft_setup, out, out, work_data, chowdsp::fft::FFT_BACKWARD);
chowdsp::fft::fft_accumulate (fft_setup, out, sine1, out, N);

compare (out_ref, out, N);

Expand Down
Loading