Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ build*/
*.raddbg_project
.vscode/
.idea/
.zed/
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ else()
else()
set(DEBUG_OPT_FLAGS -O2)
endif()
message(STATUS "Setting debug optimization flags: ${DEBUG_OPT_FLAGS}")
message(STATUS "chowdsp_fft -- Setting debug optimization flags: ${DEBUG_OPT_FLAGS}")

target_compile_options(chowdsp_fft PRIVATE $<$<CONFIG:Debug>:${DEBUG_OPT_FLAGS}>)
if(TARGET chowdsp_fft_avx)
Expand Down
39 changes: 33 additions & 6 deletions chowdsp_fft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ void aligned_free (void* p)
namespace chowdsp::fft::avx
{
struct FFT_Setup;
FFT_Setup* fft_new_setup (int N, fft_transform_t transform);
size_t fft_bytes_required (int N, fft_transform_t transform);
FFT_Setup* fft_new_setup (int N, fft_transform_t transform, void* data);
void fft_destroy_setup (FFT_Setup* s);
void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* foutput, void* scratch, fft_direction_t direction, int ordered);
void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling);
Expand Down Expand Up @@ -228,27 +229,53 @@ bool check_is_pointer_sse_setup (void* ptr)
}
#endif

void* fft_new_setup (int N, fft_transform_t transform, [[maybe_unused]] bool use_avx_if_available)
size_t fft_bytes_required (int N, fft_transform_t transform, bool use_avx_if_available)
{
#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
if (use_avx_if_available)
{
if (cpu_supports_avx())
{
auto* setup_ptr = avx::fft_new_setup (N, transform);
return avx::fft_bytes_required (N, transform);
}
}
return sse::fft_bytes_required (N, transform);
#else
return sse::fft_bytes_required (N, transform);
#endif
#elif defined(__ARM_NEON__) || defined(_M_ARM64)
return neon::fft_bytes_required (N, transform);
#endif
}

void* fft_new_setup (int N, fft_transform_t transform, bool use_avx_if_available)
{
const auto fft_bytes = fft_bytes_required (N, transform, use_avx_if_available);
return fft_new_setup_preallocated (N, transform, aligned_malloc (fft_bytes), use_avx_if_available);
}

void* fft_new_setup_preallocated (int N, fft_transform_t transform, void* data, [[maybe_unused]] bool use_avx_if_available)
{
#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
if (use_avx_if_available)
{
if (cpu_supports_avx())
{
auto* setup_ptr = avx::fft_new_setup (N, transform, data);
if (setup_ptr != nullptr)
return setup_ptr;
}
}
void* ptr = sse::fft_new_setup (N, transform);
void* ptr = sse::fft_new_setup (N, transform, data);
set_pointer_is_sse_setup (ptr);
return ptr;
#else
return sse::fft_new_setup (N, transform);
return sse::fft_new_setup (N, transform, data);
#endif
#elif defined(__ARM_NEON__) || defined(_M_ARM64)
return neon::fft_new_setup (N, transform);
return neon::fft_new_setup (N, transform, data);
#endif
}

Expand Down
32 changes: 32 additions & 0 deletions chowdsp_fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,16 @@ typedef enum
FFT_COMPLEX
} fft_transform_t;

/*
Returns the number of bytes needed for a "pre-allocated" FFT.
See `fft_new_setup_preallocated` for more details.
*/
size_t fft_bytes_required (int N, fft_transform_t transform, bool use_avx_if_available
#ifdef __cplusplus
= true
#endif
);

/*
prepare for performing transforms of size N -- the returned
FFT_Setup structure is read-only so it can safely be shared by
Expand All @@ -84,6 +94,28 @@ void* fft_new_setup (int N, fft_transform_t transform, bool use_avx_if_available
= true
#endif
);

/*
Same as fft_new_setup, except that all the memory for the FFT
is pre-allocated, and is provided by the caller via the "data"
pointer.

pffft's `aligned_malloc` aligns all allocations to 64 bytes.
Depending on your specific case, you may be able to get away
with a lower alignment requirement, but make sure to test! It
is expected that at least 16-byte alignment will be required
for FFTs using NEON or SSE, and at least 32-byte alignment will
be required for FFTs using AVX.

If you create your FFT with this method, you don't need to call
`fft_destroy_setup`, but you are responsible for freeing the
pre-allocated memory.
*/
void* fft_new_setup_preallocated (int N, fft_transform_t transform, void* data, bool use_avx_if_available
#ifdef __cplusplus
= true
#endif
);
void fft_destroy_setup (void*);

/** Returns the width (in bytes) of the SIMD registers used by the FFT setup. */
Expand Down
67 changes: 8 additions & 59 deletions simd/chowdsp_fft_impl_avx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,72 +70,21 @@ struct FFT_Setup
float* twiddle; // points into 'data', N/4 elements
};

FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
size_t fft_bytes_required (int N, fft_transform_t transform)
{
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
and 32 for real FFTs -- a lot of stuff would need to be rewritten to
handle other cases (or maybe just switch to a scalar fft, I don't know..) */
if (transform == FFT_REAL)
{
if (! ((N % (2 * SIMD_SZ * SIMD_SZ)) == 0 && N > 0))
return nullptr;
}
if (transform == FFT_COMPLEX)
{
if (! ((N % (SIMD_SZ * SIMD_SZ)) == 0 && N > 0))
return nullptr;
}

auto* s = (FFT_Setup*) malloc (sizeof (FFT_Setup));
//assert((N % 32) == 0);
s->N = N;
s->transform = transform;
/* nb of complex simd vectors */
s->Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
s->data = (__m256*) aligned_malloc (2 * s->Ncvec * sizeof (float) * SIMD_SZ);
s->e = (float*) s->data;
s->twiddle = (float*) (s->data + (2 * s->Ncvec * (SIMD_SZ - 1)) / SIMD_SZ);

int k, m;
for (k = 0; k < s->Ncvec; ++k)
{
int i = k / (int) SIMD_SZ;
int j = k % (int) SIMD_SZ;
for (m = 0; m < SIMD_SZ - 1; ++m)
{
const auto A = static_cast<float> (-2 * M_PI * (m + 1) * k / N);
s->e[(2 * (i * 7 + m) + 0) * SIMD_SZ + j] = std::cos (A);
s->e[(2 * (i * 7 + m) + 1) * SIMD_SZ + j] = std::sin (A);
}
}

if (transform == FFT_REAL)
{
common::rffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
}
else
{
common::cffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
}

/* check that N is decomposable with allowed prime factors */
for (k = 0, m = 1; k < s->ifac[1]; ++k)
{
m *= s->ifac[2 + k];
}
if (m != N / SIMD_SZ)
{
fft_destroy_setup (s);
s = nullptr;
}
const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
return data_bytes + sizeof (FFT_Setup);
}

return s;
FFT_Setup* fft_new_setup (int N, fft_transform_t transform, void* data)
{
return common::fft_new_setup<FFT_Setup, __m256> (N, transform, SIMD_SZ, data);
}

void fft_destroy_setup (FFT_Setup* s)
{
aligned_free (s->data);
free (s);
}

//====================================================================
Expand Down
68 changes: 68 additions & 0 deletions simd/chowdsp_fft_impl_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,4 +158,72 @@ void inline cffti1_ps (int n, float* wa, int* ifac)
l1 = l2;
}
}

template <typename Setup_Type, typename SIMD_Type>
static inline Setup_Type* fft_new_setup (int N, fft_transform_t transform, int SIMD_SZ, void* data)
{
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
and 32 for real FFTs -- a lot of stuff would need to be rewritten to
handle other cases (or maybe just switch to a scalar fft, I don't know..) */
if (transform == FFT_REAL)
{
if (! ((N % (2 * SIMD_SZ * SIMD_SZ)) == 0 && N > 0))
return nullptr;
}
if (transform == FFT_COMPLEX)
{
if (! ((N % (SIMD_SZ * SIMD_SZ)) == 0 && N > 0))
return nullptr;
}

const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
auto* s_data = (std::byte*) data;

auto* s = (Setup_Type*) (s_data + data_bytes);
s->N = N;
s->transform = transform;
/* nb of complex simd vectors */
s->Ncvec = Ncvec;
s->data = (SIMD_Type*) s_data;
s->e = (float*) s->data;
s->twiddle = (float*) (s->data + (2 * s->Ncvec * (SIMD_SZ - 1)) / SIMD_SZ);

int k, m;
const auto M = SIMD_SZ - 1;

for (k = 0; k < s->Ncvec; ++k)
{
int i = k / (int) SIMD_SZ;
int j = k % (int) SIMD_SZ;
for (m = 0; m < M; ++m)
{
const auto A = static_cast<float> (-2 * M_PI * (m + 1) * k / N);
s->e[(2 * (i * M + m) + 0) * SIMD_SZ + j] = std::cos (A);
s->e[(2 * (i * M + m) + 1) * SIMD_SZ + j] = std::sin (A);
}
}

if (transform == FFT_REAL)
{
rffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
}
else
{
cffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
}

/* check that N is decomposable with allowed prime factors */
for (k = 0, m = 1; k < s->ifac[1]; ++k)
{
m *= s->ifac[2 + k];
}
if (m != N / SIMD_SZ)
{
fft_destroy_setup (s);
s = nullptr;
}

return s;
}
}
64 changes: 8 additions & 56 deletions simd/chowdsp_fft_impl_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,69 +68,21 @@ struct FFT_Setup
float* twiddle; // points into 'data', N/4 elements
};

static FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
static size_t fft_bytes_required (int N, fft_transform_t transform)
{
auto* s = (FFT_Setup*) malloc (sizeof (FFT_Setup));
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
and 32 for real FFTs -- a lot of stuff would need to be rewritten to
handle other cases (or maybe just switch to a scalar fft, I don't know..) */
if (transform == FFT_REAL)
{
assert ((N % (2 * SIMD_SZ * SIMD_SZ)) == 0 && N > 0);
}
if (transform == FFT_COMPLEX)
{
assert ((N % (SIMD_SZ * SIMD_SZ)) == 0 && N > 0);
}
//assert((N % 32) == 0);
s->N = N;
s->transform = transform;
/* nb of complex simd vectors */
s->Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
s->data = (float32x4_t*) aligned_malloc (2 * s->Ncvec * sizeof (float) * SIMD_SZ);
s->e = (float*) s->data;
s->twiddle = (float*) (s->data + (2 * s->Ncvec * (SIMD_SZ - 1)) / SIMD_SZ);

int k, m;
for (k = 0; k < s->Ncvec; ++k)
{
int i = k / (int) SIMD_SZ;
int j = k % (int) SIMD_SZ;
for (m = 0; m < SIMD_SZ - 1; ++m)
{
const auto A = static_cast<float> (-2 * M_PI * (m + 1) * k / N);
s->e[(2 * (i * 3 + m) + 0) * SIMD_SZ + j] = std::cos (A);
s->e[(2 * (i * 3 + m) + 1) * SIMD_SZ + j] = std::sin (A);
}
}

if (transform == FFT_REAL)
{
common::rffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
}
else
{
common::cffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
}

/* check that N is decomposable with allowed prime factors */
for (k = 0, m = 1; k < s->ifac[1]; ++k)
{
m *= s->ifac[2 + k];
}
if (m != N / SIMD_SZ)
{
fft_destroy_setup (s);
s = nullptr;
}
const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
return data_bytes + sizeof (FFT_Setup);
}

return s;
static FFT_Setup* fft_new_setup (int N, fft_transform_t transform, void* data)
{
return common::fft_new_setup<FFT_Setup, float32x4_t> (N, transform, SIMD_SZ, data);
}

static void fft_destroy_setup (FFT_Setup* s)
{
aligned_free (s->data);
free (s);
}

//====================================================================
Expand Down
Loading