Skip to content

Commit 667b38e

Browse files
Allow user to create FFT objects using pre-allocated data (#8)
* Small fixes * Working pre-allocated FFT * Update C test * Fixes for x64 * DRY refactor
1 parent 4154f22 commit 667b38e

File tree

10 files changed

+271
-198
lines changed

10 files changed

+271
-198
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ build*/
44
*.raddbg_project
55
.vscode/
66
.idea/
7+
.zed/

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ else()
5757
else()
5858
set(DEBUG_OPT_FLAGS -O2)
5959
endif()
60-
message(STATUS "Setting debug optimization flags: ${DEBUG_OPT_FLAGS}")
60+
message(STATUS "chowdsp_fft -- Setting debug optimization flags: ${DEBUG_OPT_FLAGS}")
6161

6262
target_compile_options(chowdsp_fft PRIVATE $<$<CONFIG:Debug>:${DEBUG_OPT_FLAGS}>)
6363
if(TARGET chowdsp_fft_avx)

chowdsp_fft.cpp

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ void aligned_free (void* p)
8686
namespace chowdsp::fft::avx
8787
{
8888
struct FFT_Setup;
89-
FFT_Setup* fft_new_setup (int N, fft_transform_t transform);
89+
size_t fft_bytes_required (int N, fft_transform_t transform);
90+
FFT_Setup* fft_new_setup (int N, fft_transform_t transform, void* data);
9091
void fft_destroy_setup (FFT_Setup* s);
9192
void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* foutput, void* scratch, fft_direction_t direction, int ordered);
9293
void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b, float* ab, float scaling);
@@ -228,27 +229,53 @@ bool check_is_pointer_sse_setup (void* ptr)
228229
}
229230
#endif
230231

231-
void* fft_new_setup (int N, fft_transform_t transform, [[maybe_unused]] bool use_avx_if_available)
232+
size_t fft_bytes_required (int N, fft_transform_t transform, bool use_avx_if_available)
232233
{
233234
#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
234235
#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
235236
if (use_avx_if_available)
236237
{
237238
if (cpu_supports_avx())
238239
{
239-
auto* setup_ptr = avx::fft_new_setup (N, transform);
240+
return avx::fft_bytes_required (N, transform);
241+
}
242+
}
243+
return sse::fft_bytes_required (N, transform);
244+
#else
245+
return sse::fft_bytes_required (N, transform);
246+
#endif
247+
#elif defined(__ARM_NEON__) || defined(_M_ARM64)
248+
return neon::fft_bytes_required (N, transform);
249+
#endif
250+
}
251+
252+
void* fft_new_setup (int N, fft_transform_t transform, bool use_avx_if_available)
253+
{
254+
const auto fft_bytes = fft_bytes_required (N, transform, use_avx_if_available);
255+
return fft_new_setup_preallocated (N, transform, aligned_malloc (fft_bytes), use_avx_if_available);
256+
}
257+
258+
void* fft_new_setup_preallocated (int N, fft_transform_t transform, void* data, [[maybe_unused]] bool use_avx_if_available)
259+
{
260+
#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
261+
#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
262+
if (use_avx_if_available)
263+
{
264+
if (cpu_supports_avx())
265+
{
266+
auto* setup_ptr = avx::fft_new_setup (N, transform, data);
240267
if (setup_ptr != nullptr)
241268
return setup_ptr;
242269
}
243270
}
244-
void* ptr = sse::fft_new_setup (N, transform);
271+
void* ptr = sse::fft_new_setup (N, transform, data);
245272
set_pointer_is_sse_setup (ptr);
246273
return ptr;
247274
#else
248-
return sse::fft_new_setup (N, transform);
275+
return sse::fft_new_setup (N, transform, data);
249276
#endif
250277
#elif defined(__ARM_NEON__) || defined(_M_ARM64)
251-
return neon::fft_new_setup (N, transform);
278+
return neon::fft_new_setup (N, transform, data);
252279
#endif
253280
}
254281

chowdsp_fft.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,16 @@ typedef enum
7474
FFT_COMPLEX
7575
} fft_transform_t;
7676

77+
/*
78+
Returns the number of bytes needed for a "pre-allocated" FFT.
79+
See `fft_new_setup_preallocated` for more details.
80+
*/
81+
size_t fft_bytes_required (int N, fft_transform_t transform, bool use_avx_if_available
82+
#ifdef __cplusplus
83+
= true
84+
#endif
85+
);
86+
7787
/*
7888
prepare for performing transforms of size N -- the returned
7989
FFT_Setup structure is read-only so it can safely be shared by
@@ -84,6 +94,28 @@ void* fft_new_setup (int N, fft_transform_t transform, bool use_avx_if_available
8494
= true
8595
#endif
8696
);
97+
98+
/*
99+
Same as fft_new_setup, except that all the memory for the FFT
100+
is pre-allocated, and is provided by the caller via the "data"
101+
pointer.
102+
103+
pffft's `aligned_malloc` aligns all allocations to 64 bytes.
104+
Depending on your specific case, you may be able to get away
105+
with a lower alignment requirement, but make sure to test! It
106+
is expected that at least 16-byte alignment will be required
107+
for FFTs using NEON or SSE, and at least 32-byte alignment will
108+
be required for FFTs using AVX.
109+
110+
If you create your FFT with this method, you don't need to call
111+
`fft_destroy_setup`, but you are responsible for freeing the
112+
pre-allocated memory.
113+
*/
114+
void* fft_new_setup_preallocated (int N, fft_transform_t transform, void* data, bool use_avx_if_available
115+
#ifdef __cplusplus
116+
= true
117+
#endif
118+
);
87119
void fft_destroy_setup (void*);
88120

89121
/** Returns the width (in bytes) of the SIMD registers used by the FFT setup. */

simd/chowdsp_fft_impl_avx.cpp

Lines changed: 8 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -70,72 +70,21 @@ struct FFT_Setup
7070
float* twiddle; // points into 'data', N/4 elements
7171
};
7272

73-
FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
73+
size_t fft_bytes_required (int N, fft_transform_t transform)
7474
{
75-
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
76-
and 32 for real FFTs -- a lot of stuff would need to be rewritten to
77-
handle other cases (or maybe just switch to a scalar fft, I don't know..) */
78-
if (transform == FFT_REAL)
79-
{
80-
if (! ((N % (2 * SIMD_SZ * SIMD_SZ)) == 0 && N > 0))
81-
return nullptr;
82-
}
83-
if (transform == FFT_COMPLEX)
84-
{
85-
if (! ((N % (SIMD_SZ * SIMD_SZ)) == 0 && N > 0))
86-
return nullptr;
87-
}
88-
89-
auto* s = (FFT_Setup*) malloc (sizeof (FFT_Setup));
90-
//assert((N % 32) == 0);
91-
s->N = N;
92-
s->transform = transform;
93-
/* nb of complex simd vectors */
94-
s->Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
95-
s->data = (__m256*) aligned_malloc (2 * s->Ncvec * sizeof (float) * SIMD_SZ);
96-
s->e = (float*) s->data;
97-
s->twiddle = (float*) (s->data + (2 * s->Ncvec * (SIMD_SZ - 1)) / SIMD_SZ);
98-
99-
int k, m;
100-
for (k = 0; k < s->Ncvec; ++k)
101-
{
102-
int i = k / (int) SIMD_SZ;
103-
int j = k % (int) SIMD_SZ;
104-
for (m = 0; m < SIMD_SZ - 1; ++m)
105-
{
106-
const auto A = static_cast<float> (-2 * M_PI * (m + 1) * k / N);
107-
s->e[(2 * (i * 7 + m) + 0) * SIMD_SZ + j] = std::cos (A);
108-
s->e[(2 * (i * 7 + m) + 1) * SIMD_SZ + j] = std::sin (A);
109-
}
110-
}
111-
112-
if (transform == FFT_REAL)
113-
{
114-
common::rffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
115-
}
116-
else
117-
{
118-
common::cffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
119-
}
120-
121-
/* check that N is decomposable with allowed prime factors */
122-
for (k = 0, m = 1; k < s->ifac[1]; ++k)
123-
{
124-
m *= s->ifac[2 + k];
125-
}
126-
if (m != N / SIMD_SZ)
127-
{
128-
fft_destroy_setup (s);
129-
s = nullptr;
130-
}
75+
const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
76+
const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
77+
return data_bytes + sizeof (FFT_Setup);
78+
}
13179

132-
return s;
80+
FFT_Setup* fft_new_setup (int N, fft_transform_t transform, void* data)
81+
{
82+
return common::fft_new_setup<FFT_Setup, __m256> (N, transform, SIMD_SZ, data);
13383
}
13484

13585
void fft_destroy_setup (FFT_Setup* s)
13686
{
13787
aligned_free (s->data);
138-
free (s);
13988
}
14089

14190
//====================================================================

simd/chowdsp_fft_impl_common.hpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,4 +158,72 @@ void inline cffti1_ps (int n, float* wa, int* ifac)
158158
l1 = l2;
159159
}
160160
}
161+
162+
template <typename Setup_Type, typename SIMD_Type>
163+
static inline Setup_Type* fft_new_setup (int N, fft_transform_t transform, int SIMD_SZ, void* data)
164+
{
165+
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
166+
and 32 for real FFTs -- a lot of stuff would need to be rewritten to
167+
handle other cases (or maybe just switch to a scalar fft, I don't know..) */
168+
if (transform == FFT_REAL)
169+
{
170+
if (! ((N % (2 * SIMD_SZ * SIMD_SZ)) == 0 && N > 0))
171+
return nullptr;
172+
}
173+
if (transform == FFT_COMPLEX)
174+
{
175+
if (! ((N % (SIMD_SZ * SIMD_SZ)) == 0 && N > 0))
176+
return nullptr;
177+
}
178+
179+
const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
180+
const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
181+
auto* s_data = (std::byte*) data;
182+
183+
auto* s = (Setup_Type*) (s_data + data_bytes);
184+
s->N = N;
185+
s->transform = transform;
186+
/* nb of complex simd vectors */
187+
s->Ncvec = Ncvec;
188+
s->data = (SIMD_Type*) s_data;
189+
s->e = (float*) s->data;
190+
s->twiddle = (float*) (s->data + (2 * s->Ncvec * (SIMD_SZ - 1)) / SIMD_SZ);
191+
192+
int k, m;
193+
const auto M = SIMD_SZ - 1;
194+
195+
for (k = 0; k < s->Ncvec; ++k)
196+
{
197+
int i = k / (int) SIMD_SZ;
198+
int j = k % (int) SIMD_SZ;
199+
for (m = 0; m < M; ++m)
200+
{
201+
const auto A = static_cast<float> (-2 * M_PI * (m + 1) * k / N);
202+
s->e[(2 * (i * M + m) + 0) * SIMD_SZ + j] = std::cos (A);
203+
s->e[(2 * (i * M + m) + 1) * SIMD_SZ + j] = std::sin (A);
204+
}
205+
}
206+
207+
if (transform == FFT_REAL)
208+
{
209+
rffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
210+
}
211+
else
212+
{
213+
cffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
214+
}
215+
216+
/* check that N is decomposable with allowed prime factors */
217+
for (k = 0, m = 1; k < s->ifac[1]; ++k)
218+
{
219+
m *= s->ifac[2 + k];
220+
}
221+
if (m != N / SIMD_SZ)
222+
{
223+
fft_destroy_setup (s);
224+
s = nullptr;
225+
}
226+
227+
return s;
228+
}
161229
}

simd/chowdsp_fft_impl_neon.cpp

Lines changed: 8 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -68,69 +68,21 @@ struct FFT_Setup
6868
float* twiddle; // points into 'data', N/4 elements
6969
};
7070

71-
static FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
71+
static size_t fft_bytes_required (int N, fft_transform_t transform)
7272
{
73-
auto* s = (FFT_Setup*) malloc (sizeof (FFT_Setup));
74-
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
75-
and 32 for real FFTs -- a lot of stuff would need to be rewritten to
76-
handle other cases (or maybe just switch to a scalar fft, I don't know..) */
77-
if (transform == FFT_REAL)
78-
{
79-
assert ((N % (2 * SIMD_SZ * SIMD_SZ)) == 0 && N > 0);
80-
}
81-
if (transform == FFT_COMPLEX)
82-
{
83-
assert ((N % (SIMD_SZ * SIMD_SZ)) == 0 && N > 0);
84-
}
85-
//assert((N % 32) == 0);
86-
s->N = N;
87-
s->transform = transform;
88-
/* nb of complex simd vectors */
89-
s->Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
90-
s->data = (float32x4_t*) aligned_malloc (2 * s->Ncvec * sizeof (float) * SIMD_SZ);
91-
s->e = (float*) s->data;
92-
s->twiddle = (float*) (s->data + (2 * s->Ncvec * (SIMD_SZ - 1)) / SIMD_SZ);
93-
94-
int k, m;
95-
for (k = 0; k < s->Ncvec; ++k)
96-
{
97-
int i = k / (int) SIMD_SZ;
98-
int j = k % (int) SIMD_SZ;
99-
for (m = 0; m < SIMD_SZ - 1; ++m)
100-
{
101-
const auto A = static_cast<float> (-2 * M_PI * (m + 1) * k / N);
102-
s->e[(2 * (i * 3 + m) + 0) * SIMD_SZ + j] = std::cos (A);
103-
s->e[(2 * (i * 3 + m) + 1) * SIMD_SZ + j] = std::sin (A);
104-
}
105-
}
106-
107-
if (transform == FFT_REAL)
108-
{
109-
common::rffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
110-
}
111-
else
112-
{
113-
common::cffti1_ps (N / (int) SIMD_SZ, s->twiddle, s->ifac);
114-
}
115-
116-
/* check that N is decomposable with allowed prime factors */
117-
for (k = 0, m = 1; k < s->ifac[1]; ++k)
118-
{
119-
m *= s->ifac[2 + k];
120-
}
121-
if (m != N / SIMD_SZ)
122-
{
123-
fft_destroy_setup (s);
124-
s = nullptr;
125-
}
73+
const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
74+
const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
75+
return data_bytes + sizeof (FFT_Setup);
76+
}
12677

127-
return s;
78+
static FFT_Setup* fft_new_setup (int N, fft_transform_t transform, void* data)
79+
{
80+
return common::fft_new_setup<FFT_Setup, float32x4_t> (N, transform, SIMD_SZ, data);
12881
}
12982

13083
static void fft_destroy_setup (FFT_Setup* s)
13184
{
13285
aligned_free (s->data);
133-
free (s);
13486
}
13587

13688
//====================================================================

0 commit comments

Comments
 (0)