Skip to content

Commit 676c042

Browse files
Working pre-allocated FFT
1 parent 6ee7474 commit 676c042

File tree

6 files changed

+150
-22
lines changed

6 files changed

+150
-22
lines changed

chowdsp_fft.cpp

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -228,27 +228,53 @@ bool check_is_pointer_sse_setup (void* ptr)
228228
}
229229
#endif
230230

231-
void* fft_new_setup (int N, fft_transform_t transform, [[maybe_unused]] bool use_avx_if_available)
231+
size_t fft_bytes_required (int N, fft_transform_t transform, bool use_avx_if_available)
232232
{
233233
#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
234234
#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
235235
if (use_avx_if_available)
236236
{
237237
if (cpu_supports_avx())
238238
{
239-
auto* setup_ptr = avx::fft_new_setup (N, transform);
239+
return avx::fft_bytes_required (N, transform);
240+
}
241+
}
242+
return sse::fft_bytes_required (N, transform);
243+
#else
244+
return sse::fft_bytes_required (N, transform);
245+
#endif
246+
#elif defined(__ARM_NEON__) || defined(_M_ARM64)
247+
return neon::fft_bytes_required (N, transform);
248+
#endif
249+
}
250+
251+
void* fft_new_setup (int N, fft_transform_t transform, bool use_avx_if_available)
252+
{
253+
const auto fft_bytes = fft_bytes_required (N, transform, use_avx_if_available);
254+
return fft_new_setup_preallocated (N, transform, aligned_malloc (fft_bytes), use_avx_if_available);
255+
}
256+
257+
void* fft_new_setup_preallocated (int N, fft_transform_t transform, void* data, [[maybe_unused]] bool use_avx_if_available)
258+
{
259+
#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)
260+
#if CHOWDSP_FFT_COMPILER_SUPPORTS_AVX
261+
if (use_avx_if_available)
262+
{
263+
if (cpu_supports_avx())
264+
{
265+
auto* setup_ptr = avx::fft_new_setup (N, transform, data);
240266
if (setup_ptr != nullptr)
241267
return setup_ptr;
242268
}
243269
}
244-
void* ptr = sse::fft_new_setup (N, transform);
270+
void* ptr = sse::fft_new_setup (N, transform, data);
245271
set_pointer_is_sse_setup (ptr);
246272
return ptr;
247273
#else
248-
return sse::fft_new_setup (N, transform);
274+
return sse::fft_new_setup (N, transform, data);
249275
#endif
250276
#elif defined(__ARM_NEON__) || defined(_M_ARM64)
251-
return neon::fft_new_setup (N, transform);
277+
return neon::fft_new_setup (N, transform, data);
252278
#endif
253279
}
254280

chowdsp_fft.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,16 @@ typedef enum
7474
FFT_COMPLEX
7575
} fft_transform_t;
7676

77+
/*
78+
Returns the number of bytes needed for a "pre-allocated" FFT.
79+
See `fft_new_setup_preallocated` for more details.
80+
*/
81+
size_t fft_bytes_required (int N, fft_transform_t transform, bool use_avx_if_available
82+
#ifdef __cplusplus
83+
= true
84+
#endif
85+
);
86+
7787
/*
7888
prepare for performing transforms of size N -- the returned
7989
FFT_Setup structure is read-only so it can safely be shared by
@@ -84,6 +94,28 @@ void* fft_new_setup (int N, fft_transform_t transform, bool use_avx_if_available
8494
= true
8595
#endif
8696
);
97+
98+
/*
99+
Same as fft_new_setup, except that all the memory for the FFT
100+
is pre-allocated, and is provided by the caller via the "data"
101+
pointer.
102+
103+
pffft's `aligned_malloc` aligns all allocations to 64 bytes.
104+
Depending on your specific case, you may be able to get away
105+
with a lower alignment requirement, but make sure to test! It
106+
is expected that at least 16-byte alignment will be required
107+
for FFTs using NEON or SSE, and at least 32-byte alignment will
108+
be required for FFTs using AVX.
109+
110+
If you create your FFT with this method, you don't need to call
111+
`fft_destroy_setup`, but you are responsible for freeing the
112+
pre-allocated memory.
113+
*/
114+
void* fft_new_setup_preallocated (int N, fft_transform_t transform, void* data, bool use_avx_if_available
115+
#ifdef __cplusplus
116+
= true
117+
#endif
118+
);
87119
void fft_destroy_setup (void*);
88120

89121
/** Returns the width (in bytes) of the SIMD registers used by the FFT setup. */

simd/chowdsp_fft_impl_avx.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,13 @@ struct FFT_Setup
7070
float* twiddle; // points into 'data', N/4 elements
7171
};
7272

73+
static size_t fft_bytes_required (int N, fft_transform_t transform)
74+
{
75+
const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
76+
const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
77+
return data_bytes + sizeof (FFT_Setup);
78+
}
79+
7380
FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
7481
{
7582
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
@@ -135,7 +142,6 @@ FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
135142
void fft_destroy_setup (FFT_Setup* s)
136143
{
137144
aligned_free (s->data);
138-
free (s);
139145
}
140146

141147
//====================================================================

simd/chowdsp_fft_impl_neon.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,20 @@ struct FFT_Setup
6868
float* twiddle; // points into 'data', N/4 elements
6969
};
7070

71-
static FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
71+
static size_t fft_bytes_required (int N, fft_transform_t transform)
7272
{
73-
auto* s = (FFT_Setup*) malloc (sizeof (FFT_Setup));
73+
const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
74+
const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
75+
return data_bytes + sizeof (FFT_Setup);
76+
}
77+
78+
static FFT_Setup* fft_new_setup (int N, fft_transform_t transform, void* data)
79+
{
80+
const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
81+
const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
82+
auto* s_data = (std::byte*) data;
83+
84+
auto* s = (FFT_Setup*) (s_data + data_bytes);
7485
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
7586
and 32 for real FFTs -- a lot of stuff would need to be rewritten to
7687
handle other cases (or maybe just switch to a scalar fft, I don't know..) */
@@ -86,8 +97,8 @@ static FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
8697
s->N = N;
8798
s->transform = transform;
8899
/* nb of complex simd vectors */
89-
s->Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
90-
s->data = (float32x4_t*) aligned_malloc (2 * s->Ncvec * sizeof (float) * SIMD_SZ);
100+
s->Ncvec = Ncvec;
101+
s->data = (float32x4_t*) s_data;
91102
s->e = (float*) s->data;
92103
s->twiddle = (float*) (s->data + (2 * s->Ncvec * (SIMD_SZ - 1)) / SIMD_SZ);
93104

@@ -130,7 +141,6 @@ static FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
130141
static void fft_destroy_setup (FFT_Setup* s)
131142
{
132143
aligned_free (s->data);
133-
free (s);
134144
}
135145

136146
//====================================================================

simd/chowdsp_fft_impl_sse.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,13 @@ struct FFT_Setup
6464
float* twiddle; // points into 'data', N/4 elements
6565
};
6666

67+
static size_t fft_bytes_required (int N, fft_transform_t transform)
68+
{
69+
const auto Ncvec = (transform == FFT_REAL ? N / 2 : N) / SIMD_SZ;
70+
const auto data_bytes = 2 * Ncvec * sizeof (float) * SIMD_SZ;
71+
return data_bytes + sizeof (FFT_Setup);
72+
}
73+
6774
static FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
6875
{
6976
auto* s = (FFT_Setup*) malloc (sizeof (FFT_Setup));
@@ -126,7 +133,6 @@ static FFT_Setup* fft_new_setup (int N, fft_transform_t transform)
126133
static void fft_destroy_setup (FFT_Setup* s)
127134
{
128135
aligned_free (s->data);
129-
free (s);
130136
}
131137

132138
//====================================================================

test/test.cpp

Lines changed: 58 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,17 @@
33
#include <chowdsp_fft.h>
44
#include <pffft.h>
55

6-
#include <catch2/catch_test_macros.hpp>
76
#include <catch2/catch_approx.hpp>
7+
#include <catch2/catch_test_macros.hpp>
88

99
void compare (const float* ref, const float* test, int N)
1010
{
1111
const auto tol = 2.0e-7f * (float) N;
1212
for (int n = 0; n < N; ++n)
13-
REQUIRE (test[n] == Catch::Approx { ref[n] }.margin(tol));
13+
REQUIRE (test[n] == Catch::Approx { ref[n] }.margin (tol));
1414
}
1515

16-
void test_fft_complex (int N, bool use_avx = false)
16+
void test_fft_complex (int N, bool use_avx = false, bool preallocate = false)
1717
{
1818
auto* data = (float*) chowdsp::fft::aligned_malloc (sizeof (float) * N * 2);
1919
auto* data_ref = (float*) pffft_aligned_malloc (sizeof (float) * N * 2);
@@ -27,7 +27,18 @@ void test_fft_complex (int N, bool use_avx = false)
2727
}
2828
std::copy (data, data + N * 2, data_ref);
2929

30-
auto* fft_setup = chowdsp::fft::fft_new_setup (N, chowdsp::fft::FFT_COMPLEX, use_avx);
30+
void* fft_setup;
31+
void* prealloc;
32+
if (preallocate)
33+
{
34+
const auto bytes_required = chowdsp::fft::fft_bytes_required (N, chowdsp::fft::FFT_COMPLEX, use_avx);
35+
prealloc = chowdsp::fft::aligned_malloc (bytes_required);
36+
fft_setup = chowdsp::fft::fft_new_setup_preallocated (N, chowdsp::fft::FFT_COMPLEX, prealloc, use_avx);
37+
}
38+
else
39+
{
40+
fft_setup = chowdsp::fft::fft_new_setup (N, chowdsp::fft::FFT_COMPLEX, use_avx);
41+
}
3142
REQUIRE (fft_setup != nullptr);
3243
auto* pffft_setup = pffft_new_setup (N, PFFFT_COMPLEX);
3344
if (! use_avx)
@@ -50,15 +61,18 @@ void test_fft_complex (int N, bool use_avx = false)
5061

5162
compare (data_ref, data, N * 2);
5263

53-
chowdsp::fft::fft_destroy_setup (fft_setup);
64+
if (preallocate)
65+
chowdsp::fft::aligned_free (prealloc);
66+
else
67+
chowdsp::fft::fft_destroy_setup (fft_setup);
5468
pffft_destroy_setup (pffft_setup);
5569
chowdsp::fft::aligned_free (data);
5670
pffft_aligned_free (data_ref);
5771
chowdsp::fft::aligned_free (work_data);
5872
pffft_aligned_free (work_data_ref);
5973
}
6074

61-
void test_fft_real (int N, bool use_avx = false)
75+
void test_fft_real (int N, bool use_avx = false, bool preallocate = false)
6276
{
6377
auto* data = (float*) chowdsp::fft::aligned_malloc (sizeof (float) * N);
6478
auto* data_ref = (float*) pffft_aligned_malloc (sizeof (float) * N);
@@ -71,7 +85,18 @@ void test_fft_real (int N, bool use_avx = false)
7185
}
7286
std::copy (data, data + N, data_ref);
7387

74-
auto* fft_setup = chowdsp::fft::fft_new_setup (N, chowdsp::fft::FFT_REAL, use_avx);
88+
void* fft_setup;
89+
void* prealloc;
90+
if (preallocate)
91+
{
92+
const auto bytes_required = chowdsp::fft::fft_bytes_required (N, chowdsp::fft::FFT_REAL, use_avx);
93+
prealloc = chowdsp::fft::aligned_malloc (bytes_required);
94+
fft_setup = chowdsp::fft::fft_new_setup_preallocated (N, chowdsp::fft::FFT_REAL, prealloc, use_avx);
95+
}
96+
else
97+
{
98+
fft_setup = chowdsp::fft::fft_new_setup (N, chowdsp::fft::FFT_REAL, use_avx);
99+
}
75100
REQUIRE (fft_setup != nullptr);
76101
auto* pffft_setup = pffft_new_setup (N, PFFFT_REAL);
77102

@@ -92,7 +117,10 @@ void test_fft_real (int N, bool use_avx = false)
92117

93118
compare (data_ref, data, N);
94119

95-
chowdsp::fft::fft_destroy_setup (fft_setup);
120+
if (preallocate)
121+
chowdsp::fft::aligned_free (prealloc);
122+
else
123+
chowdsp::fft::fft_destroy_setup (fft_setup);
96124
pffft_destroy_setup (pffft_setup);
97125
chowdsp::fft::aligned_free (data);
98126
pffft_aligned_free (data_ref);
@@ -203,7 +231,7 @@ void test_convolution_real (int N, bool use_avx = false)
203231
pffft_aligned_free (work_data_ref);
204232
}
205233

206-
TEST_CASE("FFT SSE/NEON")
234+
TEST_CASE ("FFT SSE/NEON")
207235
{
208236
for (int i = 5; i < 20; ++i)
209237
{
@@ -218,6 +246,16 @@ TEST_CASE("FFT SSE/NEON")
218246
test_fft_real (fft_size);
219247
}
220248

249+
SECTION ("Testing pre-allocated Complex FFT with size: " + std::to_string (fft_size))
250+
{
251+
test_fft_complex (fft_size, false, true);
252+
}
253+
254+
SECTION ("Testing pre-allocated Real FFT with size: " + std::to_string (fft_size))
255+
{
256+
test_fft_real (fft_size, false, true);
257+
}
258+
221259
SECTION ("Testing Complex Convolution with size: " + std::to_string (fft_size))
222260
{
223261
test_convolution_complex (fft_size);
@@ -231,7 +269,7 @@ TEST_CASE("FFT SSE/NEON")
231269
}
232270

233271
#if defined(__SSE2__)
234-
TEST_CASE("FFT AVX")
272+
TEST_CASE ("FFT AVX")
235273
{
236274
for (int i = 5; i < 20; ++i)
237275
{
@@ -246,6 +284,16 @@ TEST_CASE("FFT AVX")
246284
test_fft_real (fft_size, true);
247285
}
248286

287+
SECTION ("Testing pre-allocated Complex FFT with size: " + std::to_string (fft_size))
288+
{
289+
test_fft_complex (fft_size, true, true);
290+
}
291+
292+
SECTION ("Testing pre-allocated Real FFT with size: " + std::to_string (fft_size))
293+
{
294+
test_fft_real (fft_size, true, true);
295+
}
296+
249297
SECTION ("Testing Complex Convolution with size: " + std::to_string (fft_size))
250298
{
251299
test_convolution_complex (fft_size, true);

0 commit comments

Comments
 (0)