Skip to content

Commit 76aca18

Browse files
Enable some optimization flags in debug builds (#6)
* Enable some optimization flags in debug builds * C++20 and remove debug override flag * Attempting to fix MSVC * Fixing SSE crashes with Clang-Cl
1 parent a5072de commit 76aca18

File tree

4 files changed

+74
-36
lines changed

4 files changed

+74
-36
lines changed

.github/workflows/bench.yml

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,11 @@ jobs:
2424
os: ubuntu-22.04
2525
cmake_args: "-DCMAKE_LINKER_TYPE=MOLD -DCMAKE_C_COMPILER=clang-15 -DCMAKE_CXX_COMPILER=clang++-15"
2626
nparallel: 4
27-
- name: Windows
27+
- name: Windows (MSVC)
28+
os: windows-2022
29+
cmake_args: ""
30+
nparallel: 4
31+
- name: Windows (Clang)
2832
os: windows-2022
2933
cmake_args: -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl
3034
nparallel: 4
@@ -69,8 +73,14 @@ jobs:
6973
- name: Cmake Configure
7074
run: cmake -Bbuild -G"Ninja Multi-Config" -DCHOWDSP_FFT_TESTING=ON ${{ matrix.cmake_args }}
7175

72-
- name: Build Benchmarks
76+
- name: Build Benchmarks (Debug)
77+
run: cmake --build build --config Debug --parallel ${{ matrix.nparallel }} --target bench_chowdsp_fft
78+
79+
- name: Run Benchmarks (Debug)
80+
run: ./build/bench/Debug/bench_chowdsp_fft
81+
82+
- name: Build Benchmarks (Release)
7383
run: cmake --build build --config Release --parallel ${{ matrix.nparallel }} --target bench_chowdsp_fft
7484

75-
- name: Run Benchmarks
85+
- name: Run Benchmarks (Release)
7686
run: ./build/bench/Release/bench_chowdsp_fft

.gitignore

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
build*/
22

33
.focus-config
4-
.raddbg_project
4+
*.raddbg_project
55
.vscode/
66
.idea/
7-
8-

CMakeLists.txt

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ target_sources(chowdsp_fft
99
)
1010
target_include_directories(chowdsp_fft PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
1111
target_compile_definitions(chowdsp_fft PRIVATE _USE_MATH_DEFINES=1)
12-
target_compile_features(chowdsp_fft PRIVATE cxx_std_17)
12+
target_compile_features(chowdsp_fft PRIVATE cxx_std_20)
1313

1414
include(CheckCXXCompilerFlag)
1515
CHECK_CXX_COMPILER_FLAG("/arch:AVX2" COMPILER_OPT_ARCH_AVX_MSVC_SUPPORTED)
@@ -19,15 +19,15 @@ if(COMPILER_OPT_ARCH_AVX_MSVC_SUPPORTED)
1919
add_library(chowdsp_fft_avx STATIC simd/chowdsp_fft_impl_avx.cpp)
2020
target_compile_options(chowdsp_fft_avx PRIVATE /arch:AVX2)
2121
target_compile_definitions(chowdsp_fft_avx PRIVATE _USE_MATH_DEFINES=1)
22-
target_compile_features(chowdsp_fft_avx PRIVATE cxx_std_17)
22+
target_compile_features(chowdsp_fft_avx PRIVATE cxx_std_20)
2323
target_link_libraries(chowdsp_fft PRIVATE chowdsp_fft_avx)
2424
target_compile_definitions(chowdsp_fft PRIVATE CHOWDSP_FFT_COMPILER_SUPPORTS_AVX=1)
2525
else()
2626
if(COMPILER_OPT_ARCH_AVX_GCC_CLANG_SUPPORTED)
2727
message(STATUS "chowdsp_fft -- Compiler supports flags: -mavx2 -mfma")
2828
add_library(chowdsp_fft_avx STATIC simd/chowdsp_fft_impl_avx.cpp)
2929
target_compile_options(chowdsp_fft_avx PRIVATE -mavx2 -mfma -Wno-unused-command-line-argument)
30-
target_compile_features(chowdsp_fft_avx PRIVATE cxx_std_17)
30+
target_compile_features(chowdsp_fft_avx PRIVATE cxx_std_20)
3131
target_compile_definitions(chowdsp_fft_avx PRIVATE _USE_MATH_DEFINES=1)
3232
target_link_libraries(chowdsp_fft PRIVATE chowdsp_fft_avx)
3333
target_compile_definitions(chowdsp_fft PRIVATE CHOWDSP_FFT_COMPILER_SUPPORTS_AVX=1)
@@ -37,6 +37,34 @@ else()
3737
endif()
3838
endif()
3939

40+
if(CHOWDSP_FFT_TOTAL_DEBUG)
41+
message(AUTHOR_WARNING "chowdsp_fft -- Skipping debug optimization flags!")
42+
else()
43+
if(MSVC)
44+
# message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
45+
# message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
46+
# message(STATUS "CMAKE_C_FLAGS_DEBUG: ${CMAKE_C_FLAGS_DEBUG}")
47+
# message(STATUS "CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")
48+
49+
# The "RTC" flags are incompatible with any optimization flags
50+
# We disable RTC here, but I believe this does not affect any "higher" level CMake settings.
51+
STRING (REGEX REPLACE "/RTC(su|[1su])" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
52+
STRING (REGEX REPLACE "/RTC(su|[1su])" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
53+
# message(STATUS "CMAKE_C_FLAGS_DEBUG: ${CMAKE_C_FLAGS_DEBUG}")
54+
# message(STATUS "CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")
55+
56+
set(DEBUG_OPT_FLAGS /O2 /Ob2)
57+
else()
58+
set(DEBUG_OPT_FLAGS -O2)
59+
endif()
60+
message(STATUS "Setting debug optimization flags: ${DEBUG_OPT_FLAGS}")
61+
62+
target_compile_options(chowdsp_fft PRIVATE $<$<CONFIG:Debug>:${DEBUG_OPT_FLAGS}>)
63+
if(TARGET chowdsp_fft_avx)
64+
target_compile_options(chowdsp_fft_avx PRIVATE $<$<CONFIG:Debug>:${DEBUG_OPT_FLAGS}>)
65+
endif()
66+
endif()
67+
4068
## JUCE compatibility
4169
if(EXISTS "${JUCE_MODULES_DIR}")
4270
message(STATUS "chowdsp_fft -- Configuring chowdsp_fft_juce")

simd/chowdsp_fft_impl_sse.cpp

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -130,18 +130,16 @@ static void fft_destroy_setup (FFT_Setup* s)
130130
}
131131

132132
//====================================================================
133-
static inline auto interleave2 (__m128 in1, __m128 in2)
133+
static inline void interleave2 (__m128 in1, __m128 in2, __m128& out1, __m128& out2)
134134
{
135-
auto out1 = _mm_unpacklo_ps (in1, in2);
136-
auto out2 = _mm_unpackhi_ps (in1, in2);
137-
return std::make_tuple (out1, out2);
135+
out1 = _mm_unpacklo_ps (in1, in2);
136+
out2 = _mm_unpackhi_ps (in1, in2);
138137
}
139138

140-
static inline auto uninterleave2 (__m128 in1, __m128 in2)
139+
static inline void uninterleave2 (__m128 in1, __m128 in2, __m128& out1, __m128& out2)
141140
{
142-
auto out1 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (2, 0, 2, 0));
143-
auto out2 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (3, 1, 3, 1));
144-
return std::make_tuple (out1, out2);
141+
out1 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (2, 0, 2, 0));
142+
out2 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (3, 1, 3, 1));
145143
}
146144

147145
static inline auto mul_scalar (__m128 a, float b)
@@ -630,7 +628,7 @@ static void radf2_ps (int ido, int l1, const __m128* __restrict cc, __m128* __re
630628
}
631629
for (k = 0; k < l1ido; k += ido)
632630
{
633-
ch[2 * k + ido] = _mm_xor_ps(cc[ido - 1 + k + l1ido], _mm_set1_ps(-0.f)); // negate
631+
ch[2 * k + ido] = _mm_xor_ps (cc[ido - 1 + k + l1ido], _mm_set1_ps (-0.f)); // negate
634632
ch[2 * k + ido - 1] = cc[k + ido - 1];
635633
}
636634
}
@@ -961,8 +959,9 @@ static void pffft_real_finalize (int Ncvec, const __m128* in, __m128* out, const
961959
int k, dk = Ncvec / (int) SIMD_SZ; // number of 4x4 matrix blocks
962960
/* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
963961

964-
union v4sf_union {
965-
__m128 v;
962+
union v4sf_union
963+
{
964+
__m128 v;
966965
float f[SIMD_SZ];
967966
};
968967

@@ -1073,8 +1072,9 @@ static void pffft_real_preprocess (int Ncvec, const __m128* in, __m128* out, con
10731072
int k, dk = Ncvec / (int) SIMD_SZ; // number of 4x4 matrix blocks
10741073
/* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
10751074

1076-
union v4sf_union {
1077-
__m128 v;
1075+
union v4sf_union
1076+
{
1077+
__m128 v;
10781078
float f[SIMD_SZ];
10791079
};
10801080

@@ -1470,14 +1470,16 @@ static void reversed_copy (int N, const __m128* in, int in_stride, __m128* out)
14701470
auto* in_start = in;
14711471
auto* out_start = out;
14721472

1473-
auto [g0, g1] = interleave2 (in[0], in[1]);
1473+
__m128 g0, g1;
1474+
interleave2 (in[0], in[1], g0, g1);
14741475
in += in_stride;
14751476

14761477
*--out = _mm_shuffle_ps (g1, g0, _MM_SHUFFLE (3, 2, 1, 0)); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
14771478
int k;
14781479
for (k = 1; k < N; ++k)
14791480
{
1480-
auto [h0, h1] = interleave2 (in[0], in[1]);
1481+
__m128 h0, h1;
1482+
interleave2 (in[0], in[1], h0, h1);
14811483
in += in_stride;
14821484
*--out = _mm_shuffle_ps (h0, g1, _MM_SHUFFLE (3, 2, 1, 0));
14831485
*--out = _mm_shuffle_ps (h1, h0, _MM_SHUFFLE (3, 2, 1, 0));
@@ -1498,15 +1500,15 @@ static void unreversed_copy (int N, const __m128* in, __m128* out, int out_strid
14981500
h1 = *in++;
14991501
g1 = _mm_shuffle_ps (h0, g1, _MM_SHUFFLE (3, 2, 1, 0));
15001502
h0 = _mm_shuffle_ps (h1, h0, _MM_SHUFFLE (3, 2, 1, 0));
1501-
std::tie (out[0], out[1]) = uninterleave2 (h0, g1);
1503+
uninterleave2 (h0, g1, out[0], out[1]);
15021504
out += out_stride;
15031505
g1 = h1;
15041506
}
15051507
h0 = *in++;
15061508
h1 = g0;
15071509
g1 = _mm_shuffle_ps (h0, g1, _MM_SHUFFLE (3, 2, 1, 0));
15081510
h0 = _mm_shuffle_ps (h1, h0, _MM_SHUFFLE (3, 2, 1, 0));
1509-
std::tie (out[0], out[1]) = uninterleave2 (h0, g1);
1511+
uninterleave2 (h0, g1, out[0], out[1]);
15101512
}
15111513

15121514
static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_direction_t direction)
@@ -1522,8 +1524,8 @@ static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_d
15221524
{
15231525
for (k = 0; k < dk; ++k)
15241526
{
1525-
std::tie (vout[2 * (0 * dk + k) + 0], vout[2 * (0 * dk + k) + 1]) = interleave2 (vin[k * 8 + 0], vin[k * 8 + 1]);
1526-
std::tie (vout[2 * (2 * dk + k) + 0], vout[2 * (2 * dk + k) + 1]) = interleave2 (vin[k * 8 + 4], vin[k * 8 + 5]);
1527+
interleave2 (vin[k * 8 + 0], vin[k * 8 + 1], vout[2 * (0 * dk + k) + 0], vout[2 * (0 * dk + k) + 1]);
1528+
interleave2 (vin[k * 8 + 4], vin[k * 8 + 5], vout[2 * (2 * dk + k) + 0], vout[2 * (2 * dk + k) + 1]);
15271529
}
15281530
reversed_copy (dk, vin + 2, 8, (__m128*) (out + N / 2));
15291531
reversed_copy (dk, vin + 6, 8, (__m128*) (out + N));
@@ -1532,8 +1534,8 @@ static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_d
15321534
{
15331535
for (k = 0; k < dk; ++k)
15341536
{
1535-
std::tie (vout[k * 8 + 0], vout[k * 8 + 1]) = uninterleave2 (vin[2 * (0 * dk + k) + 0], vin[2 * (0 * dk + k) + 1]);
1536-
std::tie (vout[k * 8 + 4], vout[k * 8 + 5]) = uninterleave2 (vin[2 * (2 * dk + k) + 0], vin[2 * (2 * dk + k) + 1]);
1537+
uninterleave2 (vin[2 * (0 * dk + k) + 0], vin[2 * (0 * dk + k) + 1], vout[k * 8 + 0], vout[k * 8 + 1]);
1538+
uninterleave2 (vin[2 * (2 * dk + k) + 0], vin[2 * (2 * dk + k) + 1], vout[k * 8 + 4], vout[k * 8 + 5]);
15371539
}
15381540
unreversed_copy (dk, (__m128*) (in + N / 4), (__m128*) (out + N - 6 * SIMD_SZ), -8);
15391541
unreversed_copy (dk, (__m128*) (in + 3 * N / 4), (__m128*) (out + N - 2 * SIMD_SZ), -8);
@@ -1546,15 +1548,15 @@ static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_d
15461548
for (k = 0; k < Ncvec; ++k)
15471549
{
15481550
int kk = (k / 4) + (k % 4) * (Ncvec / 4);
1549-
std::tie (vout[kk * 2], vout[kk * 2 + 1]) = interleave2 (vin[k * 2], vin[k * 2 + 1]);
1551+
interleave2 (vin[k * 2], vin[k * 2 + 1], vout[kk * 2], vout[kk * 2 + 1]);
15501552
}
15511553
}
15521554
else
15531555
{
15541556
for (k = 0; k < Ncvec; ++k)
15551557
{
15561558
int kk = (k / 4) + (k % 4) * (Ncvec / 4);
1557-
std::tie (vout[k * 2], vout[k * 2 + 1]) = uninterleave2 (vin[kk * 2], vin[kk * 2 + 1]);
1559+
uninterleave2 (vin[kk * 2], vin[kk * 2 + 1], vout[k * 2], vout[k * 2 + 1]);
15581560
}
15591561
}
15601562
}
@@ -1591,7 +1593,7 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
15911593
__m128* tmp = buff[ib];
15921594
for (k = 0; k < Ncvec; ++k)
15931595
{
1594-
std::tie (tmp[k * 2], tmp[k * 2 + 1]) = uninterleave2 (vinput[k * 2], vinput[k * 2 + 1]);
1596+
uninterleave2 (vinput[k * 2], vinput[k * 2 + 1], tmp[k * 2], tmp[k * 2 + 1]);
15951597
}
15961598
ib = (cfftf1_ps (Ncvec, buff[ib], buff[! ib], buff[ib], setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
15971599
pffft_cplx_finalize (Ncvec, buff[ib], buff[! ib], (__m128*) setup->e);
@@ -1626,7 +1628,7 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
16261628
ib = (cfftf1_ps (Ncvec, buff[ib], buff[0], buff[1], setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
16271629
for (k = 0; k < Ncvec; ++k)
16281630
{
1629-
std::tie (buff[ib][k * 2], buff[ib][k * 2 + 1]) = interleave2 (buff[ib][k * 2], buff[ib][k * 2 + 1]);
1631+
interleave2 (buff[ib][k * 2], buff[ib][k * 2 + 1], buff[ib][k * 2], buff[ib][k * 2 + 1]);
16301632
}
16311633
}
16321634
}
@@ -1672,8 +1674,8 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b,
16721674
br = vb[2 * i + 0];
16731675
bi = vb[2 * i + 1];
16741676
std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi);
1675-
vab[2 * i + 0] = _mm_add_ps (vab[2 * i + 0], _mm_mul_ps (ar, vscal));
1676-
vab[2 * i + 1] = _mm_add_ps (vab[2 * i + 1], _mm_mul_ps (ai, vscal));
1677+
vab[2 * i + 0] = _mm_add_ps (vab[2 * i + 0], _mm_mul_ps (ar, vscal));
1678+
vab[2 * i + 1] = _mm_add_ps (vab[2 * i + 1], _mm_mul_ps (ai, vscal));
16771679
ar = va[2 * i + 2];
16781680
ai = va[2 * i + 3];
16791681
br = vb[2 * i + 2];

0 commit comments

Comments
 (0)