Enable some optimization flags in debug builds (#6)

jatinchowdhury18 · web-flow · commit 76aca18a31f0 · 2025-05-23T15:20:59.000-07:00
* Enable some optimization flags in debug builds

* C++20 and remove debug override flag

* Attempting to fix MSVC

* Fixing SSE crashes with Clang-Cl
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -24,7 +24,11 @@ jobs:
             os: ubuntu-22.04
             cmake_args: "-DCMAKE_LINKER_TYPE=MOLD -DCMAKE_C_COMPILER=clang-15 -DCMAKE_CXX_COMPILER=clang++-15"
             nparallel: 4
-          - name: Windows
+          - name: Windows (MSVC)
+            os: windows-2022
+            cmake_args: ""
+            nparallel: 4
+          - name: Windows (Clang)
             os: windows-2022
             cmake_args: -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl
             nparallel: 4
@@ -69,8 +73,14 @@ jobs:
       - name: Cmake Configure
         run: cmake -Bbuild -G"Ninja Multi-Config" -DCHOWDSP_FFT_TESTING=ON ${{ matrix.cmake_args }}
 
-      - name: Build Benchmarks
+      - name: Build Benchmarks (Debug)
+        run: cmake --build build --config Debug --parallel ${{ matrix.nparallel }} --target bench_chowdsp_fft
+
+      - name: Run Benchmarks (Debug)
+        run: ./build/bench/Debug/bench_chowdsp_fft
+
+      - name: Build Benchmarks (Release)
         run: cmake --build build --config Release --parallel ${{ matrix.nparallel }} --target bench_chowdsp_fft
 
-      - name: Run Benchmarks
+      - name: Run Benchmarks (Release)
         run: ./build/bench/Release/bench_chowdsp_fft
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,6 @@
 build*/
 
 .focus-config
-.raddbg_project
+*.raddbg_project
 .vscode/
 .idea/
-
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -9,7 +9,7 @@ target_sources(chowdsp_fft
 )
 target_include_directories(chowdsp_fft PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_compile_definitions(chowdsp_fft PRIVATE _USE_MATH_DEFINES=1)
-target_compile_features(chowdsp_fft PRIVATE cxx_std_17)
+target_compile_features(chowdsp_fft PRIVATE cxx_std_20)
 
 include(CheckCXXCompilerFlag)
 CHECK_CXX_COMPILER_FLAG("/arch:AVX2" COMPILER_OPT_ARCH_AVX_MSVC_SUPPORTED)
@@ -19,15 +19,15 @@ if(COMPILER_OPT_ARCH_AVX_MSVC_SUPPORTED)
     add_library(chowdsp_fft_avx STATIC simd/chowdsp_fft_impl_avx.cpp)
     target_compile_options(chowdsp_fft_avx PRIVATE /arch:AVX2)
     target_compile_definitions(chowdsp_fft_avx PRIVATE _USE_MATH_DEFINES=1)
-    target_compile_features(chowdsp_fft_avx PRIVATE cxx_std_17)
+    target_compile_features(chowdsp_fft_avx PRIVATE cxx_std_20)
     target_link_libraries(chowdsp_fft PRIVATE chowdsp_fft_avx)
     target_compile_definitions(chowdsp_fft PRIVATE CHOWDSP_FFT_COMPILER_SUPPORTS_AVX=1)
 else()
     if(COMPILER_OPT_ARCH_AVX_GCC_CLANG_SUPPORTED)
         message(STATUS "chowdsp_fft -- Compiler supports flags: -mavx2 -mfma")
         add_library(chowdsp_fft_avx STATIC simd/chowdsp_fft_impl_avx.cpp)
         target_compile_options(chowdsp_fft_avx PRIVATE -mavx2 -mfma -Wno-unused-command-line-argument)
-        target_compile_features(chowdsp_fft_avx PRIVATE cxx_std_17)
+        target_compile_features(chowdsp_fft_avx PRIVATE cxx_std_20)
         target_compile_definitions(chowdsp_fft_avx PRIVATE _USE_MATH_DEFINES=1)
         target_link_libraries(chowdsp_fft PRIVATE chowdsp_fft_avx)
         target_compile_definitions(chowdsp_fft PRIVATE CHOWDSP_FFT_COMPILER_SUPPORTS_AVX=1)
@@ -37,6 +37,34 @@ else()
     endif()
 endif()
 
+if(CHOWDSP_FFT_TOTAL_DEBUG)
+    message(AUTHOR_WARNING "chowdsp_fft -- Skipping debug optimization flags!")
+else()
+    if(MSVC)
+        # message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
+        # message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+        # message(STATUS "CMAKE_C_FLAGS_DEBUG: ${CMAKE_C_FLAGS_DEBUG}")
+        # message(STATUS "CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")
+
+        # The "RTC" flags are incompatible with any optimization flags
+        # We disable RTC here, but I believe this does not affect any "higher" level CMake settings.
+        STRING (REGEX REPLACE "/RTC(su|[1su])" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
+        STRING (REGEX REPLACE "/RTC(su|[1su])" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
+        # message(STATUS "CMAKE_C_FLAGS_DEBUG: ${CMAKE_C_FLAGS_DEBUG}")
+        # message(STATUS "CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")
+
+        set(DEBUG_OPT_FLAGS /O2 /Ob2)
+    else()
+        set(DEBUG_OPT_FLAGS -O2)
+    endif()
+    message(STATUS "Setting debug optimization flags: ${DEBUG_OPT_FLAGS}")
+
+    target_compile_options(chowdsp_fft PRIVATE $<$<CONFIG:Debug>:${DEBUG_OPT_FLAGS}>)
+    if(TARGET chowdsp_fft_avx)
+        target_compile_options(chowdsp_fft_avx PRIVATE $<$<CONFIG:Debug>:${DEBUG_OPT_FLAGS}>)
+    endif()
+endif()
+
 ## JUCE compatibility
 if(EXISTS "${JUCE_MODULES_DIR}")
     message(STATUS "chowdsp_fft -- Configuring chowdsp_fft_juce")
diff --git a/simd/chowdsp_fft_impl_sse.cpp b/simd/chowdsp_fft_impl_sse.cpp
@@ -130,18 +130,16 @@ static void fft_destroy_setup (FFT_Setup* s)
 }
 
 //====================================================================
-static inline auto interleave2 (__m128 in1, __m128 in2)
+static inline void interleave2 (__m128 in1, __m128 in2, __m128& out1, __m128& out2)
 {
-    auto out1 = _mm_unpacklo_ps (in1, in2);
-    auto out2 = _mm_unpackhi_ps (in1, in2);
-    return std::make_tuple (out1, out2);
+    out1 = _mm_unpacklo_ps (in1, in2);
+    out2 = _mm_unpackhi_ps (in1, in2);
 }
 
-static inline auto uninterleave2 (__m128 in1, __m128 in2)
+static inline void uninterleave2 (__m128 in1, __m128 in2, __m128& out1, __m128& out2)
 {
-    auto out1 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (2, 0, 2, 0));
-    auto out2 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (3, 1, 3, 1));
-    return std::make_tuple (out1, out2);
+    out1 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (2, 0, 2, 0));
+    out2 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (3, 1, 3, 1));
 }
 
 static inline auto mul_scalar (__m128 a, float b)
@@ -630,7 +628,7 @@ static void radf2_ps (int ido, int l1, const __m128* __restrict cc, __m128* __re
     }
     for (k = 0; k < l1ido; k += ido)
     {
-        ch[2 * k + ido] = _mm_xor_ps(cc[ido - 1 + k + l1ido], _mm_set1_ps(-0.f)); // negate
+        ch[2 * k + ido] = _mm_xor_ps (cc[ido - 1 + k + l1ido], _mm_set1_ps (-0.f)); // negate
         ch[2 * k + ido - 1] = cc[k + ido - 1];
     }
 }
@@ -961,8 +959,9 @@ static void pffft_real_finalize (int Ncvec, const __m128* in, __m128* out, const
     int k, dk = Ncvec / (int) SIMD_SZ; // number of 4x4 matrix blocks
     /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
 
-    union v4sf_union {
-        __m128  v;
+    union v4sf_union
+    {
+        __m128 v;
         float f[SIMD_SZ];
     };
 
@@ -1073,8 +1072,9 @@ static void pffft_real_preprocess (int Ncvec, const __m128* in, __m128* out, con
     int k, dk = Ncvec / (int) SIMD_SZ; // number of 4x4 matrix blocks
     /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
 
-    union v4sf_union {
-        __m128  v;
+    union v4sf_union
+    {
+        __m128 v;
         float f[SIMD_SZ];
     };
 
@@ -1470,14 +1470,16 @@ static void reversed_copy (int N, const __m128* in, int in_stride, __m128* out)
     auto* in_start = in;
     auto* out_start = out;
 
-    auto [g0, g1] = interleave2 (in[0], in[1]);
+    __m128 g0, g1;
+    interleave2 (in[0], in[1], g0, g1);
     in += in_stride;
 
     *--out = _mm_shuffle_ps (g1, g0, _MM_SHUFFLE (3, 2, 1, 0)); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
     int k;
     for (k = 1; k < N; ++k)
     {
-        auto [h0, h1] = interleave2 (in[0], in[1]);
+        __m128 h0, h1;
+        interleave2 (in[0], in[1], h0, h1);
         in += in_stride;
         *--out = _mm_shuffle_ps (h0, g1, _MM_SHUFFLE (3, 2, 1, 0));
         *--out = _mm_shuffle_ps (h1, h0, _MM_SHUFFLE (3, 2, 1, 0));
@@ -1498,15 +1500,15 @@ static void unreversed_copy (int N, const __m128* in, __m128* out, int out_strid
         h1 = *in++;
         g1 = _mm_shuffle_ps (h0, g1, _MM_SHUFFLE (3, 2, 1, 0));
         h0 = _mm_shuffle_ps (h1, h0, _MM_SHUFFLE (3, 2, 1, 0));
-        std::tie (out[0], out[1]) = uninterleave2 (h0, g1);
+        uninterleave2 (h0, g1, out[0], out[1]);
         out += out_stride;
         g1 = h1;
     }
     h0 = *in++;
     h1 = g0;
     g1 = _mm_shuffle_ps (h0, g1, _MM_SHUFFLE (3, 2, 1, 0));
     h0 = _mm_shuffle_ps (h1, h0, _MM_SHUFFLE (3, 2, 1, 0));
-    std::tie (out[0], out[1]) = uninterleave2 (h0, g1);
+    uninterleave2 (h0, g1, out[0], out[1]);
 }
 
 static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_direction_t direction)
@@ -1522,8 +1524,8 @@ static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_d
         {
             for (k = 0; k < dk; ++k)
             {
-                std::tie (vout[2 * (0 * dk + k) + 0], vout[2 * (0 * dk + k) + 1]) = interleave2 (vin[k * 8 + 0], vin[k * 8 + 1]);
-                std::tie (vout[2 * (2 * dk + k) + 0], vout[2 * (2 * dk + k) + 1]) = interleave2 (vin[k * 8 + 4], vin[k * 8 + 5]);
+                interleave2 (vin[k * 8 + 0], vin[k * 8 + 1], vout[2 * (0 * dk + k) + 0], vout[2 * (0 * dk + k) + 1]);
+                interleave2 (vin[k * 8 + 4], vin[k * 8 + 5], vout[2 * (2 * dk + k) + 0], vout[2 * (2 * dk + k) + 1]);
             }
             reversed_copy (dk, vin + 2, 8, (__m128*) (out + N / 2));
             reversed_copy (dk, vin + 6, 8, (__m128*) (out + N));
@@ -1532,8 +1534,8 @@ static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_d
         {
             for (k = 0; k < dk; ++k)
             {
-                std::tie (vout[k * 8 + 0], vout[k * 8 + 1]) = uninterleave2 (vin[2 * (0 * dk + k) + 0], vin[2 * (0 * dk + k) + 1]);
-                std::tie (vout[k * 8 + 4], vout[k * 8 + 5]) = uninterleave2 (vin[2 * (2 * dk + k) + 0], vin[2 * (2 * dk + k) + 1]);
+                uninterleave2 (vin[2 * (0 * dk + k) + 0], vin[2 * (0 * dk + k) + 1], vout[k * 8 + 0], vout[k * 8 + 1]);
+                uninterleave2 (vin[2 * (2 * dk + k) + 0], vin[2 * (2 * dk + k) + 1], vout[k * 8 + 4], vout[k * 8 + 5]);
             }
             unreversed_copy (dk, (__m128*) (in + N / 4), (__m128*) (out + N - 6 * SIMD_SZ), -8);
             unreversed_copy (dk, (__m128*) (in + 3 * N / 4), (__m128*) (out + N - 2 * SIMD_SZ), -8);
@@ -1546,15 +1548,15 @@ static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_d
             for (k = 0; k < Ncvec; ++k)
             {
                 int kk = (k / 4) + (k % 4) * (Ncvec / 4);
-                std::tie (vout[kk * 2], vout[kk * 2 + 1]) = interleave2 (vin[k * 2], vin[k * 2 + 1]);
+                interleave2 (vin[k * 2], vin[k * 2 + 1], vout[kk * 2], vout[kk * 2 + 1]);
             }
         }
         else
         {
             for (k = 0; k < Ncvec; ++k)
             {
                 int kk = (k / 4) + (k % 4) * (Ncvec / 4);
-                std::tie (vout[k * 2], vout[k * 2 + 1]) = uninterleave2 (vin[kk * 2], vin[kk * 2 + 1]);
+                uninterleave2 (vin[kk * 2], vin[kk * 2 + 1], vout[k * 2], vout[k * 2 + 1]);
             }
         }
     }
@@ -1591,7 +1593,7 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
             __m128* tmp = buff[ib];
             for (k = 0; k < Ncvec; ++k)
             {
-                std::tie (tmp[k * 2], tmp[k * 2 + 1]) = uninterleave2 (vinput[k * 2], vinput[k * 2 + 1]);
+                uninterleave2 (vinput[k * 2], vinput[k * 2 + 1], tmp[k * 2], tmp[k * 2 + 1]);
             }
             ib = (cfftf1_ps (Ncvec, buff[ib], buff[! ib], buff[ib], setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
             pffft_cplx_finalize (Ncvec, buff[ib], buff[! ib], (__m128*) setup->e);
@@ -1626,7 +1628,7 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
             ib = (cfftf1_ps (Ncvec, buff[ib], buff[0], buff[1], setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
             for (k = 0; k < Ncvec; ++k)
             {
-                std::tie (buff[ib][k * 2], buff[ib][k * 2 + 1]) = interleave2 (buff[ib][k * 2], buff[ib][k * 2 + 1]);
+                interleave2 (buff[ib][k * 2], buff[ib][k * 2 + 1], buff[ib][k * 2], buff[ib][k * 2 + 1]);
             }
         }
     }
@@ -1672,8 +1674,8 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b,
         br = vb[2 * i + 0];
         bi = vb[2 * i + 1];
         std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi);
-        vab[2 * i + 0] = _mm_add_ps  (vab[2 * i + 0], _mm_mul_ps (ar, vscal));
-        vab[2 * i + 1] = _mm_add_ps  (vab[2 * i + 1], _mm_mul_ps (ai, vscal));
+        vab[2 * i + 0] = _mm_add_ps (vab[2 * i + 0], _mm_mul_ps (ar, vscal));
+        vab[2 * i + 1] = _mm_add_ps (vab[2 * i + 1], _mm_mul_ps (ai, vscal));
         ar = va[2 * i + 2];
         ai = va[2 * i + 3];
         br = vb[2 * i + 2];

Original file line number	Diff line number	Diff line change
`@@ -130,18 +130,16 @@ static void fft_destroy_setup (FFT_Setup* s)`
`130`	`130`	`}`
`131`	`131`
`132`	`132`	`//====================================================================`
`133`		`-static inline auto interleave2 (__m128 in1, __m128 in2)`
	`133`	`+static inline void interleave2 (__m128 in1, __m128 in2, __m128& out1, __m128& out2)`
`134`	`134`	`{`
`135`		`- auto out1 = _mm_unpacklo_ps (in1, in2);`
`136`		`- auto out2 = _mm_unpackhi_ps (in1, in2);`
`137`		`- return std::make_tuple (out1, out2);`
	`135`	`+ out1 = _mm_unpacklo_ps (in1, in2);`
	`136`	`+ out2 = _mm_unpackhi_ps (in1, in2);`
`138`	`137`	`}`
`139`	`138`
`140`		`-static inline auto uninterleave2 (__m128 in1, __m128 in2)`
	`139`	`+static inline void uninterleave2 (__m128 in1, __m128 in2, __m128& out1, __m128& out2)`
`141`	`140`	`{`
`142`		`- auto out1 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (2, 0, 2, 0));`
`143`		`- auto out2 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (3, 1, 3, 1));`
`144`		`- return std::make_tuple (out1, out2);`
	`141`	`+ out1 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (2, 0, 2, 0));`
	`142`	`+ out2 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (3, 1, 3, 1));`
`145`	`143`	`}`
`146`	`144`
`147`	`145`	`static inline auto mul_scalar (__m128 a, float b)`
`@@ -630,7 +628,7 @@ static void radf2_ps (int ido, int l1, const __m128* __restrict cc, __m128* __re`
`630`	`628`	`}`
`631`	`629`	`for (k = 0; k < l1ido; k += ido)`
`632`	`630`	`{`
`633`		`- ch[2 * k + ido] = _mm_xor_ps(cc[ido - 1 + k + l1ido], _mm_set1_ps(-0.f)); // negate`
	`631`	`+ ch[2 * k + ido] = _mm_xor_ps (cc[ido - 1 + k + l1ido], _mm_set1_ps (-0.f)); // negate`
`634`	`632`	`ch[2 * k + ido - 1] = cc[k + ido - 1];`
`635`	`633`	`}`
`636`	`634`	`}`
`@@ -961,8 +959,9 @@ static void pffft_real_finalize (int Ncvec, const __m128* in, __m128* out, const`
`961`	`959`	`int k, dk = Ncvec / (int) SIMD_SZ; // number of 4x4 matrix blocks`
`962`	`960`	`/* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */`
`963`	`961`
`964`		`- union v4sf_union {`
`965`		`- __m128 v;`
	`962`	`+ union v4sf_union`
	`963`	`+ {`
	`964`	`+ __m128 v;`
`966`	`965`	`float f[SIMD_SZ];`
`967`	`966`	`};`
`968`	`967`
`@@ -1073,8 +1072,9 @@ static void pffft_real_preprocess (int Ncvec, const __m128* in, __m128* out, con`
`1073`	`1072`	`int k, dk = Ncvec / (int) SIMD_SZ; // number of 4x4 matrix blocks`
`1074`	`1073`	`/* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */`
`1075`	`1074`
`1076`		`- union v4sf_union {`
`1077`		`- __m128 v;`
	`1075`	`+ union v4sf_union`
	`1076`	`+ {`
	`1077`	`+ __m128 v;`
`1078`	`1078`	`float f[SIMD_SZ];`
`1079`	`1079`	`};`
`1080`	`1080`
`@@ -1470,14 +1470,16 @@ static void reversed_copy (int N, const __m128* in, int in_stride, __m128* out)`
`1470`	`1470`	`auto* in_start = in;`
`1471`	`1471`	`auto* out_start = out;`
`1472`	`1472`
`1473`		`- auto [g0, g1] = interleave2 (in[0], in[1]);`
	`1473`	`+ __m128 g0, g1;`
	`1474`	`+ interleave2 (in[0], in[1], g0, g1);`
`1474`	`1475`	`in += in_stride;`
`1475`	`1476`
`1476`	`1477`	`*--out = _mm_shuffle_ps (g1, g0, _MM_SHUFFLE (3, 2, 1, 0)); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]`
`1477`	`1478`	`int k;`
`1478`	`1479`	`for (k = 1; k < N; ++k)`
`1479`	`1480`	`{`
`1480`		`- auto [h0, h1] = interleave2 (in[0], in[1]);`
	`1481`	`+ __m128 h0, h1;`
	`1482`	`+ interleave2 (in[0], in[1], h0, h1);`
`1481`	`1483`	`in += in_stride;`
`1482`	`1484`	`*--out = _mm_shuffle_ps (h0, g1, _MM_SHUFFLE (3, 2, 1, 0));`
`1483`	`1485`	`*--out = _mm_shuffle_ps (h1, h0, _MM_SHUFFLE (3, 2, 1, 0));`
`@@ -1498,15 +1500,15 @@ static void unreversed_copy (int N, const __m128* in, __m128* out, int out_strid`
`1498`	`1500`	`h1 = *in++;`
`1499`	`1501`	`g1 = _mm_shuffle_ps (h0, g1, _MM_SHUFFLE (3, 2, 1, 0));`
`1500`	`1502`	`h0 = _mm_shuffle_ps (h1, h0, _MM_SHUFFLE (3, 2, 1, 0));`
`1501`		`- std::tie (out[0], out[1]) = uninterleave2 (h0, g1);`
	`1503`	`+ uninterleave2 (h0, g1, out[0], out[1]);`
`1502`	`1504`	`out += out_stride;`
`1503`	`1505`	`g1 = h1;`
`1504`	`1506`	`}`
`1505`	`1507`	`h0 = *in++;`
`1506`	`1508`	`h1 = g0;`
`1507`	`1509`	`g1 = _mm_shuffle_ps (h0, g1, _MM_SHUFFLE (3, 2, 1, 0));`
`1508`	`1510`	`h0 = _mm_shuffle_ps (h1, h0, _MM_SHUFFLE (3, 2, 1, 0));`
`1509`		`- std::tie (out[0], out[1]) = uninterleave2 (h0, g1);`
	`1511`	`+ uninterleave2 (h0, g1, out[0], out[1]);`
`1510`	`1512`	`}`
`1511`	`1513`
`1512`	`1514`	`static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_direction_t direction)`
`@@ -1522,8 +1524,8 @@ static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_d`
`1522`	`1524`	`{`
`1523`	`1525`	`for (k = 0; k < dk; ++k)`
`1524`	`1526`	`{`
`1525`		`- std::tie (vout[2 * (0 * dk + k) + 0], vout[2 * (0 * dk + k) + 1]) = interleave2 (vin[k * 8 + 0], vin[k * 8 + 1]);`
`1526`		`- std::tie (vout[2 * (2 * dk + k) + 0], vout[2 * (2 * dk + k) + 1]) = interleave2 (vin[k * 8 + 4], vin[k * 8 + 5]);`
	`1527`	`+ interleave2 (vin[k * 8 + 0], vin[k * 8 + 1], vout[2 * (0 * dk + k) + 0], vout[2 * (0 * dk + k) + 1]);`
	`1528`	`+ interleave2 (vin[k * 8 + 4], vin[k * 8 + 5], vout[2 * (2 * dk + k) + 0], vout[2 * (2 * dk + k) + 1]);`
`1527`	`1529`	`}`
`1528`	`1530`	`reversed_copy (dk, vin + 2, 8, (__m128*) (out + N / 2));`
`1529`	`1531`	`reversed_copy (dk, vin + 6, 8, (__m128*) (out + N));`
`@@ -1532,8 +1534,8 @@ static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_d`
`1532`	`1534`	`{`
`1533`	`1535`	`for (k = 0; k < dk; ++k)`
`1534`	`1536`	`{`
`1535`		`- std::tie (vout[k * 8 + 0], vout[k * 8 + 1]) = uninterleave2 (vin[2 * (0 * dk + k) + 0], vin[2 * (0 * dk + k) + 1]);`
`1536`		`- std::tie (vout[k * 8 + 4], vout[k * 8 + 5]) = uninterleave2 (vin[2 * (2 * dk + k) + 0], vin[2 * (2 * dk + k) + 1]);`
	`1537`	`+ uninterleave2 (vin[2 * (0 * dk + k) + 0], vin[2 * (0 * dk + k) + 1], vout[k * 8 + 0], vout[k * 8 + 1]);`
	`1538`	`+ uninterleave2 (vin[2 * (2 * dk + k) + 0], vin[2 * (2 * dk + k) + 1], vout[k * 8 + 4], vout[k * 8 + 5]);`
`1537`	`1539`	`}`
`1538`	`1540`	`unreversed_copy (dk, (__m128) (in + N / 4), (__m128) (out + N - 6 * SIMD_SZ), -8);`
`1539`	`1541`	`unreversed_copy (dk, (__m128) (in + 3 N / 4), (__m128) (out + N - 2 SIMD_SZ), -8);`
`@@ -1546,15 +1548,15 @@ static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_d`
`1546`	`1548`	`for (k = 0; k < Ncvec; ++k)`
`1547`	`1549`	`{`
`1548`	`1550`	`int kk = (k / 4) + (k % 4) * (Ncvec / 4);`
`1549`		`- std::tie (vout[kk * 2], vout[kk * 2 + 1]) = interleave2 (vin[k * 2], vin[k * 2 + 1]);`
	`1551`	`+ interleave2 (vin[k * 2], vin[k * 2 + 1], vout[kk * 2], vout[kk * 2 + 1]);`
`1550`	`1552`	`}`
`1551`	`1553`	`}`
`1552`	`1554`	`else`
`1553`	`1555`	`{`
`1554`	`1556`	`for (k = 0; k < Ncvec; ++k)`
`1555`	`1557`	`{`
`1556`	`1558`	`int kk = (k / 4) + (k % 4) * (Ncvec / 4);`
`1557`		`- std::tie (vout[k * 2], vout[k * 2 + 1]) = uninterleave2 (vin[kk * 2], vin[kk * 2 + 1]);`
	`1559`	`+ uninterleave2 (vin[kk * 2], vin[kk * 2 + 1], vout[k * 2], vout[k * 2 + 1]);`
`1558`	`1560`	`}`
`1559`	`1561`	`}`
`1560`	`1562`	`}`
`@@ -1591,7 +1593,7 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou`
`1591`	`1593`	`__m128* tmp = buff[ib];`
`1592`	`1594`	`for (k = 0; k < Ncvec; ++k)`
`1593`	`1595`	`{`
`1594`		`- std::tie (tmp[k * 2], tmp[k * 2 + 1]) = uninterleave2 (vinput[k * 2], vinput[k * 2 + 1]);`
	`1596`	`+ uninterleave2 (vinput[k * 2], vinput[k * 2 + 1], tmp[k * 2], tmp[k * 2 + 1]);`
`1595`	`1597`	`}`
`1596`	`1598`	`ib = (cfftf1_ps (Ncvec, buff[ib], buff[! ib], buff[ib], setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);`
`1597`	`1599`	`pffft_cplx_finalize (Ncvec, buff[ib], buff[! ib], (__m128*) setup->e);`
`@@ -1626,7 +1628,7 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou`
`1626`	`1628`	`ib = (cfftf1_ps (Ncvec, buff[ib], buff[0], buff[1], setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);`
`1627`	`1629`	`for (k = 0; k < Ncvec; ++k)`
`1628`	`1630`	`{`
`1629`		`- std::tie (buff[ib][k * 2], buff[ib][k * 2 + 1]) = interleave2 (buff[ib][k * 2], buff[ib][k * 2 + 1]);`
	`1631`	`+ interleave2 (buff[ib][k * 2], buff[ib][k * 2 + 1], buff[ib][k * 2], buff[ib][k * 2 + 1]);`
`1630`	`1632`	`}`
`1631`	`1633`	`}`
`1632`	`1634`	`}`
`@@ -1672,8 +1674,8 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b,`
`1672`	`1674`	`br = vb[2 * i + 0];`
`1673`	`1675`	`bi = vb[2 * i + 1];`
`1674`	`1676`	`std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi);`
`1675`		`- vab[2 * i + 0] = _mm_add_ps (vab[2 * i + 0], _mm_mul_ps (ar, vscal));`
`1676`		`- vab[2 * i + 1] = _mm_add_ps (vab[2 * i + 1], _mm_mul_ps (ai, vscal));`
	`1677`	`+ vab[2 * i + 0] = _mm_add_ps (vab[2 * i + 0], _mm_mul_ps (ar, vscal));`
	`1678`	`+ vab[2 * i + 1] = _mm_add_ps (vab[2 * i + 1], _mm_mul_ps (ai, vscal));`
`1677`	`1679`	`ar = va[2 * i + 2];`
`1678`	`1680`	`ai = va[2 * i + 3];`
`1679`	`1681`	`br = vb[2 * i + 2];`