Skip to content

Commit ab9e1e8

Browse files
Fixing SSE crashes with Clang-Cl
1 parent 0b849ac commit ab9e1e8

File tree

2 files changed

+30
-30
lines changed

2 files changed

+30
-30
lines changed

.gitignore

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
build*/
22

33
.focus-config
4-
.raddbg_project
4+
*.raddbg_project
55
.vscode/
66
.idea/
7-
8-

simd/chowdsp_fft_impl_sse.cpp

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -130,18 +130,16 @@ static void fft_destroy_setup (FFT_Setup* s)
130130
}
131131

132132
//====================================================================
133-
static inline auto interleave2 (__m128 in1, __m128 in2)
133+
static inline void interleave2 (__m128 in1, __m128 in2, __m128& out1, __m128& out2)
134134
{
135-
auto out1 = _mm_unpacklo_ps (in1, in2);
136-
auto out2 = _mm_unpackhi_ps (in1, in2);
137-
return std::make_tuple (out1, out2);
135+
out1 = _mm_unpacklo_ps (in1, in2);
136+
out2 = _mm_unpackhi_ps (in1, in2);
138137
}
139138

140-
static inline auto uninterleave2 (__m128 in1, __m128 in2)
139+
static inline void uninterleave2 (__m128 in1, __m128 in2, __m128& out1, __m128& out2)
141140
{
142-
auto out1 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (2, 0, 2, 0));
143-
auto out2 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (3, 1, 3, 1));
144-
return std::make_tuple (out1, out2);
141+
out1 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (2, 0, 2, 0));
142+
out2 = _mm_shuffle_ps (in1, in2, _MM_SHUFFLE (3, 1, 3, 1));
145143
}
146144

147145
static inline auto mul_scalar (__m128 a, float b)
@@ -630,7 +628,7 @@ static void radf2_ps (int ido, int l1, const __m128* __restrict cc, __m128* __re
630628
}
631629
for (k = 0; k < l1ido; k += ido)
632630
{
633-
ch[2 * k + ido] = _mm_xor_ps(cc[ido - 1 + k + l1ido], _mm_set1_ps(-0.f)); // negate
631+
ch[2 * k + ido] = _mm_xor_ps (cc[ido - 1 + k + l1ido], _mm_set1_ps (-0.f)); // negate
634632
ch[2 * k + ido - 1] = cc[k + ido - 1];
635633
}
636634
}
@@ -961,8 +959,9 @@ static void pffft_real_finalize (int Ncvec, const __m128* in, __m128* out, const
961959
int k, dk = Ncvec / (int) SIMD_SZ; // number of 4x4 matrix blocks
962960
/* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
963961

964-
union v4sf_union {
965-
__m128 v;
962+
union v4sf_union
963+
{
964+
__m128 v;
966965
float f[SIMD_SZ];
967966
};
968967

@@ -1073,8 +1072,9 @@ static void pffft_real_preprocess (int Ncvec, const __m128* in, __m128* out, con
10731072
int k, dk = Ncvec / (int) SIMD_SZ; // number of 4x4 matrix blocks
10741073
/* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
10751074

1076-
union v4sf_union {
1077-
__m128 v;
1075+
union v4sf_union
1076+
{
1077+
__m128 v;
10781078
float f[SIMD_SZ];
10791079
};
10801080

@@ -1470,14 +1470,16 @@ static void reversed_copy (int N, const __m128* in, int in_stride, __m128* out)
14701470
auto* in_start = in;
14711471
auto* out_start = out;
14721472

1473-
auto [g0, g1] = interleave2 (in[0], in[1]);
1473+
__m128 g0, g1;
1474+
interleave2 (in[0], in[1], g0, g1);
14741475
in += in_stride;
14751476

14761477
*--out = _mm_shuffle_ps (g1, g0, _MM_SHUFFLE (3, 2, 1, 0)); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
14771478
int k;
14781479
for (k = 1; k < N; ++k)
14791480
{
1480-
auto [h0, h1] = interleave2 (in[0], in[1]);
1481+
__m128 h0, h1;
1482+
interleave2 (in[0], in[1], h0, h1);
14811483
in += in_stride;
14821484
*--out = _mm_shuffle_ps (h0, g1, _MM_SHUFFLE (3, 2, 1, 0));
14831485
*--out = _mm_shuffle_ps (h1, h0, _MM_SHUFFLE (3, 2, 1, 0));
@@ -1498,15 +1500,15 @@ static void unreversed_copy (int N, const __m128* in, __m128* out, int out_strid
14981500
h1 = *in++;
14991501
g1 = _mm_shuffle_ps (h0, g1, _MM_SHUFFLE (3, 2, 1, 0));
15001502
h0 = _mm_shuffle_ps (h1, h0, _MM_SHUFFLE (3, 2, 1, 0));
1501-
std::tie (out[0], out[1]) = uninterleave2 (h0, g1);
1503+
uninterleave2 (h0, g1, out[0], out[1]);
15021504
out += out_stride;
15031505
g1 = h1;
15041506
}
15051507
h0 = *in++;
15061508
h1 = g0;
15071509
g1 = _mm_shuffle_ps (h0, g1, _MM_SHUFFLE (3, 2, 1, 0));
15081510
h0 = _mm_shuffle_ps (h1, h0, _MM_SHUFFLE (3, 2, 1, 0));
1509-
std::tie (out[0], out[1]) = uninterleave2 (h0, g1);
1511+
uninterleave2 (h0, g1, out[0], out[1]);
15101512
}
15111513

15121514
static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_direction_t direction)
@@ -1522,8 +1524,8 @@ static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_d
15221524
{
15231525
for (k = 0; k < dk; ++k)
15241526
{
1525-
std::tie (vout[2 * (0 * dk + k) + 0], vout[2 * (0 * dk + k) + 1]) = interleave2 (vin[k * 8 + 0], vin[k * 8 + 1]);
1526-
std::tie (vout[2 * (2 * dk + k) + 0], vout[2 * (2 * dk + k) + 1]) = interleave2 (vin[k * 8 + 4], vin[k * 8 + 5]);
1527+
interleave2 (vin[k * 8 + 0], vin[k * 8 + 1], vout[2 * (0 * dk + k) + 0], vout[2 * (0 * dk + k) + 1]);
1528+
interleave2 (vin[k * 8 + 4], vin[k * 8 + 5], vout[2 * (2 * dk + k) + 0], vout[2 * (2 * dk + k) + 1]);
15271529
}
15281530
reversed_copy (dk, vin + 2, 8, (__m128*) (out + N / 2));
15291531
reversed_copy (dk, vin + 6, 8, (__m128*) (out + N));
@@ -1532,8 +1534,8 @@ static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_d
15321534
{
15331535
for (k = 0; k < dk; ++k)
15341536
{
1535-
std::tie (vout[k * 8 + 0], vout[k * 8 + 1]) = uninterleave2 (vin[2 * (0 * dk + k) + 0], vin[2 * (0 * dk + k) + 1]);
1536-
std::tie (vout[k * 8 + 4], vout[k * 8 + 5]) = uninterleave2 (vin[2 * (2 * dk + k) + 0], vin[2 * (2 * dk + k) + 1]);
1537+
uninterleave2 (vin[2 * (0 * dk + k) + 0], vin[2 * (0 * dk + k) + 1], vout[k * 8 + 0], vout[k * 8 + 1]);
1538+
uninterleave2 (vin[2 * (2 * dk + k) + 0], vin[2 * (2 * dk + k) + 1], vout[k * 8 + 4], vout[k * 8 + 5]);
15371539
}
15381540
unreversed_copy (dk, (__m128*) (in + N / 4), (__m128*) (out + N - 6 * SIMD_SZ), -8);
15391541
unreversed_copy (dk, (__m128*) (in + 3 * N / 4), (__m128*) (out + N - 2 * SIMD_SZ), -8);
@@ -1546,15 +1548,15 @@ static void pffft_zreorder (FFT_Setup* setup, const float* in, float* out, fft_d
15461548
for (k = 0; k < Ncvec; ++k)
15471549
{
15481550
int kk = (k / 4) + (k % 4) * (Ncvec / 4);
1549-
std::tie (vout[kk * 2], vout[kk * 2 + 1]) = interleave2 (vin[k * 2], vin[k * 2 + 1]);
1551+
interleave2 (vin[k * 2], vin[k * 2 + 1], vout[kk * 2], vout[kk * 2 + 1]);
15501552
}
15511553
}
15521554
else
15531555
{
15541556
for (k = 0; k < Ncvec; ++k)
15551557
{
15561558
int kk = (k / 4) + (k % 4) * (Ncvec / 4);
1557-
std::tie (vout[k * 2], vout[k * 2 + 1]) = uninterleave2 (vin[kk * 2], vin[kk * 2 + 1]);
1559+
uninterleave2 (vin[kk * 2], vin[kk * 2 + 1], vout[k * 2], vout[k * 2 + 1]);
15581560
}
15591561
}
15601562
}
@@ -1591,7 +1593,7 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
15911593
__m128* tmp = buff[ib];
15921594
for (k = 0; k < Ncvec; ++k)
15931595
{
1594-
std::tie (tmp[k * 2], tmp[k * 2 + 1]) = uninterleave2 (vinput[k * 2], vinput[k * 2 + 1]);
1596+
uninterleave2 (vinput[k * 2], vinput[k * 2 + 1], tmp[k * 2], tmp[k * 2 + 1]);
15951597
}
15961598
ib = (cfftf1_ps (Ncvec, buff[ib], buff[! ib], buff[ib], setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
15971599
pffft_cplx_finalize (Ncvec, buff[ib], buff[! ib], (__m128*) setup->e);
@@ -1626,7 +1628,7 @@ void pffft_transform_internal (FFT_Setup* setup, const float* finput, float* fou
16261628
ib = (cfftf1_ps (Ncvec, buff[ib], buff[0], buff[1], setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
16271629
for (k = 0; k < Ncvec; ++k)
16281630
{
1629-
std::tie (buff[ib][k * 2], buff[ib][k * 2 + 1]) = interleave2 (buff[ib][k * 2], buff[ib][k * 2 + 1]);
1631+
interleave2 (buff[ib][k * 2], buff[ib][k * 2 + 1], buff[ib][k * 2], buff[ib][k * 2 + 1]);
16301632
}
16311633
}
16321634
}
@@ -1672,8 +1674,8 @@ void pffft_convolve_internal (FFT_Setup* setup, const float* a, const float* b,
16721674
br = vb[2 * i + 0];
16731675
bi = vb[2 * i + 1];
16741676
std::tie (ar, ai) = cplx_mul_v (ar, ai, br, bi);
1675-
vab[2 * i + 0] = _mm_add_ps (vab[2 * i + 0], _mm_mul_ps (ar, vscal));
1676-
vab[2 * i + 1] = _mm_add_ps (vab[2 * i + 1], _mm_mul_ps (ai, vscal));
1677+
vab[2 * i + 0] = _mm_add_ps (vab[2 * i + 0], _mm_mul_ps (ar, vscal));
1678+
vab[2 * i + 1] = _mm_add_ps (vab[2 * i + 1], _mm_mul_ps (ai, vscal));
16771679
ar = va[2 * i + 2];
16781680
ai = va[2 * i + 3];
16791681
br = vb[2 * i + 2];

0 commit comments

Comments
 (0)