[PATCH 0/5] MR10423: dsound: Speed up resampling, part 5
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/fir.h | 4 ++++ dlls/dsound/mixer.c | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h index c03d0b9f237..45ad65d7398 100644 --- a/dlls/dsound/fir.h +++ b/dlls/dsound/fir.h @@ -86,7 +86,9 @@ int main() fprintf(stderr, "q %f\n", (double)output.q); fprintf(stderr, "status %s\n", get_pm_status_str(output.status)); + printf("static const int fir_width_shift = %d;\n", fir_width_shift); printf("static const int fir_width = %d;\n", fir_width); + printf("static const int fir_step_shift = %d;\n", fir_step_shift); printf("static const int fir_step = %d;\n", fir_step); printf("static const float fir[] = {"); // Print the FIR array with an additional row at the end. This simplifies @@ -112,7 +114,9 @@ int main() printf("};\n"); } */ +static const int fir_width_shift = 6; static const int fir_width = 64; +static const int fir_step_shift = 7; static const int fir_step = 128; static const float fir[] = { 0.0000000000e+00, -2.4830013102e-06, 1.9318705150e-06, 2.6614854151e-06, diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 43a9a8c4f91..3003715b413 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -322,8 +322,8 @@ static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 fr /* opos is in the range [-(fir_width - 1), count) */ int opos = opos_num / freq_adjust_num - fir_width; - UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) * fir_step; - UINT idx = idx_num / freq_adjust_num * fir_width; + UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) << fir_step_shift; + UINT idx = (idx_num / freq_adjust_num) << fir_width_shift; float rem = idx_num % freq_adjust_num / (float)freq_adjust_num; float input_value = input[j] * firgain; @@ -345,8 +345,8 @@ static void upsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq LONG64 ipos_num = freq_acc_start + i * freq_adjust_num; UINT ipos = ipos_num / freq_adjust_den; - UINT idx_num = ipos_num % freq_adjust_den * fir_step; - UINT idx = (fir_step - 1 - idx_num / freq_adjust_den) * fir_width; + UINT idx_num = (ipos_num % freq_adjust_den) << fir_step_shift; + UINT idx = (fir_step - 1 - idx_num / freq_adjust_den) << fir_width_shift; float rem_inv = idx_num % freq_adjust_den / (float)freq_adjust_den; float rem = 1.0f - rem_inv; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 3003715b413..530b8e23d69 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -312,19 +312,19 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) * Note that this function will overwrite up to fir_width - 1 frames before and * after output[]. */ -static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq_acc_start, - float firgain, UINT required_input, float *input, float *output) +static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, + UINT required_input, float *input, float *output) { int j; for (j = 0; j < required_input; ++j) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + j * freq_adjust_den + freq_adjust_num - 1; + LONG64 opos_num = freq_adjust_den - freq_acc_start + j * (LONG64)freq_adjust_den + ~0u; /* opos is in the range [-(fir_width - 1), count) */ - int opos = opos_num / freq_adjust_num - fir_width; + int opos = (int)(opos_num >> 32) - fir_width; - UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) << fir_step_shift; - UINT idx = (idx_num / freq_adjust_num) << fir_width_shift; - float rem = idx_num % freq_adjust_num / (float)freq_adjust_num; + UINT idx = ~(DWORD)opos_num >> (32 - fir_step_shift) << fir_width_shift; + UINT rem_num = ~(DWORD)opos_num << fir_step_shift; + float rem = rem_num * (1.0f / (1ll << 32)); float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; @@ -336,18 +336,18 @@ static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 fr } } -static void upsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq_acc_start, - UINT count, float *input, float *output) +static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, + float *output) { UINT i; for(i = 0; i < count; ++i) { - LONG64 ipos_num = freq_acc_start + i * freq_adjust_num; - UINT ipos = ipos_num / freq_adjust_den; + LONG64 ipos_num = freq_acc_start + i * (LONG64)freq_adjust_num; + UINT ipos = ipos_num >> 32; - UINT idx_num = (ipos_num % freq_adjust_den) << fir_step_shift; - UINT idx = (fir_step - 1 - idx_num / freq_adjust_den) << fir_width_shift; - float rem_inv = idx_num % freq_adjust_den / (float)freq_adjust_den; + UINT idx = ~(DWORD)ipos_num >> (32 - fir_step_shift) << fir_width_shift; + UINT rem_num = (DWORD)ipos_num << fir_step_shift; + float rem_inv = rem_num * (1.0f / (1ll << 32)); float rem = 1.0f - rem_inv; int j; @@ -368,11 +368,21 @@ static void resample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq float firgain, UINT required_input, UINT count, float *input, float *output) { if (freq_adjust_num > freq_adjust_den) { + /* Convert the values to 0.32 fixed point. Round down to prevent output + * buffer overflow. */ + DWORD freq_adjust_fixed_den = (freq_adjust_den << 32) / freq_adjust_num; + DWORD freq_acc_fixed_start = freq_acc_start * freq_adjust_fixed_den / freq_adjust_den; + memset(output, 0, count * sizeof(float)); - downsample(freq_adjust_num, freq_adjust_den, freq_acc_start, firgain, required_input, - input, output); + downsample(freq_adjust_fixed_den, freq_acc_fixed_start, firgain, required_input, input, + output); } else { - upsample(freq_adjust_num, freq_adjust_den, freq_acc_start, count, input, output); + /* Convert the values to 0.32 fixed point. Round down to prevent input + * buffer overflow. */ + DWORD freq_adjust_fixed_num = (freq_adjust_num << 32) / freq_adjust_den; + DWORD freq_acc_fixed_start = (freq_acc_start << 32) / freq_adjust_den; + + upsample(freq_adjust_fixed_num, freq_acc_fixed_start, count, input, output); } } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> Both x87 and SSE don't accept unsigned integers directly, requiring extra conversion steps. --- dlls/dsound/mixer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 530b8e23d69..fefc0d2fe05 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -323,8 +323,8 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai int opos = (int)(opos_num >> 32) - fir_width; UINT idx = ~(DWORD)opos_num >> (32 - fir_step_shift) << fir_width_shift; - UINT rem_num = ~(DWORD)opos_num << fir_step_shift; - float rem = rem_num * (1.0f / (1ll << 32)); + int rem_num = ~(DWORD)opos_num << fir_step_shift >> 1; + float rem = rem_num * (1.0f / (1ll << 31)); float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; @@ -346,8 +346,8 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl UINT ipos = ipos_num >> 32; UINT idx = ~(DWORD)ipos_num >> (32 - fir_step_shift) << fir_width_shift; - UINT rem_num = (DWORD)ipos_num << fir_step_shift; - float rem_inv = rem_num * (1.0f / (1ll << 32)); + int rem_num = (DWORD)ipos_num << fir_step_shift >> 1; + float rem_inv = rem_num * (1.0f / (1ll << 31)); float rem = 1.0f - rem_inv; int j; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index fefc0d2fe05..e3190535f19 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -315,10 +315,11 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, UINT required_input, float *input, float *output) { + LONG64 opos_num = freq_adjust_den - freq_acc_start + (LONG64)~0u; + DWORD opos_num_step = freq_adjust_den; int j; for (j = 0; j < required_input; ++j) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + j * (LONG64)freq_adjust_den + ~0u; /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> 32) - fir_width; @@ -333,16 +334,19 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai UINT i; for (i = 0; i < fir_width; ++i) output[opos + i] += fir[idx + i] * input_value0 + fir[idx + fir_width + i] * input_value1; + + opos_num += opos_num_step; } } static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { + LONG64 ipos_num = freq_acc_start; + DWORD ipos_num_step = freq_adjust_num; UINT i; for(i = 0; i < count; ++i) { - LONG64 ipos_num = freq_acc_start + i * (LONG64)freq_adjust_num; UINT ipos = ipos_num >> 32; UINT idx = ~(DWORD)ipos_num >> (32 - fir_step_shift) << fir_width_shift; @@ -357,6 +361,8 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl for (j = 0; j < fir_width; j++) sum += (fir[idx + j] * rem_inv + fir[idx + j + fir_width] * rem) * cache[j]; output[i] = sum; + + ipos_num += ipos_num_step; } } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index e3190535f19..03e3042f349 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -315,17 +315,23 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, UINT required_input, float *input, float *output) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + (LONG64)~0u; - DWORD opos_num_step = freq_adjust_den; + /* Clear the lower bits of opos_num and opos_num_step to keep rem in perfect + * sync with the lower part of opos_num. As rem is always less than 2, its + * fractional part has 23 bits in the worst case. */ + LONG64 opos_num_mask = ~0ull << (32 - 23 - fir_step_shift); + LONG64 opos_num = (freq_adjust_den - freq_acc_start + (LONG64)~0u) & opos_num_mask; + DWORD opos_num_step = freq_adjust_den & (DWORD)opos_num_mask; + + DWORD rem_num = ((DWORD)opos_num ^ (DWORD)opos_num_mask) << fir_step_shift >> 1; + DWORD rem_step_num = -opos_num_step << fir_step_shift >> 1; + float rem = rem_num * (1.0f / (1ll << 31)); + float rem_step = rem_step_num * (1.0f / (1ll << 31)); int j; for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> 32) - fir_width; - UINT idx = ~(DWORD)opos_num >> (32 - fir_step_shift) << fir_width_shift; - int rem_num = ~(DWORD)opos_num << fir_step_shift >> 1; - float rem = rem_num * (1.0f / (1ll << 31)); float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; @@ -335,6 +341,9 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai for (i = 0; i < fir_width; ++i) output[opos + i] += fir[idx + i] * input_value0 + fir[idx + fir_width + i] * input_value1; + rem += rem_step; + rem -= rem >= 1.0f ? 1.0f : 0.0f; + opos_num += opos_num_step; } } @@ -342,16 +351,22 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { - LONG64 ipos_num = freq_acc_start; - DWORD ipos_num_step = freq_adjust_num; + /* Clear the lower bits of ipos_num and ipos_num_step to keep rem_inv in + * perfect sync with the lower part of ipos_num. As rem is always less than + * 2, its fractional part has 23 bits in the worst case. */ + DWORD ipos_num_mask = ~0u << (32 - 23 - fir_step_shift); + LONG64 ipos_num = freq_acc_start & ipos_num_mask; + DWORD ipos_num_step = freq_adjust_num & ipos_num_mask; + + DWORD rem_num = (DWORD)ipos_num << fir_step_shift >> 1; + DWORD rem_step_num = ipos_num_step << fir_step_shift >> 1; + float rem_inv = rem_num * (1.0f / (1ll << 31)); + float rem_inv_step = rem_step_num * (1.0f / (1ll << 31)); UINT i; for(i = 0; i < count; ++i) { UINT ipos = ipos_num >> 32; - UINT idx = ~(DWORD)ipos_num >> (32 - fir_step_shift) << fir_width_shift; - int rem_num = (DWORD)ipos_num << fir_step_shift >> 1; - float rem_inv = rem_num * (1.0f / (1ll << 31)); float rem = 1.0f - rem_inv; int j; @@ -362,6 +377,9 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl sum += (fir[idx + j] * rem_inv + fir[idx + j + fir_width] * rem) * cache[j]; output[i] = sum; + rem_inv += rem_inv_step; + rem_inv -= rem_inv > 1.0f ? 1.0f : 0.0f; + ipos_num += ipos_num_step; } } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
Just thinking out loud / asking: how does this method compare to using a polyphase filter bank (PFB) approach for rational resampling? Specifically, decomposing into a PFB and then using SIMD-accelerated dot products with the filter bank taps. That’s the approach used in several DSP libraries and tends to map quite well to modern CPU architectures. For reference: - SciPy's [`scipy.signal.resample_poly`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.resample_p...) - GNU Radio's [Rational Resampler](https://wiki.gnuradio.org/index.php?title=Rational_Resampler) - FutureSDR's [PolyphaseResamplingFirKernel](https://docs.rs/futuredsp/0.0.6/futuredsp/fir/struct.PolyphaseResamplingFirK...) - [SampleRateConverter](https://github.com/zephray/SampleRateConverter) - liquid-dsp's [rresamp](https://liquidsdr.org/api/rresamp_crcf/) Curious if this was considered and how it compares in terms of performance/complexity. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_133256
On Tue Mar 24 05:35:49 2026 +0000, Sergi Granell Escalfet wrote:
Just thinking out loud / asking: how does this method compare to using a polyphase filter bank (PFB) approach for rational resampling? Specifically, decomposing into a PFB and then using SIMD-accelerated dot products with the filter bank taps. That’s the approach used in several DSP libraries and tends to map quite well to modern CPU architectures. For reference: - SciPy's [`scipy.signal.resample_poly`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.resample_p...) - GNU Radio's [Rational Resampler](https://wiki.gnuradio.org/index.php?title=Rational_Resampler) - FutureSDR's [PolyphaseResamplingFirKernel](https://docs.rs/futuredsp/0.0.6/futuredsp/fir/struct.PolyphaseResamplingFirK...) - [SampleRateConverter](https://github.com/zephray/SampleRateConverter) - liquid-dsp's [rresamp](https://liquidsdr.org/api/rresamp_crcf/) Curious if this was considered and how it compares in terms of performance/complexity. I didn't compare the performance, but I expect PFB to be a bit faster. The downside is that it requires precomputing and storing a coefficient table and these can be quite large, especially for irreducible resampling ratios (e.g. 47999 -> 48000 would require a 48000 * 64 * 4 = 12288000 ≈ 11.7 MiB table). And the table would have to be recomputed on every frequency change.
Our current approach involves storing a fixed bank of 128 filters and calculating the coefficients on the fly by linearly interpolating between them. This allows us to use a static 32.2 KiB table at a small additional cost of the linear interpolation. Our algorithm is also SIMD-friendly, and I'm going to add an SSE version in the next MR. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_133500
And the table would have to be recomputed on every frequency change.
For the records, that alone would make it unsuitable for dsound. Applications can and do change buffer frequency very often, and additionally 3d sound buffers depend on changing playback frequency every frame for the Doppler effect. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_133632
participants (3)
-
Anton Baskanov -
Anton Baskanov (@baskanov) -
Matteo Bruni (@Mystral) -
Sergi Granell Escalfet (@xerpi)