[PATCH v3 0/5] MR10423: dsound: Speed up resampling, part 5
-- v3: dsound: Calculate rem and rem_inv incrementally. dsound: Calculate opos_num and ipos_num incrementally. dsound: Make rem_num signed. dsound: Use a 0.32 fixed point to represent the resampling ratio. https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/fir.h | 4 ++++ dlls/dsound/mixer.c | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h index c03d0b9f237..45ad65d7398 100644 --- a/dlls/dsound/fir.h +++ b/dlls/dsound/fir.h @@ -86,7 +86,9 @@ int main() fprintf(stderr, "q %f\n", (double)output.q); fprintf(stderr, "status %s\n", get_pm_status_str(output.status)); + printf("static const int fir_width_shift = %d;\n", fir_width_shift); printf("static const int fir_width = %d;\n", fir_width); + printf("static const int fir_step_shift = %d;\n", fir_step_shift); printf("static const int fir_step = %d;\n", fir_step); printf("static const float fir[] = {"); // Print the FIR array with an additional row at the end. This simplifies @@ -112,7 +114,9 @@ int main() printf("};\n"); } */ +static const int fir_width_shift = 6; static const int fir_width = 64; +static const int fir_step_shift = 7; static const int fir_step = 128; static const float fir[] = { 0.0000000000e+00, -2.4830013102e-06, 1.9318705150e-06, 2.6614854151e-06, diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 43a9a8c4f91..3003715b413 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -322,8 +322,8 @@ static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 fr /* opos is in the range [-(fir_width - 1), count) */ int opos = opos_num / freq_adjust_num - fir_width; - UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) * fir_step; - UINT idx = idx_num / freq_adjust_num * fir_width; + UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) << fir_step_shift; + UINT idx = (idx_num / freq_adjust_num) << fir_width_shift; float rem = idx_num % freq_adjust_num / (float)freq_adjust_num; float input_value = input[j] * firgain; @@ -345,8 +345,8 @@ static void upsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq LONG64 ipos_num = freq_acc_start + i * freq_adjust_num; UINT ipos = ipos_num / freq_adjust_den; - UINT idx_num = ipos_num % freq_adjust_den * fir_step; - UINT idx = (fir_step - 1 - idx_num / freq_adjust_den) * fir_width; + UINT idx_num = (ipos_num % freq_adjust_den) << fir_step_shift; + UINT idx = (fir_step - 1 - idx_num / freq_adjust_den) << fir_width_shift; float rem_inv = idx_num % freq_adjust_den / (float)freq_adjust_den; float rem = 1.0f - rem_inv; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 46 ++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 3003715b413..97b2be738b8 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -42,6 +42,9 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound); +#define FREQ_ADJUST_SHIFT 32 +#define FIXED_0_32_TO_FLOAT(x) ((x) * (1.0f / (1ll << 32))) + void DSOUND_RecalcVolPan(PDSVOLUMEPAN volpan) { double temp; @@ -312,19 +315,19 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) * Note that this function will overwrite up to fir_width - 1 frames before and * after output[]. */ -static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq_acc_start, - float firgain, UINT required_input, float *input, float *output) +static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, + UINT required_input, float *input, float *output) { int j; for (j = 0; j < required_input; ++j) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + j * freq_adjust_den + freq_adjust_num - 1; + LONG64 opos_num = freq_adjust_den - freq_acc_start + j * (LONG64)freq_adjust_den + + (1ll << FREQ_ADJUST_SHIFT) - 1; /* opos is in the range [-(fir_width - 1), count) */ - int opos = opos_num / freq_adjust_num - fir_width; + int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - fir_width; - UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) << fir_step_shift; - UINT idx = (idx_num / freq_adjust_num) << fir_width_shift; - float rem = idx_num % freq_adjust_num / (float)freq_adjust_num; + UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; + float rem = FIXED_0_32_TO_FLOAT(~(DWORD)opos_num << fir_step_shift); float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; @@ -336,18 +339,17 @@ static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 fr } } -static void upsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq_acc_start, - UINT count, float *input, float *output) +static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, + float *output) { UINT i; for(i = 0; i < count; ++i) { - LONG64 ipos_num = freq_acc_start + i * freq_adjust_num; - UINT ipos = ipos_num / freq_adjust_den; + LONG64 ipos_num = freq_acc_start + i * (LONG64)freq_adjust_num; + UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; - UINT idx_num = (ipos_num % freq_adjust_den) << fir_step_shift; - UINT idx = (fir_step - 1 - idx_num / freq_adjust_den) << fir_width_shift; - float rem_inv = idx_num % freq_adjust_den / (float)freq_adjust_den; + UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; + float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << fir_step_shift); float rem = 1.0f - rem_inv; int j; @@ -368,11 +370,21 @@ static void resample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq float firgain, UINT required_input, UINT count, float *input, float *output) { if (freq_adjust_num > freq_adjust_den) { + /* Convert the values to 0.32 fixed point. Round down to prevent output + * buffer overflow. */ + DWORD freq_adjust_fixed_den = (freq_adjust_den << FREQ_ADJUST_SHIFT) / freq_adjust_num; + DWORD freq_acc_fixed_start = freq_acc_start * freq_adjust_fixed_den / freq_adjust_den; + memset(output, 0, count * sizeof(float)); - downsample(freq_adjust_num, freq_adjust_den, freq_acc_start, firgain, required_input, - input, output); + downsample(freq_adjust_fixed_den, freq_acc_fixed_start, firgain, required_input, input, + output); } else { - upsample(freq_adjust_num, freq_adjust_den, freq_acc_start, count, input, output); + /* Convert the values to 0.32 fixed point. Round down to prevent input + * buffer overflow. */ + DWORD freq_adjust_fixed_num = (freq_adjust_num << FREQ_ADJUST_SHIFT) / freq_adjust_den; + DWORD freq_acc_fixed_start = (freq_acc_start << FREQ_ADJUST_SHIFT) / freq_adjust_den; + + upsample(freq_adjust_fixed_num, freq_acc_fixed_start, count, input, output); } } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> Both x87 and SSE don't accept unsigned integers directly, requiring extra conversion steps. --- dlls/dsound/mixer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 97b2be738b8..61e447aa283 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -43,7 +43,7 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound); #define FREQ_ADJUST_SHIFT 32 -#define FIXED_0_32_TO_FLOAT(x) ((x) * (1.0f / (1ll << 32))) +#define FIXED_0_32_TO_FLOAT(x) ((int)((x) >> 1) * (1.0f / (1ll << 31))) void DSOUND_RecalcVolPan(PDSVOLUMEPAN volpan) { -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 61e447aa283..d6ac7c61246 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -318,11 +318,11 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, UINT required_input, float *input, float *output) { + LONG64 opos_num = freq_adjust_den - freq_acc_start + (1ll << FREQ_ADJUST_SHIFT) - 1; + DWORD opos_num_step = freq_adjust_den; int j; for (j = 0; j < required_input; ++j) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + j * (LONG64)freq_adjust_den + - (1ll << FREQ_ADJUST_SHIFT) - 1; /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - fir_width; @@ -336,16 +336,19 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai UINT i; for (i = 0; i < fir_width; ++i) output[opos + i] += fir[idx + i] * input_value0 + fir[idx + fir_width + i] * input_value1; + + opos_num += opos_num_step; } } static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { + LONG64 ipos_num = freq_acc_start; + DWORD ipos_num_step = freq_adjust_num; UINT i; for(i = 0; i < count; ++i) { - LONG64 ipos_num = freq_acc_start + i * (LONG64)freq_adjust_num; UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; @@ -359,6 +362,8 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl for (j = 0; j < fir_width; j++) sum += (fir[idx + j] * rem_inv + fir[idx + j + fir_width] * rem) * cache[j]; output[i] = sum; + + ipos_num += ipos_num_step; } } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index d6ac7c61246..2eaf5f697b8 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -318,16 +318,21 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, UINT required_input, float *input, float *output) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + (1ll << FREQ_ADJUST_SHIFT) - 1; - DWORD opos_num_step = freq_adjust_den; + /* Clear the lower bits of opos_num and opos_num_step to keep rem in perfect + * sync with the lower part of opos_num. As rem is always less than 2, its + * fractional part has 23 bits in the worst case. */ + LONG64 opos_num_mask = ~0ull << (FREQ_ADJUST_SHIFT - 23 - fir_step_shift); + LONG64 opos_num = (freq_adjust_den - freq_acc_start + (1ll << FREQ_ADJUST_SHIFT) - 1) & opos_num_mask; + DWORD opos_num_step = freq_adjust_den & (DWORD)opos_num_mask; + + float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << fir_step_shift); + float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << fir_step_shift); int j; for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - fir_width; - UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; - float rem = FIXED_0_32_TO_FLOAT(~(DWORD)opos_num << fir_step_shift); float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; @@ -337,6 +342,9 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai for (i = 0; i < fir_width; ++i) output[opos + i] += fir[idx + i] * input_value0 + fir[idx + fir_width + i] * input_value1; + rem += rem_step; + rem -= rem >= 1.0f ? 1.0f : 0.0f; + opos_num += opos_num_step; } } @@ -344,15 +352,20 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { - LONG64 ipos_num = freq_acc_start; - DWORD ipos_num_step = freq_adjust_num; + /* Clear the lower bits of ipos_num and ipos_num_step to keep rem_inv in + * perfect sync with the lower part of ipos_num. As rem_inv is always less + * than 2, its fractional part has 23 bits in the worst case. */ + DWORD ipos_num_mask = ~0u << (FREQ_ADJUST_SHIFT - 23 - fir_step_shift); + LONG64 ipos_num = freq_acc_start & ipos_num_mask; + DWORD ipos_num_step = freq_adjust_num & ipos_num_mask; + + float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << fir_step_shift); + float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << fir_step_shift); UINT i; for(i = 0; i < count; ++i) { UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; - UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; - float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << fir_step_shift); float rem = 1.0f - rem_inv; int j; @@ -363,6 +376,9 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl sum += (fir[idx + j] * rem_inv + fir[idx + j + fir_width] * rem) * cache[j]; output[i] = sum; + rem_inv += rem_inv_step; + rem_inv -= rem_inv >= 1.0f ? 1.0f : 0.0f; + ipos_num += ipos_num_step; } } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
On Thu Apr 2 04:48:39 2026 +0000, Anton Baskanov wrote:
changed this line in [version 3 of the diff](/wine/wine/-/merge_requests/10423/diffs?diff_id=257011&start_sha=8c07af7c4f32157993f5c7bf24f762a6893646ff#3611483beebc594b1206bbcc979e9584d27bf7ce_328_328) Surprisingly, yes. The SSE version with the incremental factor calculation is faster than AVX+FMA3 without it (at least on Zen+).
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_134751
On Tue Mar 31 12:16:08 2026 +0000, Matteo Bruni wrote:
Smart! I'd have `#define`s for these conversion numbers as well, maybe something like: ``` #define POS_FRAC_TO_SIGNED_SHIFT 1 #define POS_FRAC_SIGNED_MAX (1ll << 31) ``` I expect the resulting code to be more verbose but also quite a bit more self-explanatory. Added a macro for fixed point to float conversion.
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_134752
On Thu Apr 2 04:48:39 2026 +0000, Anton Baskanov wrote: > changed this line in [version 3 of the diff](/wine/wine/-/merge_requests/10423/diffs?diff_id=257011&start_sha=8c07af7c4f32157993f5c7bf24f762a6893646ff#3611483beebc594b1206bbcc979e9584d27bf7ce_333_334) There are two issues with storing the position as a fixed point: - `downsample` takes a reciprocal of the resampling ratio, so we'd have to convert anyway. - There are apps that expect the resampling ratio to be exact (see #35971 and 6d009b988b22212fabcc34711c40ec776fecaec5). Anyway, added a `#define` for `32`. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_134753
v3: - Added a `FREQ_ADJUST_SHIFT` define for `32`. - Added a `FIXED_0_32_TO_FLOAT` function-like macro. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_134755
On Fri Apr 3 22:13:10 2026 +0000, Anton Baskanov wrote:
There are two issues with storing the position as a fixed point: - `downsample` takes a reciprocal of the resampling ratio, so we'd have to convert anyway. - There are apps that expect the resampling ratio to be exact (see #35971 and 6d009b988b22212fabcc34711c40ec776fecaec5). Anyway, added a `#define` for `32`. Ah, right, `downsample()` now wants the reciprocal. Unfortunate, but reasonable.
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_135002
On Fri Apr 3 22:13:11 2026 +0000, Anton Baskanov wrote:
Added a macro for fixed point to float conversion. I like it!
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_135003
On Thu Apr 2 04:54:32 2026 +0000, Anton Baskanov wrote:
Surprisingly, yes. The SSE version with the incremental factor calculation is faster than AVX+FMA3 without it (at least on Zen+). Very interesting. Can you please add that to the patch as a small comment?
It might be nice to mention what's the objective of this "forced sync", why we can do it (i.e. the mask bits are chosen so that the cumulative error to `opos_num` doesn't affect the `opos` and `idx` values computed in the loop) and where does that "23" come from. Maybe it should be obvious, but I'd err on the side of overexplaining rather than the opposite, at least in this case. Also I'd favor a note on the `^ (DWORD)opos_num_mask` part, to me that's not immediately trivial (especially as it will appear after this patch goes in i.e. without the previous form of the `rem` expression available to compare). -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_135004
participants (3)
-
Anton Baskanov -
Anton Baskanov (@baskanov) -
Matteo Bruni (@Mystral)