[PATCH v4 0/5] MR10423: dsound: Speed up resampling, part 5
-- v4: dsound: Calculate rem and rem_inv incrementally. dsound: Calculate opos_num and ipos_num incrementally. dsound: Make rem_num signed. dsound: Use a 0.32 fixed point to represent the resampling ratio. https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/fir.h | 4 ++++ dlls/dsound/mixer.c | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h index c03d0b9f237..45ad65d7398 100644 --- a/dlls/dsound/fir.h +++ b/dlls/dsound/fir.h @@ -86,7 +86,9 @@ int main() fprintf(stderr, "q %f\n", (double)output.q); fprintf(stderr, "status %s\n", get_pm_status_str(output.status)); + printf("static const int fir_width_shift = %d;\n", fir_width_shift); printf("static const int fir_width = %d;\n", fir_width); + printf("static const int fir_step_shift = %d;\n", fir_step_shift); printf("static const int fir_step = %d;\n", fir_step); printf("static const float fir[] = {"); // Print the FIR array with an additional row at the end. This simplifies @@ -112,7 +114,9 @@ int main() printf("};\n"); } */ +static const int fir_width_shift = 6; static const int fir_width = 64; +static const int fir_step_shift = 7; static const int fir_step = 128; static const float fir[] = { 0.0000000000e+00, -2.4830013102e-06, 1.9318705150e-06, 2.6614854151e-06, diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 43a9a8c4f91..3003715b413 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -322,8 +322,8 @@ static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 fr /* opos is in the range [-(fir_width - 1), count) */ int opos = opos_num / freq_adjust_num - fir_width; - UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) * fir_step; - UINT idx = idx_num / freq_adjust_num * fir_width; + UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) << fir_step_shift; + UINT idx = (idx_num / freq_adjust_num) << fir_width_shift; float rem = idx_num % freq_adjust_num / (float)freq_adjust_num; float input_value = input[j] * firgain; @@ -345,8 +345,8 @@ static void upsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq LONG64 ipos_num = freq_acc_start + i * freq_adjust_num; UINT ipos = ipos_num / freq_adjust_den; - UINT idx_num = ipos_num % freq_adjust_den * fir_step; - UINT idx = (fir_step - 1 - idx_num / freq_adjust_den) * fir_width; + UINT idx_num = (ipos_num % freq_adjust_den) << fir_step_shift; + UINT idx = (fir_step - 1 - idx_num / freq_adjust_den) << fir_width_shift; float rem_inv = idx_num % freq_adjust_den / (float)freq_adjust_den; float rem = 1.0f - rem_inv; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 51 ++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 3003715b413..20921312356 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -42,6 +42,9 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound); +#define FREQ_ADJUST_SHIFT 32 +#define FIXED_0_32_TO_FLOAT(x) ((x) * (1.0f / (1ll << 32))) + void DSOUND_RecalcVolPan(PDSVOLUMEPAN volpan) { double temp; @@ -312,19 +315,19 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) * Note that this function will overwrite up to fir_width - 1 frames before and * after output[]. */ -static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq_acc_start, - float firgain, UINT required_input, float *input, float *output) +static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, + UINT required_input, float *input, float *output) { int j; for (j = 0; j < required_input; ++j) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + j * freq_adjust_den + freq_adjust_num - 1; + LONG64 opos_num = freq_adjust_den - freq_acc_start + j * (LONG64)freq_adjust_den + + (1ll << FREQ_ADJUST_SHIFT) - 1; /* opos is in the range [-(fir_width - 1), count) */ - int opos = opos_num / freq_adjust_num - fir_width; + int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - fir_width; - UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) << fir_step_shift; - UINT idx = (idx_num / freq_adjust_num) << fir_width_shift; - float rem = idx_num % freq_adjust_num / (float)freq_adjust_num; + UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; + float rem = FIXED_0_32_TO_FLOAT(~(DWORD)opos_num << fir_step_shift); float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; @@ -336,18 +339,17 @@ static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 fr } } -static void upsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq_acc_start, - UINT count, float *input, float *output) +static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, + float *output) { UINT i; for(i = 0; i < count; ++i) { - LONG64 ipos_num = freq_acc_start + i * freq_adjust_num; - UINT ipos = ipos_num / freq_adjust_den; + LONG64 ipos_num = freq_acc_start + i * (LONG64)freq_adjust_num; + UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; - UINT idx_num = (ipos_num % freq_adjust_den) << fir_step_shift; - UINT idx = (fir_step - 1 - idx_num / freq_adjust_den) << fir_width_shift; - float rem_inv = idx_num % freq_adjust_den / (float)freq_adjust_den; + UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; + float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << fir_step_shift); float rem = 1.0f - rem_inv; int j; @@ -368,11 +370,26 @@ static void resample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq float firgain, UINT required_input, UINT count, float *input, float *output) { if (freq_adjust_num > freq_adjust_den) { + /* Take a reciprocal of the resampling ratio and convert it to a 0.32 + * fixed point. Round down to prevent output buffer overflow. */ + DWORD freq_adjust_fixed_den = (freq_adjust_den << FREQ_ADJUST_SHIFT) / freq_adjust_num; + /* Convert the subsample position to a 0.32 fixed point. Round up to + * prevent output buffer overflow. */ + DWORD freq_acc_fixed_start = (freq_acc_start * freq_adjust_fixed_den + freq_adjust_den - 1) + / freq_adjust_den; + memset(output, 0, count * sizeof(float)); - downsample(freq_adjust_num, freq_adjust_den, freq_acc_start, firgain, required_input, - input, output); + downsample(freq_adjust_fixed_den, freq_acc_fixed_start, firgain, required_input, input, + output); } else { - upsample(freq_adjust_num, freq_adjust_den, freq_acc_start, count, input, output); + /* Convert the resampling ratio to a 0.32 fixed point. Round down to + * prevent input buffer overflow. */ + DWORD freq_adjust_fixed_num = (freq_adjust_num << FREQ_ADJUST_SHIFT) / freq_adjust_den; + /* Convert the subsample position to a 0.32 fixed point. Round down to + * prevent input buffer overflow. */ + DWORD freq_acc_fixed_start = (freq_acc_start << FREQ_ADJUST_SHIFT) / freq_adjust_den; + + upsample(freq_adjust_fixed_num, freq_acc_fixed_start, count, input, output); } } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> Both x87 and SSE don't accept unsigned integers directly, requiring extra conversion steps. --- dlls/dsound/mixer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 20921312356..da8395c9207 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -43,7 +43,7 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound); #define FREQ_ADJUST_SHIFT 32 -#define FIXED_0_32_TO_FLOAT(x) ((x) * (1.0f / (1ll << 32))) +#define FIXED_0_32_TO_FLOAT(x) ((int)((x) >> 1) * (1.0f / (1ll << 31))) void DSOUND_RecalcVolPan(PDSVOLUMEPAN volpan) { -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index da8395c9207..0d979f69898 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -318,11 +318,11 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, UINT required_input, float *input, float *output) { + LONG64 opos_num = freq_adjust_den - freq_acc_start + (1ll << FREQ_ADJUST_SHIFT) - 1; + DWORD opos_num_step = freq_adjust_den; int j; for (j = 0; j < required_input; ++j) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + j * (LONG64)freq_adjust_den + - (1ll << FREQ_ADJUST_SHIFT) - 1; /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - fir_width; @@ -336,16 +336,19 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai UINT i; for (i = 0; i < fir_width; ++i) output[opos + i] += fir[idx + i] * input_value0 + fir[idx + fir_width + i] * input_value1; + + opos_num += opos_num_step; } } static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { + LONG64 ipos_num = freq_acc_start; + DWORD ipos_num_step = freq_adjust_num; UINT i; for(i = 0; i < count; ++i) { - LONG64 ipos_num = freq_acc_start + i * (LONG64)freq_adjust_num; UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; @@ -359,6 +362,8 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl for (j = 0; j < fir_width; j++) sum += (fir[idx + j] * rem_inv + fir[idx + j + fir_width] * rem) * cache[j]; output[i] = sum; + + ipos_num += ipos_num_step; } } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 64 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 8 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 0d979f69898..bd878604436 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -318,16 +318,38 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, UINT required_input, float *input, float *output) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + (1ll << FREQ_ADJUST_SHIFT) - 1; - DWORD opos_num_step = freq_adjust_den; + /* Both opos_num and rem are calculated in an incremental fashion, + * independently of each other. This improves performance a bit, presumably + * because it allows the CPU to do the calculation in parallel. + * + * However, the value of rem must still be kept in perfect sync with the + * lower part of opos_num. Otherwise, even a small divergence can cause them + * to wrap around on different iterations of the outer loop, which will + * produce artifacts. + * + * To prevent this, clear the lower bits of opos_num and opos_num_step so + * that rem can always represent the calculated value exactly. As rem is + * always less than 2, its exponent is less than or equal to zero. This + * means that in the worst case, rem has the same number of fractional bits + * as the significand, which is 23 for a single-precision floating point. + * + * Clearing the bits is safe as it has the same effect as rounding up the + * resampling ratio and the subsample position and doesn't affect the + * initial opos value. */ + LONG64 opos_num_mask = ~0ull << (FREQ_ADJUST_SHIFT - 23 - fir_step_shift); + LONG64 opos_num = (freq_adjust_den - freq_acc_start + (1ll << FREQ_ADJUST_SHIFT) - 1) & opos_num_mask; + DWORD opos_num_step = freq_adjust_den & (DWORD)opos_num_mask; + + /* Use XOR to invert the lower part of opos_num so that the lower bits + * remain cleared. */ + float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << fir_step_shift); + float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << fir_step_shift); int j; for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - fir_width; - UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; - float rem = FIXED_0_32_TO_FLOAT(~(DWORD)opos_num << fir_step_shift); float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; @@ -337,6 +359,9 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai for (i = 0; i < fir_width; ++i) output[opos + i] += fir[idx + i] * input_value0 + fir[idx + fir_width + i] * input_value1; + rem += rem_step; + rem -= rem >= 1.0f ? 1.0f : 0.0f; + opos_num += opos_num_step; } } @@ -344,15 +369,35 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { - LONG64 ipos_num = freq_acc_start; - DWORD ipos_num_step = freq_adjust_num; + /* Both ipos_num and rem_inv are calculated in an incremental fashion, + * independently of each other. This improves performance a bit, presumably + * because it allows the CPU to do the calculation in parallel. + * + * However, the value of rem_inv must still be kept in perfect sync with the + * lower part of ipos_num. Otherwise, even a small divergence can cause them + * to wrap around on different iterations of the outer loop, which will + * produce artifacts. + * + * To prevent this, clear the lower bits of ipos_num and ipos_num_step so + * that rem_inv can always represent the calculated value exactly. As + * rem_inv is always less than 2, its exponent is less than or equal to + * zero. This means that in the worst case, rem_inv has the same number of + * fractional bits as the significand, which is 23 for a single-precision + * floating point. + * + * Clearing the bits is safe as it has the same effect as rounding down the + * resampling ratio and the subsample position. */ + DWORD ipos_num_mask = ~0u << (FREQ_ADJUST_SHIFT - 23 - fir_step_shift); + LONG64 ipos_num = freq_acc_start & ipos_num_mask; + DWORD ipos_num_step = freq_adjust_num & ipos_num_mask; + + float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << fir_step_shift); + float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << fir_step_shift); UINT i; for(i = 0; i < count; ++i) { UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; - UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; - float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << fir_step_shift); float rem = 1.0f - rem_inv; int j; @@ -363,6 +408,9 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl sum += (fir[idx + j] * rem_inv + fir[idx + j + fir_width] * rem) * cache[j]; output[i] = sum; + rem_inv += rem_inv_step; + rem_inv -= rem_inv >= 1.0f ? 1.0f : 0.0f; + ipos_num += ipos_num_step; } } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
v4: - Round `freq_acc_fixed_start` up when downsampling to prevent a potential output buffer overflow. - Add more comments about the incremental calculation. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_135426
On Fri Apr 3 22:13:12 2026 +0000, Matteo Bruni wrote:
Very interesting. Can you please add that to the patch as a small comment? It might be nice to mention what's the objective of this "forced sync", why we can do it (i.e. the mask bits are chosen so that the cumulative error to `opos_num` doesn't affect the `opos` and `idx` values computed in the loop) and where does that "23" come from. Maybe it should be obvious, but I'd err on the side of overexplaining rather than the opposite, at least in this case. Also I'd favor a note on the `^ (DWORD)opos_num_mask` part, to me that's not immediately trivial (especially as it will appear after this patch goes in i.e. without the previous form of the `rem` expression available to compare). Done.
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_135427
On Wed Apr 8 10:58:44 2026 +0000, Anton Baskanov wrote:
Done. Excellent comment. Thanks!
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_135476
This merge request was approved by Matteo Bruni. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
On Wed Apr 8 10:58:44 2026 +0000, Anton Baskanov wrote:
v4: - Round `freq_acc_fixed_start` up when downsampling to prevent a potential output buffer overflow. - Add more comments about the incremental calculation. WRT the rounding fix, for my own curiosity, did you experience issues in testing or was it just by eyeballing it?
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_135477
On Wed Apr 8 10:58:44 2026 +0000, Matteo Bruni wrote:
WRT the rounding fix, for my own curiosity, did you experience issues in testing or was it just by eyeballing it? No, I'm not even sure it can happen with real world resampling ratios, but it's better to be on the safe side.
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_135495
This merge request was approved by Huw Davies. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
participants (4)
-
Anton Baskanov -
Anton Baskanov (@baskanov) -
Huw Davies (@huw) -
Matteo Bruni (@Mystral)