[PATCH v2 0/5] MR10423: dsound: Speed up resampling, part 5
-- v2: dsound: Calculate rem and rem_inv incrementally. https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/fir.h | 4 ++++ dlls/dsound/mixer.c | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h index c03d0b9f237..45ad65d7398 100644 --- a/dlls/dsound/fir.h +++ b/dlls/dsound/fir.h @@ -86,7 +86,9 @@ int main() fprintf(stderr, "q %f\n", (double)output.q); fprintf(stderr, "status %s\n", get_pm_status_str(output.status)); + printf("static const int fir_width_shift = %d;\n", fir_width_shift); printf("static const int fir_width = %d;\n", fir_width); + printf("static const int fir_step_shift = %d;\n", fir_step_shift); printf("static const int fir_step = %d;\n", fir_step); printf("static const float fir[] = {"); // Print the FIR array with an additional row at the end. This simplifies @@ -112,7 +114,9 @@ int main() printf("};\n"); } */ +static const int fir_width_shift = 6; static const int fir_width = 64; +static const int fir_step_shift = 7; static const int fir_step = 128; static const float fir[] = { 0.0000000000e+00, -2.4830013102e-06, 1.9318705150e-06, 2.6614854151e-06, diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 43a9a8c4f91..3003715b413 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -322,8 +322,8 @@ static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 fr /* opos is in the range [-(fir_width - 1), count) */ int opos = opos_num / freq_adjust_num - fir_width; - UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) * fir_step; - UINT idx = idx_num / freq_adjust_num * fir_width; + UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) << fir_step_shift; + UINT idx = (idx_num / freq_adjust_num) << fir_width_shift; float rem = idx_num % freq_adjust_num / (float)freq_adjust_num; float input_value = input[j] * firgain; @@ -345,8 +345,8 @@ static void upsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq LONG64 ipos_num = freq_acc_start + i * freq_adjust_num; UINT ipos = ipos_num / freq_adjust_den; - UINT idx_num = ipos_num % freq_adjust_den * fir_step; - UINT idx = (fir_step - 1 - idx_num / freq_adjust_den) * fir_width; + UINT idx_num = (ipos_num % freq_adjust_den) << fir_step_shift; + UINT idx = (fir_step - 1 - idx_num / freq_adjust_den) << fir_width_shift; float rem_inv = idx_num % freq_adjust_den / (float)freq_adjust_den; float rem = 1.0f - rem_inv; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 3003715b413..530b8e23d69 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -312,19 +312,19 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) * Note that this function will overwrite up to fir_width - 1 frames before and * after output[]. */ -static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq_acc_start, - float firgain, UINT required_input, float *input, float *output) +static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, + UINT required_input, float *input, float *output) { int j; for (j = 0; j < required_input; ++j) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + j * freq_adjust_den + freq_adjust_num - 1; + LONG64 opos_num = freq_adjust_den - freq_acc_start + j * (LONG64)freq_adjust_den + ~0u; /* opos is in the range [-(fir_width - 1), count) */ - int opos = opos_num / freq_adjust_num - fir_width; + int opos = (int)(opos_num >> 32) - fir_width; - UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) << fir_step_shift; - UINT idx = (idx_num / freq_adjust_num) << fir_width_shift; - float rem = idx_num % freq_adjust_num / (float)freq_adjust_num; + UINT idx = ~(DWORD)opos_num >> (32 - fir_step_shift) << fir_width_shift; + UINT rem_num = ~(DWORD)opos_num << fir_step_shift; + float rem = rem_num * (1.0f / (1ll << 32)); float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; @@ -336,18 +336,18 @@ static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 fr } } -static void upsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq_acc_start, - UINT count, float *input, float *output) +static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, + float *output) { UINT i; for(i = 0; i < count; ++i) { - LONG64 ipos_num = freq_acc_start + i * freq_adjust_num; - UINT ipos = ipos_num / freq_adjust_den; + LONG64 ipos_num = freq_acc_start + i * (LONG64)freq_adjust_num; + UINT ipos = ipos_num >> 32; - UINT idx_num = (ipos_num % freq_adjust_den) << fir_step_shift; - UINT idx = (fir_step - 1 - idx_num / freq_adjust_den) << fir_width_shift; - float rem_inv = idx_num % freq_adjust_den / (float)freq_adjust_den; + UINT idx = ~(DWORD)ipos_num >> (32 - fir_step_shift) << fir_width_shift; + UINT rem_num = (DWORD)ipos_num << fir_step_shift; + float rem_inv = rem_num * (1.0f / (1ll << 32)); float rem = 1.0f - rem_inv; int j; @@ -368,11 +368,21 @@ static void resample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq float firgain, UINT required_input, UINT count, float *input, float *output) { if (freq_adjust_num > freq_adjust_den) { + /* Convert the values to 0.32 fixed point. Round down to prevent output + * buffer overflow. */ + DWORD freq_adjust_fixed_den = (freq_adjust_den << 32) / freq_adjust_num; + DWORD freq_acc_fixed_start = freq_acc_start * freq_adjust_fixed_den / freq_adjust_den; + memset(output, 0, count * sizeof(float)); - downsample(freq_adjust_num, freq_adjust_den, freq_acc_start, firgain, required_input, - input, output); + downsample(freq_adjust_fixed_den, freq_acc_fixed_start, firgain, required_input, input, + output); } else { - upsample(freq_adjust_num, freq_adjust_den, freq_acc_start, count, input, output); + /* Convert the values to 0.32 fixed point. Round down to prevent input + * buffer overflow. */ + DWORD freq_adjust_fixed_num = (freq_adjust_num << 32) / freq_adjust_den; + DWORD freq_acc_fixed_start = (freq_acc_start << 32) / freq_adjust_den; + + upsample(freq_adjust_fixed_num, freq_acc_fixed_start, count, input, output); } } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> Both x87 and SSE don't accept unsigned integers directly, requiring extra conversion steps. --- dlls/dsound/mixer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 530b8e23d69..fefc0d2fe05 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -323,8 +323,8 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai int opos = (int)(opos_num >> 32) - fir_width; UINT idx = ~(DWORD)opos_num >> (32 - fir_step_shift) << fir_width_shift; - UINT rem_num = ~(DWORD)opos_num << fir_step_shift; - float rem = rem_num * (1.0f / (1ll << 32)); + int rem_num = ~(DWORD)opos_num << fir_step_shift >> 1; + float rem = rem_num * (1.0f / (1ll << 31)); float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; @@ -346,8 +346,8 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl UINT ipos = ipos_num >> 32; UINT idx = ~(DWORD)ipos_num >> (32 - fir_step_shift) << fir_width_shift; - UINT rem_num = (DWORD)ipos_num << fir_step_shift; - float rem_inv = rem_num * (1.0f / (1ll << 32)); + int rem_num = (DWORD)ipos_num << fir_step_shift >> 1; + float rem_inv = rem_num * (1.0f / (1ll << 31)); float rem = 1.0f - rem_inv; int j; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index fefc0d2fe05..e3190535f19 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -315,10 +315,11 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, UINT required_input, float *input, float *output) { + LONG64 opos_num = freq_adjust_den - freq_acc_start + (LONG64)~0u; + DWORD opos_num_step = freq_adjust_den; int j; for (j = 0; j < required_input; ++j) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + j * (LONG64)freq_adjust_den + ~0u; /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> 32) - fir_width; @@ -333,16 +334,19 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai UINT i; for (i = 0; i < fir_width; ++i) output[opos + i] += fir[idx + i] * input_value0 + fir[idx + fir_width + i] * input_value1; + + opos_num += opos_num_step; } } static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { + LONG64 ipos_num = freq_acc_start; + DWORD ipos_num_step = freq_adjust_num; UINT i; for(i = 0; i < count; ++i) { - LONG64 ipos_num = freq_acc_start + i * (LONG64)freq_adjust_num; UINT ipos = ipos_num >> 32; UINT idx = ~(DWORD)ipos_num >> (32 - fir_step_shift) << fir_width_shift; @@ -357,6 +361,8 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl for (j = 0; j < fir_width; j++) sum += (fir[idx + j] * rem_inv + fir[idx + j + fir_width] * rem) * cache[j]; output[i] = sum; + + ipos_num += ipos_num_step; } } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index e3190535f19..c7565c03fa6 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -315,17 +315,23 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, UINT required_input, float *input, float *output) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + (LONG64)~0u; - DWORD opos_num_step = freq_adjust_den; + /* Clear the lower bits of opos_num and opos_num_step to keep rem in perfect + * sync with the lower part of opos_num. As rem is always less than 2, its + * fractional part has 23 bits in the worst case. */ + LONG64 opos_num_mask = ~0ull << (32 - 23 - fir_step_shift); + LONG64 opos_num = (freq_adjust_den - freq_acc_start + (LONG64)~0u) & opos_num_mask; + DWORD opos_num_step = freq_adjust_den & (DWORD)opos_num_mask; + + DWORD rem_num = ((DWORD)opos_num ^ (DWORD)opos_num_mask) << fir_step_shift >> 1; + DWORD rem_step_num = -opos_num_step << fir_step_shift >> 1; + float rem = rem_num * (1.0f / (1ll << 31)); + float rem_step = rem_step_num * (1.0f / (1ll << 31)); int j; for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> 32) - fir_width; - UINT idx = ~(DWORD)opos_num >> (32 - fir_step_shift) << fir_width_shift; - int rem_num = ~(DWORD)opos_num << fir_step_shift >> 1; - float rem = rem_num * (1.0f / (1ll << 31)); float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; @@ -335,6 +341,9 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai for (i = 0; i < fir_width; ++i) output[opos + i] += fir[idx + i] * input_value0 + fir[idx + fir_width + i] * input_value1; + rem += rem_step; + rem -= rem >= 1.0f ? 1.0f : 0.0f; + opos_num += opos_num_step; } } @@ -342,16 +351,22 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { - LONG64 ipos_num = freq_acc_start; - DWORD ipos_num_step = freq_adjust_num; + /* Clear the lower bits of ipos_num and ipos_num_step to keep rem_inv in + * perfect sync with the lower part of ipos_num. As rem is always less than + * 2, its fractional part has 23 bits in the worst case. */ + DWORD ipos_num_mask = ~0u << (32 - 23 - fir_step_shift); + LONG64 ipos_num = freq_acc_start & ipos_num_mask; + DWORD ipos_num_step = freq_adjust_num & ipos_num_mask; + + DWORD rem_num = (DWORD)ipos_num << fir_step_shift >> 1; + DWORD rem_step_num = ipos_num_step << fir_step_shift >> 1; + float rem_inv = rem_num * (1.0f / (1ll << 31)); + float rem_inv_step = rem_step_num * (1.0f / (1ll << 31)); UINT i; for(i = 0; i < count; ++i) { UINT ipos = ipos_num >> 32; - UINT idx = ~(DWORD)ipos_num >> (32 - fir_step_shift) << fir_width_shift; - int rem_num = (DWORD)ipos_num << fir_step_shift >> 1; - float rem_inv = rem_num * (1.0f / (1ll << 31)); float rem = 1.0f - rem_inv; int j; @@ -362,6 +377,9 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl sum += (fir[idx + j] * rem_inv + fir[idx + j + fir_width] * rem) * cache[j]; output[i] = sum; + rem_inv += rem_inv_step; + rem_inv -= rem_inv >= 1.0f ? 1.0f : 0.0f; + ipos_num += ipos_num_step; } } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10423
v2: - Use `>=` when comparing `rem_inv` to `1.0f`. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_133820
On Thu Mar 26 01:52:58 2026 +0000, Anton Baskanov wrote:
changed this line in [version 2 of the diff](/wine/wine/-/merge_requests/10423/diffs?diff_id=254387&start_sha=4a16374e84ef5e45cff2ac0b8973a4522afe959b#3611483beebc594b1206bbcc979e9584d27bf7ce_381_381) Done.
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_133821
Matteo Bruni (@Mystral) commented about dlls/dsound/mixer.c:
* Note that this function will overwrite up to fir_width - 1 frames before and * after output[]. */ -static void downsample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq_acc_start, - float firgain, UINT required_input, float *input, float *output) +static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, + UINT required_input, float *input, float *output) { int j;
for (j = 0; j < required_input; ++j) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + j * freq_adjust_den + freq_adjust_num - 1; + LONG64 opos_num = freq_adjust_den - freq_acc_start + j * (LONG64)freq_adjust_den + ~0u; /* opos is in the range [-(fir_width - 1), count) */ - int opos = opos_num / freq_adjust_num - fir_width; + int opos = (int)(opos_num >> 32) - fir_width; It might be nice to introduce a couple of `#define`s, like:
#define POS_FRAC_SHIFT 32
#define POS_FRAC_MASK ((1ull << POS_FRAC_SHIFT) - 1)
I'm not attached to the specific naming. The choice here comes from the idea of eventually ending up with something like https://gitlab.winehq.org/Mystral/wine/-/commit/aca8b39927dd75268cb18fe19307..., which still seems like a good improvement to have :slight_smile: -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_134503
Matteo Bruni (@Mystral) commented about dlls/dsound/mixer.c:
+static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, + UINT required_input, float *input, float *output) { int j;
for (j = 0; j < required_input; ++j) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + j * freq_adjust_den + freq_adjust_num - 1; + LONG64 opos_num = freq_adjust_den - freq_acc_start + j * (LONG64)freq_adjust_den + ~0u; /* opos is in the range [-(fir_width - 1), count) */ - int opos = opos_num / freq_adjust_num - fir_width; + int opos = (int)(opos_num >> 32) - fir_width;
- UINT idx_num = (freq_adjust_num - 1 - opos_num % freq_adjust_num) << fir_step_shift; - UINT idx = (idx_num / freq_adjust_num) << fir_width_shift; - float rem = idx_num % freq_adjust_num / (float)freq_adjust_num; + UINT idx = ~(DWORD)opos_num >> (32 - fir_step_shift) << fir_width_shift; Note to the interested reader: one's complement here replaces the whole `- 1 - opos_num` part i.e. it seems correct.
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_134505
Matteo Bruni (@Mystral) commented about dlls/dsound/mixer.c:
int opos = (int)(opos_num >> 32) - fir_width;
UINT idx = ~(DWORD)opos_num >> (32 - fir_step_shift) << fir_width_shift; - UINT rem_num = ~(DWORD)opos_num << fir_step_shift; - float rem = rem_num * (1.0f / (1ll << 32)); + int rem_num = ~(DWORD)opos_num << fir_step_shift >> 1; + float rem = rem_num * (1.0f / (1ll << 31));
Smart! I'd have `#define`s for these conversion numbers as well, maybe something like: ``` #define POS_FRAC_TO_SIGNED_SHIFT 1 #define POS_FRAC_SIGNED_MAX (1ll << 31) ``` I expect the resulting code to be more verbose but also quite a bit more self-explanatory. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_134506
Matteo Bruni (@Mystral) commented about dlls/dsound/mixer.c:
static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, UINT required_input, float *input, float *output) { - LONG64 opos_num = freq_adjust_den - freq_acc_start + (LONG64)~0u; - DWORD opos_num_step = freq_adjust_den; + /* Clear the lower bits of opos_num and opos_num_step to keep rem in perfect + * sync with the lower part of opos_num. As rem is always less than 2, its + * fractional part has 23 bits in the worst case. */ + LONG64 opos_num_mask = ~0ull << (32 - 23 - fir_step_shift); + LONG64 opos_num = (freq_adjust_den - freq_acc_start + (LONG64)~0u) & opos_num_mask; + DWORD opos_num_step = freq_adjust_den & (DWORD)opos_num_mask; + + DWORD rem_num = ((DWORD)opos_num ^ (DWORD)opos_num_mask) << fir_step_shift >> 1; + DWORD rem_step_num = -opos_num_step << fir_step_shift >> 1; + float rem = rem_num * (1.0f / (1ll << 31)); + float rem_step = rem_step_num * (1.0f / (1ll << 31)); Does this make a large difference in the end? I.e. how bad is it if we store and update the linear interpolation factor as fixed point and convert it to float in SIMD fashion every time? IIRC "sane" int->float conversion (e.g. not unsigned, as taken care of by your earlier patch) is fast in SSE+.
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10423#note_134507
participants (3)
-
Anton Baskanov -
Anton Baskanov (@baskanov) -
Matteo Bruni (@Mystral)