[PATCH v4 5/5] dsound: Calculate rem and rem_inv incrementally.

April 8, 2026

From: Anton Baskanov <baskanov@gmail.com>

---
 dlls/dsound/mixer.c | 64 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 56 insertions(+), 8 deletions(-)

diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c
index 0d979f69898..bd878604436 100644
--- a/dlls/dsound/mixer.c
+++ b/dlls/dsound/mixer.c
@@ -318,16 +318,38 @@ static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count)
 static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain,
         UINT required_input, float *input, float *output)
 {
-    LONG64 opos_num = freq_adjust_den - freq_acc_start + (1ll << FREQ_ADJUST_SHIFT) - 1;
-    DWORD opos_num_step = freq_adjust_den;
+    /* Both opos_num and rem are calculated in an incremental fashion,
+     * independently of each other. This improves performance a bit, presumably
+     * because it allows the CPU to do the calculation in parallel.
+     *
+     * However, the value of rem must still be kept in perfect sync with the
+     * lower part of opos_num. Otherwise, even a small divergence can cause them
+     * to wrap around on different iterations of the outer loop, which will
+     * produce artifacts.
+     *
+     * To prevent this, clear the lower bits of opos_num and opos_num_step so
+     * that rem can always represent the calculated value exactly. As rem is
+     * always less than 2, its exponent is less than or equal to zero. This
+     * means that in the worst case, rem has the same number of fractional bits
+     * as the significand, which is 23 for a single-precision floating point.
+     *
+     * Clearing the bits is safe as it has the same effect as rounding up the
+     * resampling ratio and the subsample position and doesn't affect the
+     * initial opos value. */
+    LONG64 opos_num_mask = ~0ull << (FREQ_ADJUST_SHIFT - 23 - fir_step_shift);
+    LONG64 opos_num = (freq_adjust_den - freq_acc_start + (1ll << FREQ_ADJUST_SHIFT) - 1) & opos_num_mask;
+    DWORD opos_num_step = freq_adjust_den & (DWORD)opos_num_mask;
+
+    /* Use XOR to invert the lower part of opos_num so that the lower bits
+     * remain cleared. */
+    float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << fir_step_shift);
+    float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << fir_step_shift);
     int j;
 
     for (j = 0; j < required_input; ++j) {
         /* opos is in the range [-(fir_width - 1), count) */
         int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - fir_width;
-
         UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift;
-        float rem = FIXED_0_32_TO_FLOAT(~(DWORD)opos_num << fir_step_shift);
 
         float input_value = input[j] * firgain;
         float input_value0 = (1.0f - rem) * input_value;
@@ -337,6 +359,9 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai
         for (i = 0; i < fir_width; ++i)
             output[opos + i] += fir[idx + i] * input_value0 + fir[idx + fir_width + i] * input_value1;
 
+        rem += rem_step;
+        rem -= rem >= 1.0f ? 1.0f : 0.0f;
+
         opos_num += opos_num_step;
     }
 }
@@ -344,15 +369,35 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai
 static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input,
         float *output)
 {
-    LONG64 ipos_num = freq_acc_start;
-    DWORD ipos_num_step = freq_adjust_num;
+    /* Both ipos_num and rem_inv are calculated in an incremental fashion,
+     * independently of each other. This improves performance a bit, presumably
+     * because it allows the CPU to do the calculation in parallel.
+     *
+     * However, the value of rem_inv must still be kept in perfect sync with the
+     * lower part of ipos_num. Otherwise, even a small divergence can cause them
+     * to wrap around on different iterations of the outer loop, which will
+     * produce artifacts.
+     *
+     * To prevent this, clear the lower bits of ipos_num and ipos_num_step so
+     * that rem_inv can always represent the calculated value exactly. As
+     * rem_inv is always less than 2, its exponent is less than or equal to
+     * zero. This means that in the worst case, rem_inv has the same number of
+     * fractional bits as the significand, which is 23 for a single-precision
+     * floating point.
+     *
+     * Clearing the bits is safe as it has the same effect as rounding down the
+     * resampling ratio and the subsample position. */
+    DWORD ipos_num_mask = ~0u << (FREQ_ADJUST_SHIFT - 23 - fir_step_shift);
+    LONG64 ipos_num = freq_acc_start & ipos_num_mask;
+    DWORD ipos_num_step = freq_adjust_num & ipos_num_mask;
+
+    float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << fir_step_shift);
+    float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << fir_step_shift);
     UINT i;
 
     for(i = 0; i < count; ++i) {
         UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT;
-
         UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift;
-        float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << fir_step_shift);
         float rem = 1.0f - rem_inv;
 
         int j;
@@ -363,6 +408,9 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl
             sum += (fir[idx + j] * rem_inv + fir[idx + j + fir_width] * rem) * cache[j];
         output[i] = sum;
 
+        rem_inv += rem_inv_step;
+        rem_inv -= rem_inv >= 1.0f ? 1.0f : 0.0f;
+
         ipos_num += ipos_num_step;
     }
 }
-- 
GitLab

https://gitlab.winehq.org/wine/wine/-/merge_requests/10423