[PATCH 5/5] dsound: Add an SSE version of downsample.

April 22, 2026

From: Anton Baskanov <baskanov@gmail.com>

---
 dlls/dsound/mixer.c | 196 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 196 insertions(+)

diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c
index 78a1ef2bb33..702f84fc08f 100644
--- a/dlls/dsound/mixer.c
+++ b/dlls/dsound/mixer.c
@@ -369,6 +369,188 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb,
 
 #endif
 
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
+
+#ifdef __i386__
+
+#define OPOS_NUM_ARG "0x14(%esp)"
+#define OPOS_NUM_STEP_ARG "0x18(%esp)"
+#define REM_ARG "0x1c(%esp)"
+#define REM_STEP_ARG "0x20(%esp)"
+#define FIRGAIN_ARG "0x24(%esp)"
+#define REQUIRED_INPUT_ARG "0x28(%esp)"
+#define INPUT_ARG "0x2c(%esp)"
+#define OUTPUT_ARG "0x30(%esp)"
+
+#define OPOS_NUM_REG "%ecx"
+#define OPOS_NUM_STEP_REG "%edx"
+#define INPUT_REG "%esi"
+#define OUTPUT_REG "%edi"
+#define INPUT_END_REG "%ebp"
+#define FIR_REG "%ebx"
+#define TMP_L_REG "%eax"
+#define TMP_REG "%eax"
+
+#else
+
+#define OPOS_NUM_ARG "%ecx"
+#define OPOS_NUM_STEP_ARG "%edx"
+#define REM_ARG "%xmm2"
+#define REM_STEP_ARG "%xmm3"
+#define FIRGAIN_ARG "0x70(%rsp)"
+#define REQUIRED_INPUT_ARG "0x78(%rsp)"
+#define INPUT_ARG "0x80(%rsp)"
+#define OUTPUT_ARG "0x88(%rsp)"
+
+#define OPOS_NUM_REG "%ecx"
+#define OPOS_NUM_STEP_REG "%edx"
+#define INPUT_REG "%rsi"
+#define OUTPUT_REG "%rdi"
+#define INPUT_END_REG "%rbp"
+#define FIR_REG "%rbx"
+#define TMP_L_REG "%eax"
+#define TMP_REG "%rax"
+
+#endif
+
+#define REM_REG "%xmm2"
+#define REM_STEP_REG "%xmm3"
+#define FIRGAIN_REG "%xmm0"
+#define ONE_REG "%xmm1"
+#define INPUT_VALUE0_REG "%xmm4"
+#define INPUT_VALUE1_REG "%xmm5"
+#define FTMP0_REG "%xmm6"
+#define FTMP1_REG "%xmm7"
+
+/**
+ * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and
+ * after output[].
+ */
+void downsample_sse(DWORD opos_num, DWORD opos_num_step, float rem, float rem_step, float firgain,
+        UINT required_input, float *input, float *output);
+__ASM_GLOBAL_FUNC( downsample_sse,
+        INIT
+
+        "mov " OPOS_NUM_ARG ", " OPOS_NUM_REG "\n\t"
+        /* Store the lower half of opos_num inverted so that we don't have to
+         * invert it on every iteration of the outer loop. */
+        "not " OPOS_NUM_REG "\n\t"
+        "mov " OPOS_NUM_STEP_ARG ", " OPOS_NUM_STEP_REG "\n\t"
+
+        "movss " REM_ARG ", " REM_REG "\n\t"
+        "shufps $0, " REM_REG ", " REM_REG "\n\t"
+        "movss " REM_STEP_ARG ", " REM_STEP_REG "\n\t"
+        "shufps $0, " REM_STEP_REG ", " REM_STEP_REG "\n\t"
+
+        "movss " FIRGAIN_ARG ", " FIRGAIN_REG "\n\t"
+
+        "mov " INPUT_ARG ", " INPUT_REG "\n\t"
+
+        /* Combine the upper half of opos_num and the output pointer into a
+         * single value. */
+        "mov " OUTPUT_ARG ", " OUTPUT_REG "\n\t"
+        /* Divide the output pointer by 4 to match the scale. We can do this
+         * because the pointer is at least 4-byte aligned. It will be scaled
+         * back during the access in the inner loop. */
+        "shr $2, " OUTPUT_REG "\n\t"
+        /* Subtract FIR_WIDTH so that we don't have to do this on every
+         * iteration of the outer loop. */
+        "sub $" EXPAND_STR(FIR_WIDTH) ", " OUTPUT_REG "\n\t"
+
+        "mov " REQUIRED_INPUT_ARG ", " TMP_L_REG "\n\t"
+        "lea (" INPUT_REG "," TMP_REG ",4), " INPUT_END_REG "\n\t"
+
+        "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t"
+
+        ".p2align 4,,10\n\t"
+        ".p2align 3\n\t"
+        "1:\n\t"
+        /* Calculate idx. */
+        "mov " OPOS_NUM_REG ", " TMP_L_REG "\n\t"
+        "shr $(32 - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t"
+        "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + 2), " TMP_REG "\n\t"
+        /* Calculate the FIR address base. */
+        "lea " __ASM_NAME("fir") ADDR_SUFFIX ", " FIR_REG "\n\t"
+        "add " TMP_REG ", " FIR_REG "\n\t"
+
+        /* Calculate input_value. */
+        "movss (" INPUT_REG "), " INPUT_VALUE1_REG "\n\t"
+        "mulss " FIRGAIN_REG ", " INPUT_VALUE1_REG "\n\t"
+        "shufps $0, " INPUT_VALUE1_REG ", " INPUT_VALUE1_REG "\n\t"
+        "movups " INPUT_VALUE1_REG ", " INPUT_VALUE0_REG "\n\t"
+        /* Calculate input_value1. */
+        "mulps " REM_REG ", " INPUT_VALUE1_REG "\n\t"
+        /* Calculate input_value0. */
+        "subps " INPUT_VALUE1_REG ", " INPUT_VALUE0_REG "\n\t"
+
+        /* Initialize i. */
+        "xor " TMP_REG ", " TMP_REG "\n\t"
+
+        ".p2align 4,,10\n\t"
+        ".p2align 3\n\t"
+        "2:\n\t"
+        /* Load the FIR coefficients. */
+        "movaps (" FIR_REG "," TMP_REG "), " FTMP0_REG "\n\t"
+        "movaps " EXPAND_STR(FIR_WIDTH) " * 4(" FIR_REG "," TMP_REG "), " FTMP1_REG "\n\t"
+        /* Calculate the weighted sums. */
+        "mulps " INPUT_VALUE0_REG ", " FTMP0_REG "\n\t"
+        "mulps " INPUT_VALUE1_REG ", " FTMP1_REG "\n\t"
+        "addps " FTMP0_REG ", " FTMP1_REG "\n\t"
+        /* Add the sums to the output. */
+        "movups (" TMP_REG "," OUTPUT_REG ",4), " FTMP0_REG "\n\t"
+        "addps " FTMP1_REG ", " FTMP0_REG "\n\t"
+        "movups " FTMP0_REG ", (" TMP_REG "," OUTPUT_REG ",4)\n\t"
+        "add $16, " TMP_REG "\n\t"
+        "cmp $(" EXPAND_STR(FIR_WIDTH) " * 4), " TMP_REG "\n\t"
+        "jl 2b\n\t"
+
+        /* Update rem. */
+        "addps " REM_STEP_REG ", " REM_REG "\n\t"
+        "movups " ONE_REG ", " FTMP0_REG "\n\t"
+        "cmpleps " REM_REG ", " FTMP0_REG "\n\t"
+        "andps " ONE_REG ", " FTMP0_REG "\n\t"
+        "subps " FTMP0_REG ", " REM_REG "\n\t"
+
+        /* Update opos_num. Use subtraction for the lower half as it is stored
+         * inverted. */
+        "sub " OPOS_NUM_STEP_REG ", " OPOS_NUM_REG "\n\t"
+        "adc $0, " OUTPUT_REG "\n\t"
+
+        /* Advance the input pointer. */
+        "add $4, " INPUT_REG "\n\t"
+        "cmp " INPUT_END_REG ", " INPUT_REG "\n\t"
+        "jl 1b\n\t"
+
+        CLEANUP
+        "ret" )
+
+#undef OPOS_NUM_ARG
+#undef OPOS_NUM_STEP_ARG
+#undef REM_ARG
+#undef REM_STEP_ARG
+#undef FIRGAIN_ARG
+#undef REQUIRED_INPUT_ARG
+#undef INPUT_ARG
+#undef OUTPUT_ARG
+#undef OPOS_NUM_REG
+#undef OPOS_NUM_STEP_REG
+#undef INPUT_REG
+#undef OUTPUT_REG
+#undef INPUT_END_REG
+#undef FIR_REG
+#undef TMP_L_REG
+#undef TMP_REG
+#undef REM_REG
+#undef REM_STEP_REG
+#undef FIRGAIN_REG
+#undef ONE_REG
+#undef INPUT_VALUE0_REG
+#undef INPUT_VALUE1_REG
+#undef FTMP0_REG
+#undef FTMP1_REG
+
+#endif
+
 /**
  * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and
  * after output[].
@@ -402,8 +584,21 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai
      * remain cleared. */
     float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << FIR_STEP_SHIFT);
     float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << FIR_STEP_SHIFT);
+
+#if defined(__x86_64__) && !defined(__arm64ec__)
+    downsample_sse((DWORD)opos_num, opos_num_step, rem, rem_step, firgain, required_input, input,
+            output + (opos_num >> FREQ_ADJUST_SHIFT));
+#else
     int j;
 
+#ifdef __i386__
+    if (sse_supported) {
+        downsample_sse((DWORD)opos_num, opos_num_step, rem, rem_step, firgain, required_input,
+                input, output + (opos_num >> FREQ_ADJUST_SHIFT));
+        return;
+    }
+#endif
+
     for (j = 0; j < required_input; ++j) {
         /* opos is in the range [-(fir_width - 1), count) */
         int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH;
@@ -422,6 +617,7 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai
 
         opos_num += opos_num_step;
     }
+#endif
 }
 
 #if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
-- 
GitLab

https://gitlab.winehq.org/wine/wine/-/merge_requests/10716