[PATCH v2 5/5] dsound: Add an SSE version of downsample.

May 3, 2026

From: Anton Baskanov <baskanov@gmail.com>

---
 dlls/dsound/mixer.c | 230 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 230 insertions(+)

diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c
index de51060db6f..c3ee4aaeef0 100644
--- a/dlls/dsound/mixer.c
+++ b/dlls/dsound/mixer.c
@@ -294,6 +294,227 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb,
     return dsb->get(dsb, buffer + (mixpos % buflen), channel);
 }
 
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
+
+/* On x86_64 rem and rem_step are passed in registers so just leave them
+ * there. */
+#define REM_REG "%xmm2"
+#define REM_STEP_REG "%xmm3"
+#define FIRGAIN_REG "%xmm0"
+#define ONE_REG "%xmm1"
+#define INPUT_VALUE0_REG "%xmm4"
+#define INPUT_VALUE1_REG "%xmm5"
+#define FTMP0_REG "%xmm6"
+#define FTMP1_REG "%xmm7"
+
+#ifdef __i386__
+
+#define OPOS_NUM_INV_REG "%ecx"
+#define OPOS_NUM_STEP_REG "%edx"
+#define INPUT_REG "%esi"
+#define INPUT_END_REG "%ebp"
+#define OUTPUT_REG "%edi"
+#define FIR_REG "%ebx"
+#define TMP_L_REG "%eax"
+#define TMP_REG "%eax"
+
+#define INIT \
+        "push %ebx\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
+        __ASM_CFI(".cfi_offset %ebx,-8\n\t") \
+        "push %ebp\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
+        __ASM_CFI(".cfi_offset %ebp,-12\n\t") \
+        "push %esi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
+        __ASM_CFI(".cfi_offset %esi,-16\n\t") \
+        "push %edi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
+        __ASM_CFI(".cfi_offset %edi,-20\n\t") \
+        "mov 0x14(%esp), " OPOS_NUM_INV_REG "\n\t" \
+        "mov 0x18(%esp), " OPOS_NUM_STEP_REG "\n\t" \
+        "movss 0x1c(%esp), " REM_REG "\n\t" \
+        "movss 0x20(%esp), " REM_STEP_REG "\n\t" \
+        "movss 0x24(%esp), " FIRGAIN_REG "\n\t" \
+        "mov 0x28(%esp), " INPUT_REG "\n\t" \
+        "mov 0x2c(%esp), " INPUT_END_REG "\n\t" \
+        "mov 0x30(%esp), " OUTPUT_REG "\n\t"
+
+#define CLEANUP \
+        "pop %edi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \
+        "pop %esi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \
+        "pop %ebp\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \
+        "pop %ebx\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t")
+
+#define ADDR_SUFFIX
+
+#else
+
+/* On x86_64 opos_num_inv and opos_num_step are passed in registers so just
+ * leave them there. */
+#define OPOS_NUM_INV_REG "%ecx"
+#define OPOS_NUM_STEP_REG "%edx"
+#define INPUT_REG "%rsi"
+#define INPUT_END_REG "%rbp"
+#define OUTPUT_REG "%rdi"
+#define FIR_REG "%rbx"
+#define TMP_L_REG "%eax"
+#define TMP_REG "%rax"
+
+#define INIT \
+        "push %rbx\n\t" \
+        __ASM_SEH(".seh_pushreg %rbx\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
+        __ASM_CFI(".cfi_offset %rbx,-16\n\t") \
+        "push %rbp\n\t" \
+        __ASM_SEH(".seh_pushreg %rbp\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
+        __ASM_CFI(".cfi_offset %rbp,-24\n\t") \
+        "push %rsi\n\t" \
+        __ASM_SEH(".seh_pushreg %rsi\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
+        __ASM_CFI(".cfi_offset %rsi,-32\n\t") \
+        "push %rdi\n\t" \
+        __ASM_SEH(".seh_pushreg %rdi\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
+        __ASM_CFI(".cfi_offset %rdi,-40\n\t") \
+        "sub $40, %rsp\n\t" \
+        __ASM_SEH(".seh_stackalloc 40\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 40\n\t") \
+        "movaps %xmm6, 16(%rsp)\n\t" \
+        __ASM_SEH(".seh_savexmm %xmm6, 16\n\t") \
+        __ASM_CFI(".cfi_offset %xmm6, -64\n\t") \
+        "movaps %xmm7, (%rsp)\n\t" \
+        __ASM_SEH(".seh_savexmm %xmm7, 0\n\t") \
+        __ASM_SEH(".seh_endprologue\n\t") \
+        __ASM_CFI(".cfi_offset %xmm7, -80\n\t") \
+        "movss 0x70(%rsp), " FIRGAIN_REG "\n\t" \
+        "mov 0x78(%rsp), " INPUT_REG "\n\t" \
+        "mov 0x80(%rsp), " INPUT_END_REG "\n\t" \
+        "mov 0x88(%rsp), " OUTPUT_REG "\n\t"
+
+#define CLEANUP \
+        "movaps (%rsp), %xmm7\n\t" \
+        "movaps 16(%rsp), %xmm6\n\t" \
+        "add $40, %rsp\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -40\n\t") \
+        "pop %rdi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
+        "pop %rsi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
+        "pop %rbp\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
+        "pop %rbx\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
+
+#define ADDR_SUFFIX "(%rip)"
+
+#endif
+
+/* opos_num_inv is the inverted lower part of opos_num. We store it inverted so
+ * that we don't have to invert it on every iteration of the outer loop.
+ *
+ * output is the output pointer divided by sizeof(float) to match the scale of
+ * opos_num. We combine the upper part of ipos_num and the input pointer into a
+ * single value to save a register. */
+void downsample_sse(DWORD opos_num_inv, DWORD opos_num_step, float rem, float rem_step,
+        float firgain, float *input, float *input_end, DWORD_PTR output);
+__ASM_GLOBAL_FUNC( downsample_sse,
+        INIT
+
+        "shufps $0, " REM_REG ", " REM_REG "\n\t"
+        "shufps $0, " REM_STEP_REG ", " REM_STEP_REG "\n\t"
+
+        "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t"
+
+        ".p2align 4,,10\n\t"
+        ".p2align 3\n\t"
+        "1:\n\t"
+        /* Calculate idx. */
+        "mov " OPOS_NUM_INV_REG ", " TMP_L_REG "\n\t"
+        "shr $(" EXPAND_STR(FREQ_ADJUST_SHIFT) " - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t"
+        "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + " EXPAND_STR(SIZEOF_FLOAT_SHIFT) "), " TMP_REG "\n\t"
+        /* Calculate the FIR address base. */
+        "lea " __ASM_NAME("fir") ADDR_SUFFIX ", " FIR_REG "\n\t"
+        "add " TMP_REG ", " FIR_REG "\n\t"
+
+        /* Calculate input_value. */
+        "movss (" INPUT_REG "), " INPUT_VALUE1_REG "\n\t"
+        "mulss " FIRGAIN_REG ", " INPUT_VALUE1_REG "\n\t"
+        "shufps $0, " INPUT_VALUE1_REG ", " INPUT_VALUE1_REG "\n\t"
+        "movups " INPUT_VALUE1_REG ", " INPUT_VALUE0_REG "\n\t"
+        /* Calculate input_value1. */
+        "mulps " REM_REG ", " INPUT_VALUE1_REG "\n\t"
+        /* Calculate input_value0. */
+        "subps " INPUT_VALUE1_REG ", " INPUT_VALUE0_REG "\n\t"
+
+        /* Initialize i. */
+        "xor " TMP_REG ", " TMP_REG "\n\t"
+
+        ".p2align 4,,10\n\t"
+        ".p2align 3\n\t"
+        "2:\n\t"
+        /* Load the FIR coefficients. */
+        "movaps (" FIR_REG "," TMP_REG "), " FTMP0_REG "\n\t"
+        "movaps " EXPAND_STR(FIR_WIDTH) " * 4(" FIR_REG "," TMP_REG "), " FTMP1_REG "\n\t"
+        /* Calculate the weighted sums. */
+        "mulps " INPUT_VALUE0_REG ", " FTMP0_REG "\n\t"
+        "mulps " INPUT_VALUE1_REG ", " FTMP1_REG "\n\t"
+        "addps " FTMP0_REG ", " FTMP1_REG "\n\t"
+        /* Add the sums to the output. */
+        "movups (" TMP_REG "," OUTPUT_REG ",4), " FTMP0_REG "\n\t"
+        "addps " FTMP1_REG ", " FTMP0_REG "\n\t"
+        "movups " FTMP0_REG ", (" TMP_REG "," OUTPUT_REG ",4)\n\t"
+        "add $16, " TMP_REG "\n\t"
+        "cmp $(" EXPAND_STR(FIR_WIDTH) " * 4), " TMP_REG "\n\t"
+        "jl 2b\n\t"
+
+        /* Update rem. */
+        "addps " REM_STEP_REG ", " REM_REG "\n\t"
+        "movups " ONE_REG ", " FTMP0_REG "\n\t"
+        "cmpleps " REM_REG ", " FTMP0_REG "\n\t"
+        "andps " ONE_REG ", " FTMP0_REG "\n\t"
+        "subps " FTMP0_REG ", " REM_REG "\n\t"
+
+        /* Update opos_num. Use subtraction for the lower half as it is stored
+         * inverted. */
+        "sub " OPOS_NUM_STEP_REG ", " OPOS_NUM_INV_REG "\n\t"
+        "adc $0, " OUTPUT_REG "\n\t"
+
+        /* Advance the input pointer. */
+        "add $4, " INPUT_REG "\n\t"
+        "cmp " INPUT_END_REG ", " INPUT_REG "\n\t"
+        "jl 1b\n\t"
+
+        CLEANUP
+        "ret" )
+
+#undef REM_REG
+#undef REM_STEP_REG
+#undef FIRGAIN_REG
+#undef ONE_REG
+#undef INPUT_VALUE0_REG
+#undef INPUT_VALUE1_REG
+#undef FTMP0_REG
+#undef FTMP1_REG
+#undef OPOS_NUM_INV_REG
+#undef OPOS_NUM_STEP_REG
+#undef INPUT_REG
+#undef INPUT_END_REG
+#undef OUTPUT_REG
+#undef FIR_REG
+#undef TMP_L_REG
+#undef TMP_REG
+#undef INIT
+#undef CLEANUP
+#undef ADDR_SUFFI
+
+#endif
+
 /**
  * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and
  * after output[].
@@ -329,6 +550,15 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai
     float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << FIR_STEP_SHIFT);
     int j;
 
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
+    if (sse_supported) {
+        downsample_sse(~(DWORD)opos_num, opos_num_step, rem, rem_step, firgain, input,
+                input + required_input,
+                (DWORD_PTR)output / sizeof(float) + (opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH);
+        return;
+    }
+#endif
+
     for (j = 0; j < required_input; ++j) {
         /* opos is in the range [-(fir_width - 1), count) */
         int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH;
-- 
GitLab

https://gitlab.winehq.org/wine/wine/-/merge_requests/10716