[PATCH 4/5] dsound: Add an SSE version of upsample.

April 22, 2026

From: Anton Baskanov <baskanov@gmail.com>

---
 dlls/dsound/dsound_main.c    |  12 ++
 dlls/dsound/dsound_private.h |   4 +
 dlls/dsound/fir.h            |   4 +-
 dlls/dsound/mixer.c          | 278 +++++++++++++++++++++++++++++++++++
 4 files changed, 296 insertions(+), 2 deletions(-)

diff --git a/dlls/dsound/dsound_main.c b/dlls/dsound/dsound_main.c
index 8936b437ba2..dbcf5a79899 100644
--- a/dlls/dsound/dsound_main.c
+++ b/dlls/dsound/dsound_main.c
@@ -63,6 +63,10 @@
 
 WINE_DEFAULT_DEBUG_CHANNEL(dsound);
 
+#ifdef __i386__
+BOOL sse_supported;
+#endif
+
 struct list DSOUND_renderers = LIST_INIT(DSOUND_renderers);
 CRITICAL_SECTION DSOUND_renderers_lock;
 static CRITICAL_SECTION_DEBUG DSOUND_renderers_lock_debug =
@@ -82,6 +86,13 @@ GUID *DSOUND_capture_guids;
 /* All default settings, you most likely don't want to touch these, see wiki on UsefulRegistryKeys */
 int ds_hel_buflen = 32768 * 2;
 
+static void init_cpu_features(void)
+{
+#ifdef __i386__
+    sse_supported = IsProcessorFeaturePresent(PF_XMMI_INSTRUCTIONS_AVAILABLE);
+#endif
+}
+
 /*
  * Get a config key from either the app-specific or the default config
  */
@@ -787,6 +798,7 @@ BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpvReserved)
         DisableThreadLibraryCalls(hInstDLL);
         /* Increase refcount on dsound by 1 */
         GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS, (LPCWSTR)hInstDLL, &hInstDLL);
+        init_cpu_features();
         break;
     case DLL_PROCESS_DETACH:
         if (lpvReserved) break;
diff --git a/dlls/dsound/dsound_private.h b/dlls/dsound/dsound_private.h
index 0e695698046..b13c3858e44 100644
--- a/dlls/dsound/dsound_private.h
+++ b/dlls/dsound/dsound_private.h
@@ -251,6 +251,10 @@ HRESULT IDirectSoundCaptureImpl_Create(IUnknown *outer_unk, REFIID riid, void **
 #define STATE_CAPTURING 2
 #define STATE_STOPPING  3
 
+#ifdef __i386__
+extern BOOL sse_supported;
+#endif
+
 extern CRITICAL_SECTION DSOUND_renderers_lock;
 extern struct list DSOUND_renderers;
 
diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h
index 76ac521e0f3..3b9bccbdb83 100644
--- a/dlls/dsound/fir.h
+++ b/dlls/dsound/fir.h
@@ -90,7 +90,7 @@ int main()
     printf("#define FIR_WIDTH %d\n", fir_width);
     printf("#define FIR_STEP_SHIFT %d\n", fir_step_shift);
     printf("#define FIR_STEP %d\n", fir_step);
-    printf("static const float fir[] = {");
+    printf("static const float __attribute__((used, aligned(16))) fir[] = {");
     // Print the FIR array with an additional row at the end. This simplifies
     // calculation of the interpolated value by allowing the index to overflow
     // into the extra row. It just repeats the first row, starting from its
@@ -118,7 +118,7 @@ int main()
 #define FIR_WIDTH 64
 #define FIR_STEP_SHIFT 7
 #define FIR_STEP 128
-static const float fir[] = {
+static const float __attribute__((used, aligned(16))) fir[] = {
      0.0000000000e+00, -2.4830013102e-06,  1.9318705150e-06,  2.6614854151e-06,
     -1.5313785194e-05,  4.2076214553e-05, -9.1417167945e-05,  1.7455895136e-04,
     -3.0567859821e-04,  5.0191365396e-04, -7.8311909082e-04,  1.1713337628e-03,
diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c
index 1b4b1c7bd7a..78a1ef2bb33 100644
--- a/dlls/dsound/mixer.c
+++ b/dlls/dsound/mixer.c
@@ -34,6 +34,7 @@
 #include "wingdi.h"
 #include "mmreg.h"
 #include "wine/debug.h"
+#include "wine/asm.h"
 #include "dsound.h"
 #include "ks.h"
 #include "ksmedia.h"
@@ -45,6 +46,14 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound);
 #define FREQ_ADJUST_SHIFT 32
 #define FIXED_0_32_TO_FLOAT(x) ((int)((x) >> 1) * (1.0f / (1ll << 31)))
 
+#define STR(a) #a
+#define EXPAND_STR(a) STR(a)
+
+static const float __attribute__((used, aligned(16))) one[] =
+{
+    1.0f, 1.0f, 1.0f, 1.0f,
+};
+
 void DSOUND_RecalcVolPan(PDSVOLUMEPAN volpan)
 {
 	double temp;
@@ -284,6 +293,82 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb,
     return dsb->get(dsb, buffer + (mixpos % buflen), channel);
 }
 
+#ifdef __i386__
+
+#define INIT \
+        "push %ebx\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
+        __ASM_CFI(".cfi_offset %ebx,-8\n\t") \
+        "push %ebp\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
+        __ASM_CFI(".cfi_offset %ebp,-12\n\t") \
+        "push %esi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
+        __ASM_CFI(".cfi_offset %esi,-16\n\t") \
+        "push %edi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
+        __ASM_CFI(".cfi_offset %edi,-20\n\t")
+
+#define CLEANUP \
+        "pop %edi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \
+        "pop %esi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \
+        "pop %ebp\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \
+        "pop %ebx\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t")
+
+#define ADDR_SUFFIX
+
+#elif defined(__x86_64__) && !defined(__arm64ec__)
+
+#define INIT \
+        "push %rbx\n\t" \
+        __ASM_SEH(".seh_pushreg %rbx\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
+        __ASM_CFI(".cfi_offset %rbx,-16\n\t") \
+        "push %rbp\n\t" \
+        __ASM_SEH(".seh_pushreg %rbp\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
+        __ASM_CFI(".cfi_offset %rbp,-24\n\t") \
+        "push %rsi\n\t" \
+        __ASM_SEH(".seh_pushreg %rsi\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
+        __ASM_CFI(".cfi_offset %rsi,-32\n\t") \
+        "push %rdi\n\t" \
+        __ASM_SEH(".seh_pushreg %rdi\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
+        __ASM_CFI(".cfi_offset %rdi,-40\n\t") \
+        "sub $40, %rsp\n\t" \
+        __ASM_SEH(".seh_stackalloc 40\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 40\n\t") \
+        "movaps %xmm6, 16(%rsp)\n\t" \
+        __ASM_SEH(".seh_savexmm %xmm6, 16\n\t") \
+        __ASM_CFI(".cfi_offset %xmm6, -64\n\t") \
+        "movaps %xmm7, (%rsp)\n\t" \
+        __ASM_SEH(".seh_savexmm %xmm7, 0\n\t") \
+        __ASM_SEH(".seh_endprologue\n\t") \
+        __ASM_CFI(".cfi_offset %xmm7, -80\n\t")
+
+#define CLEANUP \
+        "movaps (%rsp), %xmm7\n\t" \
+        "movaps 16(%rsp), %xmm6\n\t" \
+        "add $40, %rsp\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -40\n\t") \
+        "pop %rdi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
+        "pop %rsi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
+        "pop %rbp\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
+        "pop %rbx\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
+
+#define ADDR_SUFFIX "(%rip)"
+
+#endif
+
 /**
  * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and
  * after output[].
@@ -339,6 +424,183 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai
     }
 }
 
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
+
+#ifdef __i386__
+
+#define IPOS_NUM_ARG "0x14(%esp)"
+#define IPOS_NUM_STEP_ARG "0x18(%esp)"
+#define REM_INV_ARG "0x1c(%esp)"
+#define REM_INV_STEP_ARG "0x20(%esp)"
+#define COUNT_ARG "0x24(%esp)"
+#define INPUT_ARG "0x28(%esp)"
+#define OUTPUT_ARG "0x2c(%esp)"
+
+#define IPOS_NUM_REG "%ecx"
+#define IPOS_NUM_STEP_REG "%edx"
+#define INPUT_REG "%esi"
+#define OUTPUT_REG "%edi"
+#define OUTPUT_END_REG "%ebp"
+#define FIR_REG "%ebx"
+#define TMP_L_REG "%eax"
+#define TMP_REG "%eax"
+
+#else
+
+#define IPOS_NUM_ARG "%ecx"
+#define IPOS_NUM_STEP_ARG "%edx"
+#define REM_INV_ARG "%xmm2"
+#define REM_INV_STEP_ARG "%xmm3"
+#define COUNT_ARG "0x70(%rsp)"
+#define INPUT_ARG "0x78(%rsp)"
+#define OUTPUT_ARG "0x80(%rsp)"
+
+#define IPOS_NUM_REG "%ecx"
+#define IPOS_NUM_STEP_REG "%edx"
+#define INPUT_REG "%rsi"
+#define OUTPUT_REG "%rdi"
+#define OUTPUT_END_REG "%rbp"
+#define FIR_REG "%rbx"
+#define TMP_L_REG "%eax"
+#define TMP_REG "%rax"
+
+#endif
+
+#define REM_INV_REG "%xmm2"
+#define REM_INV_STEP_REG "%xmm3"
+#define ONE_REG "%xmm1"
+#define REM_REG "%xmm0"
+#define SUMS_REG "%xmm4"
+#define FTMP0_REG "%xmm5"
+#define FTMP1_REG "%xmm6"
+#define FTMP2_REG "%xmm7"
+
+void upsample_sse(DWORD ipos_num, DWORD ipos_num_step, float rem_inv, float rem_inv_step,
+        UINT count, float *input, float *output);
+__ASM_GLOBAL_FUNC(upsample_sse,
+        INIT
+
+        "mov " IPOS_NUM_ARG ", " IPOS_NUM_REG "\n\t"
+        /* Store the lower half of ipos_num inverted so that we don't have to
+         * invert it on every iteration of the outer loop. */
+        "not " IPOS_NUM_REG "\n\t"
+        "mov " IPOS_NUM_STEP_ARG ", " IPOS_NUM_STEP_REG "\n\t"
+
+        "movss " REM_INV_ARG ", " REM_INV_REG "\n\t"
+        "shufps $0, " REM_INV_REG ", " REM_INV_REG "\n\t"
+        "movss " REM_INV_STEP_ARG ", " REM_INV_STEP_REG "\n\t"
+        "shufps $0, " REM_INV_STEP_REG ", " REM_INV_STEP_REG "\n\t"
+
+        /* Combine the upper half of ipos_num and the input pointer into a
+         * single value. */
+        "mov " INPUT_ARG ", " INPUT_REG "\n\t"
+        /* Divide the input pointer by 4 to match the scale. We can do this
+         * because the pointer is at least 4-byte aligned. It will be scaled
+         * back during the access in the inner loop. */
+        "shr $2, " INPUT_REG "\n\t"
+
+        "mov " OUTPUT_ARG ", " OUTPUT_REG "\n\t"
+
+        "mov " COUNT_ARG ", " TMP_L_REG "\n\t"
+        "lea (" OUTPUT_REG "," TMP_REG ",4), " OUTPUT_END_REG "\n\t"
+
+        "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t"
+
+        ".p2align 4,,10\n\t"
+        ".p2align 3\n\t"
+        "1:\n\t"
+        /* Calculate idx. */
+        "mov " IPOS_NUM_REG ", " TMP_L_REG "\n\t"
+        "shr $(32 - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t"
+        "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + 2), " TMP_REG "\n\t"
+        /* Calculate the FIR address base. */
+        "lea " __ASM_NAME("fir") ADDR_SUFFIX ", " FIR_REG "\n\t"
+        "add " TMP_REG ", " FIR_REG "\n\t"
+
+        /* Calculate rem. */
+        "movups " ONE_REG ", " REM_REG "\n\t"
+        "subps " REM_INV_REG ", " REM_REG "\n\t"
+
+        /* Initialize j. */
+        "xor " TMP_REG ", " TMP_REG "\n\t"
+        /* Inizialize the sums. */
+        "xorps " SUMS_REG ", " SUMS_REG "\n\t"
+
+        ".p2align 4,,10\n\t"
+        ".p2align 3\n\t"
+        "2:\n\t"
+        /* Load the FIR coefficients. */
+        "movaps (" FIR_REG "," TMP_REG "), " FTMP0_REG "\n\t"
+        "movaps " EXPAND_STR(FIR_WIDTH) " * 4(" FIR_REG "," TMP_REG "), " FTMP1_REG "\n\t"
+        /* Load the input values. */
+        "movups (" TMP_REG "," INPUT_REG ",4), " FTMP2_REG "\n\t"
+        "add $16, " TMP_REG "\n\t"
+        /* Interpolate the FIR coefficients. */
+        "mulps " REM_INV_REG ", " FTMP0_REG "\n\t"
+        "mulps " REM_REG ", " FTMP1_REG "\n\t"
+        "addps " FTMP0_REG ", " FTMP1_REG "\n\t"
+        /* Multiply the input values by the interpolated coefficients. */
+        "mulps " FTMP2_REG ", " FTMP1_REG "\n\t"
+        /* Accumulate the results. */
+        "addps " FTMP1_REG ", " SUMS_REG "\n\t"
+        "cmp $(" EXPAND_STR(FIR_WIDTH) " * 4), " TMP_REG "\n\t"
+        "jl 2b\n\t"
+
+        /* Update rem_inv. */
+        "addps " REM_INV_STEP_REG ", " REM_INV_REG "\n\t"
+        "movups " ONE_REG ", " FTMP0_REG "\n\t"
+        "cmpleps " REM_INV_REG ", " FTMP0_REG "\n\t"
+        "andps " ONE_REG ", " FTMP0_REG "\n\t"
+        "subps " FTMP0_REG ", " REM_INV_REG "\n\t"
+
+        /* Update ipos_num. Use subtraction for the lower half as it is stored
+         * inverted. */
+        "sub " IPOS_NUM_STEP_REG ", " IPOS_NUM_REG "\n\t"
+        "adc $0, " INPUT_REG "\n\t"
+
+        /* Add the even-numbered sums to the odd-numbered ones. */
+        "movups " SUMS_REG ", " FTMP0_REG "\n\t"
+        "shufps $0x31, " FTMP0_REG ", " FTMP0_REG "\n\t"
+        "addps " FTMP0_REG ", " SUMS_REG "\n\t"
+        /* Calculate the final sum and store it to the output array. */
+        "movhlps " SUMS_REG ", " FTMP0_REG "\n\t"
+        "addss " FTMP0_REG ", " SUMS_REG "\n\t"
+        "movss " SUMS_REG ", (" OUTPUT_REG ")\n\t"
+
+        /* Advance the output pointer. */
+        "add $4, " OUTPUT_REG "\n\t"
+        "cmp " OUTPUT_END_REG ", " OUTPUT_REG "\n\t"
+        "jl 1b\n\t"
+
+        CLEANUP
+        "ret")
+
+#undef IPOS_NUM_ARG
+#undef IPOS_NUM_STEP_ARG
+#undef REM_INV_ARG
+#undef REM_INV_STEP_ARG
+#undef COUNT_ARG
+#undef INPUT_ARG
+#undef OUTPUT_ARG
+#undef IPOS_NUM_REG
+#undef IPOS_NUM_STEP_REG
+#undef INPUT_REG
+#undef OUTPUT_REG
+#undef OUTPUT_END_REG
+#undef FIR_REG
+#undef TMP_L_REG
+#undef TMP_REG
+#undef REM_INV_REG
+#undef REM_INV_STEP_REG
+#undef ONE_REG
+#undef REM_REG
+#undef SUMS_REG
+#undef FTMP0_REG
+#undef FTMP1_REG
+#undef FTMP2_REG
+
+#endif
+
 static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input,
         float *output)
 {
@@ -366,8 +628,19 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl
 
     float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << FIR_STEP_SHIFT);
     float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << FIR_STEP_SHIFT);
+
+#if defined(__x86_64__) && !defined(__arm64ec__)
+    upsample_sse((DWORD)ipos_num, ipos_num_step, rem_inv, rem_inv_step, count, input, output);
+#else
     UINT i;
 
+#ifdef __i386__
+    if (sse_supported) {
+        upsample_sse((DWORD)ipos_num, ipos_num_step, rem_inv, rem_inv_step, count, input, output);
+        return;
+    }
+#endif
+
     for(i = 0; i < count; ++i) {
         UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT;
         UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT;
@@ -386,8 +659,13 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl
 
         ipos_num += ipos_num_step;
     }
+#endif
 }
 
+#undef INIT
+#undef CLEANUP
+#undef ADDR_SUFFIX
+
 /**
  * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and
  * after output[].
-- 
GitLab


https://gitlab.winehq.org/wine/wine/-/merge_requests/10716