[PATCH v2 4/5] dsound: Add an SSE version of upsample.

May 3, 2026

From: Anton Baskanov <baskanov@gmail.com>

---
 dlls/dsound/dsound_main.c    |  12 ++
 dlls/dsound/dsound_private.h |   4 +
 dlls/dsound/fir.h            |   4 +-
 dlls/dsound/mixer.c          | 244 +++++++++++++++++++++++++++++++++++
 4 files changed, 262 insertions(+), 2 deletions(-)

diff --git a/dlls/dsound/dsound_main.c b/dlls/dsound/dsound_main.c
index 8936b437ba2..784aac62209 100644
--- a/dlls/dsound/dsound_main.c
+++ b/dlls/dsound/dsound_main.c
@@ -63,6 +63,10 @@
 
 WINE_DEFAULT_DEBUG_CHANNEL(dsound);
 
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
+BOOL sse_supported;
+#endif
+
 struct list DSOUND_renderers = LIST_INIT(DSOUND_renderers);
 CRITICAL_SECTION DSOUND_renderers_lock;
 static CRITICAL_SECTION_DEBUG DSOUND_renderers_lock_debug =
@@ -82,6 +86,13 @@ GUID *DSOUND_capture_guids;
 /* All default settings, you most likely don't want to touch these, see wiki on UsefulRegistryKeys */
 int ds_hel_buflen = 32768 * 2;
 
+static void init_cpu_features(void)
+{
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
+    sse_supported = IsProcessorFeaturePresent(PF_XMMI_INSTRUCTIONS_AVAILABLE);
+#endif
+}
+
 /*
  * Get a config key from either the app-specific or the default config
  */
@@ -787,6 +798,7 @@ BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpvReserved)
         DisableThreadLibraryCalls(hInstDLL);
         /* Increase refcount on dsound by 1 */
         GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS, (LPCWSTR)hInstDLL, &hInstDLL);
+        init_cpu_features();
         break;
     case DLL_PROCESS_DETACH:
         if (lpvReserved) break;
diff --git a/dlls/dsound/dsound_private.h b/dlls/dsound/dsound_private.h
index 0e695698046..f7380f5e426 100644
--- a/dlls/dsound/dsound_private.h
+++ b/dlls/dsound/dsound_private.h
@@ -251,6 +251,10 @@ HRESULT IDirectSoundCaptureImpl_Create(IUnknown *outer_unk, REFIID riid, void **
 #define STATE_CAPTURING 2
 #define STATE_STOPPING  3
 
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
+extern BOOL sse_supported;
+#endif
+
 extern CRITICAL_SECTION DSOUND_renderers_lock;
 extern struct list DSOUND_renderers;
 
diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h
index 76ac521e0f3..39a32af1412 100644
--- a/dlls/dsound/fir.h
+++ b/dlls/dsound/fir.h
@@ -90,7 +90,7 @@ int main()
     printf("#define FIR_WIDTH %d\n", fir_width);
     printf("#define FIR_STEP_SHIFT %d\n", fir_step_shift);
     printf("#define FIR_STEP %d\n", fir_step);
-    printf("static const float fir[] = {");
+    printf("static const float DECLSPEC_ALIGN(16) fir[] = {");
     // Print the FIR array with an additional row at the end. This simplifies
     // calculation of the interpolated value by allowing the index to overflow
     // into the extra row. It just repeats the first row, starting from its
@@ -118,7 +118,7 @@ int main()
 #define FIR_WIDTH 64
 #define FIR_STEP_SHIFT 7
 #define FIR_STEP 128
-static const float fir[] = {
+static const float DECLSPEC_ALIGN(16) fir[] = {
      0.0000000000e+00, -2.4830013102e-06,  1.9318705150e-06,  2.6614854151e-06,
     -1.5313785194e-05,  4.2076214553e-05, -9.1417167945e-05,  1.7455895136e-04,
     -3.0567859821e-04,  5.0191365396e-04, -7.8311909082e-04,  1.1713337628e-03,
diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c
index 1b4b1c7bd7a..de51060db6f 100644
--- a/dlls/dsound/mixer.c
+++ b/dlls/dsound/mixer.c
@@ -34,6 +34,7 @@
 #include "wingdi.h"
 #include "mmreg.h"
 #include "wine/debug.h"
+#include "wine/asm.h"
 #include "dsound.h"
 #include "ks.h"
 #include "ksmedia.h"
@@ -43,8 +44,17 @@
 WINE_DEFAULT_DEBUG_CHANNEL(dsound);
 
 #define FREQ_ADJUST_SHIFT 32
+#define SIZEOF_FLOAT_SHIFT 2
 #define FIXED_0_32_TO_FLOAT(x) ((int)((x) >> 1) * (1.0f / (1ll << 31)))
 
+#define STR(a) #a
+#define EXPAND_STR(a) STR(a)
+
+static const float __attribute__((used, aligned(16))) one[] =
+{
+    1.0f, 1.0f, 1.0f, 1.0f,
+};
+
 void DSOUND_RecalcVolPan(PDSVOLUMEPAN volpan)
 {
 	double temp;
@@ -339,6 +349,232 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai
     }
 }
 
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
+
+/* On x86_64 rem_inv and rem_inv_step are passed in registers so just leave them
+ * there. */
+#define REM_INV_REG "%xmm2"
+#define REM_INV_STEP_REG "%xmm3"
+#define ONE_REG "%xmm1"
+#define REM_REG "%xmm0"
+#define SUM_REG "%xmm4"
+#define FTMP0_REG "%xmm5"
+#define FTMP1_REG "%xmm6"
+#define FTMP2_REG "%xmm7"
+
+#ifdef __i386__
+
+#define IPOS_NUM_INV_REG "%ecx"
+#define IPOS_NUM_STEP_REG "%edx"
+#define INPUT_REG "%esi"
+#define OUTPUT_REG "%edi"
+#define OUTPUT_END_REG "%ebp"
+#define FIR_REG "%ebx"
+#define TMP_L_REG "%eax"
+#define TMP_REG "%eax"
+
+#define INIT \
+        "push %ebx\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
+        __ASM_CFI(".cfi_offset %ebx,-8\n\t") \
+        "push %ebp\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
+        __ASM_CFI(".cfi_offset %ebp,-12\n\t") \
+        "push %esi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
+        __ASM_CFI(".cfi_offset %esi,-16\n\t") \
+        "push %edi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
+        __ASM_CFI(".cfi_offset %edi,-20\n\t") \
+        "mov 0x14(%esp), " IPOS_NUM_INV_REG "\n\t" \
+        "mov 0x18(%esp), " IPOS_NUM_STEP_REG "\n\t" \
+        "movss 0x1c(%esp), " REM_INV_REG "\n\t" \
+        "movss 0x20(%esp), " REM_INV_STEP_REG "\n\t" \
+        "mov 0x24(%esp), " INPUT_REG "\n\t" \
+        "mov 0x28(%esp), " OUTPUT_REG "\n\t" \
+        "mov 0x2c(%esp), " OUTPUT_END_REG "\n\t"
+
+#define CLEANUP \
+        "pop %edi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \
+        "pop %esi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \
+        "pop %ebp\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \
+        "pop %ebx\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t")
+
+#define ADDR_SUFFIX
+
+#else
+
+/* On x86_64 ipos_num_inv and ipos_num_step are passed in registers so just
+ * leave them there. */
+#define IPOS_NUM_INV_REG "%ecx"
+#define IPOS_NUM_STEP_REG "%edx"
+#define INPUT_REG "%rsi"
+#define OUTPUT_REG "%rdi"
+#define OUTPUT_END_REG "%rbp"
+#define FIR_REG "%rbx"
+#define TMP_L_REG "%eax"
+#define TMP_REG "%rax"
+
+#define INIT \
+        "push %rbx\n\t" \
+        __ASM_SEH(".seh_pushreg %rbx\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
+        __ASM_CFI(".cfi_offset %rbx,-16\n\t") \
+        "push %rbp\n\t" \
+        __ASM_SEH(".seh_pushreg %rbp\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
+        __ASM_CFI(".cfi_offset %rbp,-24\n\t") \
+        "push %rsi\n\t" \
+        __ASM_SEH(".seh_pushreg %rsi\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
+        __ASM_CFI(".cfi_offset %rsi,-32\n\t") \
+        "push %rdi\n\t" \
+        __ASM_SEH(".seh_pushreg %rdi\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
+        __ASM_CFI(".cfi_offset %rdi,-40\n\t") \
+        "sub $40, %rsp\n\t" \
+        __ASM_SEH(".seh_stackalloc 40\n\t") \
+        __ASM_CFI(".cfi_adjust_cfa_offset 40\n\t") \
+        "movaps %xmm6, 16(%rsp)\n\t" \
+        __ASM_SEH(".seh_savexmm %xmm6, 16\n\t") \
+        __ASM_CFI(".cfi_offset %xmm6, -64\n\t") \
+        "movaps %xmm7, (%rsp)\n\t" \
+        __ASM_SEH(".seh_savexmm %xmm7, 0\n\t") \
+        __ASM_SEH(".seh_endprologue\n\t") \
+        __ASM_CFI(".cfi_offset %xmm7, -80\n\t") \
+        "mov 0x70(%rsp), " INPUT_REG "\n\t" \
+        "mov 0x78(%rsp), " OUTPUT_REG "\n\t" \
+        "mov 0x80(%rsp), " OUTPUT_END_REG "\n\t"
+
+#define CLEANUP \
+        "movaps (%rsp), %xmm7\n\t" \
+        "movaps 16(%rsp), %xmm6\n\t" \
+        "add $40, %rsp\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -40\n\t") \
+        "pop %rdi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
+        "pop %rsi\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
+        "pop %rbp\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
+        "pop %rbx\n\t" \
+        __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
+
+#define ADDR_SUFFIX "(%rip)"
+
+#endif
+
+/* ipos_num_inv is the inverted lower part of ipos_num. We store it inverted so
+ * that we don't have to invert it on every iteration of the outer loop.
+ *
+ * input is the input pointer divided by sizeof(float) to match the scale of
+ * ipos_num. We combine the upper part of ipos_num and the input pointer into a
+ * single value to save a register. */
+void upsample_sse(DWORD ipos_num_inv, DWORD ipos_num_step, float rem_inv, float rem_inv_step,
+        DWORD_PTR input, float *output, float *output_end);
+__ASM_GLOBAL_FUNC(upsample_sse,
+        INIT
+
+        "shufps $0, " REM_INV_REG ", " REM_INV_REG "\n\t"
+        "shufps $0, " REM_INV_STEP_REG ", " REM_INV_STEP_REG "\n\t"
+
+        "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t"
+
+        ".p2align 4,,10\n\t"
+        ".p2align 3\n\t"
+        "1:\n\t"
+        /* Calculate idx. */
+        "mov " IPOS_NUM_INV_REG ", " TMP_L_REG "\n\t"
+        "shr $(" EXPAND_STR(FREQ_ADJUST_SHIFT) " - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t"
+        "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + " EXPAND_STR(SIZEOF_FLOAT_SHIFT) "), " TMP_REG "\n\t"
+        /* Calculate the FIR address base. */
+        "lea " __ASM_NAME("fir") ADDR_SUFFIX ", " FIR_REG "\n\t"
+        "add " TMP_REG ", " FIR_REG "\n\t"
+
+        /* Calculate rem. */
+        "movups " ONE_REG ", " REM_REG "\n\t"
+        "subps " REM_INV_REG ", " REM_REG "\n\t"
+
+        /* Initialize j. */
+        "xor " TMP_REG ", " TMP_REG "\n\t"
+        /* Inizialize the sums. */
+        "xorps " SUM_REG ", " SUM_REG "\n\t"
+
+        ".p2align 4,,10\n\t"
+        ".p2align 3\n\t"
+        "2:\n\t"
+        /* Load the FIR coefficients. */
+        "movaps (" FIR_REG "," TMP_REG "), " FTMP0_REG "\n\t"
+        "movaps " EXPAND_STR(FIR_WIDTH) " * 4(" FIR_REG "," TMP_REG "), " FTMP1_REG "\n\t"
+        /* Load the input values. */
+        "movups (" TMP_REG "," INPUT_REG ",4), " FTMP2_REG "\n\t"
+        "add $16, " TMP_REG "\n\t"
+        /* Interpolate the FIR coefficients. */
+        "mulps " REM_INV_REG ", " FTMP0_REG "\n\t"
+        "mulps " REM_REG ", " FTMP1_REG "\n\t"
+        "addps " FTMP0_REG ", " FTMP1_REG "\n\t"
+        /* Multiply the input values by the interpolated coefficients. */
+        "mulps " FTMP2_REG ", " FTMP1_REG "\n\t"
+        /* Accumulate the results. */
+        "addps " FTMP1_REG ", " SUM_REG "\n\t"
+        "cmp $(" EXPAND_STR(FIR_WIDTH) " * 4), " TMP_REG "\n\t"
+        "jl 2b\n\t"
+
+        /* Update rem_inv. */
+        "addps " REM_INV_STEP_REG ", " REM_INV_REG "\n\t"
+        "movups " ONE_REG ", " FTMP0_REG "\n\t"
+        "cmpleps " REM_INV_REG ", " FTMP0_REG "\n\t"
+        "andps " ONE_REG ", " FTMP0_REG "\n\t"
+        "subps " FTMP0_REG ", " REM_INV_REG "\n\t"
+
+        /* Update ipos_num. Use subtraction for the lower half as it is stored
+         * inverted. */
+        "sub " IPOS_NUM_STEP_REG ", " IPOS_NUM_INV_REG "\n\t"
+        "adc $0, " INPUT_REG "\n\t"
+
+        /* Add the even-numbered sums to the odd-numbered ones. */
+        "movups " SUM_REG ", " FTMP0_REG "\n\t"
+        "shufps $0x31, " FTMP0_REG ", " FTMP0_REG "\n\t"
+        "addps " FTMP0_REG ", " SUM_REG "\n\t"
+        /* Calculate the final sum and store it to the output array. */
+        "movhlps " SUM_REG ", " FTMP0_REG "\n\t"
+        "addss " FTMP0_REG ", " SUM_REG "\n\t"
+        "movss " SUM_REG ", (" OUTPUT_REG ")\n\t"
+
+        /* Advance the output pointer. */
+        "add $4, " OUTPUT_REG "\n\t"
+        "cmp " OUTPUT_END_REG ", " OUTPUT_REG "\n\t"
+        "jl 1b\n\t"
+
+        CLEANUP
+        "ret")
+
+#undef REM_INV_REG
+#undef REM_INV_STEP_REG
+#undef ONE_REG
+#undef REM_REG
+#undef SUM_REG
+#undef FTMP0_REG
+#undef FTMP1_REG
+#undef FTMP2_REG
+#undef IPOS_NUM_INV_REG
+#undef IPOS_NUM_STEP_REG
+#undef INPUT_REG
+#undef OUTPUT_REG
+#undef OUTPUT_END_REG
+#undef FIR_REG
+#undef TMP_L_REG
+#undef TMP_REG
+#undef INIT
+#undef CLEANUP
+#undef ADDR_SUFFIX
+
+#endif
+
 static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input,
         float *output)
 {
@@ -368,6 +604,14 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl
     float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << FIR_STEP_SHIFT);
     UINT i;
 
+#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__))
+    if (sse_supported) {
+        upsample_sse(~(DWORD)ipos_num, ipos_num_step, rem_inv, rem_inv_step,
+                (DWORD_PTR)input / sizeof(float), output, output + count);
+        return;
+    }
+#endif
+
     for(i = 0; i < count; ++i) {
         UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT;
         UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT;
-- 
GitLab


https://gitlab.winehq.org/wine/wine/-/merge_requests/10716