From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/dsound_main.c | 12 ++ dlls/dsound/dsound_private.h | 4 + dlls/dsound/fir.h | 4 +- dlls/dsound/mixer.c | 278 +++++++++++++++++++++++++++++++++++ 4 files changed, 296 insertions(+), 2 deletions(-) diff --git a/dlls/dsound/dsound_main.c b/dlls/dsound/dsound_main.c index 8936b437ba2..dbcf5a79899 100644 --- a/dlls/dsound/dsound_main.c +++ b/dlls/dsound/dsound_main.c @@ -63,6 +63,10 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound); +#ifdef __i386__ +BOOL sse_supported; +#endif + struct list DSOUND_renderers = LIST_INIT(DSOUND_renderers); CRITICAL_SECTION DSOUND_renderers_lock; static CRITICAL_SECTION_DEBUG DSOUND_renderers_lock_debug = @@ -82,6 +86,13 @@ GUID *DSOUND_capture_guids; /* All default settings, you most likely don't want to touch these, see wiki on UsefulRegistryKeys */ int ds_hel_buflen = 32768 * 2; +static void init_cpu_features(void) +{ +#ifdef __i386__ + sse_supported = IsProcessorFeaturePresent(PF_XMMI_INSTRUCTIONS_AVAILABLE); +#endif +} + /* * Get a config key from either the app-specific or the default config */ @@ -787,6 +798,7 @@ BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpvReserved) DisableThreadLibraryCalls(hInstDLL); /* Increase refcount on dsound by 1 */ GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS, (LPCWSTR)hInstDLL, &hInstDLL); + init_cpu_features(); break; case DLL_PROCESS_DETACH: if (lpvReserved) break; diff --git a/dlls/dsound/dsound_private.h b/dlls/dsound/dsound_private.h index 0e695698046..b13c3858e44 100644 --- a/dlls/dsound/dsound_private.h +++ b/dlls/dsound/dsound_private.h @@ -251,6 +251,10 @@ HRESULT IDirectSoundCaptureImpl_Create(IUnknown *outer_unk, REFIID riid, void ** #define STATE_CAPTURING 2 #define STATE_STOPPING 3 +#ifdef __i386__ +extern BOOL sse_supported; +#endif + extern CRITICAL_SECTION DSOUND_renderers_lock; extern struct list DSOUND_renderers; diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h index 76ac521e0f3..3b9bccbdb83 100644 --- a/dlls/dsound/fir.h +++ b/dlls/dsound/fir.h @@ -90,7 +90,7 @@ int main() printf("#define FIR_WIDTH %d\n", fir_width); printf("#define FIR_STEP_SHIFT %d\n", fir_step_shift); printf("#define FIR_STEP %d\n", fir_step); - printf("static const float fir[] = {"); + printf("static const float __attribute__((used, aligned(16))) fir[] = {"); // Print the FIR array with an additional row at the end. This simplifies // calculation of the interpolated value by allowing the index to overflow // into the extra row. It just repeats the first row, starting from its @@ -118,7 +118,7 @@ int main() #define FIR_WIDTH 64 #define FIR_STEP_SHIFT 7 #define FIR_STEP 128 -static const float fir[] = { +static const float __attribute__((used, aligned(16))) fir[] = { 0.0000000000e+00, -2.4830013102e-06, 1.9318705150e-06, 2.6614854151e-06, -1.5313785194e-05, 4.2076214553e-05, -9.1417167945e-05, 1.7455895136e-04, -3.0567859821e-04, 5.0191365396e-04, -7.8311909082e-04, 1.1713337628e-03, diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 1b4b1c7bd7a..78a1ef2bb33 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -34,6 +34,7 @@ #include "wingdi.h" #include "mmreg.h" #include "wine/debug.h" +#include "wine/asm.h" #include "dsound.h" #include "ks.h" #include "ksmedia.h" @@ -45,6 +46,14 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound); #define FREQ_ADJUST_SHIFT 32 #define FIXED_0_32_TO_FLOAT(x) ((int)((x) >> 1) * (1.0f / (1ll << 31))) +#define STR(a) #a +#define EXPAND_STR(a) STR(a) + +static const float __attribute__((used, aligned(16))) one[] = +{ + 1.0f, 1.0f, 1.0f, 1.0f, +}; + void DSOUND_RecalcVolPan(PDSVOLUMEPAN volpan) { double temp; @@ -284,6 +293,82 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, return dsb->get(dsb, buffer + (mixpos % buflen), channel); } +#ifdef __i386__ + +#define INIT \ + "push %ebx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %ebx,-8\n\t") \ + "push %ebp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %ebp,-12\n\t") \ + "push %esi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %esi,-16\n\t") \ + "push %edi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %edi,-20\n\t") + +#define CLEANUP \ + "pop %edi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %esi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %ebp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %ebx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") + +#define ADDR_SUFFIX + +#elif defined(__x86_64__) && !defined(__arm64ec__) + +#define INIT \ + "push %rbx\n\t" \ + __ASM_SEH(".seh_pushreg %rbx\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rbx,-16\n\t") \ + "push %rbp\n\t" \ + __ASM_SEH(".seh_pushreg %rbp\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rbp,-24\n\t") \ + "push %rsi\n\t" \ + __ASM_SEH(".seh_pushreg %rsi\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rsi,-32\n\t") \ + "push %rdi\n\t" \ + __ASM_SEH(".seh_pushreg %rdi\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rdi,-40\n\t") \ + "sub $40, %rsp\n\t" \ + __ASM_SEH(".seh_stackalloc 40\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 40\n\t") \ + "movaps %xmm6, 16(%rsp)\n\t" \ + __ASM_SEH(".seh_savexmm %xmm6, 16\n\t") \ + __ASM_CFI(".cfi_offset %xmm6, -64\n\t") \ + "movaps %xmm7, (%rsp)\n\t" \ + __ASM_SEH(".seh_savexmm %xmm7, 0\n\t") \ + __ASM_SEH(".seh_endprologue\n\t") \ + __ASM_CFI(".cfi_offset %xmm7, -80\n\t") + +#define CLEANUP \ + "movaps (%rsp), %xmm7\n\t" \ + "movaps 16(%rsp), %xmm6\n\t" \ + "add $40, %rsp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -40\n\t") \ + "pop %rdi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rsi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rbp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rbx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + +#define ADDR_SUFFIX "(%rip)" + +#endif + /** * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. @@ -339,6 +424,183 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai } } +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) + +#ifdef __i386__ + +#define IPOS_NUM_ARG "0x14(%esp)" +#define IPOS_NUM_STEP_ARG "0x18(%esp)" +#define REM_INV_ARG "0x1c(%esp)" +#define REM_INV_STEP_ARG "0x20(%esp)" +#define COUNT_ARG "0x24(%esp)" +#define INPUT_ARG "0x28(%esp)" +#define OUTPUT_ARG "0x2c(%esp)" + +#define IPOS_NUM_REG "%ecx" +#define IPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%esi" +#define OUTPUT_REG "%edi" +#define OUTPUT_END_REG "%ebp" +#define FIR_REG "%ebx" +#define TMP_L_REG "%eax" +#define TMP_REG "%eax" + +#else + +#define IPOS_NUM_ARG "%ecx" +#define IPOS_NUM_STEP_ARG "%edx" +#define REM_INV_ARG "%xmm2" +#define REM_INV_STEP_ARG "%xmm3" +#define COUNT_ARG "0x70(%rsp)" +#define INPUT_ARG "0x78(%rsp)" +#define OUTPUT_ARG "0x80(%rsp)" + +#define IPOS_NUM_REG "%ecx" +#define IPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%rsi" +#define OUTPUT_REG "%rdi" +#define OUTPUT_END_REG "%rbp" +#define FIR_REG "%rbx" +#define TMP_L_REG "%eax" +#define TMP_REG "%rax" + +#endif + +#define REM_INV_REG "%xmm2" +#define REM_INV_STEP_REG "%xmm3" +#define ONE_REG "%xmm1" +#define REM_REG "%xmm0" +#define SUMS_REG "%xmm4" +#define FTMP0_REG "%xmm5" +#define FTMP1_REG "%xmm6" +#define FTMP2_REG "%xmm7" + +void upsample_sse(DWORD ipos_num, DWORD ipos_num_step, float rem_inv, float rem_inv_step, + UINT count, float *input, float *output); +__ASM_GLOBAL_FUNC(upsample_sse, + INIT + + "mov " IPOS_NUM_ARG ", " IPOS_NUM_REG "\n\t" + /* Store the lower half of ipos_num inverted so that we don't have to + * invert it on every iteration of the outer loop. */ + "not " IPOS_NUM_REG "\n\t" + "mov " IPOS_NUM_STEP_ARG ", " IPOS_NUM_STEP_REG "\n\t" + + "movss " REM_INV_ARG ", " REM_INV_REG "\n\t" + "shufps $0, " REM_INV_REG ", " REM_INV_REG "\n\t" + "movss " REM_INV_STEP_ARG ", " REM_INV_STEP_REG "\n\t" + "shufps $0, " REM_INV_STEP_REG ", " REM_INV_STEP_REG "\n\t" + + /* Combine the upper half of ipos_num and the input pointer into a + * single value. */ + "mov " INPUT_ARG ", " INPUT_REG "\n\t" + /* Divide the input pointer by 4 to match the scale. We can do this + * because the pointer is at least 4-byte aligned. It will be scaled + * back during the access in the inner loop. */ + "shr $2, " INPUT_REG "\n\t" + + "mov " OUTPUT_ARG ", " OUTPUT_REG "\n\t" + + "mov " COUNT_ARG ", " TMP_L_REG "\n\t" + "lea (" OUTPUT_REG "," TMP_REG ",4), " OUTPUT_END_REG "\n\t" + + "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "1:\n\t" + /* Calculate idx. */ + "mov " IPOS_NUM_REG ", " TMP_L_REG "\n\t" + "shr $(32 - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t" + "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + 2), " TMP_REG "\n\t" + /* Calculate the FIR address base. */ + "lea " __ASM_NAME("fir") ADDR_SUFFIX ", " FIR_REG "\n\t" + "add " TMP_REG ", " FIR_REG "\n\t" + + /* Calculate rem. */ + "movups " ONE_REG ", " REM_REG "\n\t" + "subps " REM_INV_REG ", " REM_REG "\n\t" + + /* Initialize j. */ + "xor " TMP_REG ", " TMP_REG "\n\t" + /* Inizialize the sums. */ + "xorps " SUMS_REG ", " SUMS_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "2:\n\t" + /* Load the FIR coefficients. */ + "movaps (" FIR_REG "," TMP_REG "), " FTMP0_REG "\n\t" + "movaps " EXPAND_STR(FIR_WIDTH) " * 4(" FIR_REG "," TMP_REG "), " FTMP1_REG "\n\t" + /* Load the input values. */ + "movups (" TMP_REG "," INPUT_REG ",4), " FTMP2_REG "\n\t" + "add $16, " TMP_REG "\n\t" + /* Interpolate the FIR coefficients. */ + "mulps " REM_INV_REG ", " FTMP0_REG "\n\t" + "mulps " REM_REG ", " FTMP1_REG "\n\t" + "addps " FTMP0_REG ", " FTMP1_REG "\n\t" + /* Multiply the input values by the interpolated coefficients. */ + "mulps " FTMP2_REG ", " FTMP1_REG "\n\t" + /* Accumulate the results. */ + "addps " FTMP1_REG ", " SUMS_REG "\n\t" + "cmp $(" EXPAND_STR(FIR_WIDTH) " * 4), " TMP_REG "\n\t" + "jl 2b\n\t" + + /* Update rem_inv. */ + "addps " REM_INV_STEP_REG ", " REM_INV_REG "\n\t" + "movups " ONE_REG ", " FTMP0_REG "\n\t" + "cmpleps " REM_INV_REG ", " FTMP0_REG "\n\t" + "andps " ONE_REG ", " FTMP0_REG "\n\t" + "subps " FTMP0_REG ", " REM_INV_REG "\n\t" + + /* Update ipos_num. Use subtraction for the lower half as it is stored + * inverted. */ + "sub " IPOS_NUM_STEP_REG ", " IPOS_NUM_REG "\n\t" + "adc $0, " INPUT_REG "\n\t" + + /* Add the even-numbered sums to the odd-numbered ones. */ + "movups " SUMS_REG ", " FTMP0_REG "\n\t" + "shufps $0x31, " FTMP0_REG ", " FTMP0_REG "\n\t" + "addps " FTMP0_REG ", " SUMS_REG "\n\t" + /* Calculate the final sum and store it to the output array. */ + "movhlps " SUMS_REG ", " FTMP0_REG "\n\t" + "addss " FTMP0_REG ", " SUMS_REG "\n\t" + "movss " SUMS_REG ", (" OUTPUT_REG ")\n\t" + + /* Advance the output pointer. */ + "add $4, " OUTPUT_REG "\n\t" + "cmp " OUTPUT_END_REG ", " OUTPUT_REG "\n\t" + "jl 1b\n\t" + + CLEANUP + "ret") + +#undef IPOS_NUM_ARG +#undef IPOS_NUM_STEP_ARG +#undef REM_INV_ARG +#undef REM_INV_STEP_ARG +#undef COUNT_ARG +#undef INPUT_ARG +#undef OUTPUT_ARG +#undef IPOS_NUM_REG +#undef IPOS_NUM_STEP_REG +#undef INPUT_REG +#undef OUTPUT_REG +#undef OUTPUT_END_REG +#undef FIR_REG +#undef TMP_L_REG +#undef TMP_REG +#undef REM_INV_REG +#undef REM_INV_STEP_REG +#undef ONE_REG +#undef REM_REG +#undef SUMS_REG +#undef FTMP0_REG +#undef FTMP1_REG +#undef FTMP2_REG + +#endif + static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { @@ -366,8 +628,19 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << FIR_STEP_SHIFT); float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << FIR_STEP_SHIFT); + +#if defined(__x86_64__) && !defined(__arm64ec__) + upsample_sse((DWORD)ipos_num, ipos_num_step, rem_inv, rem_inv_step, count, input, output); +#else UINT i; +#ifdef __i386__ + if (sse_supported) { + upsample_sse((DWORD)ipos_num, ipos_num_step, rem_inv, rem_inv_step, count, input, output); + return; + } +#endif + for(i = 0; i < count; ++i) { UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; @@ -386,8 +659,13 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl ipos_num += ipos_num_step; } +#endif } +#undef INIT +#undef CLEANUP +#undef ADDR_SUFFIX + /** * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716