From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/dsound_main.c | 12 ++ dlls/dsound/dsound_private.h | 4 + dlls/dsound/fir.h | 4 +- dlls/dsound/mixer.c | 244 +++++++++++++++++++++++++++++++++++ 4 files changed, 262 insertions(+), 2 deletions(-) diff --git a/dlls/dsound/dsound_main.c b/dlls/dsound/dsound_main.c index 8936b437ba2..784aac62209 100644 --- a/dlls/dsound/dsound_main.c +++ b/dlls/dsound/dsound_main.c @@ -63,6 +63,10 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound); +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) +BOOL sse_supported; +#endif + struct list DSOUND_renderers = LIST_INIT(DSOUND_renderers); CRITICAL_SECTION DSOUND_renderers_lock; static CRITICAL_SECTION_DEBUG DSOUND_renderers_lock_debug = @@ -82,6 +86,13 @@ GUID *DSOUND_capture_guids; /* All default settings, you most likely don't want to touch these, see wiki on UsefulRegistryKeys */ int ds_hel_buflen = 32768 * 2; +static void init_cpu_features(void) +{ +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) + sse_supported = IsProcessorFeaturePresent(PF_XMMI_INSTRUCTIONS_AVAILABLE); +#endif +} + /* * Get a config key from either the app-specific or the default config */ @@ -787,6 +798,7 @@ BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpvReserved) DisableThreadLibraryCalls(hInstDLL); /* Increase refcount on dsound by 1 */ GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS, (LPCWSTR)hInstDLL, &hInstDLL); + init_cpu_features(); break; case DLL_PROCESS_DETACH: if (lpvReserved) break; diff --git a/dlls/dsound/dsound_private.h b/dlls/dsound/dsound_private.h index 0e695698046..f7380f5e426 100644 --- a/dlls/dsound/dsound_private.h +++ b/dlls/dsound/dsound_private.h @@ -251,6 +251,10 @@ HRESULT IDirectSoundCaptureImpl_Create(IUnknown *outer_unk, REFIID riid, void ** #define STATE_CAPTURING 2 #define STATE_STOPPING 3 +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) +extern BOOL sse_supported; +#endif + extern CRITICAL_SECTION DSOUND_renderers_lock; extern struct list DSOUND_renderers; diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h index 76ac521e0f3..39a32af1412 100644 --- a/dlls/dsound/fir.h +++ b/dlls/dsound/fir.h @@ -90,7 +90,7 @@ int main() printf("#define FIR_WIDTH %d\n", fir_width); printf("#define FIR_STEP_SHIFT %d\n", fir_step_shift); printf("#define FIR_STEP %d\n", fir_step); - printf("static const float fir[] = {"); + printf("static const float DECLSPEC_ALIGN(16) fir[] = {"); // Print the FIR array with an additional row at the end. This simplifies // calculation of the interpolated value by allowing the index to overflow // into the extra row. It just repeats the first row, starting from its @@ -118,7 +118,7 @@ int main() #define FIR_WIDTH 64 #define FIR_STEP_SHIFT 7 #define FIR_STEP 128 -static const float fir[] = { +static const float DECLSPEC_ALIGN(16) fir[] = { 0.0000000000e+00, -2.4830013102e-06, 1.9318705150e-06, 2.6614854151e-06, -1.5313785194e-05, 4.2076214553e-05, -9.1417167945e-05, 1.7455895136e-04, -3.0567859821e-04, 5.0191365396e-04, -7.8311909082e-04, 1.1713337628e-03, diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 1b4b1c7bd7a..de51060db6f 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -34,6 +34,7 @@ #include "wingdi.h" #include "mmreg.h" #include "wine/debug.h" +#include "wine/asm.h" #include "dsound.h" #include "ks.h" #include "ksmedia.h" @@ -43,8 +44,17 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound); #define FREQ_ADJUST_SHIFT 32 +#define SIZEOF_FLOAT_SHIFT 2 #define FIXED_0_32_TO_FLOAT(x) ((int)((x) >> 1) * (1.0f / (1ll << 31))) +#define STR(a) #a +#define EXPAND_STR(a) STR(a) + +static const float __attribute__((used, aligned(16))) one[] = +{ + 1.0f, 1.0f, 1.0f, 1.0f, +}; + void DSOUND_RecalcVolPan(PDSVOLUMEPAN volpan) { double temp; @@ -339,6 +349,232 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai } } +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) + +/* On x86_64 rem_inv and rem_inv_step are passed in registers so just leave them + * there. */ +#define REM_INV_REG "%xmm2" +#define REM_INV_STEP_REG "%xmm3" +#define ONE_REG "%xmm1" +#define REM_REG "%xmm0" +#define SUM_REG "%xmm4" +#define FTMP0_REG "%xmm5" +#define FTMP1_REG "%xmm6" +#define FTMP2_REG "%xmm7" + +#ifdef __i386__ + +#define IPOS_NUM_INV_REG "%ecx" +#define IPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%esi" +#define OUTPUT_REG "%edi" +#define OUTPUT_END_REG "%ebp" +#define FIR_REG "%ebx" +#define TMP_L_REG "%eax" +#define TMP_REG "%eax" + +#define INIT \ + "push %ebx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %ebx,-8\n\t") \ + "push %ebp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %ebp,-12\n\t") \ + "push %esi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %esi,-16\n\t") \ + "push %edi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %edi,-20\n\t") \ + "mov 0x14(%esp), " IPOS_NUM_INV_REG "\n\t" \ + "mov 0x18(%esp), " IPOS_NUM_STEP_REG "\n\t" \ + "movss 0x1c(%esp), " REM_INV_REG "\n\t" \ + "movss 0x20(%esp), " REM_INV_STEP_REG "\n\t" \ + "mov 0x24(%esp), " INPUT_REG "\n\t" \ + "mov 0x28(%esp), " OUTPUT_REG "\n\t" \ + "mov 0x2c(%esp), " OUTPUT_END_REG "\n\t" + +#define CLEANUP \ + "pop %edi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %esi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %ebp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %ebx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") + +#define ADDR_SUFFIX + +#else + +/* On x86_64 ipos_num_inv and ipos_num_step are passed in registers so just + * leave them there. */ +#define IPOS_NUM_INV_REG "%ecx" +#define IPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%rsi" +#define OUTPUT_REG "%rdi" +#define OUTPUT_END_REG "%rbp" +#define FIR_REG "%rbx" +#define TMP_L_REG "%eax" +#define TMP_REG "%rax" + +#define INIT \ + "push %rbx\n\t" \ + __ASM_SEH(".seh_pushreg %rbx\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rbx,-16\n\t") \ + "push %rbp\n\t" \ + __ASM_SEH(".seh_pushreg %rbp\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rbp,-24\n\t") \ + "push %rsi\n\t" \ + __ASM_SEH(".seh_pushreg %rsi\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rsi,-32\n\t") \ + "push %rdi\n\t" \ + __ASM_SEH(".seh_pushreg %rdi\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rdi,-40\n\t") \ + "sub $40, %rsp\n\t" \ + __ASM_SEH(".seh_stackalloc 40\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 40\n\t") \ + "movaps %xmm6, 16(%rsp)\n\t" \ + __ASM_SEH(".seh_savexmm %xmm6, 16\n\t") \ + __ASM_CFI(".cfi_offset %xmm6, -64\n\t") \ + "movaps %xmm7, (%rsp)\n\t" \ + __ASM_SEH(".seh_savexmm %xmm7, 0\n\t") \ + __ASM_SEH(".seh_endprologue\n\t") \ + __ASM_CFI(".cfi_offset %xmm7, -80\n\t") \ + "mov 0x70(%rsp), " INPUT_REG "\n\t" \ + "mov 0x78(%rsp), " OUTPUT_REG "\n\t" \ + "mov 0x80(%rsp), " OUTPUT_END_REG "\n\t" + +#define CLEANUP \ + "movaps (%rsp), %xmm7\n\t" \ + "movaps 16(%rsp), %xmm6\n\t" \ + "add $40, %rsp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -40\n\t") \ + "pop %rdi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rsi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rbp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rbx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + +#define ADDR_SUFFIX "(%rip)" + +#endif + +/* ipos_num_inv is the inverted lower part of ipos_num. We store it inverted so + * that we don't have to invert it on every iteration of the outer loop. + * + * input is the input pointer divided by sizeof(float) to match the scale of + * ipos_num. We combine the upper part of ipos_num and the input pointer into a + * single value to save a register. */ +void upsample_sse(DWORD ipos_num_inv, DWORD ipos_num_step, float rem_inv, float rem_inv_step, + DWORD_PTR input, float *output, float *output_end); +__ASM_GLOBAL_FUNC(upsample_sse, + INIT + + "shufps $0, " REM_INV_REG ", " REM_INV_REG "\n\t" + "shufps $0, " REM_INV_STEP_REG ", " REM_INV_STEP_REG "\n\t" + + "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "1:\n\t" + /* Calculate idx. */ + "mov " IPOS_NUM_INV_REG ", " TMP_L_REG "\n\t" + "shr $(" EXPAND_STR(FREQ_ADJUST_SHIFT) " - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t" + "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + " EXPAND_STR(SIZEOF_FLOAT_SHIFT) "), " TMP_REG "\n\t" + /* Calculate the FIR address base. */ + "lea " __ASM_NAME("fir") ADDR_SUFFIX ", " FIR_REG "\n\t" + "add " TMP_REG ", " FIR_REG "\n\t" + + /* Calculate rem. */ + "movups " ONE_REG ", " REM_REG "\n\t" + "subps " REM_INV_REG ", " REM_REG "\n\t" + + /* Initialize j. */ + "xor " TMP_REG ", " TMP_REG "\n\t" + /* Inizialize the sums. */ + "xorps " SUM_REG ", " SUM_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "2:\n\t" + /* Load the FIR coefficients. */ + "movaps (" FIR_REG "," TMP_REG "), " FTMP0_REG "\n\t" + "movaps " EXPAND_STR(FIR_WIDTH) " * 4(" FIR_REG "," TMP_REG "), " FTMP1_REG "\n\t" + /* Load the input values. */ + "movups (" TMP_REG "," INPUT_REG ",4), " FTMP2_REG "\n\t" + "add $16, " TMP_REG "\n\t" + /* Interpolate the FIR coefficients. */ + "mulps " REM_INV_REG ", " FTMP0_REG "\n\t" + "mulps " REM_REG ", " FTMP1_REG "\n\t" + "addps " FTMP0_REG ", " FTMP1_REG "\n\t" + /* Multiply the input values by the interpolated coefficients. */ + "mulps " FTMP2_REG ", " FTMP1_REG "\n\t" + /* Accumulate the results. */ + "addps " FTMP1_REG ", " SUM_REG "\n\t" + "cmp $(" EXPAND_STR(FIR_WIDTH) " * 4), " TMP_REG "\n\t" + "jl 2b\n\t" + + /* Update rem_inv. */ + "addps " REM_INV_STEP_REG ", " REM_INV_REG "\n\t" + "movups " ONE_REG ", " FTMP0_REG "\n\t" + "cmpleps " REM_INV_REG ", " FTMP0_REG "\n\t" + "andps " ONE_REG ", " FTMP0_REG "\n\t" + "subps " FTMP0_REG ", " REM_INV_REG "\n\t" + + /* Update ipos_num. Use subtraction for the lower half as it is stored + * inverted. */ + "sub " IPOS_NUM_STEP_REG ", " IPOS_NUM_INV_REG "\n\t" + "adc $0, " INPUT_REG "\n\t" + + /* Add the even-numbered sums to the odd-numbered ones. */ + "movups " SUM_REG ", " FTMP0_REG "\n\t" + "shufps $0x31, " FTMP0_REG ", " FTMP0_REG "\n\t" + "addps " FTMP0_REG ", " SUM_REG "\n\t" + /* Calculate the final sum and store it to the output array. */ + "movhlps " SUM_REG ", " FTMP0_REG "\n\t" + "addss " FTMP0_REG ", " SUM_REG "\n\t" + "movss " SUM_REG ", (" OUTPUT_REG ")\n\t" + + /* Advance the output pointer. */ + "add $4, " OUTPUT_REG "\n\t" + "cmp " OUTPUT_END_REG ", " OUTPUT_REG "\n\t" + "jl 1b\n\t" + + CLEANUP + "ret") + +#undef REM_INV_REG +#undef REM_INV_STEP_REG +#undef ONE_REG +#undef REM_REG +#undef SUM_REG +#undef FTMP0_REG +#undef FTMP1_REG +#undef FTMP2_REG +#undef IPOS_NUM_INV_REG +#undef IPOS_NUM_STEP_REG +#undef INPUT_REG +#undef OUTPUT_REG +#undef OUTPUT_END_REG +#undef FIR_REG +#undef TMP_L_REG +#undef TMP_REG +#undef INIT +#undef CLEANUP +#undef ADDR_SUFFIX + +#endif + static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { @@ -368,6 +604,14 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << FIR_STEP_SHIFT); UINT i; +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) + if (sse_supported) { + upsample_sse(~(DWORD)ipos_num, ipos_num_step, rem_inv, rem_inv_step, + (DWORD_PTR)input / sizeof(float), output, output + count); + return; + } +#endif + for(i = 0; i < count; ++i) { UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716