From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 230 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index de51060db6f..c3ee4aaeef0 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -294,6 +294,227 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, return dsb->get(dsb, buffer + (mixpos % buflen), channel); } +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) + +/* On x86_64 rem and rem_step are passed in registers so just leave them + * there. */ +#define REM_REG "%xmm2" +#define REM_STEP_REG "%xmm3" +#define FIRGAIN_REG "%xmm0" +#define ONE_REG "%xmm1" +#define INPUT_VALUE0_REG "%xmm4" +#define INPUT_VALUE1_REG "%xmm5" +#define FTMP0_REG "%xmm6" +#define FTMP1_REG "%xmm7" + +#ifdef __i386__ + +#define OPOS_NUM_INV_REG "%ecx" +#define OPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%esi" +#define INPUT_END_REG "%ebp" +#define OUTPUT_REG "%edi" +#define FIR_REG "%ebx" +#define TMP_L_REG "%eax" +#define TMP_REG "%eax" + +#define INIT \ + "push %ebx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %ebx,-8\n\t") \ + "push %ebp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %ebp,-12\n\t") \ + "push %esi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %esi,-16\n\t") \ + "push %edi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %edi,-20\n\t") \ + "mov 0x14(%esp), " OPOS_NUM_INV_REG "\n\t" \ + "mov 0x18(%esp), " OPOS_NUM_STEP_REG "\n\t" \ + "movss 0x1c(%esp), " REM_REG "\n\t" \ + "movss 0x20(%esp), " REM_STEP_REG "\n\t" \ + "movss 0x24(%esp), " FIRGAIN_REG "\n\t" \ + "mov 0x28(%esp), " INPUT_REG "\n\t" \ + "mov 0x2c(%esp), " INPUT_END_REG "\n\t" \ + "mov 0x30(%esp), " OUTPUT_REG "\n\t" + +#define CLEANUP \ + "pop %edi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %esi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %ebp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %ebx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") + +#define ADDR_SUFFIX + +#else + +/* On x86_64 opos_num_inv and opos_num_step are passed in registers so just + * leave them there. */ +#define OPOS_NUM_INV_REG "%ecx" +#define OPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%rsi" +#define INPUT_END_REG "%rbp" +#define OUTPUT_REG "%rdi" +#define FIR_REG "%rbx" +#define TMP_L_REG "%eax" +#define TMP_REG "%rax" + +#define INIT \ + "push %rbx\n\t" \ + __ASM_SEH(".seh_pushreg %rbx\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rbx,-16\n\t") \ + "push %rbp\n\t" \ + __ASM_SEH(".seh_pushreg %rbp\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rbp,-24\n\t") \ + "push %rsi\n\t" \ + __ASM_SEH(".seh_pushreg %rsi\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rsi,-32\n\t") \ + "push %rdi\n\t" \ + __ASM_SEH(".seh_pushreg %rdi\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rdi,-40\n\t") \ + "sub $40, %rsp\n\t" \ + __ASM_SEH(".seh_stackalloc 40\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 40\n\t") \ + "movaps %xmm6, 16(%rsp)\n\t" \ + __ASM_SEH(".seh_savexmm %xmm6, 16\n\t") \ + __ASM_CFI(".cfi_offset %xmm6, -64\n\t") \ + "movaps %xmm7, (%rsp)\n\t" \ + __ASM_SEH(".seh_savexmm %xmm7, 0\n\t") \ + __ASM_SEH(".seh_endprologue\n\t") \ + __ASM_CFI(".cfi_offset %xmm7, -80\n\t") \ + "movss 0x70(%rsp), " FIRGAIN_REG "\n\t" \ + "mov 0x78(%rsp), " INPUT_REG "\n\t" \ + "mov 0x80(%rsp), " INPUT_END_REG "\n\t" \ + "mov 0x88(%rsp), " OUTPUT_REG "\n\t" + +#define CLEANUP \ + "movaps (%rsp), %xmm7\n\t" \ + "movaps 16(%rsp), %xmm6\n\t" \ + "add $40, %rsp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -40\n\t") \ + "pop %rdi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rsi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rbp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rbx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + +#define ADDR_SUFFIX "(%rip)" + +#endif + +/* opos_num_inv is the inverted lower part of opos_num. We store it inverted so + * that we don't have to invert it on every iteration of the outer loop. + * + * output is the output pointer divided by sizeof(float) to match the scale of + * opos_num. We combine the upper part of ipos_num and the input pointer into a + * single value to save a register. */ +void downsample_sse(DWORD opos_num_inv, DWORD opos_num_step, float rem, float rem_step, + float firgain, float *input, float *input_end, DWORD_PTR output); +__ASM_GLOBAL_FUNC( downsample_sse, + INIT + + "shufps $0, " REM_REG ", " REM_REG "\n\t" + "shufps $0, " REM_STEP_REG ", " REM_STEP_REG "\n\t" + + "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "1:\n\t" + /* Calculate idx. */ + "mov " OPOS_NUM_INV_REG ", " TMP_L_REG "\n\t" + "shr $(" EXPAND_STR(FREQ_ADJUST_SHIFT) " - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t" + "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + " EXPAND_STR(SIZEOF_FLOAT_SHIFT) "), " TMP_REG "\n\t" + /* Calculate the FIR address base. */ + "lea " __ASM_NAME("fir") ADDR_SUFFIX ", " FIR_REG "\n\t" + "add " TMP_REG ", " FIR_REG "\n\t" + + /* Calculate input_value. */ + "movss (" INPUT_REG "), " INPUT_VALUE1_REG "\n\t" + "mulss " FIRGAIN_REG ", " INPUT_VALUE1_REG "\n\t" + "shufps $0, " INPUT_VALUE1_REG ", " INPUT_VALUE1_REG "\n\t" + "movups " INPUT_VALUE1_REG ", " INPUT_VALUE0_REG "\n\t" + /* Calculate input_value1. */ + "mulps " REM_REG ", " INPUT_VALUE1_REG "\n\t" + /* Calculate input_value0. */ + "subps " INPUT_VALUE1_REG ", " INPUT_VALUE0_REG "\n\t" + + /* Initialize i. */ + "xor " TMP_REG ", " TMP_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "2:\n\t" + /* Load the FIR coefficients. */ + "movaps (" FIR_REG "," TMP_REG "), " FTMP0_REG "\n\t" + "movaps " EXPAND_STR(FIR_WIDTH) " * 4(" FIR_REG "," TMP_REG "), " FTMP1_REG "\n\t" + /* Calculate the weighted sums. */ + "mulps " INPUT_VALUE0_REG ", " FTMP0_REG "\n\t" + "mulps " INPUT_VALUE1_REG ", " FTMP1_REG "\n\t" + "addps " FTMP0_REG ", " FTMP1_REG "\n\t" + /* Add the sums to the output. */ + "movups (" TMP_REG "," OUTPUT_REG ",4), " FTMP0_REG "\n\t" + "addps " FTMP1_REG ", " FTMP0_REG "\n\t" + "movups " FTMP0_REG ", (" TMP_REG "," OUTPUT_REG ",4)\n\t" + "add $16, " TMP_REG "\n\t" + "cmp $(" EXPAND_STR(FIR_WIDTH) " * 4), " TMP_REG "\n\t" + "jl 2b\n\t" + + /* Update rem. */ + "addps " REM_STEP_REG ", " REM_REG "\n\t" + "movups " ONE_REG ", " FTMP0_REG "\n\t" + "cmpleps " REM_REG ", " FTMP0_REG "\n\t" + "andps " ONE_REG ", " FTMP0_REG "\n\t" + "subps " FTMP0_REG ", " REM_REG "\n\t" + + /* Update opos_num. Use subtraction for the lower half as it is stored + * inverted. */ + "sub " OPOS_NUM_STEP_REG ", " OPOS_NUM_INV_REG "\n\t" + "adc $0, " OUTPUT_REG "\n\t" + + /* Advance the input pointer. */ + "add $4, " INPUT_REG "\n\t" + "cmp " INPUT_END_REG ", " INPUT_REG "\n\t" + "jl 1b\n\t" + + CLEANUP + "ret" ) + +#undef REM_REG +#undef REM_STEP_REG +#undef FIRGAIN_REG +#undef ONE_REG +#undef INPUT_VALUE0_REG +#undef INPUT_VALUE1_REG +#undef FTMP0_REG +#undef FTMP1_REG +#undef OPOS_NUM_INV_REG +#undef OPOS_NUM_STEP_REG +#undef INPUT_REG +#undef INPUT_END_REG +#undef OUTPUT_REG +#undef FIR_REG +#undef TMP_L_REG +#undef TMP_REG +#undef INIT +#undef CLEANUP +#undef ADDR_SUFFI + +#endif + /** * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. @@ -329,6 +550,15 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << FIR_STEP_SHIFT); int j; +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) + if (sse_supported) { + downsample_sse(~(DWORD)opos_num, opos_num_step, rem, rem_step, firgain, input, + input + required_input, + (DWORD_PTR)output / sizeof(float) + (opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH); + return; + } +#endif + for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716