From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 196 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 78a1ef2bb33..702f84fc08f 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -369,6 +369,188 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, #endif +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) + +#ifdef __i386__ + +#define OPOS_NUM_ARG "0x14(%esp)" +#define OPOS_NUM_STEP_ARG "0x18(%esp)" +#define REM_ARG "0x1c(%esp)" +#define REM_STEP_ARG "0x20(%esp)" +#define FIRGAIN_ARG "0x24(%esp)" +#define REQUIRED_INPUT_ARG "0x28(%esp)" +#define INPUT_ARG "0x2c(%esp)" +#define OUTPUT_ARG "0x30(%esp)" + +#define OPOS_NUM_REG "%ecx" +#define OPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%esi" +#define OUTPUT_REG "%edi" +#define INPUT_END_REG "%ebp" +#define FIR_REG "%ebx" +#define TMP_L_REG "%eax" +#define TMP_REG "%eax" + +#else + +#define OPOS_NUM_ARG "%ecx" +#define OPOS_NUM_STEP_ARG "%edx" +#define REM_ARG "%xmm2" +#define REM_STEP_ARG "%xmm3" +#define FIRGAIN_ARG "0x70(%rsp)" +#define REQUIRED_INPUT_ARG "0x78(%rsp)" +#define INPUT_ARG "0x80(%rsp)" +#define OUTPUT_ARG "0x88(%rsp)" + +#define OPOS_NUM_REG "%ecx" +#define OPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%rsi" +#define OUTPUT_REG "%rdi" +#define INPUT_END_REG "%rbp" +#define FIR_REG "%rbx" +#define TMP_L_REG "%eax" +#define TMP_REG "%rax" + +#endif + +#define REM_REG "%xmm2" +#define REM_STEP_REG "%xmm3" +#define FIRGAIN_REG "%xmm0" +#define ONE_REG "%xmm1" +#define INPUT_VALUE0_REG "%xmm4" +#define INPUT_VALUE1_REG "%xmm5" +#define FTMP0_REG "%xmm6" +#define FTMP1_REG "%xmm7" + +/** + * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and + * after output[]. + */ +void downsample_sse(DWORD opos_num, DWORD opos_num_step, float rem, float rem_step, float firgain, + UINT required_input, float *input, float *output); +__ASM_GLOBAL_FUNC( downsample_sse, + INIT + + "mov " OPOS_NUM_ARG ", " OPOS_NUM_REG "\n\t" + /* Store the lower half of opos_num inverted so that we don't have to + * invert it on every iteration of the outer loop. */ + "not " OPOS_NUM_REG "\n\t" + "mov " OPOS_NUM_STEP_ARG ", " OPOS_NUM_STEP_REG "\n\t" + + "movss " REM_ARG ", " REM_REG "\n\t" + "shufps $0, " REM_REG ", " REM_REG "\n\t" + "movss " REM_STEP_ARG ", " REM_STEP_REG "\n\t" + "shufps $0, " REM_STEP_REG ", " REM_STEP_REG "\n\t" + + "movss " FIRGAIN_ARG ", " FIRGAIN_REG "\n\t" + + "mov " INPUT_ARG ", " INPUT_REG "\n\t" + + /* Combine the upper half of opos_num and the output pointer into a + * single value. */ + "mov " OUTPUT_ARG ", " OUTPUT_REG "\n\t" + /* Divide the output pointer by 4 to match the scale. We can do this + * because the pointer is at least 4-byte aligned. It will be scaled + * back during the access in the inner loop. */ + "shr $2, " OUTPUT_REG "\n\t" + /* Subtract FIR_WIDTH so that we don't have to do this on every + * iteration of the outer loop. */ + "sub $" EXPAND_STR(FIR_WIDTH) ", " OUTPUT_REG "\n\t" + + "mov " REQUIRED_INPUT_ARG ", " TMP_L_REG "\n\t" + "lea (" INPUT_REG "," TMP_REG ",4), " INPUT_END_REG "\n\t" + + "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "1:\n\t" + /* Calculate idx. */ + "mov " OPOS_NUM_REG ", " TMP_L_REG "\n\t" + "shr $(32 - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t" + "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + 2), " TMP_REG "\n\t" + /* Calculate the FIR address base. */ + "lea " __ASM_NAME("fir") ADDR_SUFFIX ", " FIR_REG "\n\t" + "add " TMP_REG ", " FIR_REG "\n\t" + + /* Calculate input_value. */ + "movss (" INPUT_REG "), " INPUT_VALUE1_REG "\n\t" + "mulss " FIRGAIN_REG ", " INPUT_VALUE1_REG "\n\t" + "shufps $0, " INPUT_VALUE1_REG ", " INPUT_VALUE1_REG "\n\t" + "movups " INPUT_VALUE1_REG ", " INPUT_VALUE0_REG "\n\t" + /* Calculate input_value1. */ + "mulps " REM_REG ", " INPUT_VALUE1_REG "\n\t" + /* Calculate input_value0. */ + "subps " INPUT_VALUE1_REG ", " INPUT_VALUE0_REG "\n\t" + + /* Initialize i. */ + "xor " TMP_REG ", " TMP_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "2:\n\t" + /* Load the FIR coefficients. */ + "movaps (" FIR_REG "," TMP_REG "), " FTMP0_REG "\n\t" + "movaps " EXPAND_STR(FIR_WIDTH) " * 4(" FIR_REG "," TMP_REG "), " FTMP1_REG "\n\t" + /* Calculate the weighted sums. */ + "mulps " INPUT_VALUE0_REG ", " FTMP0_REG "\n\t" + "mulps " INPUT_VALUE1_REG ", " FTMP1_REG "\n\t" + "addps " FTMP0_REG ", " FTMP1_REG "\n\t" + /* Add the sums to the output. */ + "movups (" TMP_REG "," OUTPUT_REG ",4), " FTMP0_REG "\n\t" + "addps " FTMP1_REG ", " FTMP0_REG "\n\t" + "movups " FTMP0_REG ", (" TMP_REG "," OUTPUT_REG ",4)\n\t" + "add $16, " TMP_REG "\n\t" + "cmp $(" EXPAND_STR(FIR_WIDTH) " * 4), " TMP_REG "\n\t" + "jl 2b\n\t" + + /* Update rem. */ + "addps " REM_STEP_REG ", " REM_REG "\n\t" + "movups " ONE_REG ", " FTMP0_REG "\n\t" + "cmpleps " REM_REG ", " FTMP0_REG "\n\t" + "andps " ONE_REG ", " FTMP0_REG "\n\t" + "subps " FTMP0_REG ", " REM_REG "\n\t" + + /* Update opos_num. Use subtraction for the lower half as it is stored + * inverted. */ + "sub " OPOS_NUM_STEP_REG ", " OPOS_NUM_REG "\n\t" + "adc $0, " OUTPUT_REG "\n\t" + + /* Advance the input pointer. */ + "add $4, " INPUT_REG "\n\t" + "cmp " INPUT_END_REG ", " INPUT_REG "\n\t" + "jl 1b\n\t" + + CLEANUP + "ret" ) + +#undef OPOS_NUM_ARG +#undef OPOS_NUM_STEP_ARG +#undef REM_ARG +#undef REM_STEP_ARG +#undef FIRGAIN_ARG +#undef REQUIRED_INPUT_ARG +#undef INPUT_ARG +#undef OUTPUT_ARG +#undef OPOS_NUM_REG +#undef OPOS_NUM_STEP_REG +#undef INPUT_REG +#undef OUTPUT_REG +#undef INPUT_END_REG +#undef FIR_REG +#undef TMP_L_REG +#undef TMP_REG +#undef REM_REG +#undef REM_STEP_REG +#undef FIRGAIN_REG +#undef ONE_REG +#undef INPUT_VALUE0_REG +#undef INPUT_VALUE1_REG +#undef FTMP0_REG +#undef FTMP1_REG + +#endif + /** * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. @@ -402,8 +584,21 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai * remain cleared. */ float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << FIR_STEP_SHIFT); float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << FIR_STEP_SHIFT); + +#if defined(__x86_64__) && !defined(__arm64ec__) + downsample_sse((DWORD)opos_num, opos_num_step, rem, rem_step, firgain, required_input, input, + output + (opos_num >> FREQ_ADJUST_SHIFT)); +#else int j; +#ifdef __i386__ + if (sse_supported) { + downsample_sse((DWORD)opos_num, opos_num_step, rem, rem_step, firgain, required_input, + input, output + (opos_num >> FREQ_ADJUST_SHIFT)); + return; + } +#endif + for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH; @@ -422,6 +617,7 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai opos_num += opos_num_step; } +#endif } #if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716