From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/dsound_private.h | 2 ++ dlls/dsound/mixer.c | 8 ++++++++ dlls/dsound/mixer_sse.c | 39 ++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/dlls/dsound/dsound_private.h b/dlls/dsound/dsound_private.h index 0ded79055e4..408e0104fb7 100644 --- a/dlls/dsound/dsound_private.h +++ b/dlls/dsound/dsound_private.h @@ -270,6 +270,8 @@ HRESULT enumerate_mmdevices(EDataFlow flow, GUID *guids, /* mixer_sse.c */ #if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) +void downsample_sse(LONG64 opos_num, DWORD opos_num_step, float rem_float, float rem_step_float, + float firgain_float, UINT required_input, float *input, float *output); void upsample_sse(LONG64 ipos_num, DWORD ipos_num_step, float rem_inv_float, float rem_inv_step_float, UINT count, float *input, float *output); #endif diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 7a1eddaf057..a1809993d4e 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -320,6 +320,14 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << FIR_STEP_SHIFT); int j; +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) + if (sse_supported) { + downsample_sse(opos_num, opos_num_step, rem, rem_step, firgain, required_input, input, + output); + return; + } +#endif + for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH; diff --git a/dlls/dsound/mixer_sse.c b/dlls/dsound/mixer_sse.c index 62957233556..0885051e57e 100644 --- a/dlls/dsound/mixer_sse.c +++ b/dlls/dsound/mixer_sse.c @@ -25,6 +25,45 @@ #include "dsound_private.h" #include "fir.h" +/** + * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and + * after output[]. + */ +void downsample_sse(LONG64 opos_num, DWORD opos_num_step, float rem_float, float rem_step_float, + float firgain_float, UINT required_input, float *input, float *output) +{ + __m128 rem = _mm_set1_ps(rem_float); + __m128 rem_step = _mm_set1_ps(rem_step_float); + __m128 firgain = _mm_set_ss(firgain_float); + __m128 one = _mm_set1_ps(1.0f); + int j; + + for (j = 0; j < required_input; ++j) { + /* opos is in the range [-(fir_width - 1), count) */ + int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH; + UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; + __m128 rem_inv = _mm_sub_ps(one, rem); + + __m128 input_value_ss = _mm_mul_ss(_mm_load_ss(&input[j]), firgain); + __m128 input_value = _mm_shuffle_ps(input_value_ss, input_value_ss, 0); + __m128 input_value0 = _mm_mul_ps(rem_inv, input_value); + __m128 input_value1 = _mm_mul_ps(rem, input_value); + + int i; + for (i = 0; i < FIR_WIDTH; i += 4) { + __m128 value0 = _mm_mul_ps(_mm_load_ps(&fir[idx + i]), input_value0); + __m128 value1 = _mm_mul_ps(_mm_load_ps(&fir[idx + FIR_WIDTH + i]), input_value1); + __m128 value = _mm_add_ps(value0, value1); + _mm_storeu_ps(&output[opos + i], _mm_add_ps(_mm_loadu_ps(&output[opos + i]), value)); + } + + rem = _mm_add_ps(rem, rem_step); + rem = _mm_sub_ps(rem, _mm_and_ps(one, _mm_cmple_ps(one, rem))); + + opos_num += opos_num_step; + } +} + void upsample_sse(LONG64 ipos_num, DWORD ipos_num_step, float rem_inv_float, float rem_inv_step_float, UINT count, float *input, float *output) { -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716