[PATCH v6 0/5] MR10716: dsound: Speed up resampling, part 6
-- v6: dsound: Add an SSE version of downsample. dsound: Add an SSE version of upsample. https://gitlab.winehq.org/wine/wine/-/merge_requests/10716
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/dsound_private.h | 4 ++-- dlls/dsound/mixer.c | 24 ++++++++++++++---------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/dlls/dsound/dsound_private.h b/dlls/dsound/dsound_private.h index 75279dacf87..0e695698046 100644 --- a/dlls/dsound/dsound_private.h +++ b/dlls/dsound/dsound_private.h @@ -147,8 +147,8 @@ struct IDirectSoundBufferImpl DSBUFFERDESC dsbd; /* used for frequency conversion (PerfectPitch) */ float firgain; - LONG64 freqAdjustNum,freqAdjustDen; - LONG64 freqAccNum; + DWORD freqAdjustNum,freqAdjustDen; + DWORD freqAccNum; /* used for mixing */ DWORD sec_mixpos; /* Holds a copy of the next 'writelead' bytes, to be used for mixing. This makes it diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 45e37c5bcb1..26a35722e50 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -106,7 +106,7 @@ void DSOUND_RecalcFormat(IDirectSoundBufferImpl *dsb) { DWORD ichannels = dsb->pwfx->nChannels; DWORD ochannels = dsb->device->pwfx->nChannels; - LONG64 oldFreqAdjustDen = dsb->freqAdjustDen; + DWORD oldFreqAdjustDen = dsb->freqAdjustDen; WAVEFORMATEXTENSIBLE *pwfxe; BOOL ieee = FALSE; @@ -131,7 +131,8 @@ void DSOUND_RecalcFormat(IDirectSoundBufferImpl *dsb) dsb->maxwritelead = (DSBFREQUENCY_MAX / 100) * dsb->pwfx->nBlockAlign; if (oldFreqAdjustDen) - dsb->freqAccNum = (dsb->freqAccNum * dsb->freqAdjustDen + oldFreqAdjustDen / 2) / oldFreqAdjustDen; + dsb->freqAccNum = (dsb->freqAccNum * (LONG64)dsb->freqAdjustDen + + oldFreqAdjustDen / 2) / oldFreqAdjustDen; dsb->get_aux = ieee ? getbpp[4] : getbpp[dsb->pwfx->wBitsPerSample/8 - 1]; dsb->put_aux = putieee32; @@ -419,17 +420,18 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl * Note that this function will overwrite up to fir_width - 1 frames before and * after output[]. */ -static void resample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq_acc_start, +static void resample(DWORD freq_adjust_num, DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, UINT required_input, UINT count, float *input, float *output) { if (freq_adjust_num > freq_adjust_den) { /* Take a reciprocal of the resampling ratio and convert it to a 0.32 * fixed point. Round down to prevent output buffer overflow. */ - DWORD freq_adjust_fixed_den = (freq_adjust_den << FREQ_ADJUST_SHIFT) / freq_adjust_num; + DWORD freq_adjust_fixed_den = ((LONG64)freq_adjust_den << FREQ_ADJUST_SHIFT) + / freq_adjust_num; /* Convert the subsample position to a 0.32 fixed point. Round up to * prevent output buffer overflow. */ - DWORD freq_acc_fixed_start = (freq_acc_start * freq_adjust_fixed_den + freq_adjust_den - 1) - / freq_adjust_den; + DWORD freq_acc_fixed_start = ((LONG64)freq_acc_start * freq_adjust_fixed_den + + freq_adjust_den - 1) / freq_adjust_den; memset(output, 0, count * sizeof(float)); downsample(freq_adjust_fixed_den, freq_acc_fixed_start, firgain, required_input, input, @@ -437,16 +439,18 @@ static void resample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq } else { /* Convert the resampling ratio to a 0.32 fixed point. Round down to * prevent input buffer overflow. */ - DWORD freq_adjust_fixed_num = (freq_adjust_num << FREQ_ADJUST_SHIFT) / freq_adjust_den; + DWORD freq_adjust_fixed_num = ((LONG64)freq_adjust_num << FREQ_ADJUST_SHIFT) + / freq_adjust_den; /* Convert the subsample position to a 0.32 fixed point. Round down to * prevent input buffer overflow. */ - DWORD freq_acc_fixed_start = (freq_acc_start << FREQ_ADJUST_SHIFT) / freq_adjust_den; + DWORD freq_acc_fixed_start = ((LONG64)freq_acc_start << FREQ_ADJUST_SHIFT) + / freq_adjust_den; upsample(freq_adjust_fixed_num, freq_acc_fixed_start, count, input, output); } } -static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, LONG64 *freqAccNum) +static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *freqAccNum) { UINT i, channel; UINT istride = dsb->pwfx->nBlockAlign; @@ -517,7 +521,7 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, LONG64 * return max_ipos; } -static void cp_fields(IDirectSoundBufferImpl *dsb, UINT count, LONG64 *freqAccNum) +static void cp_fields(IDirectSoundBufferImpl *dsb, UINT count, DWORD *freqAccNum) { DWORD ipos, adv; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 56 ++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 26a35722e50..d66dec2ed7e 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -284,34 +284,6 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, return dsb->get(dsb, buffer + (mixpos % buflen), channel); } -static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) -{ - UINT istride = dsb->pwfx->nBlockAlign; - UINT ostride = dsb->device->pwfx->nChannels * sizeof(float); - UINT committed_samples = 0; - DWORD channel, i; - - if (!secondarybuffer_is_audible(dsb)) - return count; - - if(dsb->use_committed) { - committed_samples = (dsb->writelead - dsb->committed_mixpos) / istride; - committed_samples = committed_samples <= count ? committed_samples : count; - } - - for (i = 0; i < committed_samples; i++) - for (channel = 0; channel < dsb->mix_channels; channel++) - dsb->put(dsb, i * ostride, channel, get_current_sample(dsb, dsb->committedbuff, - dsb->writelead, dsb->committed_mixpos + i * istride, channel)); - - for (; i < count; i++) - for (channel = 0; channel < dsb->mix_channels; channel++) - dsb->put(dsb, i * ostride, channel, get_current_sample(dsb, dsb->buffer->memory, - dsb->buflen, dsb->sec_mixpos + i * istride, channel)); - - return count; -} - /** * Note that this function will overwrite up to fir_width - 1 frames before and * after output[]. @@ -521,6 +493,34 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *f return max_ipos; } +static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) +{ + UINT istride = dsb->pwfx->nBlockAlign; + UINT ostride = dsb->device->pwfx->nChannels * sizeof(float); + UINT committed_samples = 0; + DWORD channel, i; + + if (!secondarybuffer_is_audible(dsb)) + return count; + + if(dsb->use_committed) { + committed_samples = (dsb->writelead - dsb->committed_mixpos) / istride; + committed_samples = committed_samples <= count ? committed_samples : count; + } + + for (i = 0; i < committed_samples; i++) + for (channel = 0; channel < dsb->mix_channels; channel++) + dsb->put(dsb, i * ostride, channel, get_current_sample(dsb, dsb->committedbuff, + dsb->writelead, dsb->committed_mixpos + i * istride, channel)); + + for (; i < count; i++) + for (channel = 0; channel < dsb->mix_channels; channel++) + dsb->put(dsb, i * ostride, channel, get_current_sample(dsb, dsb->buffer->memory, + dsb->buflen, dsb->sec_mixpos + i * istride, channel)); + + return count; +} + static void cp_fields(IDirectSoundBufferImpl *dsb, UINT count, DWORD *freqAccNum) { DWORD ipos, adv; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/fir.h | 16 ++++++++-------- dlls/dsound/mixer.c | 42 +++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h index 45ad65d7398..76ac521e0f3 100644 --- a/dlls/dsound/fir.h +++ b/dlls/dsound/fir.h @@ -86,10 +86,10 @@ int main() fprintf(stderr, "q %f\n", (double)output.q); fprintf(stderr, "status %s\n", get_pm_status_str(output.status)); - printf("static const int fir_width_shift = %d;\n", fir_width_shift); - printf("static const int fir_width = %d;\n", fir_width); - printf("static const int fir_step_shift = %d;\n", fir_step_shift); - printf("static const int fir_step = %d;\n", fir_step); + printf("#define FIR_WIDTH_SHIFT %d\n", fir_width_shift); + printf("#define FIR_WIDTH %d\n", fir_width); + printf("#define FIR_STEP_SHIFT %d\n", fir_step_shift); + printf("#define FIR_STEP %d\n", fir_step); printf("static const float fir[] = {"); // Print the FIR array with an additional row at the end. This simplifies // calculation of the interpolated value by allowing the index to overflow @@ -114,10 +114,10 @@ int main() printf("};\n"); } */ -static const int fir_width_shift = 6; -static const int fir_width = 64; -static const int fir_step_shift = 7; -static const int fir_step = 128; +#define FIR_WIDTH_SHIFT 6 +#define FIR_WIDTH 64 +#define FIR_STEP_SHIFT 7 +#define FIR_STEP 128 static const float fir[] = { 0.0000000000e+00, -2.4830013102e-06, 1.9318705150e-06, 2.6614854151e-06, -1.5313785194e-05, 4.2076214553e-05, -9.1417167945e-05, 1.7455895136e-04, diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index d66dec2ed7e..1b4b1c7bd7a 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -285,7 +285,7 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, } /** - * Note that this function will overwrite up to fir_width - 1 frames before and + * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. */ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, @@ -309,28 +309,28 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai * Clearing the bits is safe as it has the same effect as rounding up the * resampling ratio and the subsample position and doesn't affect the * initial opos value. */ - LONG64 opos_num_mask = ~0ull << (FREQ_ADJUST_SHIFT - 23 - fir_step_shift); + LONG64 opos_num_mask = ~0ull << (FREQ_ADJUST_SHIFT - 23 - FIR_STEP_SHIFT); LONG64 opos_num = (freq_adjust_den - freq_acc_start + (1ll << FREQ_ADJUST_SHIFT) - 1) & opos_num_mask; DWORD opos_num_step = freq_adjust_den & (DWORD)opos_num_mask; /* Use XOR to invert the lower part of opos_num so that the lower bits * remain cleared. */ - float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << fir_step_shift); - float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << fir_step_shift); + float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << FIR_STEP_SHIFT); + float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << FIR_STEP_SHIFT); int j; for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ - int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - fir_width; - UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; + int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH; + UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; float input_value1 = rem * input_value; int i; - for (i = 0; i < fir_width; ++i) - output[opos + i] += fir[idx + i] * input_value0 + fir[idx + fir_width + i] * input_value1; + for (i = 0; i < FIR_WIDTH; ++i) + output[opos + i] += fir[idx + i] * input_value0 + fir[idx + FIR_WIDTH + i] * input_value1; rem += rem_step; rem -= rem >= 1.0f ? 1.0f : 0.0f; @@ -360,25 +360,25 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl * * Clearing the bits is safe as it has the same effect as rounding down the * resampling ratio and the subsample position. */ - DWORD ipos_num_mask = ~0u << (FREQ_ADJUST_SHIFT - 23 - fir_step_shift); + DWORD ipos_num_mask = ~0u << (FREQ_ADJUST_SHIFT - 23 - FIR_STEP_SHIFT); LONG64 ipos_num = freq_acc_start & ipos_num_mask; DWORD ipos_num_step = freq_adjust_num & ipos_num_mask; - float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << fir_step_shift); - float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << fir_step_shift); + float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << FIR_STEP_SHIFT); + float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << FIR_STEP_SHIFT); UINT i; for(i = 0; i < count; ++i) { UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; - UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; + UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; float rem = 1.0f - rem_inv; int j; float sum = 0.0; float* cache = &input[ipos]; - for (j = 0; j < fir_width; j++) - sum += (fir[idx + j] * rem_inv + fir[idx + j + fir_width] * rem) * cache[j]; + for (j = 0; j < FIR_WIDTH; j++) + sum += (fir[idx + j] * rem_inv + fir[idx + j + FIR_WIDTH] * rem) * cache[j]; output[i] = sum; rem_inv += rem_inv_step; @@ -389,7 +389,7 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl } /** - * Note that this function will overwrite up to fir_width - 1 frames before and + * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. */ static void resample(DWORD freq_adjust_num, DWORD freq_adjust_den, DWORD freq_acc_start, @@ -435,15 +435,15 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *f UINT max_ipos = (freqAcc_start + count * dsb->freqAdjustNum) / dsb->freqAdjustDen; UINT required_input = max( - (freqAcc_start + (count - 1) * dsb->freqAdjustNum) / dsb->freqAdjustDen + fir_width, - (freqAcc_start + (count - 1 + fir_width) * dsb->freqAdjustNum) / dsb->freqAdjustDen); + (freqAcc_start + (count - 1) * dsb->freqAdjustNum) / dsb->freqAdjustDen + FIR_WIDTH, + (freqAcc_start + (count - 1 + FIR_WIDTH) * dsb->freqAdjustNum) / dsb->freqAdjustDen); float *intermediate, *output, *itmp; DWORD len = required_input * channels; /* Allocate an output buffer for each channel with padding on both ends as * required by the resample function. Padding at the end of one channel * buffer is reused as a start padding for the next channel buffer. */ - len += fir_width - 1 + (count + fir_width - 1) * channels; + len += FIR_WIDTH - 1 + (count + FIR_WIDTH - 1) * channels; len *= sizeof(float); *freqAccNum = freqAcc_end % dsb->freqAdjustDen; @@ -460,7 +460,7 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *f } intermediate = dsb->device->cp_buffer; - output = intermediate + required_input * channels + fir_width - 1; + output = intermediate + required_input * channels + FIR_WIDTH - 1; if(dsb->use_committed) { committed_samples = (dsb->writelead - dsb->committed_mixpos) / istride; @@ -484,11 +484,11 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *f for (channel = 0; channel < channels; channel++) resample(dsb->freqAdjustNum, dsb->freqAdjustDen, freqAcc_start, dsb->firgain, required_input, count, intermediate + channel * required_input, - output + channel * (fir_width - 1 + count)); + output + channel * (FIR_WIDTH - 1 + count)); for(i = 0; i < count; ++i) for (channel = 0; channel < channels; channel++) - dsb->put(dsb, i * ostride, channel, output[channel * (fir_width - 1 + count) + i]); + dsb->put(dsb, i * ostride, channel, output[channel * (FIR_WIDTH - 1 + count) + i]); return max_ipos; } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/fir.h | 4 ++-- dlls/dsound/mixer.c | 53 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h index 76ac521e0f3..39a32af1412 100644 --- a/dlls/dsound/fir.h +++ b/dlls/dsound/fir.h @@ -90,7 +90,7 @@ int main() printf("#define FIR_WIDTH %d\n", fir_width); printf("#define FIR_STEP_SHIFT %d\n", fir_step_shift); printf("#define FIR_STEP %d\n", fir_step); - printf("static const float fir[] = {"); + printf("static const float DECLSPEC_ALIGN(16) fir[] = {"); // Print the FIR array with an additional row at the end. This simplifies // calculation of the interpolated value by allowing the index to overflow // into the extra row. It just repeats the first row, starting from its @@ -118,7 +118,7 @@ int main() #define FIR_WIDTH 64 #define FIR_STEP_SHIFT 7 #define FIR_STEP 128 -static const float fir[] = { +static const float DECLSPEC_ALIGN(16) fir[] = { 0.0000000000e+00, -2.4830013102e-06, 1.9318705150e-06, 2.6614854151e-06, -1.5313785194e-05, 4.2076214553e-05, -9.1417167945e-05, 1.7455895136e-04, -3.0567859821e-04, 5.0191365396e-04, -7.8311909082e-04, 1.1713337628e-03, diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 1b4b1c7bd7a..b8a7208ae32 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -25,6 +25,9 @@ #include <assert.h> #include <stdarg.h> #include <math.h> /* Insomnia - pow() function */ +#ifdef __SSE__ +#include <xmmintrin.h> +#endif #define COBJMACROS @@ -339,6 +342,50 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai } } +#ifdef __SSE__ + +void upsample_sse(LONG64 ipos_num, DWORD ipos_num_step, float rem_inv_float, + float rem_inv_step_float, UINT count, float *input, float *output) +{ + __m128 rem_inv = _mm_set1_ps(rem_inv_float); + __m128 rem_inv_step = _mm_set1_ps(rem_inv_step_float); + __m128 one = _mm_set1_ps(1.0f); + + UINT i; + + for(i = 0; i < count; ++i) { + UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; + UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; + __m128 rem = _mm_sub_ps(one, rem_inv); + + int j; + __m128 sum = _mm_set1_ps(0.0f); + float* cache = &input[ipos]; + + C_ASSERT(!(FIR_WIDTH % 4)); + for (j = 0; j < FIR_WIDTH; j += 4) { + __m128 fir_value0 = _mm_mul_ps(_mm_load_ps(&fir[idx + j]), rem_inv); + __m128 fir_value1 = _mm_mul_ps(_mm_load_ps(&fir[idx + j + FIR_WIDTH]), rem); + __m128 fir_value = _mm_add_ps(fir_value0, fir_value1); + __m128 input_value = _mm_loadu_ps(&cache[j]); + sum = _mm_add_ps(sum, _mm_mul_ps(fir_value, input_value)); + } + + /* Add the even-numbered sums to the odd-numbered ones. */ + sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 3, 0, 1))); + /* Calculate the final sum and store it to the output array. */ + sum = _mm_add_ss(sum, _mm_movehl_ps(sum, sum)); + _mm_store_ss(&output[i], sum); + + rem_inv = _mm_add_ps(rem_inv, rem_inv_step); + rem_inv = _mm_sub_ps(rem_inv, _mm_and_ps(one, _mm_cmple_ps(one, rem_inv))); + + ipos_num += ipos_num_step; + } +} + +#endif + static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { @@ -366,8 +413,11 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << FIR_STEP_SHIFT); float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << FIR_STEP_SHIFT); - UINT i; +#ifdef __SSE__ + upsample_sse(ipos_num, ipos_num_step, rem_inv, rem_inv_step, count, input, output); +#else + UINT i; for(i = 0; i < count; ++i) { UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; @@ -386,6 +436,7 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl ipos_num += ipos_num_step; } +#endif } /** -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 50 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index b8a7208ae32..519de19ff04 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -287,6 +287,50 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, return dsb->get(dsb, buffer + (mixpos % buflen), channel); } +#ifdef __SSE__ + +/** + * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and + * after output[]. + */ +void downsample_sse(LONG64 opos_num, DWORD opos_num_step, float rem_float, float rem_step_float, + float firgain_float, UINT required_input, float *input, float *output) +{ + __m128 rem = _mm_set1_ps(rem_float); + __m128 rem_step = _mm_set1_ps(rem_step_float); + __m128 firgain = _mm_set_ss(firgain_float); + __m128 one = _mm_set1_ps(1.0f); + int j; + + for (j = 0; j < required_input; ++j) { + /* opos is in the range [-(fir_width - 1), count) */ + int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH; + UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; + __m128 rem_inv = _mm_sub_ps(one, rem); + + __m128 input_value_ss = _mm_mul_ss(_mm_load_ss(&input[j]), firgain); + __m128 input_value = _mm_shuffle_ps(input_value_ss, input_value_ss, 0); + __m128 input_value0 = _mm_mul_ps(rem_inv, input_value); + __m128 input_value1 = _mm_mul_ps(rem, input_value); + + int i; + C_ASSERT(!(FIR_WIDTH % 4)); + for (i = 0; i < FIR_WIDTH; i += 4) { + __m128 value0 = _mm_mul_ps(_mm_load_ps(&fir[idx + i]), input_value0); + __m128 value1 = _mm_mul_ps(_mm_load_ps(&fir[idx + FIR_WIDTH + i]), input_value1); + __m128 value = _mm_add_ps(value0, value1); + _mm_storeu_ps(&output[opos + i], _mm_add_ps(_mm_loadu_ps(&output[opos + i]), value)); + } + + rem = _mm_add_ps(rem, rem_step); + rem = _mm_sub_ps(rem, _mm_and_ps(one, _mm_cmple_ps(one, rem))); + + opos_num += opos_num_step; + } +} + +#endif + /** * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. @@ -320,8 +364,11 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai * remain cleared. */ float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << FIR_STEP_SHIFT); float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << FIR_STEP_SHIFT); - int j; +#ifdef __SSE__ + downsample_sse(opos_num, opos_num_step, rem, rem_step, firgain, required_input, input, output); +#else + int j; for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH; @@ -340,6 +387,7 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai opos_num += opos_num_step; } +#endif } #ifdef __SSE__ -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716
On Wed May 20 08:53:41 2026 +0000, Matteo Bruni wrote:
That's also an option, certainly. Not sure it's preferable to not building the SSE+ version when the CFLAGS don't allow it. Removed the `makedep` changes and changed the code to use `#ifdef __SSE__` instead.
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10716#note_141116
v6: - Check for SSE support at compile time. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10716#note_141117
FWIW, my approval still stands. I don't think this has to wait for a resolution to !10953, since the SSE version of both `upsample()` and `downsample()` is already going to be used on x86-64 by default. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10716#note_141655
This merge request was approved by Huw Davies. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10716
participants (4)
-
Anton Baskanov -
Anton Baskanov (@baskanov) -
Huw Davies (@huw) -
Matteo Bruni (@Mystral)