[PATCH 0/5] MR10716: dsound: Speed up resampling, part 6

newer
[PATCH v2 0/1] MR10575:...

older
[PATCH 0/2] MR10606: mf/session:...

Anton Baskanov (＠baskanov)

April 22, 2026

5:49 a.m.

-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10716

Show replies by date

Anton Baskanov

April 2026

5:49 a.m.

New subject: [PATCH 1/5] dsound: Use DWORD to store freqAdjustNum, freqAdjustDen and freqAccNum.

From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/dsound_private.h | 4 ++-- dlls/dsound/mixer.c | 24 ++++++++++++++---------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/dlls/dsound/dsound_private.h b/dlls/dsound/dsound_private.h index 75279dacf87..0e695698046 100644 --- a/dlls/dsound/dsound_private.h +++ b/dlls/dsound/dsound_private.h @@ -147,8 +147,8 @@ struct IDirectSoundBufferImpl DSBUFFERDESC dsbd; /* used for frequency conversion (PerfectPitch) */ float firgain; - LONG64 freqAdjustNum,freqAdjustDen; - LONG64 freqAccNum; + DWORD freqAdjustNum,freqAdjustDen; + DWORD freqAccNum; /* used for mixing */ DWORD sec_mixpos; /* Holds a copy of the next 'writelead' bytes, to be used for mixing. This makes it diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 45e37c5bcb1..26a35722e50 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -106,7 +106,7 @@ void DSOUND_RecalcFormat(IDirectSoundBufferImpl *dsb) { DWORD ichannels = dsb->pwfx->nChannels; DWORD ochannels = dsb->device->pwfx->nChannels; - LONG64 oldFreqAdjustDen = dsb->freqAdjustDen; + DWORD oldFreqAdjustDen = dsb->freqAdjustDen; WAVEFORMATEXTENSIBLE *pwfxe; BOOL ieee = FALSE; @@ -131,7 +131,8 @@ void DSOUND_RecalcFormat(IDirectSoundBufferImpl *dsb) dsb->maxwritelead = (DSBFREQUENCY_MAX / 100) * dsb->pwfx->nBlockAlign; if (oldFreqAdjustDen) - dsb->freqAccNum = (dsb->freqAccNum * dsb->freqAdjustDen + oldFreqAdjustDen / 2) / oldFreqAdjustDen; + dsb->freqAccNum = (dsb->freqAccNum * (LONG64)dsb->freqAdjustDen + + oldFreqAdjustDen / 2) / oldFreqAdjustDen; dsb->get_aux = ieee ? getbpp[4] : getbpp[dsb->pwfx->wBitsPerSample/8 - 1]; dsb->put_aux = putieee32; @@ -419,17 +420,18 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl * Note that this function will overwrite up to fir_width - 1 frames before and * after output[]. */ -static void resample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq_acc_start, +static void resample(DWORD freq_adjust_num, DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, UINT required_input, UINT count, float *input, float *output) { if (freq_adjust_num > freq_adjust_den) { /* Take a reciprocal of the resampling ratio and convert it to a 0.32 * fixed point. Round down to prevent output buffer overflow. */ - DWORD freq_adjust_fixed_den = (freq_adjust_den << FREQ_ADJUST_SHIFT) / freq_adjust_num; + DWORD freq_adjust_fixed_den = ((LONG64)freq_adjust_den << FREQ_ADJUST_SHIFT) + / freq_adjust_num; /* Convert the subsample position to a 0.32 fixed point. Round up to * prevent output buffer overflow. */ - DWORD freq_acc_fixed_start = (freq_acc_start * freq_adjust_fixed_den + freq_adjust_den - 1) - / freq_adjust_den; + DWORD freq_acc_fixed_start = ((LONG64)freq_acc_start * freq_adjust_fixed_den + + freq_adjust_den - 1) / freq_adjust_den; memset(output, 0, count * sizeof(float)); downsample(freq_adjust_fixed_den, freq_acc_fixed_start, firgain, required_input, input, @@ -437,16 +439,18 @@ static void resample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq } else { /* Convert the resampling ratio to a 0.32 fixed point. Round down to * prevent input buffer overflow. */ - DWORD freq_adjust_fixed_num = (freq_adjust_num << FREQ_ADJUST_SHIFT) / freq_adjust_den; + DWORD freq_adjust_fixed_num = ((LONG64)freq_adjust_num << FREQ_ADJUST_SHIFT) + / freq_adjust_den; /* Convert the subsample position to a 0.32 fixed point. Round down to * prevent input buffer overflow. */ - DWORD freq_acc_fixed_start = (freq_acc_start << FREQ_ADJUST_SHIFT) / freq_adjust_den; + DWORD freq_acc_fixed_start = ((LONG64)freq_acc_start << FREQ_ADJUST_SHIFT) + / freq_adjust_den; upsample(freq_adjust_fixed_num, freq_acc_fixed_start, count, input, output); } } -static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, LONG64 *freqAccNum) +static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *freqAccNum) { UINT i, channel; UINT istride = dsb->pwfx->nBlockAlign; @@ -517,7 +521,7 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, LONG64 * return max_ipos; } -static void cp_fields(IDirectSoundBufferImpl *dsb, UINT count, LONG64 *freqAccNum) +static void cp_fields(IDirectSoundBufferImpl *dsb, UINT count, DWORD *freqAccNum) { DWORD ipos, adv; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716

Anton Baskanov

5:49 a.m.

New subject: [PATCH 2/5] dsound: Move cp_fields_noresample after cp_fields_resample.

From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 56 ++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 26a35722e50..d66dec2ed7e 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -284,34 +284,6 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, return dsb->get(dsb, buffer + (mixpos % buflen), channel); } -static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) -{ - UINT istride = dsb->pwfx->nBlockAlign; - UINT ostride = dsb->device->pwfx->nChannels * sizeof(float); - UINT committed_samples = 0; - DWORD channel, i; - - if (!secondarybuffer_is_audible(dsb)) - return count; - - if(dsb->use_committed) { - committed_samples = (dsb->writelead - dsb->committed_mixpos) / istride; - committed_samples = committed_samples <= count ? committed_samples : count; - } - - for (i = 0; i < committed_samples; i++) - for (channel = 0; channel < dsb->mix_channels; channel++) - dsb->put(dsb, i * ostride, channel, get_current_sample(dsb, dsb->committedbuff, - dsb->writelead, dsb->committed_mixpos + i * istride, channel)); - - for (; i < count; i++) - for (channel = 0; channel < dsb->mix_channels; channel++) - dsb->put(dsb, i * ostride, channel, get_current_sample(dsb, dsb->buffer->memory, - dsb->buflen, dsb->sec_mixpos + i * istride, channel)); - - return count; -} - /** * Note that this function will overwrite up to fir_width - 1 frames before and * after output[]. @@ -521,6 +493,34 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *f return max_ipos; } +static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) +{ + UINT istride = dsb->pwfx->nBlockAlign; + UINT ostride = dsb->device->pwfx->nChannels * sizeof(float); + UINT committed_samples = 0; + DWORD channel, i; + + if (!secondarybuffer_is_audible(dsb)) + return count; + + if(dsb->use_committed) { + committed_samples = (dsb->writelead - dsb->committed_mixpos) / istride; + committed_samples = committed_samples <= count ? committed_samples : count; + } + + for (i = 0; i < committed_samples; i++) + for (channel = 0; channel < dsb->mix_channels; channel++) + dsb->put(dsb, i * ostride, channel, get_current_sample(dsb, dsb->committedbuff, + dsb->writelead, dsb->committed_mixpos + i * istride, channel)); + + for (; i < count; i++) + for (channel = 0; channel < dsb->mix_channels; channel++) + dsb->put(dsb, i * ostride, channel, get_current_sample(dsb, dsb->buffer->memory, + dsb->buflen, dsb->sec_mixpos + i * istride, channel)); + + return count; +} + static void cp_fields(IDirectSoundBufferImpl *dsb, UINT count, DWORD *freqAccNum) { DWORD ipos, adv; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716

Anton Baskanov

5:49 a.m.

New subject: [PATCH 3/5] dsound: Use #define for fir.h constants.

From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/fir.h | 16 ++++++++-------- dlls/dsound/mixer.c | 42 +++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h index 45ad65d7398..76ac521e0f3 100644 --- a/dlls/dsound/fir.h +++ b/dlls/dsound/fir.h @@ -86,10 +86,10 @@ int main() fprintf(stderr, "q %f\n", (double)output.q); fprintf(stderr, "status %s\n", get_pm_status_str(output.status)); - printf("static const int fir_width_shift = %d;\n", fir_width_shift); - printf("static const int fir_width = %d;\n", fir_width); - printf("static const int fir_step_shift = %d;\n", fir_step_shift); - printf("static const int fir_step = %d;\n", fir_step); + printf("#define FIR_WIDTH_SHIFT %d\n", fir_width_shift); + printf("#define FIR_WIDTH %d\n", fir_width); + printf("#define FIR_STEP_SHIFT %d\n", fir_step_shift); + printf("#define FIR_STEP %d\n", fir_step); printf("static const float fir[] = {"); // Print the FIR array with an additional row at the end. This simplifies // calculation of the interpolated value by allowing the index to overflow @@ -114,10 +114,10 @@ int main() printf("};\n"); } */ -static const int fir_width_shift = 6; -static const int fir_width = 64; -static const int fir_step_shift = 7; -static const int fir_step = 128; +#define FIR_WIDTH_SHIFT 6 +#define FIR_WIDTH 64 +#define FIR_STEP_SHIFT 7 +#define FIR_STEP 128 static const float fir[] = { 0.0000000000e+00, -2.4830013102e-06, 1.9318705150e-06, 2.6614854151e-06, -1.5313785194e-05, 4.2076214553e-05, -9.1417167945e-05, 1.7455895136e-04, diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index d66dec2ed7e..1b4b1c7bd7a 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -285,7 +285,7 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, } /** - * Note that this function will overwrite up to fir_width - 1 frames before and + * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. */ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, @@ -309,28 +309,28 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai * Clearing the bits is safe as it has the same effect as rounding up the * resampling ratio and the subsample position and doesn't affect the * initial opos value. */ - LONG64 opos_num_mask = ~0ull << (FREQ_ADJUST_SHIFT - 23 - fir_step_shift); + LONG64 opos_num_mask = ~0ull << (FREQ_ADJUST_SHIFT - 23 - FIR_STEP_SHIFT); LONG64 opos_num = (freq_adjust_den - freq_acc_start + (1ll << FREQ_ADJUST_SHIFT) - 1) & opos_num_mask; DWORD opos_num_step = freq_adjust_den & (DWORD)opos_num_mask; /* Use XOR to invert the lower part of opos_num so that the lower bits * remain cleared. */ - float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << fir_step_shift); - float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << fir_step_shift); + float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << FIR_STEP_SHIFT); + float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << FIR_STEP_SHIFT); int j; for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ - int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - fir_width; - UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; + int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH; + UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; float input_value1 = rem * input_value; int i; - for (i = 0; i < fir_width; ++i) - output[opos + i] += fir[idx + i] * input_value0 + fir[idx + fir_width + i] * input_value1; + for (i = 0; i < FIR_WIDTH; ++i) + output[opos + i] += fir[idx + i] * input_value0 + fir[idx + FIR_WIDTH + i] * input_value1; rem += rem_step; rem -= rem >= 1.0f ? 1.0f : 0.0f; @@ -360,25 +360,25 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl * * Clearing the bits is safe as it has the same effect as rounding down the * resampling ratio and the subsample position. */ - DWORD ipos_num_mask = ~0u << (FREQ_ADJUST_SHIFT - 23 - fir_step_shift); + DWORD ipos_num_mask = ~0u << (FREQ_ADJUST_SHIFT - 23 - FIR_STEP_SHIFT); LONG64 ipos_num = freq_acc_start & ipos_num_mask; DWORD ipos_num_step = freq_adjust_num & ipos_num_mask; - float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << fir_step_shift); - float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << fir_step_shift); + float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << FIR_STEP_SHIFT); + float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << FIR_STEP_SHIFT); UINT i; for(i = 0; i < count; ++i) { UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; - UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; + UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; float rem = 1.0f - rem_inv; int j; float sum = 0.0; float* cache = &input[ipos]; - for (j = 0; j < fir_width; j++) - sum += (fir[idx + j] * rem_inv + fir[idx + j + fir_width] * rem) * cache[j]; + for (j = 0; j < FIR_WIDTH; j++) + sum += (fir[idx + j] * rem_inv + fir[idx + j + FIR_WIDTH] * rem) * cache[j]; output[i] = sum; rem_inv += rem_inv_step; @@ -389,7 +389,7 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl } /** - * Note that this function will overwrite up to fir_width - 1 frames before and + * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. */ static void resample(DWORD freq_adjust_num, DWORD freq_adjust_den, DWORD freq_acc_start, @@ -435,15 +435,15 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *f UINT max_ipos = (freqAcc_start + count * dsb->freqAdjustNum) / dsb->freqAdjustDen; UINT required_input = max( - (freqAcc_start + (count - 1) * dsb->freqAdjustNum) / dsb->freqAdjustDen + fir_width, - (freqAcc_start + (count - 1 + fir_width) * dsb->freqAdjustNum) / dsb->freqAdjustDen); + (freqAcc_start + (count - 1) * dsb->freqAdjustNum) / dsb->freqAdjustDen + FIR_WIDTH, + (freqAcc_start + (count - 1 + FIR_WIDTH) * dsb->freqAdjustNum) / dsb->freqAdjustDen); float *intermediate, *output, *itmp; DWORD len = required_input * channels; /* Allocate an output buffer for each channel with padding on both ends as * required by the resample function. Padding at the end of one channel * buffer is reused as a start padding for the next channel buffer. */ - len += fir_width - 1 + (count + fir_width - 1) * channels; + len += FIR_WIDTH - 1 + (count + FIR_WIDTH - 1) * channels; len *= sizeof(float); *freqAccNum = freqAcc_end % dsb->freqAdjustDen; @@ -460,7 +460,7 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *f } intermediate = dsb->device->cp_buffer; - output = intermediate + required_input * channels + fir_width - 1; + output = intermediate + required_input * channels + FIR_WIDTH - 1; if(dsb->use_committed) { committed_samples = (dsb->writelead - dsb->committed_mixpos) / istride; @@ -484,11 +484,11 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *f for (channel = 0; channel < channels; channel++) resample(dsb->freqAdjustNum, dsb->freqAdjustDen, freqAcc_start, dsb->firgain, required_input, count, intermediate + channel * required_input, - output + channel * (fir_width - 1 + count)); + output + channel * (FIR_WIDTH - 1 + count)); for(i = 0; i < count; ++i) for (channel = 0; channel < channels; channel++) - dsb->put(dsb, i * ostride, channel, output[channel * (fir_width - 1 + count) + i]); + dsb->put(dsb, i * ostride, channel, output[channel * (FIR_WIDTH - 1 + count) + i]); return max_ipos; } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716

Anton Baskanov

5:49 a.m.

New subject: [PATCH 4/5] dsound: Add an SSE version of upsample.

From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/dsound_main.c | 12 ++ dlls/dsound/dsound_private.h | 4 + dlls/dsound/fir.h | 4 +- dlls/dsound/mixer.c | 278 +++++++++++++++++++++++++++++++++++ 4 files changed, 296 insertions(+), 2 deletions(-) diff --git a/dlls/dsound/dsound_main.c b/dlls/dsound/dsound_main.c index 8936b437ba2..dbcf5a79899 100644 --- a/dlls/dsound/dsound_main.c +++ b/dlls/dsound/dsound_main.c @@ -63,6 +63,10 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound); +#ifdef __i386__ +BOOL sse_supported; +#endif + struct list DSOUND_renderers = LIST_INIT(DSOUND_renderers); CRITICAL_SECTION DSOUND_renderers_lock; static CRITICAL_SECTION_DEBUG DSOUND_renderers_lock_debug = @@ -82,6 +86,13 @@ GUID *DSOUND_capture_guids; /* All default settings, you most likely don't want to touch these, see wiki on UsefulRegistryKeys */ int ds_hel_buflen = 32768 * 2; +static void init_cpu_features(void) +{ +#ifdef __i386__ + sse_supported = IsProcessorFeaturePresent(PF_XMMI_INSTRUCTIONS_AVAILABLE); +#endif +} + /* * Get a config key from either the app-specific or the default config */ @@ -787,6 +798,7 @@ BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpvReserved) DisableThreadLibraryCalls(hInstDLL); /* Increase refcount on dsound by 1 */ GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS, (LPCWSTR)hInstDLL, &hInstDLL); + init_cpu_features(); break; case DLL_PROCESS_DETACH: if (lpvReserved) break; diff --git a/dlls/dsound/dsound_private.h b/dlls/dsound/dsound_private.h index 0e695698046..b13c3858e44 100644 --- a/dlls/dsound/dsound_private.h +++ b/dlls/dsound/dsound_private.h @@ -251,6 +251,10 @@ HRESULT IDirectSoundCaptureImpl_Create(IUnknown *outer_unk, REFIID riid, void ** #define STATE_CAPTURING 2 #define STATE_STOPPING 3 +#ifdef __i386__ +extern BOOL sse_supported; +#endif + extern CRITICAL_SECTION DSOUND_renderers_lock; extern struct list DSOUND_renderers; diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h index 76ac521e0f3..3b9bccbdb83 100644 --- a/dlls/dsound/fir.h +++ b/dlls/dsound/fir.h @@ -90,7 +90,7 @@ int main() printf("#define FIR_WIDTH %d\n", fir_width); printf("#define FIR_STEP_SHIFT %d\n", fir_step_shift); printf("#define FIR_STEP %d\n", fir_step); - printf("static const float fir[] = {"); + printf("static const float __attribute__((used, aligned(16))) fir[] = {"); // Print the FIR array with an additional row at the end. This simplifies // calculation of the interpolated value by allowing the index to overflow // into the extra row. It just repeats the first row, starting from its @@ -118,7 +118,7 @@ int main() #define FIR_WIDTH 64 #define FIR_STEP_SHIFT 7 #define FIR_STEP 128 -static const float fir[] = { +static const float __attribute__((used, aligned(16))) fir[] = { 0.0000000000e+00, -2.4830013102e-06, 1.9318705150e-06, 2.6614854151e-06, -1.5313785194e-05, 4.2076214553e-05, -9.1417167945e-05, 1.7455895136e-04, -3.0567859821e-04, 5.0191365396e-04, -7.8311909082e-04, 1.1713337628e-03, diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 1b4b1c7bd7a..78a1ef2bb33 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -34,6 +34,7 @@ #include "wingdi.h" #include "mmreg.h" #include "wine/debug.h" +#include "wine/asm.h" #include "dsound.h" #include "ks.h" #include "ksmedia.h" @@ -45,6 +46,14 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound); #define FREQ_ADJUST_SHIFT 32 #define FIXED_0_32_TO_FLOAT(x) ((int)((x) >> 1) * (1.0f / (1ll << 31))) +#define STR(a) #a +#define EXPAND_STR(a) STR(a) + +static const float __attribute__((used, aligned(16))) one[] = +{ + 1.0f, 1.0f, 1.0f, 1.0f, +}; + void DSOUND_RecalcVolPan(PDSVOLUMEPAN volpan) { double temp; @@ -284,6 +293,82 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, return dsb->get(dsb, buffer + (mixpos % buflen), channel); } +#ifdef __i386__ + +#define INIT \ + "push %ebx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %ebx,-8\n\t") \ + "push %ebp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %ebp,-12\n\t") \ + "push %esi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %esi,-16\n\t") \ + "push %edi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %edi,-20\n\t") + +#define CLEANUP \ + "pop %edi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %esi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %ebp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %ebx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") + +#define ADDR_SUFFIX + +#elif defined(__x86_64__) && !defined(__arm64ec__) + +#define INIT \ + "push %rbx\n\t" \ + __ASM_SEH(".seh_pushreg %rbx\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rbx,-16\n\t") \ + "push %rbp\n\t" \ + __ASM_SEH(".seh_pushreg %rbp\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rbp,-24\n\t") \ + "push %rsi\n\t" \ + __ASM_SEH(".seh_pushreg %rsi\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rsi,-32\n\t") \ + "push %rdi\n\t" \ + __ASM_SEH(".seh_pushreg %rdi\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rdi,-40\n\t") \ + "sub $40, %rsp\n\t" \ + __ASM_SEH(".seh_stackalloc 40\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 40\n\t") \ + "movaps %xmm6, 16(%rsp)\n\t" \ + __ASM_SEH(".seh_savexmm %xmm6, 16\n\t") \ + __ASM_CFI(".cfi_offset %xmm6, -64\n\t") \ + "movaps %xmm7, (%rsp)\n\t" \ + __ASM_SEH(".seh_savexmm %xmm7, 0\n\t") \ + __ASM_SEH(".seh_endprologue\n\t") \ + __ASM_CFI(".cfi_offset %xmm7, -80\n\t") + +#define CLEANUP \ + "movaps (%rsp), %xmm7\n\t" \ + "movaps 16(%rsp), %xmm6\n\t" \ + "add $40, %rsp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -40\n\t") \ + "pop %rdi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rsi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rbp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rbx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + +#define ADDR_SUFFIX "(%rip)" + +#endif + /** * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. @@ -339,6 +424,183 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai } } +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) + +#ifdef __i386__ + +#define IPOS_NUM_ARG "0x14(%esp)" +#define IPOS_NUM_STEP_ARG "0x18(%esp)" +#define REM_INV_ARG "0x1c(%esp)" +#define REM_INV_STEP_ARG "0x20(%esp)" +#define COUNT_ARG "0x24(%esp)" +#define INPUT_ARG "0x28(%esp)" +#define OUTPUT_ARG "0x2c(%esp)" + +#define IPOS_NUM_REG "%ecx" +#define IPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%esi" +#define OUTPUT_REG "%edi" +#define OUTPUT_END_REG "%ebp" +#define FIR_REG "%ebx" +#define TMP_L_REG "%eax" +#define TMP_REG "%eax" + +#else + +#define IPOS_NUM_ARG "%ecx" +#define IPOS_NUM_STEP_ARG "%edx" +#define REM_INV_ARG "%xmm2" +#define REM_INV_STEP_ARG "%xmm3" +#define COUNT_ARG "0x70(%rsp)" +#define INPUT_ARG "0x78(%rsp)" +#define OUTPUT_ARG "0x80(%rsp)" + +#define IPOS_NUM_REG "%ecx" +#define IPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%rsi" +#define OUTPUT_REG "%rdi" +#define OUTPUT_END_REG "%rbp" +#define FIR_REG "%rbx" +#define TMP_L_REG "%eax" +#define TMP_REG "%rax" + +#endif + +#define REM_INV_REG "%xmm2" +#define REM_INV_STEP_REG "%xmm3" +#define ONE_REG "%xmm1" +#define REM_REG "%xmm0" +#define SUMS_REG "%xmm4" +#define FTMP0_REG "%xmm5" +#define FTMP1_REG "%xmm6" +#define FTMP2_REG "%xmm7" + +void upsample_sse(DWORD ipos_num, DWORD ipos_num_step, float rem_inv, float rem_inv_step, + UINT count, float *input, float *output); +__ASM_GLOBAL_FUNC(upsample_sse, + INIT + + "mov " IPOS_NUM_ARG ", " IPOS_NUM_REG "\n\t" + /* Store the lower half of ipos_num inverted so that we don't have to + * invert it on every iteration of the outer loop. */ + "not " IPOS_NUM_REG "\n\t" + "mov " IPOS_NUM_STEP_ARG ", " IPOS_NUM_STEP_REG "\n\t" + + "movss " REM_INV_ARG ", " REM_INV_REG "\n\t" + "shufps $0, " REM_INV_REG ", " REM_INV_REG "\n\t" + "movss " REM_INV_STEP_ARG ", " REM_INV_STEP_REG "\n\t" + "shufps $0, " REM_INV_STEP_REG ", " REM_INV_STEP_REG "\n\t" + + /* Combine the upper half of ipos_num and the input pointer into a + * single value. */ + "mov " INPUT_ARG ", " INPUT_REG "\n\t" + /* Divide the input pointer by 4 to match the scale. We can do this + * because the pointer is at least 4-byte aligned. It will be scaled + * back during the access in the inner loop. */ + "shr $2, " INPUT_REG "\n\t" + + "mov " OUTPUT_ARG ", " OUTPUT_REG "\n\t" + + "mov " COUNT_ARG ", " TMP_L_REG "\n\t" + "lea (" OUTPUT_REG "," TMP_REG ",4), " OUTPUT_END_REG "\n\t" + + "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "1:\n\t" + /* Calculate idx. */ + "mov " IPOS_NUM_REG ", " TMP_L_REG "\n\t" + "shr $(32 - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t" + "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + 2), " TMP_REG "\n\t" + /* Calculate the FIR address base. */ + "lea " __ASM_NAME("fir") ADDR_SUFFIX ", " FIR_REG "\n\t" + "add " TMP_REG ", " FIR_REG "\n\t" + + /* Calculate rem. */ + "movups " ONE_REG ", " REM_REG "\n\t" + "subps " REM_INV_REG ", " REM_REG "\n\t" + + /* Initialize j. */ + "xor " TMP_REG ", " TMP_REG "\n\t" + /* Inizialize the sums. */ + "xorps " SUMS_REG ", " SUMS_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "2:\n\t" + /* Load the FIR coefficients. */ + "movaps (" FIR_REG "," TMP_REG "), " FTMP0_REG "\n\t" + "movaps " EXPAND_STR(FIR_WIDTH) " * 4(" FIR_REG "," TMP_REG "), " FTMP1_REG "\n\t" + /* Load the input values. */ + "movups (" TMP_REG "," INPUT_REG ",4), " FTMP2_REG "\n\t" + "add $16, " TMP_REG "\n\t" + /* Interpolate the FIR coefficients. */ + "mulps " REM_INV_REG ", " FTMP0_REG "\n\t" + "mulps " REM_REG ", " FTMP1_REG "\n\t" + "addps " FTMP0_REG ", " FTMP1_REG "\n\t" + /* Multiply the input values by the interpolated coefficients. */ + "mulps " FTMP2_REG ", " FTMP1_REG "\n\t" + /* Accumulate the results. */ + "addps " FTMP1_REG ", " SUMS_REG "\n\t" + "cmp $(" EXPAND_STR(FIR_WIDTH) " * 4), " TMP_REG "\n\t" + "jl 2b\n\t" + + /* Update rem_inv. */ + "addps " REM_INV_STEP_REG ", " REM_INV_REG "\n\t" + "movups " ONE_REG ", " FTMP0_REG "\n\t" + "cmpleps " REM_INV_REG ", " FTMP0_REG "\n\t" + "andps " ONE_REG ", " FTMP0_REG "\n\t" + "subps " FTMP0_REG ", " REM_INV_REG "\n\t" + + /* Update ipos_num. Use subtraction for the lower half as it is stored + * inverted. */ + "sub " IPOS_NUM_STEP_REG ", " IPOS_NUM_REG "\n\t" + "adc $0, " INPUT_REG "\n\t" + + /* Add the even-numbered sums to the odd-numbered ones. */ + "movups " SUMS_REG ", " FTMP0_REG "\n\t" + "shufps $0x31, " FTMP0_REG ", " FTMP0_REG "\n\t" + "addps " FTMP0_REG ", " SUMS_REG "\n\t" + /* Calculate the final sum and store it to the output array. */ + "movhlps " SUMS_REG ", " FTMP0_REG "\n\t" + "addss " FTMP0_REG ", " SUMS_REG "\n\t" + "movss " SUMS_REG ", (" OUTPUT_REG ")\n\t" + + /* Advance the output pointer. */ + "add $4, " OUTPUT_REG "\n\t" + "cmp " OUTPUT_END_REG ", " OUTPUT_REG "\n\t" + "jl 1b\n\t" + + CLEANUP + "ret") + +#undef IPOS_NUM_ARG +#undef IPOS_NUM_STEP_ARG +#undef REM_INV_ARG +#undef REM_INV_STEP_ARG +#undef COUNT_ARG +#undef INPUT_ARG +#undef OUTPUT_ARG +#undef IPOS_NUM_REG +#undef IPOS_NUM_STEP_REG +#undef INPUT_REG +#undef OUTPUT_REG +#undef OUTPUT_END_REG +#undef FIR_REG +#undef TMP_L_REG +#undef TMP_REG +#undef REM_INV_REG +#undef REM_INV_STEP_REG +#undef ONE_REG +#undef REM_REG +#undef SUMS_REG +#undef FTMP0_REG +#undef FTMP1_REG +#undef FTMP2_REG + +#endif + static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { @@ -366,8 +628,19 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << FIR_STEP_SHIFT); float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << FIR_STEP_SHIFT); + +#if defined(__x86_64__) && !defined(__arm64ec__) + upsample_sse((DWORD)ipos_num, ipos_num_step, rem_inv, rem_inv_step, count, input, output); +#else UINT i; +#ifdef __i386__ + if (sse_supported) { + upsample_sse((DWORD)ipos_num, ipos_num_step, rem_inv, rem_inv_step, count, input, output); + return; + } +#endif + for(i = 0; i < count; ++i) { UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; @@ -386,8 +659,13 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl ipos_num += ipos_num_step; } +#endif } +#undef INIT +#undef CLEANUP +#undef ADDR_SUFFIX + /** * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716

Anton Baskanov

5:49 a.m.

New subject: [PATCH 5/5] dsound: Add an SSE version of downsample.

From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 196 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 78a1ef2bb33..702f84fc08f 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -369,6 +369,188 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, #endif +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) + +#ifdef __i386__ + +#define OPOS_NUM_ARG "0x14(%esp)" +#define OPOS_NUM_STEP_ARG "0x18(%esp)" +#define REM_ARG "0x1c(%esp)" +#define REM_STEP_ARG "0x20(%esp)" +#define FIRGAIN_ARG "0x24(%esp)" +#define REQUIRED_INPUT_ARG "0x28(%esp)" +#define INPUT_ARG "0x2c(%esp)" +#define OUTPUT_ARG "0x30(%esp)" + +#define OPOS_NUM_REG "%ecx" +#define OPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%esi" +#define OUTPUT_REG "%edi" +#define INPUT_END_REG "%ebp" +#define FIR_REG "%ebx" +#define TMP_L_REG "%eax" +#define TMP_REG "%eax" + +#else + +#define OPOS_NUM_ARG "%ecx" +#define OPOS_NUM_STEP_ARG "%edx" +#define REM_ARG "%xmm2" +#define REM_STEP_ARG "%xmm3" +#define FIRGAIN_ARG "0x70(%rsp)" +#define REQUIRED_INPUT_ARG "0x78(%rsp)" +#define INPUT_ARG "0x80(%rsp)" +#define OUTPUT_ARG "0x88(%rsp)" + +#define OPOS_NUM_REG "%ecx" +#define OPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%rsi" +#define OUTPUT_REG "%rdi" +#define INPUT_END_REG "%rbp" +#define FIR_REG "%rbx" +#define TMP_L_REG "%eax" +#define TMP_REG "%rax" + +#endif + +#define REM_REG "%xmm2" +#define REM_STEP_REG "%xmm3" +#define FIRGAIN_REG "%xmm0" +#define ONE_REG "%xmm1" +#define INPUT_VALUE0_REG "%xmm4" +#define INPUT_VALUE1_REG "%xmm5" +#define FTMP0_REG "%xmm6" +#define FTMP1_REG "%xmm7" + +/** + * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and + * after output[]. + */ +void downsample_sse(DWORD opos_num, DWORD opos_num_step, float rem, float rem_step, float firgain, + UINT required_input, float *input, float *output); +__ASM_GLOBAL_FUNC( downsample_sse, + INIT + + "mov " OPOS_NUM_ARG ", " OPOS_NUM_REG "\n\t" + /* Store the lower half of opos_num inverted so that we don't have to + * invert it on every iteration of the outer loop. */ + "not " OPOS_NUM_REG "\n\t" + "mov " OPOS_NUM_STEP_ARG ", " OPOS_NUM_STEP_REG "\n\t" + + "movss " REM_ARG ", " REM_REG "\n\t" + "shufps $0, " REM_REG ", " REM_REG "\n\t" + "movss " REM_STEP_ARG ", " REM_STEP_REG "\n\t" + "shufps $0, " REM_STEP_REG ", " REM_STEP_REG "\n\t" + + "movss " FIRGAIN_ARG ", " FIRGAIN_REG "\n\t" + + "mov " INPUT_ARG ", " INPUT_REG "\n\t" + + /* Combine the upper half of opos_num and the output pointer into a + * single value. */ + "mov " OUTPUT_ARG ", " OUTPUT_REG "\n\t" + /* Divide the output pointer by 4 to match the scale. We can do this + * because the pointer is at least 4-byte aligned. It will be scaled + * back during the access in the inner loop. */ + "shr $2, " OUTPUT_REG "\n\t" + /* Subtract FIR_WIDTH so that we don't have to do this on every + * iteration of the outer loop. */ + "sub $" EXPAND_STR(FIR_WIDTH) ", " OUTPUT_REG "\n\t" + + "mov " REQUIRED_INPUT_ARG ", " TMP_L_REG "\n\t" + "lea (" INPUT_REG "," TMP_REG ",4), " INPUT_END_REG "\n\t" + + "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "1:\n\t" + /* Calculate idx. */ + "mov " OPOS_NUM_REG ", " TMP_L_REG "\n\t" + "shr $(32 - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t" + "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + 2), " TMP_REG "\n\t" + /* Calculate the FIR address base. */ + "lea " __ASM_NAME("fir") ADDR_SUFFIX ", " FIR_REG "\n\t" + "add " TMP_REG ", " FIR_REG "\n\t" + + /* Calculate input_value. */ + "movss (" INPUT_REG "), " INPUT_VALUE1_REG "\n\t" + "mulss " FIRGAIN_REG ", " INPUT_VALUE1_REG "\n\t" + "shufps $0, " INPUT_VALUE1_REG ", " INPUT_VALUE1_REG "\n\t" + "movups " INPUT_VALUE1_REG ", " INPUT_VALUE0_REG "\n\t" + /* Calculate input_value1. */ + "mulps " REM_REG ", " INPUT_VALUE1_REG "\n\t" + /* Calculate input_value0. */ + "subps " INPUT_VALUE1_REG ", " INPUT_VALUE0_REG "\n\t" + + /* Initialize i. */ + "xor " TMP_REG ", " TMP_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "2:\n\t" + /* Load the FIR coefficients. */ + "movaps (" FIR_REG "," TMP_REG "), " FTMP0_REG "\n\t" + "movaps " EXPAND_STR(FIR_WIDTH) " * 4(" FIR_REG "," TMP_REG "), " FTMP1_REG "\n\t" + /* Calculate the weighted sums. */ + "mulps " INPUT_VALUE0_REG ", " FTMP0_REG "\n\t" + "mulps " INPUT_VALUE1_REG ", " FTMP1_REG "\n\t" + "addps " FTMP0_REG ", " FTMP1_REG "\n\t" + /* Add the sums to the output. */ + "movups (" TMP_REG "," OUTPUT_REG ",4), " FTMP0_REG "\n\t" + "addps " FTMP1_REG ", " FTMP0_REG "\n\t" + "movups " FTMP0_REG ", (" TMP_REG "," OUTPUT_REG ",4)\n\t" + "add $16, " TMP_REG "\n\t" + "cmp $(" EXPAND_STR(FIR_WIDTH) " * 4), " TMP_REG "\n\t" + "jl 2b\n\t" + + /* Update rem. */ + "addps " REM_STEP_REG ", " REM_REG "\n\t" + "movups " ONE_REG ", " FTMP0_REG "\n\t" + "cmpleps " REM_REG ", " FTMP0_REG "\n\t" + "andps " ONE_REG ", " FTMP0_REG "\n\t" + "subps " FTMP0_REG ", " REM_REG "\n\t" + + /* Update opos_num. Use subtraction for the lower half as it is stored + * inverted. */ + "sub " OPOS_NUM_STEP_REG ", " OPOS_NUM_REG "\n\t" + "adc $0, " OUTPUT_REG "\n\t" + + /* Advance the input pointer. */ + "add $4, " INPUT_REG "\n\t" + "cmp " INPUT_END_REG ", " INPUT_REG "\n\t" + "jl 1b\n\t" + + CLEANUP + "ret" ) + +#undef OPOS_NUM_ARG +#undef OPOS_NUM_STEP_ARG +#undef REM_ARG +#undef REM_STEP_ARG +#undef FIRGAIN_ARG +#undef REQUIRED_INPUT_ARG +#undef INPUT_ARG +#undef OUTPUT_ARG +#undef OPOS_NUM_REG +#undef OPOS_NUM_STEP_REG +#undef INPUT_REG +#undef OUTPUT_REG +#undef INPUT_END_REG +#undef FIR_REG +#undef TMP_L_REG +#undef TMP_REG +#undef REM_REG +#undef REM_STEP_REG +#undef FIRGAIN_REG +#undef ONE_REG +#undef INPUT_VALUE0_REG +#undef INPUT_VALUE1_REG +#undef FTMP0_REG +#undef FTMP1_REG + +#endif + /** * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. @@ -402,8 +584,21 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai * remain cleared. */ float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << FIR_STEP_SHIFT); float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << FIR_STEP_SHIFT); + +#if defined(__x86_64__) && !defined(__arm64ec__) + downsample_sse((DWORD)opos_num, opos_num_step, rem, rem_step, firgain, required_input, input, + output + (opos_num >> FREQ_ADJUST_SHIFT)); +#else int j; +#ifdef __i386__ + if (sse_supported) { + downsample_sse((DWORD)opos_num, opos_num_step, rem, rem_step, firgain, required_input, + input, output + (opos_num >> FREQ_ADJUST_SHIFT)); + return; + } +#endif + for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH; @@ -422,6 +617,7 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai opos_num += opos_num_step; } +#endif } #if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716

Age (days ago)

Last active (days ago)

List overview

5 comments

2 participants

participants (2)

Anton Baskanov
Anton Baskanov (＠baskanov)