[PATCH 0/5] MR10716: dsound: Speed up resampling, part 6
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/dsound_private.h | 4 ++-- dlls/dsound/mixer.c | 24 ++++++++++++++---------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/dlls/dsound/dsound_private.h b/dlls/dsound/dsound_private.h index 75279dacf87..0e695698046 100644 --- a/dlls/dsound/dsound_private.h +++ b/dlls/dsound/dsound_private.h @@ -147,8 +147,8 @@ struct IDirectSoundBufferImpl DSBUFFERDESC dsbd; /* used for frequency conversion (PerfectPitch) */ float firgain; - LONG64 freqAdjustNum,freqAdjustDen; - LONG64 freqAccNum; + DWORD freqAdjustNum,freqAdjustDen; + DWORD freqAccNum; /* used for mixing */ DWORD sec_mixpos; /* Holds a copy of the next 'writelead' bytes, to be used for mixing. This makes it diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 45e37c5bcb1..26a35722e50 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -106,7 +106,7 @@ void DSOUND_RecalcFormat(IDirectSoundBufferImpl *dsb) { DWORD ichannels = dsb->pwfx->nChannels; DWORD ochannels = dsb->device->pwfx->nChannels; - LONG64 oldFreqAdjustDen = dsb->freqAdjustDen; + DWORD oldFreqAdjustDen = dsb->freqAdjustDen; WAVEFORMATEXTENSIBLE *pwfxe; BOOL ieee = FALSE; @@ -131,7 +131,8 @@ void DSOUND_RecalcFormat(IDirectSoundBufferImpl *dsb) dsb->maxwritelead = (DSBFREQUENCY_MAX / 100) * dsb->pwfx->nBlockAlign; if (oldFreqAdjustDen) - dsb->freqAccNum = (dsb->freqAccNum * dsb->freqAdjustDen + oldFreqAdjustDen / 2) / oldFreqAdjustDen; + dsb->freqAccNum = (dsb->freqAccNum * (LONG64)dsb->freqAdjustDen + + oldFreqAdjustDen / 2) / oldFreqAdjustDen; dsb->get_aux = ieee ? getbpp[4] : getbpp[dsb->pwfx->wBitsPerSample/8 - 1]; dsb->put_aux = putieee32; @@ -419,17 +420,18 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl * Note that this function will overwrite up to fir_width - 1 frames before and * after output[]. */ -static void resample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq_acc_start, +static void resample(DWORD freq_adjust_num, DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, UINT required_input, UINT count, float *input, float *output) { if (freq_adjust_num > freq_adjust_den) { /* Take a reciprocal of the resampling ratio and convert it to a 0.32 * fixed point. Round down to prevent output buffer overflow. */ - DWORD freq_adjust_fixed_den = (freq_adjust_den << FREQ_ADJUST_SHIFT) / freq_adjust_num; + DWORD freq_adjust_fixed_den = ((LONG64)freq_adjust_den << FREQ_ADJUST_SHIFT) + / freq_adjust_num; /* Convert the subsample position to a 0.32 fixed point. Round up to * prevent output buffer overflow. */ - DWORD freq_acc_fixed_start = (freq_acc_start * freq_adjust_fixed_den + freq_adjust_den - 1) - / freq_adjust_den; + DWORD freq_acc_fixed_start = ((LONG64)freq_acc_start * freq_adjust_fixed_den + + freq_adjust_den - 1) / freq_adjust_den; memset(output, 0, count * sizeof(float)); downsample(freq_adjust_fixed_den, freq_acc_fixed_start, firgain, required_input, input, @@ -437,16 +439,18 @@ static void resample(LONG64 freq_adjust_num, LONG64 freq_adjust_den, LONG64 freq } else { /* Convert the resampling ratio to a 0.32 fixed point. Round down to * prevent input buffer overflow. */ - DWORD freq_adjust_fixed_num = (freq_adjust_num << FREQ_ADJUST_SHIFT) / freq_adjust_den; + DWORD freq_adjust_fixed_num = ((LONG64)freq_adjust_num << FREQ_ADJUST_SHIFT) + / freq_adjust_den; /* Convert the subsample position to a 0.32 fixed point. Round down to * prevent input buffer overflow. */ - DWORD freq_acc_fixed_start = (freq_acc_start << FREQ_ADJUST_SHIFT) / freq_adjust_den; + DWORD freq_acc_fixed_start = ((LONG64)freq_acc_start << FREQ_ADJUST_SHIFT) + / freq_adjust_den; upsample(freq_adjust_fixed_num, freq_acc_fixed_start, count, input, output); } } -static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, LONG64 *freqAccNum) +static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *freqAccNum) { UINT i, channel; UINT istride = dsb->pwfx->nBlockAlign; @@ -517,7 +521,7 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, LONG64 * return max_ipos; } -static void cp_fields(IDirectSoundBufferImpl *dsb, UINT count, LONG64 *freqAccNum) +static void cp_fields(IDirectSoundBufferImpl *dsb, UINT count, DWORD *freqAccNum) { DWORD ipos, adv; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 56 ++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 26a35722e50..d66dec2ed7e 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -284,34 +284,6 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, return dsb->get(dsb, buffer + (mixpos % buflen), channel); } -static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) -{ - UINT istride = dsb->pwfx->nBlockAlign; - UINT ostride = dsb->device->pwfx->nChannels * sizeof(float); - UINT committed_samples = 0; - DWORD channel, i; - - if (!secondarybuffer_is_audible(dsb)) - return count; - - if(dsb->use_committed) { - committed_samples = (dsb->writelead - dsb->committed_mixpos) / istride; - committed_samples = committed_samples <= count ? committed_samples : count; - } - - for (i = 0; i < committed_samples; i++) - for (channel = 0; channel < dsb->mix_channels; channel++) - dsb->put(dsb, i * ostride, channel, get_current_sample(dsb, dsb->committedbuff, - dsb->writelead, dsb->committed_mixpos + i * istride, channel)); - - for (; i < count; i++) - for (channel = 0; channel < dsb->mix_channels; channel++) - dsb->put(dsb, i * ostride, channel, get_current_sample(dsb, dsb->buffer->memory, - dsb->buflen, dsb->sec_mixpos + i * istride, channel)); - - return count; -} - /** * Note that this function will overwrite up to fir_width - 1 frames before and * after output[]. @@ -521,6 +493,34 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *f return max_ipos; } +static UINT cp_fields_noresample(IDirectSoundBufferImpl *dsb, UINT count) +{ + UINT istride = dsb->pwfx->nBlockAlign; + UINT ostride = dsb->device->pwfx->nChannels * sizeof(float); + UINT committed_samples = 0; + DWORD channel, i; + + if (!secondarybuffer_is_audible(dsb)) + return count; + + if(dsb->use_committed) { + committed_samples = (dsb->writelead - dsb->committed_mixpos) / istride; + committed_samples = committed_samples <= count ? committed_samples : count; + } + + for (i = 0; i < committed_samples; i++) + for (channel = 0; channel < dsb->mix_channels; channel++) + dsb->put(dsb, i * ostride, channel, get_current_sample(dsb, dsb->committedbuff, + dsb->writelead, dsb->committed_mixpos + i * istride, channel)); + + for (; i < count; i++) + for (channel = 0; channel < dsb->mix_channels; channel++) + dsb->put(dsb, i * ostride, channel, get_current_sample(dsb, dsb->buffer->memory, + dsb->buflen, dsb->sec_mixpos + i * istride, channel)); + + return count; +} + static void cp_fields(IDirectSoundBufferImpl *dsb, UINT count, DWORD *freqAccNum) { DWORD ipos, adv; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/fir.h | 16 ++++++++-------- dlls/dsound/mixer.c | 42 +++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h index 45ad65d7398..76ac521e0f3 100644 --- a/dlls/dsound/fir.h +++ b/dlls/dsound/fir.h @@ -86,10 +86,10 @@ int main() fprintf(stderr, "q %f\n", (double)output.q); fprintf(stderr, "status %s\n", get_pm_status_str(output.status)); - printf("static const int fir_width_shift = %d;\n", fir_width_shift); - printf("static const int fir_width = %d;\n", fir_width); - printf("static const int fir_step_shift = %d;\n", fir_step_shift); - printf("static const int fir_step = %d;\n", fir_step); + printf("#define FIR_WIDTH_SHIFT %d\n", fir_width_shift); + printf("#define FIR_WIDTH %d\n", fir_width); + printf("#define FIR_STEP_SHIFT %d\n", fir_step_shift); + printf("#define FIR_STEP %d\n", fir_step); printf("static const float fir[] = {"); // Print the FIR array with an additional row at the end. This simplifies // calculation of the interpolated value by allowing the index to overflow @@ -114,10 +114,10 @@ int main() printf("};\n"); } */ -static const int fir_width_shift = 6; -static const int fir_width = 64; -static const int fir_step_shift = 7; -static const int fir_step = 128; +#define FIR_WIDTH_SHIFT 6 +#define FIR_WIDTH 64 +#define FIR_STEP_SHIFT 7 +#define FIR_STEP 128 static const float fir[] = { 0.0000000000e+00, -2.4830013102e-06, 1.9318705150e-06, 2.6614854151e-06, -1.5313785194e-05, 4.2076214553e-05, -9.1417167945e-05, 1.7455895136e-04, diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index d66dec2ed7e..1b4b1c7bd7a 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -285,7 +285,7 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, } /** - * Note that this function will overwrite up to fir_width - 1 frames before and + * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. */ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgain, @@ -309,28 +309,28 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai * Clearing the bits is safe as it has the same effect as rounding up the * resampling ratio and the subsample position and doesn't affect the * initial opos value. */ - LONG64 opos_num_mask = ~0ull << (FREQ_ADJUST_SHIFT - 23 - fir_step_shift); + LONG64 opos_num_mask = ~0ull << (FREQ_ADJUST_SHIFT - 23 - FIR_STEP_SHIFT); LONG64 opos_num = (freq_adjust_den - freq_acc_start + (1ll << FREQ_ADJUST_SHIFT) - 1) & opos_num_mask; DWORD opos_num_step = freq_adjust_den & (DWORD)opos_num_mask; /* Use XOR to invert the lower part of opos_num so that the lower bits * remain cleared. */ - float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << fir_step_shift); - float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << fir_step_shift); + float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << FIR_STEP_SHIFT); + float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << FIR_STEP_SHIFT); int j; for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ - int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - fir_width; - UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; + int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH; + UINT idx = ~(DWORD)opos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; float input_value = input[j] * firgain; float input_value0 = (1.0f - rem) * input_value; float input_value1 = rem * input_value; int i; - for (i = 0; i < fir_width; ++i) - output[opos + i] += fir[idx + i] * input_value0 + fir[idx + fir_width + i] * input_value1; + for (i = 0; i < FIR_WIDTH; ++i) + output[opos + i] += fir[idx + i] * input_value0 + fir[idx + FIR_WIDTH + i] * input_value1; rem += rem_step; rem -= rem >= 1.0f ? 1.0f : 0.0f; @@ -360,25 +360,25 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl * * Clearing the bits is safe as it has the same effect as rounding down the * resampling ratio and the subsample position. */ - DWORD ipos_num_mask = ~0u << (FREQ_ADJUST_SHIFT - 23 - fir_step_shift); + DWORD ipos_num_mask = ~0u << (FREQ_ADJUST_SHIFT - 23 - FIR_STEP_SHIFT); LONG64 ipos_num = freq_acc_start & ipos_num_mask; DWORD ipos_num_step = freq_adjust_num & ipos_num_mask; - float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << fir_step_shift); - float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << fir_step_shift); + float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << FIR_STEP_SHIFT); + float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << FIR_STEP_SHIFT); UINT i; for(i = 0; i < count; ++i) { UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; - UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - fir_step_shift) << fir_width_shift; + UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; float rem = 1.0f - rem_inv; int j; float sum = 0.0; float* cache = &input[ipos]; - for (j = 0; j < fir_width; j++) - sum += (fir[idx + j] * rem_inv + fir[idx + j + fir_width] * rem) * cache[j]; + for (j = 0; j < FIR_WIDTH; j++) + sum += (fir[idx + j] * rem_inv + fir[idx + j + FIR_WIDTH] * rem) * cache[j]; output[i] = sum; rem_inv += rem_inv_step; @@ -389,7 +389,7 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl } /** - * Note that this function will overwrite up to fir_width - 1 frames before and + * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. */ static void resample(DWORD freq_adjust_num, DWORD freq_adjust_den, DWORD freq_acc_start, @@ -435,15 +435,15 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *f UINT max_ipos = (freqAcc_start + count * dsb->freqAdjustNum) / dsb->freqAdjustDen; UINT required_input = max( - (freqAcc_start + (count - 1) * dsb->freqAdjustNum) / dsb->freqAdjustDen + fir_width, - (freqAcc_start + (count - 1 + fir_width) * dsb->freqAdjustNum) / dsb->freqAdjustDen); + (freqAcc_start + (count - 1) * dsb->freqAdjustNum) / dsb->freqAdjustDen + FIR_WIDTH, + (freqAcc_start + (count - 1 + FIR_WIDTH) * dsb->freqAdjustNum) / dsb->freqAdjustDen); float *intermediate, *output, *itmp; DWORD len = required_input * channels; /* Allocate an output buffer for each channel with padding on both ends as * required by the resample function. Padding at the end of one channel * buffer is reused as a start padding for the next channel buffer. */ - len += fir_width - 1 + (count + fir_width - 1) * channels; + len += FIR_WIDTH - 1 + (count + FIR_WIDTH - 1) * channels; len *= sizeof(float); *freqAccNum = freqAcc_end % dsb->freqAdjustDen; @@ -460,7 +460,7 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *f } intermediate = dsb->device->cp_buffer; - output = intermediate + required_input * channels + fir_width - 1; + output = intermediate + required_input * channels + FIR_WIDTH - 1; if(dsb->use_committed) { committed_samples = (dsb->writelead - dsb->committed_mixpos) / istride; @@ -484,11 +484,11 @@ static UINT cp_fields_resample(IDirectSoundBufferImpl *dsb, UINT count, DWORD *f for (channel = 0; channel < channels; channel++) resample(dsb->freqAdjustNum, dsb->freqAdjustDen, freqAcc_start, dsb->firgain, required_input, count, intermediate + channel * required_input, - output + channel * (fir_width - 1 + count)); + output + channel * (FIR_WIDTH - 1 + count)); for(i = 0; i < count; ++i) for (channel = 0; channel < channels; channel++) - dsb->put(dsb, i * ostride, channel, output[channel * (fir_width - 1 + count) + i]); + dsb->put(dsb, i * ostride, channel, output[channel * (FIR_WIDTH - 1 + count) + i]); return max_ipos; } -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/dsound_main.c | 12 ++ dlls/dsound/dsound_private.h | 4 + dlls/dsound/fir.h | 4 +- dlls/dsound/mixer.c | 278 +++++++++++++++++++++++++++++++++++ 4 files changed, 296 insertions(+), 2 deletions(-) diff --git a/dlls/dsound/dsound_main.c b/dlls/dsound/dsound_main.c index 8936b437ba2..dbcf5a79899 100644 --- a/dlls/dsound/dsound_main.c +++ b/dlls/dsound/dsound_main.c @@ -63,6 +63,10 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound); +#ifdef __i386__ +BOOL sse_supported; +#endif + struct list DSOUND_renderers = LIST_INIT(DSOUND_renderers); CRITICAL_SECTION DSOUND_renderers_lock; static CRITICAL_SECTION_DEBUG DSOUND_renderers_lock_debug = @@ -82,6 +86,13 @@ GUID *DSOUND_capture_guids; /* All default settings, you most likely don't want to touch these, see wiki on UsefulRegistryKeys */ int ds_hel_buflen = 32768 * 2; +static void init_cpu_features(void) +{ +#ifdef __i386__ + sse_supported = IsProcessorFeaturePresent(PF_XMMI_INSTRUCTIONS_AVAILABLE); +#endif +} + /* * Get a config key from either the app-specific or the default config */ @@ -787,6 +798,7 @@ BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpvReserved) DisableThreadLibraryCalls(hInstDLL); /* Increase refcount on dsound by 1 */ GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS, (LPCWSTR)hInstDLL, &hInstDLL); + init_cpu_features(); break; case DLL_PROCESS_DETACH: if (lpvReserved) break; diff --git a/dlls/dsound/dsound_private.h b/dlls/dsound/dsound_private.h index 0e695698046..b13c3858e44 100644 --- a/dlls/dsound/dsound_private.h +++ b/dlls/dsound/dsound_private.h @@ -251,6 +251,10 @@ HRESULT IDirectSoundCaptureImpl_Create(IUnknown *outer_unk, REFIID riid, void ** #define STATE_CAPTURING 2 #define STATE_STOPPING 3 +#ifdef __i386__ +extern BOOL sse_supported; +#endif + extern CRITICAL_SECTION DSOUND_renderers_lock; extern struct list DSOUND_renderers; diff --git a/dlls/dsound/fir.h b/dlls/dsound/fir.h index 76ac521e0f3..3b9bccbdb83 100644 --- a/dlls/dsound/fir.h +++ b/dlls/dsound/fir.h @@ -90,7 +90,7 @@ int main() printf("#define FIR_WIDTH %d\n", fir_width); printf("#define FIR_STEP_SHIFT %d\n", fir_step_shift); printf("#define FIR_STEP %d\n", fir_step); - printf("static const float fir[] = {"); + printf("static const float __attribute__((used, aligned(16))) fir[] = {"); // Print the FIR array with an additional row at the end. This simplifies // calculation of the interpolated value by allowing the index to overflow // into the extra row. It just repeats the first row, starting from its @@ -118,7 +118,7 @@ int main() #define FIR_WIDTH 64 #define FIR_STEP_SHIFT 7 #define FIR_STEP 128 -static const float fir[] = { +static const float __attribute__((used, aligned(16))) fir[] = { 0.0000000000e+00, -2.4830013102e-06, 1.9318705150e-06, 2.6614854151e-06, -1.5313785194e-05, 4.2076214553e-05, -9.1417167945e-05, 1.7455895136e-04, -3.0567859821e-04, 5.0191365396e-04, -7.8311909082e-04, 1.1713337628e-03, diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 1b4b1c7bd7a..78a1ef2bb33 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -34,6 +34,7 @@ #include "wingdi.h" #include "mmreg.h" #include "wine/debug.h" +#include "wine/asm.h" #include "dsound.h" #include "ks.h" #include "ksmedia.h" @@ -45,6 +46,14 @@ WINE_DEFAULT_DEBUG_CHANNEL(dsound); #define FREQ_ADJUST_SHIFT 32 #define FIXED_0_32_TO_FLOAT(x) ((int)((x) >> 1) * (1.0f / (1ll << 31))) +#define STR(a) #a +#define EXPAND_STR(a) STR(a) + +static const float __attribute__((used, aligned(16))) one[] = +{ + 1.0f, 1.0f, 1.0f, 1.0f, +}; + void DSOUND_RecalcVolPan(PDSVOLUMEPAN volpan) { double temp; @@ -284,6 +293,82 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, return dsb->get(dsb, buffer + (mixpos % buflen), channel); } +#ifdef __i386__ + +#define INIT \ + "push %ebx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %ebx,-8\n\t") \ + "push %ebp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %ebp,-12\n\t") \ + "push %esi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %esi,-16\n\t") \ + "push %edi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + __ASM_CFI(".cfi_offset %edi,-20\n\t") + +#define CLEANUP \ + "pop %edi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %esi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %ebp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "pop %ebx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") + +#define ADDR_SUFFIX + +#elif defined(__x86_64__) && !defined(__arm64ec__) + +#define INIT \ + "push %rbx\n\t" \ + __ASM_SEH(".seh_pushreg %rbx\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rbx,-16\n\t") \ + "push %rbp\n\t" \ + __ASM_SEH(".seh_pushreg %rbp\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rbp,-24\n\t") \ + "push %rsi\n\t" \ + __ASM_SEH(".seh_pushreg %rsi\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rsi,-32\n\t") \ + "push %rdi\n\t" \ + __ASM_SEH(".seh_pushreg %rdi\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + __ASM_CFI(".cfi_offset %rdi,-40\n\t") \ + "sub $40, %rsp\n\t" \ + __ASM_SEH(".seh_stackalloc 40\n\t") \ + __ASM_CFI(".cfi_adjust_cfa_offset 40\n\t") \ + "movaps %xmm6, 16(%rsp)\n\t" \ + __ASM_SEH(".seh_savexmm %xmm6, 16\n\t") \ + __ASM_CFI(".cfi_offset %xmm6, -64\n\t") \ + "movaps %xmm7, (%rsp)\n\t" \ + __ASM_SEH(".seh_savexmm %xmm7, 0\n\t") \ + __ASM_SEH(".seh_endprologue\n\t") \ + __ASM_CFI(".cfi_offset %xmm7, -80\n\t") + +#define CLEANUP \ + "movaps (%rsp), %xmm7\n\t" \ + "movaps 16(%rsp), %xmm6\n\t" \ + "add $40, %rsp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -40\n\t") \ + "pop %rdi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rsi\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rbp\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "pop %rbx\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + +#define ADDR_SUFFIX "(%rip)" + +#endif + /** * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. @@ -339,6 +424,183 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai } } +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) + +#ifdef __i386__ + +#define IPOS_NUM_ARG "0x14(%esp)" +#define IPOS_NUM_STEP_ARG "0x18(%esp)" +#define REM_INV_ARG "0x1c(%esp)" +#define REM_INV_STEP_ARG "0x20(%esp)" +#define COUNT_ARG "0x24(%esp)" +#define INPUT_ARG "0x28(%esp)" +#define OUTPUT_ARG "0x2c(%esp)" + +#define IPOS_NUM_REG "%ecx" +#define IPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%esi" +#define OUTPUT_REG "%edi" +#define OUTPUT_END_REG "%ebp" +#define FIR_REG "%ebx" +#define TMP_L_REG "%eax" +#define TMP_REG "%eax" + +#else + +#define IPOS_NUM_ARG "%ecx" +#define IPOS_NUM_STEP_ARG "%edx" +#define REM_INV_ARG "%xmm2" +#define REM_INV_STEP_ARG "%xmm3" +#define COUNT_ARG "0x70(%rsp)" +#define INPUT_ARG "0x78(%rsp)" +#define OUTPUT_ARG "0x80(%rsp)" + +#define IPOS_NUM_REG "%ecx" +#define IPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%rsi" +#define OUTPUT_REG "%rdi" +#define OUTPUT_END_REG "%rbp" +#define FIR_REG "%rbx" +#define TMP_L_REG "%eax" +#define TMP_REG "%rax" + +#endif + +#define REM_INV_REG "%xmm2" +#define REM_INV_STEP_REG "%xmm3" +#define ONE_REG "%xmm1" +#define REM_REG "%xmm0" +#define SUMS_REG "%xmm4" +#define FTMP0_REG "%xmm5" +#define FTMP1_REG "%xmm6" +#define FTMP2_REG "%xmm7" + +void upsample_sse(DWORD ipos_num, DWORD ipos_num_step, float rem_inv, float rem_inv_step, + UINT count, float *input, float *output); +__ASM_GLOBAL_FUNC(upsample_sse, + INIT + + "mov " IPOS_NUM_ARG ", " IPOS_NUM_REG "\n\t" + /* Store the lower half of ipos_num inverted so that we don't have to + * invert it on every iteration of the outer loop. */ + "not " IPOS_NUM_REG "\n\t" + "mov " IPOS_NUM_STEP_ARG ", " IPOS_NUM_STEP_REG "\n\t" + + "movss " REM_INV_ARG ", " REM_INV_REG "\n\t" + "shufps $0, " REM_INV_REG ", " REM_INV_REG "\n\t" + "movss " REM_INV_STEP_ARG ", " REM_INV_STEP_REG "\n\t" + "shufps $0, " REM_INV_STEP_REG ", " REM_INV_STEP_REG "\n\t" + + /* Combine the upper half of ipos_num and the input pointer into a + * single value. */ + "mov " INPUT_ARG ", " INPUT_REG "\n\t" + /* Divide the input pointer by 4 to match the scale. We can do this + * because the pointer is at least 4-byte aligned. It will be scaled + * back during the access in the inner loop. */ + "shr $2, " INPUT_REG "\n\t" + + "mov " OUTPUT_ARG ", " OUTPUT_REG "\n\t" + + "mov " COUNT_ARG ", " TMP_L_REG "\n\t" + "lea (" OUTPUT_REG "," TMP_REG ",4), " OUTPUT_END_REG "\n\t" + + "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "1:\n\t" + /* Calculate idx. */ + "mov " IPOS_NUM_REG ", " TMP_L_REG "\n\t" + "shr $(32 - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t" + "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + 2), " TMP_REG "\n\t" + /* Calculate the FIR address base. */ + "lea " __ASM_NAME("fir") ADDR_SUFFIX ", " FIR_REG "\n\t" + "add " TMP_REG ", " FIR_REG "\n\t" + + /* Calculate rem. */ + "movups " ONE_REG ", " REM_REG "\n\t" + "subps " REM_INV_REG ", " REM_REG "\n\t" + + /* Initialize j. */ + "xor " TMP_REG ", " TMP_REG "\n\t" + /* Inizialize the sums. */ + "xorps " SUMS_REG ", " SUMS_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "2:\n\t" + /* Load the FIR coefficients. */ + "movaps (" FIR_REG "," TMP_REG "), " FTMP0_REG "\n\t" + "movaps " EXPAND_STR(FIR_WIDTH) " * 4(" FIR_REG "," TMP_REG "), " FTMP1_REG "\n\t" + /* Load the input values. */ + "movups (" TMP_REG "," INPUT_REG ",4), " FTMP2_REG "\n\t" + "add $16, " TMP_REG "\n\t" + /* Interpolate the FIR coefficients. */ + "mulps " REM_INV_REG ", " FTMP0_REG "\n\t" + "mulps " REM_REG ", " FTMP1_REG "\n\t" + "addps " FTMP0_REG ", " FTMP1_REG "\n\t" + /* Multiply the input values by the interpolated coefficients. */ + "mulps " FTMP2_REG ", " FTMP1_REG "\n\t" + /* Accumulate the results. */ + "addps " FTMP1_REG ", " SUMS_REG "\n\t" + "cmp $(" EXPAND_STR(FIR_WIDTH) " * 4), " TMP_REG "\n\t" + "jl 2b\n\t" + + /* Update rem_inv. */ + "addps " REM_INV_STEP_REG ", " REM_INV_REG "\n\t" + "movups " ONE_REG ", " FTMP0_REG "\n\t" + "cmpleps " REM_INV_REG ", " FTMP0_REG "\n\t" + "andps " ONE_REG ", " FTMP0_REG "\n\t" + "subps " FTMP0_REG ", " REM_INV_REG "\n\t" + + /* Update ipos_num. Use subtraction for the lower half as it is stored + * inverted. */ + "sub " IPOS_NUM_STEP_REG ", " IPOS_NUM_REG "\n\t" + "adc $0, " INPUT_REG "\n\t" + + /* Add the even-numbered sums to the odd-numbered ones. */ + "movups " SUMS_REG ", " FTMP0_REG "\n\t" + "shufps $0x31, " FTMP0_REG ", " FTMP0_REG "\n\t" + "addps " FTMP0_REG ", " SUMS_REG "\n\t" + /* Calculate the final sum and store it to the output array. */ + "movhlps " SUMS_REG ", " FTMP0_REG "\n\t" + "addss " FTMP0_REG ", " SUMS_REG "\n\t" + "movss " SUMS_REG ", (" OUTPUT_REG ")\n\t" + + /* Advance the output pointer. */ + "add $4, " OUTPUT_REG "\n\t" + "cmp " OUTPUT_END_REG ", " OUTPUT_REG "\n\t" + "jl 1b\n\t" + + CLEANUP + "ret") + +#undef IPOS_NUM_ARG +#undef IPOS_NUM_STEP_ARG +#undef REM_INV_ARG +#undef REM_INV_STEP_ARG +#undef COUNT_ARG +#undef INPUT_ARG +#undef OUTPUT_ARG +#undef IPOS_NUM_REG +#undef IPOS_NUM_STEP_REG +#undef INPUT_REG +#undef OUTPUT_REG +#undef OUTPUT_END_REG +#undef FIR_REG +#undef TMP_L_REG +#undef TMP_REG +#undef REM_INV_REG +#undef REM_INV_STEP_REG +#undef ONE_REG +#undef REM_REG +#undef SUMS_REG +#undef FTMP0_REG +#undef FTMP1_REG +#undef FTMP2_REG + +#endif + static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, float *input, float *output) { @@ -366,8 +628,19 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << FIR_STEP_SHIFT); float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << FIR_STEP_SHIFT); + +#if defined(__x86_64__) && !defined(__arm64ec__) + upsample_sse((DWORD)ipos_num, ipos_num_step, rem_inv, rem_inv_step, count, input, output); +#else UINT i; +#ifdef __i386__ + if (sse_supported) { + upsample_sse((DWORD)ipos_num, ipos_num_step, rem_inv, rem_inv_step, count, input, output); + return; + } +#endif + for(i = 0; i < count; ++i) { UINT ipos = ipos_num >> FREQ_ADJUST_SHIFT; UINT idx = ~(DWORD)ipos_num >> (FREQ_ADJUST_SHIFT - FIR_STEP_SHIFT) << FIR_WIDTH_SHIFT; @@ -386,8 +659,13 @@ static void upsample(DWORD freq_adjust_num, DWORD freq_acc_start, UINT count, fl ipos_num += ipos_num_step; } +#endif } +#undef INIT +#undef CLEANUP +#undef ADDR_SUFFIX + /** * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716
From: Anton Baskanov <baskanov@gmail.com> --- dlls/dsound/mixer.c | 196 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/dlls/dsound/mixer.c b/dlls/dsound/mixer.c index 78a1ef2bb33..702f84fc08f 100644 --- a/dlls/dsound/mixer.c +++ b/dlls/dsound/mixer.c @@ -369,6 +369,188 @@ static inline float get_current_sample(const IDirectSoundBufferImpl *dsb, #endif +#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) + +#ifdef __i386__ + +#define OPOS_NUM_ARG "0x14(%esp)" +#define OPOS_NUM_STEP_ARG "0x18(%esp)" +#define REM_ARG "0x1c(%esp)" +#define REM_STEP_ARG "0x20(%esp)" +#define FIRGAIN_ARG "0x24(%esp)" +#define REQUIRED_INPUT_ARG "0x28(%esp)" +#define INPUT_ARG "0x2c(%esp)" +#define OUTPUT_ARG "0x30(%esp)" + +#define OPOS_NUM_REG "%ecx" +#define OPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%esi" +#define OUTPUT_REG "%edi" +#define INPUT_END_REG "%ebp" +#define FIR_REG "%ebx" +#define TMP_L_REG "%eax" +#define TMP_REG "%eax" + +#else + +#define OPOS_NUM_ARG "%ecx" +#define OPOS_NUM_STEP_ARG "%edx" +#define REM_ARG "%xmm2" +#define REM_STEP_ARG "%xmm3" +#define FIRGAIN_ARG "0x70(%rsp)" +#define REQUIRED_INPUT_ARG "0x78(%rsp)" +#define INPUT_ARG "0x80(%rsp)" +#define OUTPUT_ARG "0x88(%rsp)" + +#define OPOS_NUM_REG "%ecx" +#define OPOS_NUM_STEP_REG "%edx" +#define INPUT_REG "%rsi" +#define OUTPUT_REG "%rdi" +#define INPUT_END_REG "%rbp" +#define FIR_REG "%rbx" +#define TMP_L_REG "%eax" +#define TMP_REG "%rax" + +#endif + +#define REM_REG "%xmm2" +#define REM_STEP_REG "%xmm3" +#define FIRGAIN_REG "%xmm0" +#define ONE_REG "%xmm1" +#define INPUT_VALUE0_REG "%xmm4" +#define INPUT_VALUE1_REG "%xmm5" +#define FTMP0_REG "%xmm6" +#define FTMP1_REG "%xmm7" + +/** + * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and + * after output[]. + */ +void downsample_sse(DWORD opos_num, DWORD opos_num_step, float rem, float rem_step, float firgain, + UINT required_input, float *input, float *output); +__ASM_GLOBAL_FUNC( downsample_sse, + INIT + + "mov " OPOS_NUM_ARG ", " OPOS_NUM_REG "\n\t" + /* Store the lower half of opos_num inverted so that we don't have to + * invert it on every iteration of the outer loop. */ + "not " OPOS_NUM_REG "\n\t" + "mov " OPOS_NUM_STEP_ARG ", " OPOS_NUM_STEP_REG "\n\t" + + "movss " REM_ARG ", " REM_REG "\n\t" + "shufps $0, " REM_REG ", " REM_REG "\n\t" + "movss " REM_STEP_ARG ", " REM_STEP_REG "\n\t" + "shufps $0, " REM_STEP_REG ", " REM_STEP_REG "\n\t" + + "movss " FIRGAIN_ARG ", " FIRGAIN_REG "\n\t" + + "mov " INPUT_ARG ", " INPUT_REG "\n\t" + + /* Combine the upper half of opos_num and the output pointer into a + * single value. */ + "mov " OUTPUT_ARG ", " OUTPUT_REG "\n\t" + /* Divide the output pointer by 4 to match the scale. We can do this + * because the pointer is at least 4-byte aligned. It will be scaled + * back during the access in the inner loop. */ + "shr $2, " OUTPUT_REG "\n\t" + /* Subtract FIR_WIDTH so that we don't have to do this on every + * iteration of the outer loop. */ + "sub $" EXPAND_STR(FIR_WIDTH) ", " OUTPUT_REG "\n\t" + + "mov " REQUIRED_INPUT_ARG ", " TMP_L_REG "\n\t" + "lea (" INPUT_REG "," TMP_REG ",4), " INPUT_END_REG "\n\t" + + "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "1:\n\t" + /* Calculate idx. */ + "mov " OPOS_NUM_REG ", " TMP_L_REG "\n\t" + "shr $(32 - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t" + "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + 2), " TMP_REG "\n\t" + /* Calculate the FIR address base. */ + "lea " __ASM_NAME("fir") ADDR_SUFFIX ", " FIR_REG "\n\t" + "add " TMP_REG ", " FIR_REG "\n\t" + + /* Calculate input_value. */ + "movss (" INPUT_REG "), " INPUT_VALUE1_REG "\n\t" + "mulss " FIRGAIN_REG ", " INPUT_VALUE1_REG "\n\t" + "shufps $0, " INPUT_VALUE1_REG ", " INPUT_VALUE1_REG "\n\t" + "movups " INPUT_VALUE1_REG ", " INPUT_VALUE0_REG "\n\t" + /* Calculate input_value1. */ + "mulps " REM_REG ", " INPUT_VALUE1_REG "\n\t" + /* Calculate input_value0. */ + "subps " INPUT_VALUE1_REG ", " INPUT_VALUE0_REG "\n\t" + + /* Initialize i. */ + "xor " TMP_REG ", " TMP_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "2:\n\t" + /* Load the FIR coefficients. */ + "movaps (" FIR_REG "," TMP_REG "), " FTMP0_REG "\n\t" + "movaps " EXPAND_STR(FIR_WIDTH) " * 4(" FIR_REG "," TMP_REG "), " FTMP1_REG "\n\t" + /* Calculate the weighted sums. */ + "mulps " INPUT_VALUE0_REG ", " FTMP0_REG "\n\t" + "mulps " INPUT_VALUE1_REG ", " FTMP1_REG "\n\t" + "addps " FTMP0_REG ", " FTMP1_REG "\n\t" + /* Add the sums to the output. */ + "movups (" TMP_REG "," OUTPUT_REG ",4), " FTMP0_REG "\n\t" + "addps " FTMP1_REG ", " FTMP0_REG "\n\t" + "movups " FTMP0_REG ", (" TMP_REG "," OUTPUT_REG ",4)\n\t" + "add $16, " TMP_REG "\n\t" + "cmp $(" EXPAND_STR(FIR_WIDTH) " * 4), " TMP_REG "\n\t" + "jl 2b\n\t" + + /* Update rem. */ + "addps " REM_STEP_REG ", " REM_REG "\n\t" + "movups " ONE_REG ", " FTMP0_REG "\n\t" + "cmpleps " REM_REG ", " FTMP0_REG "\n\t" + "andps " ONE_REG ", " FTMP0_REG "\n\t" + "subps " FTMP0_REG ", " REM_REG "\n\t" + + /* Update opos_num. Use subtraction for the lower half as it is stored + * inverted. */ + "sub " OPOS_NUM_STEP_REG ", " OPOS_NUM_REG "\n\t" + "adc $0, " OUTPUT_REG "\n\t" + + /* Advance the input pointer. */ + "add $4, " INPUT_REG "\n\t" + "cmp " INPUT_END_REG ", " INPUT_REG "\n\t" + "jl 1b\n\t" + + CLEANUP + "ret" ) + +#undef OPOS_NUM_ARG +#undef OPOS_NUM_STEP_ARG +#undef REM_ARG +#undef REM_STEP_ARG +#undef FIRGAIN_ARG +#undef REQUIRED_INPUT_ARG +#undef INPUT_ARG +#undef OUTPUT_ARG +#undef OPOS_NUM_REG +#undef OPOS_NUM_STEP_REG +#undef INPUT_REG +#undef OUTPUT_REG +#undef INPUT_END_REG +#undef FIR_REG +#undef TMP_L_REG +#undef TMP_REG +#undef REM_REG +#undef REM_STEP_REG +#undef FIRGAIN_REG +#undef ONE_REG +#undef INPUT_VALUE0_REG +#undef INPUT_VALUE1_REG +#undef FTMP0_REG +#undef FTMP1_REG + +#endif + /** * Note that this function will overwrite up to FIR_WIDTH - 1 frames before and * after output[]. @@ -402,8 +584,21 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai * remain cleared. */ float rem = FIXED_0_32_TO_FLOAT(((DWORD)opos_num ^ (DWORD)opos_num_mask) << FIR_STEP_SHIFT); float rem_step = FIXED_0_32_TO_FLOAT(-opos_num_step << FIR_STEP_SHIFT); + +#if defined(__x86_64__) && !defined(__arm64ec__) + downsample_sse((DWORD)opos_num, opos_num_step, rem, rem_step, firgain, required_input, input, + output + (opos_num >> FREQ_ADJUST_SHIFT)); +#else int j; +#ifdef __i386__ + if (sse_supported) { + downsample_sse((DWORD)opos_num, opos_num_step, rem, rem_step, firgain, required_input, + input, output + (opos_num >> FREQ_ADJUST_SHIFT)); + return; + } +#endif + for (j = 0; j < required_input; ++j) { /* opos is in the range [-(fir_width - 1), count) */ int opos = (int)(opos_num >> FREQ_ADJUST_SHIFT) - FIR_WIDTH; @@ -422,6 +617,7 @@ static void downsample(DWORD freq_adjust_den, DWORD freq_acc_start, float firgai opos_num += opos_num_step; } +#endif } #if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10716
Matteo Bruni (@Mystral) commented about dlls/dsound/mixer.c:
+ * back during the access in the inner loop. */ + "shr $2, " INPUT_REG "\n\t" + + "mov " OUTPUT_ARG ", " OUTPUT_REG "\n\t" + + "mov " COUNT_ARG ", " TMP_L_REG "\n\t" + "lea (" OUTPUT_REG "," TMP_REG ",4), " OUTPUT_END_REG "\n\t" + + "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "1:\n\t" + /* Calculate idx. */ + "mov " IPOS_NUM_REG ", " TMP_L_REG "\n\t" + "shr $(32 - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t" That 32 would be `FREQ_ADJUST_SHIFT`, right?
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10716#note_137932
Matteo Bruni (@Mystral) commented about dlls/dsound/mixer.c:
+ "shr $2, " INPUT_REG "\n\t" + + "mov " OUTPUT_ARG ", " OUTPUT_REG "\n\t" + + "mov " COUNT_ARG ", " TMP_L_REG "\n\t" + "lea (" OUTPUT_REG "," TMP_REG ",4), " OUTPUT_END_REG "\n\t" + + "movaps " __ASM_NAME("one") ADDR_SUFFIX ", " ONE_REG "\n\t" + + ".p2align 4,,10\n\t" + ".p2align 3\n\t" + "1:\n\t" + /* Calculate idx. */ + "mov " IPOS_NUM_REG ", " TMP_L_REG "\n\t" + "shr $(32 - " EXPAND_STR(FIR_STEP_SHIFT) "), " TMP_REG "\n\t" + "shl $(" EXPAND_STR(FIR_WIDTH_SHIFT) " + 2), " TMP_REG "\n\t" This '2' is the same scale mentioned above I assume.
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10716#note_137934
Matteo Bruni (@Mystral) commented about dlls/dsound/mixer.c:
+ * invert it on every iteration of the outer loop. */ + "not " IPOS_NUM_REG "\n\t" + "mov " IPOS_NUM_STEP_ARG ", " IPOS_NUM_STEP_REG "\n\t" + + "movss " REM_INV_ARG ", " REM_INV_REG "\n\t" + "shufps $0, " REM_INV_REG ", " REM_INV_REG "\n\t" + "movss " REM_INV_STEP_ARG ", " REM_INV_STEP_REG "\n\t" + "shufps $0, " REM_INV_STEP_REG ", " REM_INV_STEP_REG "\n\t" + + /* Combine the upper half of ipos_num and the input pointer into a + * single value. */ + "mov " INPUT_ARG ", " INPUT_REG "\n\t" + /* Divide the input pointer by 4 to match the scale. We can do this + * because the pointer is at least 4-byte aligned. It will be scaled + * back during the access in the inner loop. */ + "shr $2, " INPUT_REG "\n\t" "Scale" in the sense of `sizeof(float)` I gather?
-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10716#note_137933
Matteo Bruni (@Mystral) commented about dlls/dsound/mixer.c:
float rem_inv = FIXED_0_32_TO_FLOAT((DWORD)ipos_num << FIR_STEP_SHIFT); float rem_inv_step = FIXED_0_32_TO_FLOAT(ipos_num_step << FIR_STEP_SHIFT); + +#if defined(__x86_64__) && !defined(__arm64ec__) + upsample_sse((DWORD)ipos_num, ipos_num_step, rem_inv, rem_inv_step, count, input, output); +#else UINT i;
+#ifdef __i386__ + if (sse_supported) { + upsample_sse((DWORD)ipos_num, ipos_num_step, rem_inv, rem_inv_step, count, input, output); + return; + } +#endif +
It might be worthwhile to consider having a superfluous `sse_supported` and `if (sse_supported)` in the 64-bit case if it simplifies the code. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10716#note_137935
I don't feel I have enough experience with maintaining larger assembly code chunks to be comfortable with asking for specific suggestions, or give a confident review / approval here. I'll stick to some general comments of uncertain value. I'm not sure that going to those lengths to have a single asm version for both 32 and 64 bit x86 helps more than it hurts. Those defines make it work, but don't look super nice e.g. the first line in the asm for 64-bit ends up being `mov %ecx, %ecx`. Just splitting the two versions apart might help. Otherwise using compiler intrinsics could be an option. I guess it would leave some performance on the table, by handing register allocation to the compiler, but maybe not that much? If I had to write it myself now I'd probably write it in the style of https://gitlab.winehq.org/wine/wine/-/merge_requests/9588/diffs?commit_id=2a... but with some comments. FWIW the SSE version of upsample seems to be generally doing what it's supposed to. I haven't looked into it at the level of detail I'd like to. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10716#note_137936
participants (3)
-
Anton Baskanov -
Anton Baskanov (@baskanov) -
Matteo Bruni (@Mystral)