[PATCH 3/4] sapi/tts: Implement TTS engine audio output resampler.

15 Jul 2025

From: Shaun Ren sren@codeweavers.com
This implementation uses the audio resampler DSP from Media Foundation.
---
 dlls/sapi/Makefile.in |   4 +-
 dlls/sapi/tests/tts.c |   8 +-
 dlls/sapi/tts.c       | 183 +++++++++++++++++++++++++++++++++++++++---
 3 files changed, 178 insertions(+), 17 deletions(-)

diff --git a/dlls/sapi/Makefile.in b/dlls/sapi/Makefile.in
index da91eca5b8c..8286ff8adc0 100644
--- a/dlls/sapi/Makefile.in
+++ b/dlls/sapi/Makefile.in
@@ -1,6 +1,6 @@
 MODULE    = sapi.dll
-IMPORTS   = uuid ole32 oleaut32 user32 advapi32
-DELAYIMPORTS = winmm
+IMPORTS   = uuid ole32 oleaut32 user32 advapi32 mfuuid wmcodecdspuuid
+DELAYIMPORTS = winmm mfplat
SOURCES = \
    async.c \
diff --git a/dlls/sapi/tests/tts.c b/dlls/sapi/tests/tts.c
index 4d3f32dc36b..c89bffae5a0 100644
--- a/dlls/sapi/tests/tts.c
+++ b/dlls/sapi/tests/tts.c
@@ -805,8 +805,8 @@ static void test_spvoice(void)
     test_engine.output_len = wave_len;
hr = ISpVoice_Speak(voice, test_text, SPF_DEFAULT, NULL);
-    todo_wine ok(hr == S_OK, "got %#lx.\n", hr);
-    todo_wine ok(test_engine.speak_called, "ISpTTSEngine::Speak was not called.\n");
+    ok(hr == S_OK, "got %#lx.\n", hr);
+    ok(test_engine.speak_called, "ISpTTSEngine::Speak was not called.\n");
hr = ISpVoice_SetOutput(voice, NULL, TRUE);
     ok(hr == S_OK, "got %#lx.\n", hr);
@@ -816,8 +816,8 @@ static void test_spvoice(void)
hr = IStream_Stat(mem_stream, &statstg, STATFLAG_DEFAULT);
     ok(hr == S_OK, "got %#lx.\n", hr);
-    todo_wine ok(fabs((double)statstg.cbSize.QuadPart / wave_len - 1) < 0.02,
-            "got %I64u, expected %Iu (+/-2%%).\n", statstg.cbSize.QuadPart, wave_len);
+    ok(fabs((double)statstg.cbSize.QuadPart / wave_len - 1) < 0.02,
+            "got %I64u, expected %Iu.\n", statstg.cbSize.QuadPart, wave_len);
if (statstg.cbSize.QuadPart > 0) {
         size_t check_len = min((size_t)statstg.cbSize.QuadPart, wave_len) / sizeof(int16_t);
diff --git a/dlls/sapi/tts.c b/dlls/sapi/tts.c
index baf7a74789d..54537d927be 100644
--- a/dlls/sapi/tts.c
+++ b/dlls/sapi/tts.c
@@ -26,6 +26,11 @@
 #include "winbase.h"
 #include "objbase.h"
+#include "mfapi.h"
+#include "mferror.h"
+#include "mftransform.h"
+#include "wmcodecdsp.h"
+
 #include "sapiddk.h"
 #include "sperror.h"
@@ -43,6 +48,7 @@ struct speech_voice
     LONG ref;
ISpStreamFormat *output;
+    IMFTransform *resampler;
     ISpObjectToken *engine_token;
     ISpTTSEngine *engine;
     LONG cur_stream_num;
@@ -76,6 +82,9 @@ struct tts_engine_site
struct speech_voice *voice;
     ULONG stream_num;
+    BOOL use_resampler;
+    IMFSample *out_sample;
+    IMFMediaBuffer *out_buf;
 };
static inline struct tts_engine_site *impl_from_ISpTTSEngineSite(ISpTTSEngineSite *iface)
@@ -83,6 +92,23 @@ static inline struct tts_engine_site *impl_from_ISpTTSEngineSite(ISpTTSEngineSit
     return CONTAINING_RECORD(iface, struct tts_engine_site, ISpTTSEngineSite_iface);
 }
+static const char *debugstr_wfx(const WAVEFORMATEX *wfx)
+{
+    if (!wfx) return "(null)";
+    if (wfx->wFormatTag == WAVE_FORMAT_EXTENSIBLE)
+    {
+        const WAVEFORMATEXTENSIBLE *wfxe = (const WAVEFORMATEXTENSIBLE *)wfx;
+
+        return wine_dbg_sprintf(
+                "tag: %#x (%s), ch: %u (mask: %#lx), rate: %lu, avgbps: %lu, align: %u, depth: %u",
+                wfx->wFormatTag, debugstr_guid(&wfxe->SubFormat), wfx->nChannels, wfxe->dwChannelMask,
+                wfx->nSamplesPerSec, wfx->nAvgBytesPerSec, wfx->nBlockAlign, wfx->wBitsPerSample);
+    }
+    return wine_dbg_sprintf("tag: %#x, ch: %u, rate: %lu, avgbps: %lu, align: %u, depth: %u",
+                            wfx->wFormatTag, wfx->nChannels, wfx->nSamplesPerSec,
+                            wfx->nAvgBytesPerSec, wfx->nBlockAlign, wfx->wBitsPerSample);
+}
+
 static HRESULT create_token_category(const WCHAR *cat_id, ISpObjectTokenCategory **cat)
 {
     HRESULT hr;
@@ -170,6 +196,7 @@ static ULONG WINAPI speech_voice_Release(ISpeechVoice *iface)
     {
         async_cancel_queue(&This->queue);
         if (This->output) ISpStreamFormat_Release(This->output);
+        if (This->resampler) IMFTransform_Release(This->resampler);
         if (This->engine_token) ISpObjectToken_Release(This->engine_token);
         if (This->engine) ISpTTSEngine_Release(This->engine);
         DeleteCriticalSection(&This->cs);
@@ -824,16 +851,68 @@ static void free_frag_list(SPVTEXTFRAG *frag)
     }
 }
-static HRESULT set_output_format(ISpStreamFormat *output, ISpTTSEngine *engine, GUID *fmtid, WAVEFORMATEX **wfx)
+static HRESULT setup_resampler(struct speech_voice *voice, const WAVEFORMATEX *in_wfx,
+        const WAVEFORMATEX *out_wfx)
+{
+    IMFMediaType *cur_in_type = NULL, *cur_out_type = NULL;
+    IMFMediaType *in_type = NULL, *out_type = NULL;
+    DWORD flags;
+    HRESULT hr;
+
+    TRACE("Resampling TTS engine output\n");
+    TRACE("  in_wfx:  %s\n", debugstr_wfx(in_wfx));
+    TRACE("to\n");
+    TRACE("  out_wfx: %s\n", debugstr_wfx(out_wfx));
+
+    if (!voice->resampler &&
+        FAILED(hr = CoCreateInstance(&CLSID_CResamplerMediaObject, NULL, CLSCTX_INPROC_SERVER,
+                                     &IID_IMFTransform, (void **)&voice->resampler)))
+    {
+        ERR("Failed to create CResamplerMediaObject: %#lx.\n", hr);
+        return hr;
+    }
+
+    if (FAILED(hr = MFCreateMediaType(&in_type)) ||
+        FAILED(hr = MFInitMediaTypeFromWaveFormatEx(in_type, in_wfx, sizeof(WAVEFORMATEX) + in_wfx->cbSize)))
+        goto done;
+
+    if (FAILED(hr = MFCreateMediaType(&out_type)) ||
+        FAILED(hr = MFInitMediaTypeFromWaveFormatEx(out_type, out_wfx, sizeof(WAVEFORMATEX) + out_wfx->cbSize)))
+        goto done;
+
+    if (FAILED(IMFTransform_GetInputCurrentType(voice->resampler, 0, &cur_in_type)) ||
+        IMFMediaType_IsEqual(cur_in_type, in_type, &flags) != S_OK)
+    {
+        if (FAILED(hr = IMFTransform_SetInputType(voice->resampler, 0, in_type, 0)))
+            goto done;
+    }
+
+    if (FAILED(IMFTransform_GetOutputCurrentType(voice->resampler, 0, &cur_out_type)) ||
+        IMFMediaType_IsEqual(cur_out_type, out_type, &flags) != S_OK)
+    {
+        if (FAILED(hr = IMFTransform_SetOutputType(voice->resampler, 0, out_type, 0)))
+            goto done;
+    }
+
+done:
+    if (in_type) IMFMediaType_Release(in_type);
+    if (out_type) IMFMediaType_Release(out_type);
+    if (cur_in_type) IMFMediaType_Release(cur_in_type);
+    if (cur_out_type) IMFMediaType_Release(cur_out_type);
+    return hr;
+}
+
+static HRESULT set_output_format(struct speech_voice *voice, GUID *fmtid, WAVEFORMATEX **engine_wfx,
+                                 BOOL *use_resampler)
 {
     GUID output_fmtid;
     WAVEFORMATEX *output_wfx = NULL;
     ISpAudio *audio = NULL;
     HRESULT hr;
-    if (FAILED(hr = ISpStreamFormat_GetFormat(output, &output_fmtid, &output_wfx)))
+    if (FAILED(hr = ISpStreamFormat_GetFormat(voice->output, &output_fmtid, &output_wfx)))
         return hr;
-    if (FAILED(hr = ISpTTSEngine_GetOutputFormat(engine, &output_fmtid, output_wfx, fmtid, wfx)))
+    if (FAILED(hr = ISpTTSEngine_GetOutputFormat(voice->engine, &output_fmtid, output_wfx, fmtid, engine_wfx)))
         goto done;
     if (!IsEqualGUID(fmtid, &SPDFID_WaveFormatEx))
     {
@@ -841,12 +920,22 @@ static HRESULT set_output_format(ISpStreamFormat *output, ISpTTSEngine *engine,
         goto done;
     }
-    if (memcmp(output_wfx, *wfx, sizeof(WAVEFORMATEX)) ||
-        memcmp(output_wfx + 1, *wfx + 1, output_wfx->cbSize))
+    *use_resampler = FALSE;
+
+    if (memcmp(output_wfx, *engine_wfx, sizeof(WAVEFORMATEX)) ||
+        memcmp(output_wfx + 1, *engine_wfx + 1, output_wfx->cbSize))
     {
-        if (FAILED(hr = ISpStreamFormat_QueryInterface(output, &IID_ISpAudio, (void **)&audio)) ||
-            FAILED(hr = ISpAudio_SetFormat(audio, &SPDFID_WaveFormatEx, *wfx)))
-            goto done;
+        if (SUCCEEDED(ISpStreamFormat_QueryInterface(voice->output, &IID_ISpAudio, (void **)&audio)))
+        {
+            hr = ISpAudio_SetFormat(audio, &SPDFID_WaveFormatEx, *engine_wfx);
+            if (hr == SPERR_UNSUPPORTED_FORMAT)
+                *use_resampler = TRUE;
+        }
+        else
+            *use_resampler = TRUE;
+
+        if (*use_resampler)
+            hr = setup_resampler(voice, *engine_wfx, output_wfx);
     }
done:
@@ -859,6 +948,7 @@ static void speak_proc(struct async_task *task)
 {
     struct speak_task *speak_task = (struct speak_task *)task;
     struct speech_voice *This = speak_task->voice;
+    struct tts_engine_site *site = impl_from_ISpTTSEngineSite(speak_task->site);
     GUID fmtid;
     WAVEFORMATEX *wfx = NULL;
     ISpAudio *audio = NULL;
@@ -875,7 +965,7 @@ static void speak_proc(struct async_task *task)
         goto done;
     }
-    if (FAILED(hr = set_output_format(This->output, This->engine, &fmtid, &wfx)))
+    if (FAILED(hr = set_output_format(This, &fmtid, &wfx, &site->use_resampler)))
     {
         LeaveCriticalSection(&This->cs);
         ERR("failed setting output format: %#lx.\n", hr);
@@ -1356,8 +1446,9 @@ static ULONG WINAPI ttsenginesite_Release(ISpTTSEngineSite *iface)
if (!ref)
     {
-        if (This->voice)
-            ISpeechVoice_Release(&This->voice->ISpeechVoice_iface);
+        if (This->voice) ISpeechVoice_Release(&This->voice->ISpeechVoice_iface);
+        if (This->out_sample) IMFSample_Release(This->out_sample);
+        if (This->out_buf) IMFMediaBuffer_Release(This->out_buf);
         free(This);
     }
@@ -1392,6 +1483,69 @@ static DWORD WINAPI ttsenginesite_GetActions(ISpTTSEngineSite *iface)
     return actions;
 }
+static HRESULT resample_engine_output(struct tts_engine_site *This, const void *buf, ULONG cb, ULONG *cb_written)
+{
+    MFT_OUTPUT_DATA_BUFFER mft_buf;
+    IMFMediaBuffer *in_buf = NULL;
+    IMFSample *in_sample = NULL;
+    BYTE *in_data, *out_data;
+    DWORD out_len;
+    DWORD status;
+    HRESULT hr;
+
+    if (FAILED(hr = MFCreateSample(&in_sample)) ||
+        FAILED(hr = MFCreateMemoryBuffer(cb, &in_buf)) ||
+        FAILED(hr = IMFSample_AddBuffer(in_sample, in_buf)))
+        goto done;
+
+    if (!This->out_sample)
+    {
+        if (FAILED(hr = MFCreateSample(&This->out_sample)) ||
+            FAILED(hr = MFCreateMemoryBuffer(16384, &This->out_buf)) ||
+            FAILED(hr = IMFSample_AddBuffer(This->out_sample, This->out_buf)))
+            goto done;
+    }
+
+    if (FAILED(hr = IMFMediaBuffer_Lock(in_buf, &in_data, NULL, NULL)))
+        goto done;
+    memcpy(in_data, buf, cb);
+    IMFMediaBuffer_Unlock(in_buf);
+
+    IMFMediaBuffer_SetCurrentLength(in_buf, cb);
+
+    if (FAILED(hr = IMFTransform_ProcessInput(This->voice->resampler, 0, in_sample, 0)))
+        goto done;
+
+    while (SUCCEEDED(hr))
+    {
+        memset(&mft_buf, 0, sizeof(mft_buf));
+        mft_buf.pSample = This->out_sample;
+
+        if (FAILED(hr = IMFTransform_ProcessOutput(This->voice->resampler, 0, 1, &mft_buf, &status)))
+        {
+            if (hr == MF_E_TRANSFORM_NEED_MORE_INPUT)
+                hr = S_OK;
+            break;
+        }
+
+        if (FAILED(hr = IMFMediaBuffer_GetCurrentLength(This->out_buf, &out_len)) ||
+            FAILED(hr = IMFMediaBuffer_Lock(This->out_buf, &out_data, NULL, NULL)))
+            break;
+
+        hr = ISpStreamFormat_Write(This->voice->output, out_data, out_len, NULL);
+        IMFMediaBuffer_Unlock(This->out_buf);
+    }
+
+done:
+    if (in_sample) IMFSample_Release(in_sample);
+    if (in_buf) IMFMediaBuffer_Release(in_buf);
+
+    if (cb_written)
+        *cb_written = SUCCEEDED(hr) ? cb : 0;
+
+    return hr;
+}
+
 static HRESULT WINAPI ttsenginesite_Write(ISpTTSEngineSite *iface, const void *buf, ULONG cb, ULONG *cb_written)
 {
     struct tts_engine_site *This = impl_from_ISpTTSEngineSite(iface);
@@ -1401,6 +1555,9 @@ static HRESULT WINAPI ttsenginesite_Write(ISpTTSEngineSite *iface, const void *b
     if (!This->voice->output)
         return SPERR_UNINITIALIZED;
+    if (This->use_resampler)
+        return resample_engine_output(This, buf, cb, cb_written);
+
     return ISpStreamFormat_Write(This->voice->output, buf, cb, cb_written);
 }
@@ -1472,6 +1629,9 @@ static HRESULT ttsenginesite_create(struct speech_voice *voice, ULONG stream_num
     This->ref = 1;
     This->voice = voice;
     This->stream_num = stream_num;
+    This->use_resampler = FALSE;
+    This->out_sample = NULL;
+    This->out_buf = NULL;
ISpeechVoice_AddRef(&This->voice->ISpeechVoice_iface);
@@ -1545,6 +1705,7 @@ HRESULT speech_voice_create(IUnknown *outer, REFIID iid, void **obj)
     This->ref = 1;
This->output = NULL;
+    This->resampler = NULL;
     This->engine_token = NULL;
     This->engine = NULL;
     This->cur_stream_num = 0;
-- 
GitLab


https://gitlab.winehq.org/wine/wine/-/merge_requests/8571

    

2025

2024

2023

2022

[PATCH 3/4] sapi/tts: Implement TTS engine audio output resampler.