Used by Diablo IV's screen reader.
-- v9: sapi/tts: Support XML-related flags in ISpVoice::Speak. sapi/tests: Add some SSML tests in tts.
From: Shaun Ren sren@codeweavers.com
This simplifies checking fragment field values. --- dlls/sapi/tests/tts.c | 79 ++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 38 deletions(-)
diff --git a/dlls/sapi/tests/tts.c b/dlls/sapi/tests/tts.c index 38c69e6144a..f523052d451 100644 --- a/dlls/sapi/tests/tts.c +++ b/dlls/sapi/tests/tts.c @@ -151,60 +151,67 @@ struct test_engine BOOL speak_called; DWORD flags; GUID fmtid; - SPVTEXTFRAG *frag_list; + SPVTEXTFRAG *frags; + size_t frag_count; LONG rate; USHORT volume; };
-static void copy_frag_list(const SPVTEXTFRAG *frag_list, SPVTEXTFRAG **ret_frag_list) +/* Copy frag_list into a contiguous array allocated by a single malloc(). + * The texts are allocated at the end of the array. */ +static void copy_frag_list(const SPVTEXTFRAG *frag_list, SPVTEXTFRAG **ret_frags, size_t *frag_count) { - SPVTEXTFRAG *frag, *prev = NULL; + const SPVTEXTFRAG *frag; + SPVTEXTFRAG *cur; + WCHAR *cur_text; + size_t size = 0; + + *frag_count = 0;
if (!frag_list) { - *ret_frag_list = NULL; + *ret_frags = NULL; return; }
- while (frag_list) + for (frag = frag_list; frag; frag = frag->pNext) { - frag = malloc(sizeof(*frag) + frag_list->ulTextLen * sizeof(WCHAR)); - memcpy(frag, frag_list, sizeof(*frag)); + size += sizeof(*frag) + (frag->ulTextLen + 1) * sizeof(WCHAR); + (*frag_count)++; + }
- if (frag_list->pTextStart) - { - frag->pTextStart = (WCHAR *)(frag + 1); - memcpy(frag + 1, frag_list->pTextStart, frag->ulTextLen * sizeof(WCHAR)); - } + *ret_frags = malloc(size); + cur = *ret_frags; + cur_text = (WCHAR *)(*ret_frags + (*frag_count));
- frag->pNext = NULL; + for (frag = frag_list; frag; frag = frag->pNext, ++cur) + { + memcpy(cur, frag, sizeof(*frag));
- if (prev) - prev->pNext = frag; - else - *ret_frag_list = frag; + cur->pNext = frag->pNext ? cur + 1 : NULL;
- prev = frag; - frag_list = frag_list->pNext; + if (frag->pTextStart) + { + memcpy(cur_text, frag->pTextStart, frag->ulTextLen * sizeof(WCHAR)); + cur_text[frag->ulTextLen] = L'\0'; + + cur->pTextStart = (WCHAR *)cur_text; + cur_text += frag->ulTextLen + 1; + } } }
static void reset_engine_params(struct test_engine *engine) { - SPVTEXTFRAG *frag, *next; - engine->speak_called = FALSE; engine->flags = 0xdeadbeef; memset(&engine->fmtid, 0xde, sizeof(engine->fmtid)); engine->rate = 0xdeadbeef; engine->volume = 0xbeef;
- for (frag = engine->frag_list; frag; frag = next) - { - next = frag->pNext; - free(frag); - } - engine->frag_list = NULL; + free(engine->frags); + engine->frags = NULL; + engine->frag_count = 0; }
static inline struct test_engine *impl_from_ISpTTSEngine(ISpTTSEngine *iface) @@ -257,7 +264,7 @@ static HRESULT WINAPI test_engine_Speak(ISpTTSEngine *iface, DWORD flags, REFGUI
engine->flags = flags; engine->fmtid = *fmtid; - copy_frag_list(frag_list, &engine->frag_list); + copy_frag_list(frag_list, &engine->frags, &engine->frag_count); engine->speak_called = TRUE;
actions = ISpTTSEngineSite_GetActions(site); @@ -637,11 +644,9 @@ static void test_spvoice(void) ok(hr == S_OK, "got %#lx.\n", hr); ok(test_engine.speak_called, "ISpTTSEngine::Speak was not called.\n"); ok(test_engine.flags == SPF_DEFAULT, "got %#lx.\n", test_engine.flags); - ok(test_engine.frag_list != NULL, "frag_list is NULL.\n"); - ok(test_engine.frag_list->pNext == NULL, "frag_list->pNext != NULL.\n"); - ok(test_engine.frag_list->ulTextLen == wcslen(test_text), "got %lu.\n", test_engine.frag_list->ulTextLen); - ok(!wcsncmp(test_text, test_engine.frag_list->pTextStart, wcslen(test_text)), - "got %s.\n", wine_dbgstr_w(test_engine.frag_list->pTextStart)); + ok(test_engine.frag_count == 1, "got %Iu.\n", test_engine.frag_count); + ok(!wcscmp(test_engine.frags[0].pTextStart, test_text), + "got %s.\n", wine_dbgstr_w(test_engine.frags[0].pTextStart)); ok(test_engine.rate == 0, "got %ld.\n", test_engine.rate); ok(test_engine.volume == 100, "got %d.\n", test_engine.volume); ok(stream_num == 1, "got %lu.\n", stream_num); @@ -677,11 +682,9 @@ static void test_spvoice(void)
ok(test_engine.speak_called, "ISpTTSEngine::Speak was not called.\n"); ok(test_engine.flags == SPF_NLP_SPEAK_PUNC, "got %#lx.\n", test_engine.flags); - ok(test_engine.frag_list != NULL, "frag_list is NULL.\n"); - ok(test_engine.frag_list->pNext == NULL, "frag_list->pNext != NULL.\n"); - ok(test_engine.frag_list->ulTextLen == wcslen(test_text), "got %lu.\n", test_engine.frag_list->ulTextLen); - ok(!wcsncmp(test_text, test_engine.frag_list->pTextStart, wcslen(test_text)), - "got %s.\n", wine_dbgstr_w(test_engine.frag_list->pTextStart)); + ok(test_engine.frag_count == 1, "got %Iu.\n", test_engine.frag_count); + ok(!wcscmp(test_engine.frags[0].pTextStart, test_text), + "got %s.\n", wine_dbgstr_w(test_engine.frags[0].pTextStart)); ok(test_engine.rate == 0, "got %ld.\n", test_engine.rate); ok(test_engine.volume == 100, "got %d.\n", test_engine.volume);
From: Shaun Ren sren@codeweavers.com
--- dlls/sapi/tests/tts.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/dlls/sapi/tests/tts.c b/dlls/sapi/tests/tts.c index f523052d451..7b1e49b1b73 100644 --- a/dlls/sapi/tests/tts.c +++ b/dlls/sapi/tests/tts.c @@ -148,6 +148,7 @@ struct test_engine
ISpObjectToken *token;
+ BOOL simulate_output; BOOL speak_called; DWORD flags; GUID fmtid; @@ -203,6 +204,7 @@ static void copy_frag_list(const SPVTEXTFRAG *frag_list, SPVTEXTFRAG **ret_frags
static void reset_engine_params(struct test_engine *engine) { + engine->simulate_output = FALSE; engine->speak_called = FALSE; engine->flags = 0xdeadbeef; memset(&engine->fmtid, 0xde, sizeof(engine->fmtid)); @@ -280,6 +282,9 @@ static HRESULT WINAPI test_engine_Speak(ISpTTSEngine *iface, DWORD flags, REFGUI actions = ISpTTSEngineSite_GetActions(site); ok(actions == SPVES_CONTINUE, "got %#lx.\n", actions);
+ if (!engine->simulate_output) + return S_OK; + buf = calloc(1, 22050 * 2 / 5); for (i = 0; i < 5; i++) { @@ -637,6 +642,7 @@ static void test_spvoice(void) ISpVoice_SetVolume(voice, 100);
reset_engine_params(&test_engine); + test_engine.simulate_output = TRUE; stream_num = 0xdeadbeef; start = GetTickCount(); hr = ISpVoice_Speak(voice, test_text, SPF_DEFAULT, &stream_num); @@ -664,6 +670,7 @@ static void test_spvoice(void) ok(duration < 200, "took %lu ms.\n", duration);
reset_engine_params(&test_engine); + test_engine.simulate_output = TRUE; stream_num = 0xdeadbeef; start = GetTickCount(); hr = ISpVoice_Speak(voice, test_text, SPF_DEFAULT | SPF_ASYNC | SPF_NLP_SPEAK_PUNC, &stream_num); @@ -689,6 +696,7 @@ static void test_spvoice(void) ok(test_engine.volume == 100, "got %d.\n", test_engine.volume);
reset_engine_params(&test_engine); + test_engine.simulate_output = TRUE; hr = ISpVoice_Speak(voice, test_text, SPF_DEFAULT | SPF_ASYNC, NULL); ok(hr == S_OK, "got %#lx.\n", hr);
From: Shaun Ren sren@codeweavers.com
--- dlls/sapi/tests/tts.c | 380 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 372 insertions(+), 8 deletions(-)
diff --git a/dlls/sapi/tests/tts.c b/dlls/sapi/tests/tts.c index 7b1e49b1b73..1a63c6986cf 100644 --- a/dlls/sapi/tests/tts.c +++ b/dlls/sapi/tests/tts.c @@ -426,9 +426,24 @@ static const IClassFactoryVtbl ClassFactoryVtbl = {
static IClassFactory test_engine_cf = { &ClassFactoryVtbl };
+static const WCHAR test_token_id[] = L"HKEY_LOCAL_MACHINE\Software\Microsoft\Speech\Voices\Tokens\WinetestVoice"; + +static BOOL test_token_created = FALSE; + +#define check_frag_text(i, exp) \ + ok(!wcscmp(test_engine.frags[i].pTextStart, exp), "frag %d text: got %s.\n", \ + i, wine_dbgstr_w(test_engine.frags[i].pTextStart)) + +#define check_frag_text_src_offset(i, exp) \ + ok(test_engine.frags[i].ulTextSrcOffset == exp, "frag %d text src offset: got %lu.\n", \ + i, test_engine.frags[i].ulTextSrcOffset) + +#define check_frag_state_field(i, name, exp, fmt) \ + ok(test_engine.frags[i].State.name == exp, "frag %d state " #name ": got " fmt ".\n", \ + i, test_engine.frags[i].State.name) + static void test_spvoice(void) { - static const WCHAR test_token_id[] = L"HKEY_LOCAL_MACHINE\Software\Microsoft\Speech\Voices\Tokens\WinetestVoice"; static const WCHAR test_text[] = L"Hello! This is a test sentence."; static const WCHAR *get_voices = L"GetVoices";
@@ -462,8 +477,6 @@ static void test_spvoice(void) return; }
- RegDeleteTreeA(HKEY_LOCAL_MACHINE, "Software\Microsoft\Speech\Voices\WinetestVoice"); - check_apttype(); ok(test_apt_data.type == APTTYPE_UNITIALIZED, "got apt type %d.\n", test_apt_data.type);
@@ -616,6 +629,8 @@ static void test_spvoice(void) ISpDataKey_SetStringValue(attrs_key, L"Vendor", L"Winetest"); ISpDataKey_Release(attrs_key);
+ test_token_created = TRUE; + hr = ISpVoice_SetVoice(voice, token); ok(hr == S_OK, "got %#lx.\n", hr);
@@ -651,8 +666,7 @@ static void test_spvoice(void) ok(test_engine.speak_called, "ISpTTSEngine::Speak was not called.\n"); ok(test_engine.flags == SPF_DEFAULT, "got %#lx.\n", test_engine.flags); ok(test_engine.frag_count == 1, "got %Iu.\n", test_engine.frag_count); - ok(!wcscmp(test_engine.frags[0].pTextStart, test_text), - "got %s.\n", wine_dbgstr_w(test_engine.frags[0].pTextStart)); + check_frag_text(0, test_text); ok(test_engine.rate == 0, "got %ld.\n", test_engine.rate); ok(test_engine.volume == 100, "got %d.\n", test_engine.volume); ok(stream_num == 1, "got %lu.\n", stream_num); @@ -690,8 +704,8 @@ static void test_spvoice(void) ok(test_engine.speak_called, "ISpTTSEngine::Speak was not called.\n"); ok(test_engine.flags == SPF_NLP_SPEAK_PUNC, "got %#lx.\n", test_engine.flags); ok(test_engine.frag_count == 1, "got %Iu.\n", test_engine.frag_count); - ok(!wcscmp(test_engine.frags[0].pTextStart, test_text), - "got %s.\n", wine_dbgstr_w(test_engine.frags[0].pTextStart)); + check_frag_text(0, test_text); + check_frag_text_src_offset(0, 0); ok(test_engine.rate == 0, "got %ld.\n", test_engine.rate); ok(test_engine.volume == 100, "got %d.\n", test_engine.volume);
@@ -811,15 +825,365 @@ done: ISpMMSysAudio_Release(audio_out); SysFreeString(req); SysFreeString(opt); +}
- RegDeleteTreeA(HKEY_LOCAL_MACHINE, "Software\Microsoft\Speech\Voices\WinetestVoice"); +static void test_spvoice_ssml(void) +{ + static const WCHAR text1[] = + L"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-us'>text1</speak>"; + + /* Only version 1.0 is supported in SAPI. */ + static const WCHAR bad_text1[] = + L"<speak version='1.1' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-us'>text1</speak>"; + + /* version attribute is required in <speak>. */ + static const WCHAR bad_text2[] = + L"<speak xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-us'>text1</speak>"; + + /* xml:lang attribute is required in <speak>. */ + static const WCHAR bad_text3[] = + L"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis'>text1</speak>"; + + /* xmlns is not required in <speak>. */ + static const WCHAR text2[] = + L"<speak version='1.0' xml:lang='en-US'>text2</speak>"; + + static const WCHAR text3[] = + L"<?xml version='1.0' encoding='utf-8'?>" + L"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-us'>\n" + L"P1S1. P1S2.\n" + L"\n" + L"P2." + L"<p>P3.</p>" + L"<p><s>P4, S1. P4S2.</s><s>P4S3.</s></p>" + L"<p>\u4F0D</p>" + L"<p>\U0001240B</p>" /* Two WCHARs needed for \U0001240B */ + L"<p>P7.</p>" + L"</speak>"; + + static const WCHAR text4[] = + L"<?xml version='1.0' encoding='utf-16'?>" + L"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-us'>" + L"<s>One, <prosody rate='-85%'>two.</prosody></s>" + L"</speak>"; + + static const WCHAR text5[] = + L"<?xml version=\"1.0\" encoding=\"utf-8\"?>" + L"<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis%5C" xml:lang="en-us">" + L"<prosody rate="50%">50%.</prosody>" + L"<prosody rate='+50%'>+50%.</prosody>" + L"<prosody rate='6'>6.</prosody>" + L"<prosody rate='0.01000001'>0.01000001.</prosody>" + L"<prosody rate='0.01'>0.01.</prosody>" + L"<prosody rate='0'>0.</prosody>" + L"<prosody rate='-1.0'>-1.0.</prosody>" + L"<prosody rate='6'><prosody rate='3'>3.</prosody></prosody>" + L"</speak>"; + + static const WCHAR text6[] = + L"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-us'>" + L"<prosody rate='x-slow'>x-slow.</prosody>" + L"<prosody rate='slow'>slow.</prosody>" + L"<prosody rate='medium'>medium.</prosody>" + L"<prosody rate='fast'>fast.</prosody>" + L"<prosody rate='x-fast'>x-fast.</prosody>" + L"</speak>"; + + static const WCHAR text7[] = + L"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-us'>" + L"One,<prosody rate='x-fast'/> Two." /* Empty tags are ignored. */ + L"<prosody rate='fast'>" + L" Three.<prosody rate='x-fast'>Four.</prosody>" + L"</prosody>" + L"Five." + L"</speak>"; + + static const WCHAR text8[] = + L"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-us'>" + L"<prosody volume='50%'>50%.</prosody>" + L"<prosody volume='-50%'>-50%.</prosody>" + L"<prosody volume='10'>10.</prosody>" + L"<prosody volume='+10'>+10.</prosody>" + L"<prosody volume='-10.1' rate='+200%'>-10.1.</prosody>" + L"<prosody volume='-50%'><prosody volume='-50%'>25.</prosody></prosody>" + L"<prosody volume='-50%'><prosody volume='50%'>75.</prosody></prosody>" + L"<prosody volume='-50%'><prosody volume='+50'>100.</prosody></prosody>" + L"<prosody volume='-50%'><prosody volume='50'>50.</prosody></prosody>" + L"</speak>"; + + static const WCHAR text9[] = + L"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-us'>" + L"<prosody volume='silent'>silent.</prosody>" + L"<prosody volume='x-soft'>x-soft.</prosody>" + L"<prosody volume='soft'>soft.</prosody>" + L"<prosody volume='medium'>medium.</prosody>" + L"<prosody volume='loud'>loud.</prosody>" + L"<prosody volume='x-loud'>x-loud.</prosody>" + L"<prosody volume='loud'><prosody volume='soft'>soft.</prosody></prosody>" + L"</speak>"; + + + ISpVoice *voice; + ISpObjectToken *token; + HRESULT hr; + + if (waveOutGetNumDevs() == 0) { + skip("no wave out devices.\n"); + return; + } + + if (!test_token_created) { + /* w1064_adm */ + win_skip("Test token not created.\n"); + return; + } + + hr = CoCreateInstance(&CLSID_SpVoice, NULL, CLSCTX_INPROC_SERVER, + &IID_ISpVoice, (void **)&voice); + ok(hr == S_OK, "got %#lx.\n", hr); + + hr = ISpVoice_SetOutput(voice, NULL, TRUE); + ok(hr == S_OK, "got %#lx.\n", hr); + + hr = CoCreateInstance(&CLSID_SpObjectToken, NULL, CLSCTX_INPROC_SERVER, + &IID_ISpObjectToken, (void **)&token); + ok(hr == S_OK, "got %#lx.\n", hr); + + hr = ISpObjectToken_SetId(token, NULL, test_token_id, FALSE); + ok(hr == S_OK, "got %#lx.\n", hr); + + hr = ISpVoice_SetVoice(voice, token); + ok(hr == S_OK, "got %#lx.\n", hr); + + reset_engine_params(&test_engine); + + hr = ISpVoice_Speak(voice, text1, SPF_IS_XML | SPF_PARSE_SSML, NULL); + todo_wine ok(hr == S_OK, "got %#lx.\n", hr); + todo_wine ok(test_engine.frag_count == 1, "got %Iu.\n", test_engine.frag_count); + + if (test_engine.frag_count == 1) { + check_frag_text(0, L"text1"); + + check_frag_state_field(0, eAction, SPVA_Speak, "%d"); + ok(test_engine.frags[0].State.LangID == 0x409 || broken(test_engine.frags[0].State.LangID == 0) /* win7 */, + "got %#hx.\n", test_engine.frags[0].State.LangID); + check_frag_state_field(0, EmphAdj, 0, "%ld"); + check_frag_state_field(0, RateAdj, 0, "%ld"); + check_frag_state_field(0, Volume, 100, "%lu"); + check_frag_state_field(0, PitchAdj.MiddleAdj, 0, "%ld"); + check_frag_state_field(0, PitchAdj.RangeAdj, 0, "%ld"); + check_frag_state_field(0, SilenceMSecs, 0, "%lu"); + check_frag_state_field(0, ePartOfSpeech, SPPS_Unknown, "%#x"); + } + + reset_engine_params(&test_engine); + + /* SSML autodetection when SPF_PARSE_SSML is not specified. */ + hr = ISpVoice_Speak(voice, text1, SPF_IS_XML, NULL); + todo_wine ok(hr == S_OK, "got %#lx.\n", hr); + todo_wine ok(test_engine.frag_count == 1, "got %Iu.\n", test_engine.frag_count); + + if (test_engine.frag_count == 1) + check_frag_text(0, L"text1"); + + reset_engine_params(&test_engine); + + /* XML and SSML autodetection when SPF_IS_XML is not specified. */ + hr = ISpVoice_Speak(voice, text1, SPF_DEFAULT, NULL); + ok(hr == S_OK, "got %#lx.\n", hr); + ok(test_engine.frag_count == 1, "got %Iu.\n", test_engine.frag_count); + todo_wine check_frag_text(0, L"text1"); + + reset_engine_params(&test_engine); + + hr = ISpVoice_Speak(voice, bad_text1, SPF_IS_XML | SPF_PARSE_SSML, NULL); + todo_wine ok(hr == SPERR_UNSUPPORTED_FORMAT, "got %#lx.\n", hr); + + hr = ISpVoice_Speak(voice, bad_text2, SPF_IS_XML | SPF_PARSE_SSML, NULL); + todo_wine ok(hr == SPERR_UNSUPPORTED_FORMAT, "got %#lx.\n", hr); + + hr = ISpVoice_Speak(voice, bad_text3, SPF_IS_XML | SPF_PARSE_SSML, NULL); + todo_wine ok(hr == SPERR_UNSUPPORTED_FORMAT || broken(hr == S_OK) /* win7 */, "got %#lx.\n", hr); + + reset_engine_params(&test_engine); + + hr = ISpVoice_Speak(voice, text2, SPF_IS_XML | SPF_PARSE_SSML, NULL); + todo_wine ok(hr == S_OK || broken(hr == SPERR_UNSUPPORTED_FORMAT) /* win7 */, "got %#lx.\n", hr); + + if (hr == S_OK) { + ok(test_engine.frag_count == 1, "got %Iu.\n", test_engine.frag_count); + check_frag_text(0, L"text2"); + check_frag_state_field(0, eAction, SPVA_Speak, "%d"); + check_frag_state_field(0, LangID, 0x409, "%#hx"); + } + + reset_engine_params(&test_engine); + + hr = ISpVoice_Speak(voice, text3, SPF_IS_XML, NULL); + todo_wine ok(hr == S_OK, "got %#lx.\n", hr); + todo_wine ok(test_engine.frag_count == 7 || broken(test_engine.frag_count == 1) /* win7 */, + "got %Iu.\n", test_engine.frag_count); + + if (test_engine.frag_count == 7) { + check_frag_text(0, L"\nP1S1. P1S2.\n\nP2."); + check_frag_text_src_offset(0, 120); + check_frag_state_field(0, eAction, SPVA_Speak, "%d"); + + check_frag_text(1, L"P3."); + check_frag_text_src_offset(1, 140); + check_frag_state_field(1, eAction, SPVA_Speak, "%d"); + + check_frag_text(2, L"P4, S1. P4S2."); + check_frag_text_src_offset(2, 153); + check_frag_state_field(2, eAction, SPVA_Speak, "%d"); + + check_frag_text(3, L"P4S3."); + check_frag_text_src_offset(3, 173); + check_frag_state_field(3, eAction, SPVA_Speak, "%d"); + + check_frag_text(4, L"\u4F0D"); + check_frag_text_src_offset(4, 189); + check_frag_state_field(4, eAction, SPVA_Speak, "%d"); + + check_frag_text(5, L"\U0001240B"); + ok(test_engine.frags[5].ulTextSrcOffset == 197 || /* 189 + 8 = 197 */ + broken(test_engine.frags[5].ulTextSrcOffset == 196), /* Windows gives incorrect offset here */ + "got %lu.\n", test_engine.frags[5].ulTextSrcOffset); + + check_frag_text(6, L"P7."); + check_frag_text_src_offset(6, test_engine.frags[5].ulTextSrcOffset + 9); + } + + reset_engine_params(&test_engine); + + hr = ISpVoice_Speak(voice, text4, SPF_DEFAULT, NULL); + ok(hr == S_OK, "got %#lx.\n", hr); + todo_wine ok(test_engine.frag_count == 2, "got %Iu.\n", test_engine.frag_count); + + if (test_engine.frag_count == 2) { + check_frag_text(0, L"One, "); + check_frag_state_field(0, eAction, SPVA_Speak, "%d"); + check_frag_state_field(0, RateAdj, 0, "%ld"); + + check_frag_text(1, L"two."); + check_frag_state_field(1, eAction, SPVA_Speak, "%d"); + check_frag_state_field(1, RateAdj, -17, "%ld"); /* 3^(-17/10) ~= 0.15 */ + } + + reset_engine_params(&test_engine); + + hr = ISpVoice_Speak(voice, text5, SPF_IS_XML | SPF_PARSE_SSML, NULL); + todo_wine ok(hr == S_OK, "got %#lx.\n", hr); + todo_wine ok(test_engine.frag_count == 8 || broken(test_engine.frag_count == 3) /* win7 */, + "got %Iu.\n", test_engine.frag_count); + + if (test_engine.frag_count == 8) { + check_frag_state_field(0, RateAdj, 4, "%ld"); /* 3^(4/10) ~= 1.5 */ + check_frag_state_field(1, RateAdj, 4, "%ld"); /* 3^(4/10) ~= 1.5 */ + check_frag_state_field(2, RateAdj, 16, "%ld"); /* 3^(16/10) ~= 6 */ + check_frag_state_field(3, RateAdj, -42, "%ld"); /* 3^(-42/10) ~= 0.01000001 */ + check_frag_state_field(4, RateAdj, -10, "%ld"); /* rate = 0.01 */ + check_frag_state_field(5, RateAdj, -10, "%ld"); /* rate = 0 */ + check_frag_state_field(6, RateAdj, 0, "%ld"); /* negative rates are ignored */ + check_frag_state_field(7, RateAdj, 10, "%ld"); /* 3^(10/10) = 3 */ + } + + reset_engine_params(&test_engine); + + hr = ISpVoice_Speak(voice, text6, SPF_IS_XML | SPF_PARSE_SSML, NULL); + todo_wine ok(hr == S_OK || broken(hr == SPERR_UNSUPPORTED_FORMAT) /* win7 */, "got %#lx.\n", hr); + + if (hr == S_OK) { + ok(test_engine.frag_count == 5, "got %Iu.\n", test_engine.frag_count); + + check_frag_state_field(0, RateAdj, -9, "%ld"); /* x-slow */ + check_frag_state_field(1, RateAdj, -4, "%ld"); /* slow */ + check_frag_state_field(2, RateAdj, 0, "%ld"); /* medium */ + check_frag_state_field(3, RateAdj, 4, "%ld"); /* fast */ + check_frag_state_field(4, RateAdj, 9, "%ld"); /* x-fast */ + } + + reset_engine_params(&test_engine); + + hr = ISpVoice_Speak(voice, text7, SPF_IS_XML | SPF_PARSE_SSML, NULL); + todo_wine ok(hr == S_OK || broken(hr == SPERR_UNSUPPORTED_FORMAT) /* win7 */, "got %#lx.\n", hr); + + if (hr == S_OK) { + ok(test_engine.frag_count == 5, "got %Iu.\n", test_engine.frag_count); + + check_frag_text(0, L"One,"); + check_frag_state_field(0, RateAdj, 0, "%ld"); + + check_frag_text(1, L" Two."); + check_frag_state_field(1, RateAdj, 0, "%ld"); + + check_frag_text(2, L" Three."); + check_frag_state_field(2, RateAdj, 4, "%ld"); + + check_frag_text(3, L"Four."); + check_frag_state_field(3, RateAdj, 9, "%ld"); + + check_frag_text(4, L"Five."); + check_frag_state_field(4, RateAdj, 0, "%ld"); + } + + reset_engine_params(&test_engine); + + hr = ISpVoice_Speak(voice, text8, SPF_IS_XML | SPF_PARSE_SSML, NULL); + todo_wine ok(hr == S_OK, "got %#lx.\n", hr); + + if (hr == S_OK) { + ok(test_engine.frag_count == 9, "got %Iu.\n", test_engine.frag_count); + + ok(test_engine.frags[0].State.Volume == 100 || broken(test_engine.frags[0].State.Volume == 50) /* win7 */, + "got %lu.\n", test_engine.frags[0].State.Volume); + check_frag_state_field(1, Volume, 50, "%ld"); + check_frag_state_field(2, Volume, 10, "%lu"); + check_frag_state_field(3, Volume, 100, "%lu"); + + check_frag_state_field(4, Volume, 90, "%lu"); + check_frag_state_field(4, RateAdj, 10, "%ld"); + + check_frag_state_field(5, Volume, 25, "%lu"); + ok(test_engine.frags[6].State.Volume == 75 || broken(test_engine.frags[6].State.Volume == 25) /* win7 */, + "got %lu.\n", test_engine.frags[6].State.Volume); + check_frag_state_field(7, Volume, 100, "%lu"); + check_frag_state_field(8, Volume, 50, "%lu"); + } + + reset_engine_params(&test_engine); + + hr = ISpVoice_Speak(voice, text9, SPF_IS_XML | SPF_PARSE_SSML, NULL); + todo_wine ok(hr == S_OK || broken(hr == SPERR_UNSUPPORTED_FORMAT) /* win7 */, "got %#lx.\n", hr); + + if (hr == S_OK) { + ok(test_engine.frag_count == 7, "got %Iu.\n", test_engine.frag_count); + + check_frag_state_field(0, Volume, 0, "%lu"); /* silent */ + check_frag_state_field(1, Volume, 20, "%lu"); /* x-soft */ + check_frag_state_field(2, Volume, 40, "%lu"); /* soft */ + check_frag_state_field(3, Volume, 60, "%lu"); /* medium */ + check_frag_state_field(4, Volume, 80, "%lu"); /* loud */ + check_frag_state_field(5, Volume, 100, "%lu"); /* x-loud */ + + check_frag_state_field(6, Volume, 40, "%lu"); /* soft */ + } + + reset_engine_params(&test_engine); + ISpVoice_Release(voice); + ISpObjectToken_Release(token); }
START_TEST(tts) { CoInitialize(NULL); + RegDeleteTreeA(HKEY_LOCAL_MACHINE, "Software\Microsoft\Speech\Voices\WinetestVoice"); + /* Run spvoice tests before interface tests so that a MTA won't be created before this test is run. */ test_spvoice(); + test_spvoice_ssml(); test_interfaces(); + + RegDeleteTreeA(HKEY_LOCAL_MACHINE, "Software\Microsoft\Speech\Voices\WinetestVoice"); CoUninitialize(); }
From: Shaun Ren sren@codeweavers.com
--- dlls/sapi/Makefile.in | 3 +- dlls/sapi/sapi_private.h | 9 ++++ dlls/sapi/tests/tts.c | 9 ++-- dlls/sapi/tts.c | 103 +++++++++++++++++++++++++++++---------- dlls/sapi/xml.c | 37 ++++++++++++++ 5 files changed, 129 insertions(+), 32 deletions(-) create mode 100644 dlls/sapi/xml.c
diff --git a/dlls/sapi/Makefile.in b/dlls/sapi/Makefile.in index a6c4d86037e..da91eca5b8c 100644 --- a/dlls/sapi/Makefile.in +++ b/dlls/sapi/Makefile.in @@ -14,4 +14,5 @@ SOURCES = \ sapi_typelib.idl \ stream.c \ token.c \ - tts.c + tts.c \ + xml.c diff --git a/dlls/sapi/sapi_private.h b/dlls/sapi/sapi_private.h index 219436f88c1..9abd98093f2 100644 --- a/dlls/sapi/sapi_private.h +++ b/dlls/sapi/sapi_private.h @@ -18,6 +18,8 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA */
+#include "sapiddk.h" + #include "wine/list.h"
struct async_task @@ -63,3 +65,10 @@ enum type_id
HRESULT get_typeinfo( enum type_id tid, ITypeInfo **typeinfo ); void release_typelib( void ); + +HRESULT parse_sapi_xml( const WCHAR *contents, DWORD parse_flag, SPVTEXTFRAG **frag_list ); + +static inline BOOL isxmlspace( WCHAR c ) +{ + return c == ' ' || c == '\r' || c == '\n' || c == '\t'; +} diff --git a/dlls/sapi/tests/tts.c b/dlls/sapi/tests/tts.c index 1a63c6986cf..49d312fed70 100644 --- a/dlls/sapi/tests/tts.c +++ b/dlls/sapi/tests/tts.c @@ -990,9 +990,10 @@ static void test_spvoice_ssml(void)
/* XML and SSML autodetection when SPF_IS_XML is not specified. */ hr = ISpVoice_Speak(voice, text1, SPF_DEFAULT, NULL); - ok(hr == S_OK, "got %#lx.\n", hr); - ok(test_engine.frag_count == 1, "got %Iu.\n", test_engine.frag_count); - todo_wine check_frag_text(0, L"text1"); + todo_wine ok(hr == S_OK, "got %#lx.\n", hr); + todo_wine ok(test_engine.frag_count == 1, "got %Iu.\n", test_engine.frag_count); + if (test_engine.frag_count == 1) + check_frag_text(0, L"text1");
reset_engine_params(&test_engine);
@@ -1057,7 +1058,7 @@ static void test_spvoice_ssml(void) reset_engine_params(&test_engine);
hr = ISpVoice_Speak(voice, text4, SPF_DEFAULT, NULL); - ok(hr == S_OK, "got %#lx.\n", hr); + todo_wine ok(hr == S_OK, "got %#lx.\n", hr); todo_wine ok(test_engine.frag_count == 2, "got %Iu.\n", test_engine.frag_count);
if (test_engine.frag_count == 2) { diff --git a/dlls/sapi/tts.c b/dlls/sapi/tts.c index 80f0298b51c..6765241eec9 100644 --- a/dlls/sapi/tts.c +++ b/dlls/sapi/tts.c @@ -811,6 +811,18 @@ struct speak_task DWORD flags; };
+static void free_frag_list(SPVTEXTFRAG *frag) +{ + SPVTEXTFRAG *next; + + while (frag) + { + next = frag->pNext; + free(frag); + frag = next; + } +} + static HRESULT set_output_format(ISpStreamFormat *output, ISpTTSEngine *engine, GUID *fmtid, WAVEFORMATEX **wfx) { GUID output_fmtid; @@ -894,7 +906,7 @@ done: } CoTaskMemFree(wfx); ISpTTSEngine_Release(speak_task->engine); - free(speak_task->frag_list); + free_frag_list(speak_task->frag_list); ISpTTSEngineSite_Release(speak_task->site);
if (speak_task->result) @@ -911,23 +923,50 @@ static HRESULT WINAPI spvoice_Speak(ISpVoice *iface, const WCHAR *contents, DWOR struct speech_voice *This = impl_from_ISpVoice(iface); ISpTTSEngineSite *site = NULL; ISpTTSEngine *engine = NULL; - SPVTEXTFRAG *frag; + SPVTEXTFRAG *frag_list; + BOOL async, purge; + DWORD parse_flag, nlp_flags; + BOOL xml; struct speak_task *speak_task = NULL; struct async_result *result = NULL; - size_t contents_len, contents_size; + size_t contents_len; ULONG stream_num; HRESULT hr;
TRACE("(%p, %p, %#lx, %p).\n", iface, contents, flags, stream_num_out);
- flags &= ~SPF_IS_NOT_XML; - if (flags & ~(SPF_ASYNC | SPF_PURGEBEFORESPEAK | SPF_NLP_SPEAK_PUNC)) + async = flags & SPF_ASYNC; + purge = flags & SPF_PURGEBEFORESPEAK; + parse_flag = flags & SPF_PARSE_MASK; + nlp_flags = flags & SPF_NLP_MASK; + + xml = FALSE; + if ((flags & SPF_IS_XML) && (flags & SPF_IS_NOT_XML)) + return E_INVALIDARG; + else if (flags & SPF_IS_XML) + xml = TRUE; + else if (!(flags & SPF_IS_NOT_XML)) + { + if (contents) + { + const WCHAR *c = contents; + + while (*c && isxmlspace(*c)) c++; + xml = *c == '<'; + } + } + + if (parse_flag == SPF_PARSE_MASK) + return E_INVALIDARG; + + flags &= ~(SPF_ASYNC | SPF_PURGEBEFORESPEAK | SPF_IS_XML | SPF_IS_NOT_XML | SPF_PARSE_MASK | SPF_NLP_MASK); + if (flags) { - FIXME("flags %#lx not implemented.\n", flags & ~(SPF_ASYNC | SPF_PURGEBEFORESPEAK | SPF_NLP_SPEAK_PUNC)); + FIXME("flags %#lx not implemented.\n", flags); return E_NOTIMPL; }
- if (flags & SPF_PURGEBEFORESPEAK) + if (purge) { ISpAudio *audio;
@@ -954,9 +993,6 @@ static HRESULT WINAPI spvoice_Speak(ISpVoice *iface, const WCHAR *contents, DWOR else if (!contents) return E_POINTER;
- contents_len = wcslen(contents); - contents_size = sizeof(WCHAR) * (contents_len + 1); - if (!This->output) { /* Create a new output stream with the default output. */ @@ -964,6 +1000,28 @@ static HRESULT WINAPI spvoice_Speak(ISpVoice *iface, const WCHAR *contents, DWOR return hr; }
+ if (xml) + { + if (FAILED(hr = parse_sapi_xml(contents, parse_flag, &frag_list))) + return hr; + } + else + { + contents_len = wcslen(contents); + + if (!(frag_list = malloc(sizeof(*frag_list) + (contents_len + 1) * sizeof(WCHAR)))) + return E_OUTOFMEMORY; + + memcpy(frag_list + 1, contents, (contents_len + 1) * sizeof(WCHAR)); + + frag_list->pNext = NULL; + frag_list->State.eAction = SPVA_Speak; + frag_list->State.Volume = 100; + frag_list->pTextStart = (WCHAR *)(frag_list + 1); + frag_list->ulTextLen = contents_len; + frag_list->ulTextSrcOffset = 0; + } + EnterCriticalSection(&This->cs);
if (!This->engine_token) @@ -972,31 +1030,22 @@ static HRESULT WINAPI spvoice_Speak(ISpVoice *iface, const WCHAR *contents, DWOR if (FAILED(hr = ISpVoice_SetVoice(iface, NULL))) { LeaveCriticalSection(&This->cs); - return hr; + goto fail; } } + if (!This->engine && FAILED(hr = ISpObjectToken_CreateInstance(This->engine_token, NULL, CLSCTX_ALL, &IID_ISpTTSEngine, (void **)&This->engine))) { LeaveCriticalSection(&This->cs); ERR("Failed to create engine: %#lx.\n", hr); - return hr; + goto fail; } engine = This->engine; ISpTTSEngine_AddRef(engine);
LeaveCriticalSection(&This->cs);
- if (!(frag = malloc(sizeof(*frag) + contents_size))) - return E_OUTOFMEMORY; - memset(frag, 0, sizeof(*frag)); - memcpy(frag + 1, contents, contents_size); - frag->State.eAction = SPVA_Speak; - frag->State.Volume = 100; - frag->pTextStart = (WCHAR *)(frag + 1); - frag->ulTextLen = contents_len; - frag->ulTextSrcOffset = 0; - stream_num = InterlockedIncrement(&This->cur_stream_num); if (FAILED(hr = ttsenginesite_create(This, stream_num, &site))) { @@ -1010,11 +1059,11 @@ static HRESULT WINAPI spvoice_Speak(ISpVoice *iface, const WCHAR *contents, DWOR speak_task->result = NULL; speak_task->voice = This; speak_task->engine = engine; - speak_task->frag_list = frag; + speak_task->frag_list = frag_list; speak_task->site = site; - speak_task->flags = flags & SPF_NLP_SPEAK_PUNC; + speak_task->flags = nlp_flags;
- if (!(flags & SPF_ASYNC)) + if (!async) { if (!(result = malloc(sizeof(*result)))) { @@ -1035,7 +1084,7 @@ static HRESULT WINAPI spvoice_Speak(ISpVoice *iface, const WCHAR *contents, DWOR if (stream_num_out) *stream_num_out = stream_num;
- if (flags & SPF_ASYNC) + if (async) return S_OK; else { @@ -1049,7 +1098,7 @@ static HRESULT WINAPI spvoice_Speak(ISpVoice *iface, const WCHAR *contents, DWOR fail: if (site) ISpTTSEngineSite_Release(site); if (engine) ISpTTSEngine_Release(engine); - free(frag); + free_frag_list(frag_list); free(speak_task); if (result) { diff --git a/dlls/sapi/xml.c b/dlls/sapi/xml.c new file mode 100644 index 00000000000..eca8f9810bb --- /dev/null +++ b/dlls/sapi/xml.c @@ -0,0 +1,37 @@ +/* + * Speech API (SAPI) XML parser implementation. + * + * Copyright 2025 Shaun Ren for CodeWeavers + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA + */ + +#define COBJMACROS + +#include "objbase.h" + +#include "sapiddk.h" + +#include "wine/debug.h" + +#include "sapi_private.h" + +WINE_DEFAULT_DEBUG_CHANNEL(sapi); + +HRESULT parse_sapi_xml(const WCHAR *contents, DWORD parse_flag, SPVTEXTFRAG **frag_list) +{ + FIXME("(%p, %#lx, %p): stub.\n", contents, parse_flag, frag_list); + return E_NOTIMPL; +}