From: Bernhard Kölbl besentv@gmail.com
Signed-off-by: Bernhard Kölbl besentv@gmail.com --- dlls/windows.media.speech/Makefile.in | 2 +- dlls/windows.media.speech/private.h | 3 + dlls/windows.media.speech/recognizer.c | 42 ++++++ dlls/windows.media.speech/unixlib.c | 183 ++++++++++++++++++++++++- dlls/windows.media.speech/unixlib.h | 16 +++ 5 files changed, 244 insertions(+), 2 deletions(-)
diff --git a/dlls/windows.media.speech/Makefile.in b/dlls/windows.media.speech/Makefile.in index 64376514d58..455f81c0840 100644 --- a/dlls/windows.media.speech/Makefile.in +++ b/dlls/windows.media.speech/Makefile.in @@ -1,6 +1,6 @@ MODULE = windows.media.speech.dll UNIXLIB = windows.media.speech.so -IMPORTS = combase uuid +IMPORTS = combase uuid user32
C_SRCS = \ async.c \ diff --git a/dlls/windows.media.speech/private.h b/dlls/windows.media.speech/private.h index 2f804fbf1a7..62952478bdf 100644 --- a/dlls/windows.media.speech/private.h +++ b/dlls/windows.media.speech/private.h @@ -31,6 +31,7 @@ #include "windef.h" #include "winbase.h" #include "winstring.h" +#include "winuser.h" #include "objbase.h"
#include "activation.h" @@ -47,6 +48,8 @@
#include "wine/list.h"
+#define SPERR_WINRT_INTERNAL_ERROR 0x800455a0 + /* * * Windows.Media.SpeechRecognition diff --git a/dlls/windows.media.speech/recognizer.c b/dlls/windows.media.speech/recognizer.c index c2f386206b8..06133031d44 100644 --- a/dlls/windows.media.speech/recognizer.c +++ b/dlls/windows.media.speech/recognizer.c @@ -25,6 +25,9 @@
#include "wine/debug.h"
+#include "unixlib.h" +#include "wine/unixlib.h" + WINE_DEFAULT_DEBUG_CHANNEL(speech);
/* @@ -171,6 +174,8 @@ struct session IAudioCaptureClient *capture_client; WAVEFORMATEX capture_wfx;
+ speech_recognizer_handle unix_handle; + HANDLE worker_thread, worker_control_event, audio_buf_event; BOOLEAN worker_running, worker_paused; CRITICAL_SECTION cs; @@ -318,7 +323,9 @@ static ULONG WINAPI session_AddRef( ISpeechContinuousRecognitionSession *iface ) static ULONG WINAPI session_Release( ISpeechContinuousRecognitionSession *iface ) { struct session *impl = impl_from_ISpeechContinuousRecognitionSession(iface); + struct speech_release_recognizer_params release_params; ULONG ref = InterlockedDecrement(&impl->ref); + TRACE("iface %p, ref %lu.\n", iface, ref);
if (!ref) @@ -344,6 +351,9 @@ static ULONG WINAPI session_Release( ISpeechContinuousRecognitionSession *iface impl->cs.DebugInfo->Spare[0] = 0; DeleteCriticalSection(&impl->cs);
+ release_params.handle = impl->unix_handle; + WINE_UNIX_CALL(unix_speech_release_recognizer, &release_params); + IVector_ISpeechRecognitionConstraint_Release(impl->constraints); free(impl); } @@ -1079,6 +1089,35 @@ cleanup: return hr; }
+static HRESULT recognizer_factory_create_vosk_instance( struct session *session ) +{ + struct speech_create_recognizer_params create_params = { 0 }; + WCHAR locale[LOCALE_NAME_MAX_LENGTH]; + NTSTATUS status; + INT len; + + if (!(len = GetUserDefaultLocaleName(locale, LOCALE_NAME_MAX_LENGTH))) + return E_FAIL; + + if (CharLowerBuffW(locale, len) != len) + return E_FAIL; + + if (!WideCharToMultiByte(CP_ACP, 0, locale, len, create_params.locale, ARRAY_SIZE(create_params.locale), NULL, NULL)) + return HRESULT_FROM_WIN32(GetLastError()); + + create_params.sample_rate = (FLOAT)session->capture_wfx.nSamplesPerSec; + + if ((status = WINE_UNIX_CALL(unix_speech_create_recognizer, &create_params))) + { + ERR("Unable to create Vosk instance for locale %s, status %#lx. Speech recognition won't work.\n", debugstr_a(create_params.locale), status); + return SPERR_WINRT_INTERNAL_ERROR; + } + + session->unix_handle = create_params.handle; + + return S_OK; +} + static HRESULT WINAPI recognizer_factory_Create( ISpeechRecognizerFactory *iface, ILanguage *language, ISpeechRecognizer **speechrecognizer ) { struct recognizer *impl; @@ -1125,6 +1164,9 @@ static HRESULT WINAPI recognizer_factory_Create( ISpeechRecognizerFactory *iface if (FAILED(hr = recognizer_factory_create_audio_capture(session))) goto error;
+ if (FAILED(hr = recognizer_factory_create_vosk_instance(session))) + goto error; + InitializeCriticalSection(&session->cs); session->cs.DebugInfo->Spare[0] = (DWORD_PTR)(__FILE__ ": recognition_session.cs");
diff --git a/dlls/windows.media.speech/unixlib.c b/dlls/windows.media.speech/unixlib.c index d6f748b9426..da7fc5ec31b 100644 --- a/dlls/windows.media.speech/unixlib.c +++ b/dlls/windows.media.speech/unixlib.c @@ -26,9 +26,12 @@
#include <stdio.h> #include <stdlib.h> -#include <string.h> #include <stdarg.h> +#include <string.h> +#include <dirent.h> #include <dlfcn.h> +#include <errno.h> +#include <sys/stat.h>
#ifdef SONAME_LIBVOSK #include <vosk_api.h> @@ -94,6 +97,178 @@ static NTSTATUS process_detach( void *args ) return STATUS_SUCCESS; }
+static inline speech_recognizer_handle vosk_recognizer_to_handle( VoskRecognizer *recognizer ) +{ + return (speech_recognizer_handle)(UINT_PTR)recognizer; +} + +static inline VoskRecognizer *vosk_recognizer_from_handle( speech_recognizer_handle handle ) +{ + return (VoskRecognizer *)(UINT_PTR)handle; +} + +static NTSTATUS find_model_by_locale_and_path( const char *path, const char *locale, VoskModel **model ) +{ + static const char *vosk_model_identifier_small = "vosk-model-small-"; + static const char *vosk_model_identifier = "vosk-model-"; + size_t ident_small_len = strlen(vosk_model_identifier_small); + size_t ident_len = strlen(vosk_model_identifier); + char *ent_name, *model_path, *best_match, *delim; + NTSTATUS status = STATUS_UNSUCCESSFUL; + struct dirent *dirent; + size_t path_len, len; + struct stat stat; + DIR *dir; + INT fd; + + TRACE("path %s, locale %s, model %p.\n", path, debugstr_a(locale), model); + + if (!path || !locale || (len = strlen(locale)) < 4) + return STATUS_UNSUCCESSFUL; + + if (!(dir = opendir(path))) + return STATUS_UNSUCCESSFUL; + + if ((fd = dirfd(dir)) == -1) + goto done; + + delim = strchr(locale, '-'); + path_len = strlen(path); + best_match = NULL; + *model = NULL; + + while ((dirent = readdir(dir))) + { + ent_name = dirent->d_name; + + if (!strncmp(ent_name, vosk_model_identifier_small, ident_small_len)) + ent_name += ident_small_len; + else if (!strncmp(ent_name, vosk_model_identifier, ident_len)) + ent_name += ident_len; + else + continue; + + /* First match for lang and region (en-us), then only lang (en). */ + if (strncmp(ent_name, locale, len)) + { + /* + * best_match holds the best matching model folder name, in which a matching lang-region + * combo holds higher precedence over just a matching language. This means if best_match + * is set, we assume either a lang-region or just lang matching folder was already found, + * so we don't try to do another lang match. + */ + if (best_match || strncmp(ent_name, locale, delim - locale)) + continue; + } + + if (fstatat(fd, dirent->d_name, &stat, 0)) + { + WARN("Failed to get stats for entry %s.\n", debugstr_a(dirent->d_name)); + continue; + } + + if (!S_ISDIR(stat.st_mode)) + continue; + + best_match = dirent->d_name; + } + + if (best_match) + { + if (!(model_path = malloc(path_len + 1 /* '/' */ + strlen(best_match) + 1))) + { + status = STATUS_NO_MEMORY; + goto done; + } + + sprintf(model_path, "%s/%s", path, best_match); + + TRACE("Trying to load Vosk model %s.\n", debugstr_a(model_path)); + + *model = p_vosk_model_new(model_path); + free(model_path); + + if (*model) status = STATUS_SUCCESS; + } + +done: + closedir(dir); + + return status; +} + +static NTSTATUS find_model_by_locale( const char *locale, VoskModel **model ) +{ + const char *suffix = NULL; + char *env, *path = NULL; + NTSTATUS status; + + TRACE("locale %s, model %p.\n", debugstr_a(locale), model); + + if (!model) + return STATUS_UNSUCCESSFUL; + + if (!find_model_by_locale_and_path(getenv("VOSK_MODEL_PATH"), locale, model)) + return STATUS_SUCCESS; + if (!find_model_by_locale_and_path("/usr/share/vosk", locale, model)) + return STATUS_SUCCESS; + + if ((env = getenv("XDG_CACHE_HOME"))) + suffix = "/vosk"; + else if ((env = getenv("HOME"))) + suffix = "/.cache/vosk"; + else + return STATUS_UNSUCCESSFUL; + + if (!(path = malloc(strlen(env) + strlen(suffix) + 1))) + return STATUS_NO_MEMORY; + + sprintf(path, "%s%s", env, suffix); + status = find_model_by_locale_and_path(path, locale, model); + free(path); + + return status; +} + +static NTSTATUS speech_create_recognizer( void *args ) +{ + struct speech_create_recognizer_params *params = args; + VoskRecognizer *recognizer = NULL; + VoskModel *model = NULL; + NTSTATUS status = STATUS_SUCCESS; + + TRACE("args %p.\n", args); + + if (!vosk_handle) + return STATUS_NOT_SUPPORTED; + + if ((status = find_model_by_locale(params->locale, &model))) + return status; + + if (!(recognizer = p_vosk_recognizer_new(model, params->sample_rate))) + status = STATUS_UNSUCCESSFUL; + + /* VoskModel is reference-counted. A VoskRecognizer keeps a reference to its model. */ + p_vosk_model_free(model); + + params->handle = vosk_recognizer_to_handle(recognizer); + return status; +} + +static NTSTATUS speech_release_recognizer( void *args ) +{ + struct speech_release_recognizer_params *params = args; + + TRACE("args %p.\n", args); + + if (!vosk_handle) + return STATUS_NOT_SUPPORTED; + + p_vosk_recognizer_free(vosk_recognizer_from_handle(params->handle)); + + return STATUS_SUCCESS; +} + #else /* SONAME_LIBVOSK */
#define MAKE_UNSUPPORTED_FUNC( f ) \ @@ -105,6 +280,8 @@ static NTSTATUS process_detach( void *args )
MAKE_UNSUPPORTED_FUNC(process_attach) MAKE_UNSUPPORTED_FUNC(process_detach) +MAKE_UNSUPPORTED_FUNC(speech_create_recognizer) +MAKE_UNSUPPORTED_FUNC(speech_release_recognizer) #undef MAKE_UNSUPPORTED_FUNC
#endif /* SONAME_LIBVOSK */ @@ -113,10 +290,14 @@ unixlib_entry_t __wine_unix_call_funcs[] = { process_attach, process_detach, + speech_create_recognizer, + speech_release_recognizer, };
unixlib_entry_t __wine_unix_call_wow64_funcs[] = { process_attach, process_detach, + speech_create_recognizer, + speech_release_recognizer, }; diff --git a/dlls/windows.media.speech/unixlib.h b/dlls/windows.media.speech/unixlib.h index 6c337e54511..974e8d5f797 100644 --- a/dlls/windows.media.speech/unixlib.h +++ b/dlls/windows.media.speech/unixlib.h @@ -30,10 +30,26 @@
#include "wine/unixlib.h"
+typedef UINT64 speech_recognizer_handle; + +struct speech_create_recognizer_params +{ + speech_recognizer_handle handle; + CHAR locale[LOCALE_NAME_MAX_LENGTH]; + FLOAT sample_rate; +}; + +struct speech_release_recognizer_params +{ + speech_recognizer_handle handle; +}; + enum unix_funcs { unix_process_attach, unix_process_detach, + unix_speech_create_recognizer, + unix_speech_release_recognizer, };
#endif