[PATCH v20 7/7] windows.media.speech: Implement Vosk create and release functions in the unixlib.

6 Mar 2023

From: Bernhard Kölbl besentv@gmail.com
Signed-off-by: Bernhard Kölbl besentv@gmail.com
---
 dlls/windows.media.speech/Makefile.in  |   2 +-
 dlls/windows.media.speech/private.h    |   3 +
 dlls/windows.media.speech/recognizer.c |  42 ++++++
 dlls/windows.media.speech/unixlib.c    | 183 ++++++++++++++++++++++++-
 dlls/windows.media.speech/unixlib.h    |  16 +++
 5 files changed, 244 insertions(+), 2 deletions(-)

diff --git a/dlls/windows.media.speech/Makefile.in b/dlls/windows.media.speech/Makefile.in
index 64376514d58..455f81c0840 100644
--- a/dlls/windows.media.speech/Makefile.in
+++ b/dlls/windows.media.speech/Makefile.in
@@ -1,6 +1,6 @@
 MODULE = windows.media.speech.dll
 UNIXLIB = windows.media.speech.so
-IMPORTS = combase uuid
+IMPORTS = combase uuid user32
C_SRCS = \
    async.c \
diff --git a/dlls/windows.media.speech/private.h b/dlls/windows.media.speech/private.h
index 2f804fbf1a7..62952478bdf 100644
--- a/dlls/windows.media.speech/private.h
+++ b/dlls/windows.media.speech/private.h
@@ -31,6 +31,7 @@
 #include "windef.h"
 #include "winbase.h"
 #include "winstring.h"
+#include "winuser.h"
 #include "objbase.h"
#include "activation.h"
@@ -47,6 +48,8 @@
#include "wine/list.h"
+#define SPERR_WINRT_INTERNAL_ERROR 0x800455a0
+
 /*
  *
  * Windows.Media.SpeechRecognition
diff --git a/dlls/windows.media.speech/recognizer.c b/dlls/windows.media.speech/recognizer.c
index c2f386206b8..06133031d44 100644
--- a/dlls/windows.media.speech/recognizer.c
+++ b/dlls/windows.media.speech/recognizer.c
@@ -25,6 +25,9 @@
#include "wine/debug.h"
+#include "unixlib.h"
+#include "wine/unixlib.h"
+
 WINE_DEFAULT_DEBUG_CHANNEL(speech);
/*
@@ -171,6 +174,8 @@ struct session
     IAudioCaptureClient *capture_client;
     WAVEFORMATEX capture_wfx;
+    speech_recognizer_handle unix_handle;
+
     HANDLE worker_thread, worker_control_event, audio_buf_event;
     BOOLEAN worker_running, worker_paused;
     CRITICAL_SECTION cs;
@@ -318,7 +323,9 @@ static ULONG WINAPI session_AddRef( ISpeechContinuousRecognitionSession *iface )
 static ULONG WINAPI session_Release( ISpeechContinuousRecognitionSession *iface )
 {
     struct session *impl = impl_from_ISpeechContinuousRecognitionSession(iface);
+    struct speech_release_recognizer_params release_params;
     ULONG ref = InterlockedDecrement(&impl->ref);
+
     TRACE("iface %p, ref %lu.\n", iface, ref);
if (!ref)
@@ -344,6 +351,9 @@ static ULONG WINAPI session_Release( ISpeechContinuousRecognitionSession *iface
         impl->cs.DebugInfo->Spare[0] = 0;
         DeleteCriticalSection(&impl->cs);
+        release_params.handle = impl->unix_handle;
+        WINE_UNIX_CALL(unix_speech_release_recognizer, &release_params);
+
         IVector_ISpeechRecognitionConstraint_Release(impl->constraints);
         free(impl);
     }
@@ -1079,6 +1089,35 @@ cleanup:
     return hr;
 }
+static HRESULT recognizer_factory_create_vosk_instance( struct session *session )
+{
+    struct speech_create_recognizer_params create_params = { 0 };
+    WCHAR locale[LOCALE_NAME_MAX_LENGTH];
+    NTSTATUS status;
+    INT len;
+
+    if (!(len = GetUserDefaultLocaleName(locale, LOCALE_NAME_MAX_LENGTH)))
+        return E_FAIL;
+
+    if (CharLowerBuffW(locale, len) != len)
+        return E_FAIL;
+
+    if (!WideCharToMultiByte(CP_ACP, 0, locale, len, create_params.locale, ARRAY_SIZE(create_params.locale), NULL, NULL))
+        return HRESULT_FROM_WIN32(GetLastError());
+
+    create_params.sample_rate = (FLOAT)session->capture_wfx.nSamplesPerSec;
+
+    if ((status = WINE_UNIX_CALL(unix_speech_create_recognizer, &create_params)))
+    {
+        ERR("Unable to create Vosk instance for locale %s, status %#lx. Speech recognition won't work.\n", debugstr_a(create_params.locale), status);
+        return SPERR_WINRT_INTERNAL_ERROR;
+    }
+
+    session->unix_handle = create_params.handle;
+
+    return S_OK;
+}
+
 static HRESULT WINAPI recognizer_factory_Create( ISpeechRecognizerFactory *iface, ILanguage *language, ISpeechRecognizer **speechrecognizer )
 {
     struct recognizer *impl;
@@ -1125,6 +1164,9 @@ static HRESULT WINAPI recognizer_factory_Create( ISpeechRecognizerFactory *iface
     if (FAILED(hr = recognizer_factory_create_audio_capture(session)))
         goto error;
+    if (FAILED(hr = recognizer_factory_create_vosk_instance(session)))
+        goto error;
+
     InitializeCriticalSection(&session->cs);
     session->cs.DebugInfo->Spare[0] = (DWORD_PTR)(__FILE__ ": recognition_session.cs");
diff --git a/dlls/windows.media.speech/unixlib.c b/dlls/windows.media.speech/unixlib.c
index d6f748b9426..da7fc5ec31b 100644
--- a/dlls/windows.media.speech/unixlib.c
+++ b/dlls/windows.media.speech/unixlib.c
@@ -26,9 +26,12 @@
#include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
 #include <stdarg.h>
+#include <string.h>
+#include <dirent.h>
 #include <dlfcn.h>
+#include <errno.h>
+#include <sys/stat.h>
#ifdef SONAME_LIBVOSK
 #include <vosk_api.h>
@@ -94,6 +97,178 @@ static NTSTATUS process_detach( void *args )
     return STATUS_SUCCESS;
 }
+static inline speech_recognizer_handle vosk_recognizer_to_handle( VoskRecognizer *recognizer )
+{
+    return (speech_recognizer_handle)(UINT_PTR)recognizer;
+}
+
+static inline VoskRecognizer *vosk_recognizer_from_handle( speech_recognizer_handle handle )
+{
+    return (VoskRecognizer *)(UINT_PTR)handle;
+}
+
+static NTSTATUS find_model_by_locale_and_path( const char *path, const char *locale, VoskModel **model )
+{
+    static const char *vosk_model_identifier_small = "vosk-model-small-";
+    static const char *vosk_model_identifier = "vosk-model-";
+    size_t ident_small_len = strlen(vosk_model_identifier_small);
+    size_t ident_len = strlen(vosk_model_identifier);
+    char *ent_name, *model_path, *best_match, *delim;
+    NTSTATUS status = STATUS_UNSUCCESSFUL;
+    struct dirent *dirent;
+    size_t path_len, len;
+    struct stat stat;
+    DIR *dir;
+    INT fd;
+
+    TRACE("path %s, locale %s, model %p.\n", path, debugstr_a(locale), model);
+
+    if (!path || !locale || (len = strlen(locale)) < 4)
+        return STATUS_UNSUCCESSFUL;
+
+    if (!(dir = opendir(path)))
+        return STATUS_UNSUCCESSFUL;
+
+    if ((fd = dirfd(dir)) == -1)
+        goto done;
+
+    delim = strchr(locale, '-');
+    path_len = strlen(path);
+    best_match = NULL;
+    *model = NULL;
+
+    while ((dirent = readdir(dir)))
+    {
+        ent_name = dirent->d_name;
+
+        if (!strncmp(ent_name, vosk_model_identifier_small, ident_small_len))
+            ent_name += ident_small_len;
+        else if (!strncmp(ent_name, vosk_model_identifier, ident_len))
+            ent_name += ident_len;
+        else
+            continue;
+
+        /* First match for lang and region (en-us), then only lang (en). */
+        if (strncmp(ent_name, locale, len))
+        {
+            /*
+             * best_match holds the best matching model folder name, in which a matching lang-region
+             * combo holds higher precedence over just a matching language. This means if best_match
+             * is set, we assume either a lang-region or just lang matching folder was already found,
+             * so we don't try to do another lang match.
+             */
+            if (best_match || strncmp(ent_name, locale, delim - locale))
+                continue;
+        }
+
+        if (fstatat(fd, dirent->d_name, &stat, 0))
+        {
+            WARN("Failed to get stats for entry %s.\n", debugstr_a(dirent->d_name));
+            continue;
+        }
+
+        if (!S_ISDIR(stat.st_mode))
+            continue;
+
+        best_match = dirent->d_name;
+    }
+
+    if (best_match)
+    {
+        if (!(model_path = malloc(path_len + 1 /* '/' */ + strlen(best_match) + 1)))
+        {
+            status = STATUS_NO_MEMORY;
+            goto done;
+        }
+
+        sprintf(model_path, "%s/%s", path, best_match);
+
+        TRACE("Trying to load Vosk model %s.\n", debugstr_a(model_path));
+
+        *model = p_vosk_model_new(model_path);
+        free(model_path);
+
+        if (*model) status = STATUS_SUCCESS;
+    }
+
+done:
+    closedir(dir);
+
+    return status;
+}
+
+static NTSTATUS find_model_by_locale( const char *locale, VoskModel **model )
+{
+    const char *suffix = NULL;
+    char *env, *path = NULL;
+    NTSTATUS status;
+
+    TRACE("locale %s, model %p.\n", debugstr_a(locale), model);
+
+    if (!model)
+        return STATUS_UNSUCCESSFUL;
+
+    if (!find_model_by_locale_and_path(getenv("VOSK_MODEL_PATH"), locale, model))
+        return STATUS_SUCCESS;
+    if (!find_model_by_locale_and_path("/usr/share/vosk", locale, model))
+        return STATUS_SUCCESS;
+
+    if ((env = getenv("XDG_CACHE_HOME")))
+        suffix = "/vosk";
+    else if ((env = getenv("HOME")))
+        suffix = "/.cache/vosk";
+    else
+        return STATUS_UNSUCCESSFUL;
+
+    if (!(path = malloc(strlen(env) + strlen(suffix) + 1)))
+        return STATUS_NO_MEMORY;
+
+    sprintf(path, "%s%s", env, suffix);
+    status = find_model_by_locale_and_path(path, locale, model);
+    free(path);
+
+    return status;
+}
+
+static NTSTATUS speech_create_recognizer( void *args )
+{
+    struct speech_create_recognizer_params *params = args;
+    VoskRecognizer *recognizer = NULL;
+    VoskModel *model = NULL;
+    NTSTATUS status = STATUS_SUCCESS;
+
+    TRACE("args %p.\n", args);
+
+    if (!vosk_handle)
+        return STATUS_NOT_SUPPORTED;
+
+    if ((status = find_model_by_locale(params->locale, &model)))
+        return status;
+
+    if (!(recognizer = p_vosk_recognizer_new(model, params->sample_rate)))
+        status = STATUS_UNSUCCESSFUL;
+
+    /* VoskModel is reference-counted.  A VoskRecognizer keeps a reference to its model. */
+    p_vosk_model_free(model);
+
+    params->handle = vosk_recognizer_to_handle(recognizer);
+    return status;
+}
+
+static NTSTATUS speech_release_recognizer( void *args )
+{
+    struct speech_release_recognizer_params *params = args;
+
+    TRACE("args %p.\n", args);
+
+    if (!vosk_handle)
+        return STATUS_NOT_SUPPORTED;
+
+    p_vosk_recognizer_free(vosk_recognizer_from_handle(params->handle));
+
+    return STATUS_SUCCESS;
+}
+
 #else /* SONAME_LIBVOSK */
#define MAKE_UNSUPPORTED_FUNC( f ) \
@@ -105,6 +280,8 @@ static NTSTATUS process_detach( void *args )
MAKE_UNSUPPORTED_FUNC(process_attach)
 MAKE_UNSUPPORTED_FUNC(process_detach)
+MAKE_UNSUPPORTED_FUNC(speech_create_recognizer)
+MAKE_UNSUPPORTED_FUNC(speech_release_recognizer)
 #undef MAKE_UNSUPPORTED_FUNC
#endif /* SONAME_LIBVOSK */
@@ -113,10 +290,14 @@ unixlib_entry_t __wine_unix_call_funcs[] =
 {
     process_attach,
     process_detach,
+    speech_create_recognizer,
+    speech_release_recognizer,
 };
unixlib_entry_t __wine_unix_call_wow64_funcs[] =
 {
     process_attach,
     process_detach,
+    speech_create_recognizer,
+    speech_release_recognizer,
 };
diff --git a/dlls/windows.media.speech/unixlib.h b/dlls/windows.media.speech/unixlib.h
index 6c337e54511..974e8d5f797 100644
--- a/dlls/windows.media.speech/unixlib.h
+++ b/dlls/windows.media.speech/unixlib.h
@@ -30,10 +30,26 @@
#include "wine/unixlib.h"
+typedef UINT64 speech_recognizer_handle;
+
+struct speech_create_recognizer_params
+{
+    speech_recognizer_handle handle;
+    CHAR locale[LOCALE_NAME_MAX_LENGTH];
+    FLOAT sample_rate;
+};
+
+struct speech_release_recognizer_params
+{
+    speech_recognizer_handle handle;
+};
+
 enum unix_funcs
 {
     unix_process_attach,
     unix_process_detach,
+    unix_speech_create_recognizer,
+    unix_speech_release_recognizer,
 };
#endif
-- 
GitLab

https://gitlab.winehq.org/wine/wine/-/merge_requests/2091

    

2025

2024

2023

2022

[PATCH v20 7/7] windows.media.speech: Implement Vosk create and release functions in the unixlib.