GStreamer uses _SC_NPROCESSORS_CONF to determine 'max-threads'. On the Steam Deck, this is configured to be 16 (which is double its number of logical cores).
_SC_NPROCESSORS_CONF also disregards a process's CPU affinity, thus it can create more threads than is useful, which ultimately wastes memory resources.
Using affinity to set 'max-threads' addresses both these problems.
-- v10: winegstreamer: Set 'max_threads' to 4 for 32-bit processors.
From: Brendan McGrath bmcgrath@codeweavers.com
--- dlls/winegstreamer/unixlib.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+)
diff --git a/dlls/winegstreamer/unixlib.c b/dlls/winegstreamer/unixlib.c index 175ab92ecdc..9124103cd46 100644 --- a/dlls/winegstreamer/unixlib.c +++ b/dlls/winegstreamer/unixlib.c @@ -47,6 +47,8 @@
GST_DEBUG_CATEGORY(wine);
+static UINT16 thread_count; + GstStreamType stream_type_from_caps(GstCaps *caps) { const gchar *media_type; @@ -244,6 +246,17 @@ bool push_event(GstPad *pad, GstEvent *event) return true; }
+static ULONG popcount(ULONG val) +{ +#if HAVE___BUILTIN_POPCOUNT + return __builtin_popcount(val); +#else + val -= val >> 1 & 0x55555555; + val = (val & 0x33333333) + (val >> 2 & 0x33333333); + return ((val + (val >> 4)) & 0x0f0f0f0f) * 0x01010101 >> 24; +#endif +} + NTSTATUS wg_init_gstreamer(void *arg) { struct wg_init_gstreamer_params *params = arg; @@ -253,6 +266,7 @@ NTSTATUS wg_init_gstreamer(void *arg) int argc = ARRAY_SIZE(args) - 1; char **argv = args; GError *err; + DWORD_PTR process_mask;
if (params->trace_on) setenv("GST_DEBUG", "WINE:9,4", FALSE); @@ -276,6 +290,12 @@ NTSTATUS wg_init_gstreamer(void *arg) return STATUS_UNSUCCESSFUL; }
+ if (!NtQueryInformationProcess(GetCurrentProcess(), + ProcessAffinityMask, &process_mask, sizeof(process_mask), NULL)) + thread_count = popcount(process_mask); + else + thread_count = 0; + GST_DEBUG_CATEGORY_INIT(wine, "WINE", GST_DEBUG_FG_RED, "Wine GStreamer support");
GST_INFO("GStreamer library version %s; wine built with %d.%d.%d.",
From: Brendan McGrath bmcgrath@codeweavers.com
GStreamer uses _SC_NPROCESSORS_CONF to determine 'max-threads'. On the Steam Deck, this is configured to be 16 (which is double its number of logical cores).
_SC_NPROCESSORS_CONF also disregards a process's CPU affinity, thus it can create more threads than is useful, which ultimately wastes memory resources.
Using thread_count to set 'max-threads' addresses both these problems. --- dlls/winegstreamer/unix_private.h | 1 + dlls/winegstreamer/unixlib.c | 34 +++++++++++++++++++++++++++++++ dlls/winegstreamer/wg_parser.c | 7 +++++++ dlls/winegstreamer/wg_transform.c | 2 ++ 4 files changed, 44 insertions(+)
diff --git a/dlls/winegstreamer/unix_private.h b/dlls/winegstreamer/unix_private.h index 985b70a925c..c62307aca10 100644 --- a/dlls/winegstreamer/unix_private.h +++ b/dlls/winegstreamer/unix_private.h @@ -46,6 +46,7 @@ extern bool link_src_to_sink(GstPad *src_pad, GstPad *sink_pad); extern bool link_src_to_element(GstPad *src_pad, GstElement *element); extern bool link_element_to_sink(GstElement *element, GstPad *sink_pad); extern bool push_event(GstPad *pad, GstEvent *event); +extern void set_max_threads(GstElement *element);
/* wg_format.c */
diff --git a/dlls/winegstreamer/unixlib.c b/dlls/winegstreamer/unixlib.c index 9124103cd46..f085693cae6 100644 --- a/dlls/winegstreamer/unixlib.c +++ b/dlls/winegstreamer/unixlib.c @@ -302,3 +302,37 @@ NTSTATUS wg_init_gstreamer(void *arg) gst_version_string(), GST_VERSION_MAJOR, GST_VERSION_MINOR, GST_VERSION_MICRO); return STATUS_SUCCESS; } + +static inline bool element_has_property(const GstElement *element, const gchar *property) +{ + return !!g_object_class_find_property(G_OBJECT_CLASS(GST_ELEMENT_GET_CLASS(element)), property); +} + +void set_max_threads(GstElement *element) +{ + const char *shortname = NULL; + GstElementFactory *factory = gst_element_get_factory(element); + + if (factory) + shortname = gst_plugin_feature_get_name(GST_PLUGIN_FEATURE(factory)); + + /* By default, GStreamer will use the result of sysconf(_SC_NPROCESSORS_CONF) to determine the number + * of decoder threads to be used by libva. This has two issues: + * 1. It can return an inaccurate result (for example, on the Steam Deck this returns 16); and + * 2. It disregards process affinity + * + * Both of these scenarios result in more threads being allocated than logical cores made available, meaning + * they provide little (or possibly detrimental) performance benefit and for 4K video can occupy 32MB + * of RAM each (w * h * bpp). + * + * So we will instead explictly set 'max-threads' to the minimum of thread_count (process affinity at time of + * initialization) or 16. + */ + + if (shortname && strstr(shortname, "avdec_") && element_has_property(element, "max-threads")) + { + gint32 max_threads = MIN(thread_count, 16); + GST_DEBUG("%s found, setting max-threads to %d.", shortname, max_threads); + g_object_set(element, "max-threads", max_threads, NULL); + } +} diff --git a/dlls/winegstreamer/wg_parser.c b/dlls/winegstreamer/wg_parser.c index 8690e6baf81..710cfe6a0a5 100644 --- a/dlls/winegstreamer/wg_parser.c +++ b/dlls/winegstreamer/wg_parser.c @@ -567,6 +567,12 @@ static void no_more_pads_cb(GstElement *element, gpointer user) pthread_cond_signal(&parser->init_cond); }
+static void deep_element_added_cb(GstBin *self, GstBin *sub_bin, GstElement *element, gpointer user) +{ + if (element) + set_max_threads(element); +} + static gboolean sink_event_cb(GstPad *pad, GstObject *parent, GstEvent *event) { struct wg_parser_stream *stream = gst_pad_get_element_private(pad); @@ -1797,6 +1803,7 @@ static BOOL decodebin_parser_init_gst(struct wg_parser *parser) g_signal_connect(element, "autoplug-continue", G_CALLBACK(autoplug_continue_cb), parser); g_signal_connect(element, "autoplug-select", G_CALLBACK(autoplug_select_cb), parser); g_signal_connect(element, "no-more-pads", G_CALLBACK(no_more_pads_cb), parser); + g_signal_connect(element, "deep-element-added", G_CALLBACK(deep_element_added_cb), parser);
pthread_mutex_lock(&parser->mutex); parser->no_more_pads = false; diff --git a/dlls/winegstreamer/wg_transform.c b/dlls/winegstreamer/wg_transform.c index 614125522a8..70a09eb14e3 100644 --- a/dlls/winegstreamer/wg_transform.c +++ b/dlls/winegstreamer/wg_transform.c @@ -454,6 +454,8 @@ NTSTATUS wg_transform_create(void *args) if (!(element = find_element(GST_ELEMENT_FACTORY_TYPE_DECODER, parsed_caps, sink_caps)) || !append_element(transform->container, element, &first, &last)) goto out; + + set_max_threads(element); }
if (g_str_has_prefix(output_mime, "audio/"))
From: Brendan McGrath bmcgrath@codeweavers.com
The avdec_h264 element can use 32MB per thread when working with 4K video.
With 16 threads, this is 512MB, which is a quarter of the RAM available to a 32-bit application. Setting 'max_threads' to 4 can save 384MB. --- dlls/winegstreamer/unixlib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/dlls/winegstreamer/unixlib.c b/dlls/winegstreamer/unixlib.c index f085693cae6..165d38909aa 100644 --- a/dlls/winegstreamer/unixlib.c +++ b/dlls/winegstreamer/unixlib.c @@ -326,12 +326,12 @@ void set_max_threads(GstElement *element) * of RAM each (w * h * bpp). * * So we will instead explictly set 'max-threads' to the minimum of thread_count (process affinity at time of - * initialization) or 16. + * initialization) or 16 (4 for 32-bit processors). */
if (shortname && strstr(shortname, "avdec_") && element_has_property(element, "max-threads")) { - gint32 max_threads = MIN(thread_count, 16); + gint32 max_threads = MIN(thread_count, sizeof(void *) == 4 ? 4 : 16); GST_DEBUG("%s found, setting max-threads to %d.", shortname, max_threads); g_object_set(element, "max-threads", max_threads, NULL); }
On Wed Jul 3 18:46:49 2024 +0000, Brendan McGrath wrote:
changed this line in [version 9 of the diff](/wine/wine/-/merge_requests/5923/diffs?diff_id=120480&start_sha=e96def2be0108d332885ba8b7b357e7ea12ccebd#60c57c4ae493c61ac248f463f5b5971603bd4e53_37_35)
I was thinking I should move it out of the header. I've changed this to be a `static inline` within unixlib.c.
On Wed Jul 3 16:16:14 2024 +0000, Elizabeth Figura wrote:
This kind of element-specific workaround really deserves a comment in the code. I also don't think MAX_THREADS is really doing much when it's only used in one place, and just writing 16 directly would be just as clear.
You're right. I've added a comment to explain the code and removed the MAX_THREADS const.
On Wed Jul 3 18:46:51 2024 +0000, Brendan McGrath wrote:
changed this line in [version 9 of the diff](/wine/wine/-/merge_requests/5923/diffs?diff_id=120480&start_sha=e96def2be0108d332885ba8b7b357e7ea12ccebd#cbb9783c1a61c0a7c24e47cfacdc62e9e75218a2_320_334)
Done. Thanks for taking a look Zeb. Appreciated.
On Wed Jul 3 18:50:20 2024 +0000, Brendan McGrath wrote:
I was thinking I should move it out of the header. I've changed this to be a `static inline` within unixlib.c.
I don't think it needs to be inline, and I tend to avoid that since it means the compiler won't detect when it becomes unused.