On my Steam Deck, this reduces the time it takes to initialize the wg_parser radically (by around 1 second). This helps in the game WILD HEARTS, which doesn't present while loading help videos during gameplay, causing large stutters.
Because GStreamer can randomly access the file with no known pattern on our side, I opted to implement this by buffering 4 chunks so that interleaved reads to different areas of the file don't cause us to discard and reload cached data more than we need to.
From: Derek Lesho dlesho@codeweavers.com
In order to reduce wg_parser initialization time by skipping the round-trip to the PE thread.
Signed-off-by: Derek Lesho dlesho@codeweavers.com --- dlls/winegstreamer/wg_parser.c | 135 +++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+)
diff --git a/dlls/winegstreamer/wg_parser.c b/dlls/winegstreamer/wg_parser.c index 5bb824f4399..13838b86442 100644 --- a/dlls/winegstreamer/wg_parser.c +++ b/dlls/winegstreamer/wg_parser.c @@ -96,7 +96,15 @@ struct wg_parser bool unlimited_buffering;
gchar *sink_caps; + + struct + { + unsigned int rank; + guint64 offset; + uint8_t *data; + } cached_chunks[4]; }; +static unsigned int wg_parser_chunk_size = 8 << 20;
struct wg_parser_stream { @@ -967,10 +975,99 @@ static void pad_removed_cb(GstElement *element, GstPad *pad, gpointer user) g_free(name); }
+static unsigned int read_parser_cache(struct wg_parser *parser, guint64 offset, guint size, GstBuffer *buffer, guint64 buffer_offset) +{ + unsigned int chunk_rank, read_size, bytes_read = 0; + int i, k; + + for (i = 0; i < ARRAY_SIZE(parser->cached_chunks); i++) + { + chunk_rank = parser->cached_chunks[i].rank; + + if (chunk_rank && (offset - parser->cached_chunks[i].offset) < wg_parser_chunk_size) + { + read_size = min(size, wg_parser_chunk_size - (offset - parser->cached_chunks[i].offset)); + gst_buffer_fill(buffer, buffer_offset + bytes_read, parser->cached_chunks[i].data + (offset - parser->cached_chunks[i].offset), read_size); + + if (chunk_rank != ARRAY_SIZE(parser->cached_chunks)) + { + parser->cached_chunks[i].rank = ARRAY_SIZE(parser->cached_chunks); + + for (k = 0; k < ARRAY_SIZE(parser->cached_chunks); k++) + { + if (k != i && parser->cached_chunks[k].rank > chunk_rank) + { + parser->cached_chunks[k].rank--; + } + } + } + + size -= read_size; + offset += read_size; + bytes_read += read_size; + + if (size) + i = -1; + else + break; + } + } + + return bytes_read; +} + +static GstFlowReturn load_parser_cache(struct wg_parser *parser, guint64 offset) +{ + guint64 chunk_start = offset - (offset % wg_parser_chunk_size); + GstFlowReturn ret = GST_FLOW_OK; + GstBuffer *buffer; + int i; + + for (i = 0; i < ARRAY_SIZE(parser->cached_chunks); i++) + { + if (parser->cached_chunks[i].rank <= 1) + { + if (!parser->cached_chunks[i].data) + parser->cached_chunks[i].data = malloc(wg_parser_chunk_size); + + buffer = gst_buffer_new_wrapped_full(0, parser->cached_chunks[i].data, wg_parser_chunk_size, 0, wg_parser_chunk_size, NULL, NULL); + + pthread_mutex_lock(&parser->mutex); + + assert(!parser->read_request.size); + parser->read_request.buffer = buffer; + parser->read_request.offset = chunk_start; + parser->read_request.size = wg_parser_chunk_size; + parser->read_request.done = false; + pthread_cond_signal(&parser->read_cond); + + while (!parser->read_request.done) + pthread_cond_wait(&parser->read_done_cond, &parser->mutex); + + ret = parser->read_request.ret; + + pthread_mutex_unlock(&parser->mutex); + + gst_buffer_unref(buffer); + + if (ret != GST_FLOW_OK) + return ret; + + parser->cached_chunks[i].offset = chunk_start; + if (!parser->cached_chunks[i].rank) + parser->cached_chunks[i].rank = 1; + } + } + + return ret; +} + static GstFlowReturn src_getrange_cb(GstPad *pad, GstObject *parent, guint64 offset, guint size, GstBuffer **buffer) { struct wg_parser *parser = gst_pad_get_element_private(pad); + GstBuffer *working_buffer = *buffer; + unsigned int bytes_read = 0; GstFlowReturn ret;
GST_LOG("pad %p, offset %" G_GINT64_MODIFIER "u, size %u, buffer %p.", pad, offset, size, *buffer); @@ -991,6 +1088,34 @@ static GstFlowReturn src_getrange_cb(GstPad *pad, GstObject *parent, return GST_FLOW_OK; }
+ if (size < wg_parser_chunk_size) + { + if (offset >= parser->file_size) + return GST_FLOW_EOS; + + if ((offset + size) >= parser->file_size) + size = parser->file_size - offset; + + if (!working_buffer) + working_buffer = gst_buffer_new_and_alloc(size); + + while ((bytes_read += read_parser_cache(parser, offset, size, working_buffer, bytes_read)) < size) + { + size -= bytes_read; + offset += bytes_read; + + if ((ret = load_parser_cache(parser, offset)) != GST_FLOW_OK) + { + if (!*buffer) + gst_buffer_unref(working_buffer); + return ret; + } + } + + *buffer = working_buffer; + return GST_FLOW_OK; + } + pthread_mutex_lock(&parser->mutex);
assert(!parser->read_request.size); @@ -1562,6 +1687,9 @@ static NTSTATUS wg_parser_disconnect(void *args) g_free(parser->sink_caps); parser->sink_caps = NULL;
+ for (i = 0; i < ARRAY_SIZE(parser->cached_chunks); i++) + parser->cached_chunks[i].rank = 0; + return S_OK; }
@@ -1765,6 +1893,7 @@ static NTSTATUS wg_parser_create(void *args) static NTSTATUS wg_parser_destroy(void *args) { struct wg_parser *parser = args; + unsigned int i;
if (parser->bus) { @@ -1772,6 +1901,12 @@ static NTSTATUS wg_parser_destroy(void *args) gst_object_unref(parser->bus); }
+ for (i = 0; i < ARRAY_SIZE(parser->cached_chunks); i++) + { + if (parser->cached_chunks[i].data) + free(parser->cached_chunks[i].data); + } + pthread_mutex_destroy(&parser->mutex); pthread_cond_destroy(&parser->init_cond); pthread_cond_destroy(&parser->read_cond);
General comments:
* This will take up up to 32 MiB per wg_parser, which seems like an awful lot, especially since applications can often create more than one of these at a time. I'm concerned about the memory usage and address space implications. Can the chunk size get any smaller without negatively impacting performance again?
* The code as written does handle loads that span chunk boundaries, although I feel like it's done in a non-obvious way. Reading through the loop in src_getrange_cb() I'm left wondering "why would read_parser_cache() return less than the requested size?" and "why are read_parser_cache() and load_parser_cache() separate functions?"
* And along similar lines, is it even worth falling back to the old path for larger read requests? If not, we should factor out a helper to actually perform the read pseudo-callback.
* Instead of a "rank" field, I think it would be simpler just to make the index itself be the rank, and just memmove the entries every time.
* I don't like the naminng of "parser_cache"; it's both redundant (everything in the file has to do with the parser) and not specific enough (caching what?) I'd propose "input_cache" or "read_cache" here.
* The parser mutex doesn't seem to be taken around everything that it should be.
This will take up up to 32 MiB per wg_parser
I looked up online about what chunk size reads on an SSD are, and found the number 512kb. Setting the chunk to this size seems to actually _improve_ the performance, presumably because we're spending less time reading/copying data we don't need at the moment. Do you think this is small enough (512kb per chunk * 4 chunks = 2mb per parser).
"why are read_parser_cache() and load_parser_cache() separate functions?
Ack, I'll put these together.
And along similar lines, is it even worth falling back to the old path for larger read requests?
The advantage to falling back that I can think of is that it's one less copy, as the data is first copied into the cache only to be read out to the buffer afterwards. How important this is to performance probably depends on how often we hit the read_size > chunk_size path. In my test case, I don't think it would hurt too much, but in the case where GStreamer reads the whole file at once, maybe this would make things worse. If it's okay I'll let you make the judgement call.
I think it would be simpler just to make the index itself be the rank
Ack
If not, we should factor out a helper to actually perform the read pseudo-callback.
I'm a bit confused here, if it's not worth it to fall back to the old path, then we'd only have one area performing the read pseudo callback, the function reading/loading from the cache into the buffer, right? If you meant the other way around, where we keep both paths then yeah, I can make that helper.
I'd propose "input_cache" or "read_cache" here.
Ack
The parser mutex doesn't seem to be taken around everything that it should be.
I'm pretty sure getrange doesn't need to be implemented as thread-safe, as GStreamer takes a lock to the pad anyway before invoking the callback:
https://github.com/GStreamer/gstreamer/blob/2db3ddaa9d8e9e0a1659d7e48623c512...