These are all the patches in this area that I think make sense at this point in the release cycle (i.e. a couple of days before code freeze...)
I originally had many more. A few are not really suitable at this time; a couple turned out to not really make a difference or generally not feeling like it would be an improvement overall. Many ended up in the way (or not in the spirit) of what we probably want to do at some point, which is grouping the whole draw state setup data together with the draw itself. Not that I could find the time to explore that yet...
-- v2: wined3d: Add a bunch of d3d_perf traces. wined3d: Add a frametime debug channel. wined3d: Increase WINED3D_CS_QUERY_POLL_INTERVAL to 100. wined3d: Sleep when waiting for the CS thread. wined3d: Do a blocking wait for CS commands even when there are active queries.
From: Zebediah Figura zfigura@codeweavers.com
--- dlls/wined3d/cs.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index bf171326926..edc2feeaab2 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -24,6 +24,9 @@ WINE_DECLARE_DEBUG_CHANNEL(d3d_perf); WINE_DECLARE_DEBUG_CHANNEL(d3d_sync); WINE_DECLARE_DEBUG_CHANNEL(fps);
+static NTSTATUS (WINAPI *pNtAlertThreadByThreadId)(HANDLE tid); +static NTSTATUS (WINAPI *pNtWaitForAlertByThreadId)(void *addr, const LARGE_INTEGER *timeout); + #define WINED3D_INITIAL_CS_SIZE 4096
struct wined3d_deferred_upload @@ -3188,7 +3191,12 @@ static void wined3d_cs_queue_submit(struct wined3d_cs_queue *queue, struct wined InterlockedExchange((LONG *)&queue->head, queue->head + packet_size);
if (InterlockedCompareExchange(&cs->waiting_for_event, FALSE, TRUE)) - SetEvent(cs->event); + { + if (pNtAlertThreadByThreadId) + pNtAlertThreadByThreadId((HANDLE)(ULONG_PTR)cs->thread_id); + else + SetEvent(cs->event); + } }
static void wined3d_cs_mt_submit(struct wined3d_device_context *context, enum wined3d_cs_queue_id queue_id) @@ -3328,7 +3336,10 @@ static void wined3d_cs_wait_event(struct wined3d_cs *cs) && InterlockedCompareExchange(&cs->waiting_for_event, FALSE, TRUE)) return;
- WaitForSingleObject(cs->event, INFINITE); + if (pNtWaitForAlertByThreadId) + pNtWaitForAlertByThreadId(NULL, NULL); + else + WaitForSingleObject(cs->event, INFINITE); }
static void wined3d_cs_command_lock(const struct wined3d_cs *cs) @@ -3529,7 +3540,15 @@ struct wined3d_cs *wined3d_cs_create(struct wined3d_device *device, { cs->c.ops = &wined3d_cs_mt_ops;
- if (!(cs->event = CreateEventW(NULL, FALSE, FALSE, NULL))) + if (!pNtAlertThreadByThreadId) + { + HANDLE ntdll = GetModuleHandleW(L"ntdll.dll"); + + pNtAlertThreadByThreadId = (void *)GetProcAddress(ntdll, "NtAlertThreadByThreadId"); + pNtWaitForAlertByThreadId = (void *)GetProcAddress(ntdll, "NtWaitForAlertByThreadId"); + } + + if (!pNtAlertThreadByThreadId && !(cs->event = CreateEventW(NULL, FALSE, FALSE, NULL))) { ERR("Failed to create command stream event.\n"); heap_free(cs->data); @@ -3547,7 +3566,8 @@ struct wined3d_cs *wined3d_cs_create(struct wined3d_device *device, { ERR("Failed to get wined3d module handle.\n"); CloseHandle(cs->present_event); - CloseHandle(cs->event); + if (cs->event) + CloseHandle(cs->event); heap_free(cs->data); goto fail; } @@ -3557,7 +3577,8 @@ struct wined3d_cs *wined3d_cs_create(struct wined3d_device *device, ERR("Failed to create wined3d command stream thread.\n"); FreeLibrary(cs->wined3d_module); CloseHandle(cs->present_event); - CloseHandle(cs->event); + if (cs->event) + CloseHandle(cs->event); heap_free(cs->data); goto fail; } @@ -3581,7 +3602,7 @@ void wined3d_cs_destroy(struct wined3d_cs *cs) CloseHandle(cs->thread); if (!CloseHandle(cs->present_event)) ERR("Closing present event failed.\n"); - if (!CloseHandle(cs->event)) + if (cs->event && !CloseHandle(cs->event)) ERR("Closing event failed.\n"); }
From: Matteo Bruni mbruni@codeweavers.com
Make sure we poll queries before we go to sleep, to e.g. avoid hangs in Rocket League.
With significant contributions by Zebediah Figura. --- dlls/wined3d/cs.c | 16 +++++++++++----- dlls/wined3d/wined3d_private.h | 2 ++ 2 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index edc2feeaab2..61ea75c2776 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -3322,6 +3322,12 @@ static void poll_queries(struct wined3d_cs *cs)
static void wined3d_cs_wait_event(struct wined3d_cs *cs) { + static const LARGE_INTEGER query_timeout = {.QuadPart = WINED3D_CS_COMMAND_WAIT_WITH_QUERIES_TIMEOUT * -10}; + const LARGE_INTEGER *timeout = NULL; + + if (!list_empty(&cs->query_poll_list)) + timeout = &query_timeout; + InterlockedExchange(&cs->waiting_for_event, TRUE);
/* The main thread might have enqueued a command and blocked on it after @@ -3337,9 +3343,9 @@ static void wined3d_cs_wait_event(struct wined3d_cs *cs) return;
if (pNtWaitForAlertByThreadId) - pNtWaitForAlertByThreadId(NULL, NULL); + pNtWaitForAlertByThreadId(NULL, timeout); else - WaitForSingleObject(cs->event, INFINITE); + NtWaitForSingleObject(cs->event, FALSE, timeout); }
static void wined3d_cs_command_lock(const struct wined3d_cs *cs) @@ -3451,10 +3457,10 @@ static DWORD WINAPI wined3d_cs_run(void *ctx) YieldProcessor(); if (++spin_count >= WINED3D_CS_SPIN_COUNT) { - if (list_empty(&cs->query_poll_list)) - wined3d_cs_wait_event(cs); + if (poll) + poll = WINED3D_CS_QUERY_POLL_INTERVAL - 1; else - Sleep(0); + wined3d_cs_wait_event(cs); } continue; } diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index ea78312fef3..494d26ebd02 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -3564,6 +3564,8 @@ enum wined3d_cs_queue_id #define WINED3D_CS_QUEUE_SIZE 0x400000u #endif #define WINED3D_CS_SPIN_COUNT 2000u +/* How long to wait for commands when there are active queries, in µs. */ +#define WINED3D_CS_COMMAND_WAIT_WITH_QUERIES_TIMEOUT 100 #define WINED3D_CS_QUEUE_MASK (WINED3D_CS_QUEUE_SIZE - 1)
C_ASSERT(!(WINED3D_CS_QUEUE_SIZE & (WINED3D_CS_QUEUE_SIZE - 1)));
From: Zebediah Figura zfigura@codeweavers.com
--- v2: Use separate defines for client side timeout and spin count, adjust values. --- dlls/wined3d/cs.c | 3 ++- dlls/wined3d/wined3d_private.h | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index 61ea75c2776..b704b720cd0 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -3284,12 +3284,13 @@ static void *wined3d_cs_mt_require_space(struct wined3d_device_context *context, static void wined3d_cs_mt_finish(struct wined3d_device_context *context, enum wined3d_cs_queue_id queue_id) { struct wined3d_cs *cs = wined3d_cs_from_context(context); + unsigned int spin_count = 0;
if (cs->thread_id == GetCurrentThreadId()) return wined3d_cs_st_finish(context, queue_id);
while (cs->queue[queue_id].head != *(volatile ULONG *)&cs->queue[queue_id].tail) - YieldProcessor(); + wined3d_pause(&spin_count); }
static const struct wined3d_device_context_ops wined3d_cs_mt_ops = diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index 494d26ebd02..64b933b2b45 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -3566,6 +3566,8 @@ enum wined3d_cs_queue_id #define WINED3D_CS_SPIN_COUNT 2000u /* How long to wait for commands when there are active queries, in µs. */ #define WINED3D_CS_COMMAND_WAIT_WITH_QUERIES_TIMEOUT 100 +/* How long to wait for the CS from the client thread, in µs. */ +#define WINED3D_CS_CLIENT_WAIT_TIMEOUT 0 #define WINED3D_CS_QUEUE_MASK (WINED3D_CS_QUEUE_SIZE - 1)
C_ASSERT(!(WINED3D_CS_QUEUE_SIZE & (WINED3D_CS_QUEUE_SIZE - 1))); @@ -3758,6 +3760,16 @@ static inline void wined3d_resource_reference(struct wined3d_resource *resource) resource->access_time = cs->queue[WINED3D_CS_QUEUE_DEFAULT].head; }
+#define WINED3D_PAUSE_SPIN_COUNT 200u + +static inline void wined3d_pause(unsigned int *spin_count) +{ + static const LARGE_INTEGER timeout = {.QuadPart = WINED3D_CS_CLIENT_WAIT_TIMEOUT * -10}; + + if (++*spin_count >= WINED3D_PAUSE_SPIN_COUNT) + NtDelayExecution(FALSE, &timeout); +} + static inline BOOL wined3d_ge_wrap(ULONG x, ULONG y) { return (x - y) < UINT_MAX / 2; @@ -3768,6 +3780,7 @@ static inline void wined3d_resource_wait_idle(const struct wined3d_resource *res { const struct wined3d_cs *cs = resource->device->cs; ULONG access_time, tail, head; + unsigned int spin_count = 0;
if (!cs->thread || cs->thread_id == GetCurrentThreadId()) return; @@ -3806,7 +3819,7 @@ static inline void wined3d_resource_wait_idle(const struct wined3d_resource *res if (!wined3d_ge_wrap(access_time, tail) && access_time != tail) break;
- YieldProcessor(); + wined3d_pause(&spin_count); } }
From: Matteo Bruni mbruni@codeweavers.com
--- This is kind of a stopgap, we're still potentially polling very often. We probably want to change the polling scheme entirely, perhaps to poll just after a PRESENT or FLUSH (maybe other commands as well), or make the polling time-based. --- dlls/wined3d/wined3d_private.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index 64b933b2b45..26474cecb07 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -3557,7 +3557,7 @@ enum wined3d_cs_queue_id WINED3D_CS_QUEUE_COUNT, };
-#define WINED3D_CS_QUERY_POLL_INTERVAL 10u +#define WINED3D_CS_QUERY_POLL_INTERVAL 100u #if defined(_WIN64) #define WINED3D_CS_QUEUE_SIZE 0x1000000u #else
From: Matteo Bruni mbruni@codeweavers.com
--- v2: Move code to wined3d_cs_exec_present(), data to struct wined3d_swapchain. --- dlls/wined3d/cs.c | 18 ++++++++++++++++++ dlls/wined3d/wined3d_private.h | 4 +++- 2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index b704b720cd0..4d34a939ac7 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -23,6 +23,7 @@ WINE_DEFAULT_DEBUG_CHANNEL(d3d); WINE_DECLARE_DEBUG_CHANNEL(d3d_perf); WINE_DECLARE_DEBUG_CHANNEL(d3d_sync); WINE_DECLARE_DEBUG_CHANNEL(fps); +WINE_DECLARE_DEBUG_CHANNEL(frametime);
static NTSTATUS (WINAPI *pNtAlertThreadByThreadId)(HANDLE tid); static NTSTATUS (WINAPI *pNtWaitForAlertByThreadId)(void *addr, const LARGE_INTEGER *timeout); @@ -643,11 +644,18 @@ static void wined3d_cs_exec_nop(struct wined3d_cs *cs, const void *data)
static void wined3d_cs_exec_present(struct wined3d_cs *cs, const void *data) { + static LARGE_INTEGER freq; + struct wined3d_texture *logo_texture, *cursor_texture, *back_buffer; struct wined3d_rendertarget_view *dsv = cs->state.fb.depth_stencil; const struct wined3d_cs_present *op = data; const struct wined3d_swapchain_desc *desc; struct wined3d_swapchain *swapchain; + LONGLONG elapsed_time; + LARGE_INTEGER time; + + if (!freq.QuadPart) + QueryPerformanceFrequency(&freq);
swapchain = op->swapchain; desc = &swapchain->state.desc; @@ -703,6 +711,16 @@ static void wined3d_cs_exec_present(struct wined3d_cs *cs, const void *data) wined3d_rendertarget_view_validate_location(dsv, WINED3D_LOCATION_DISCARDED); }
+ if (TRACE_ON(frametime)) + { + QueryPerformanceCounter(&time); + if (swapchain->last_present_time.QuadPart) + { + elapsed_time = time.QuadPart - swapchain->last_present_time.QuadPart; + TRACE_(frametime)("Frame duration %u μs.\n", (unsigned int)(elapsed_time * 1000000 / freq.QuadPart)); + } + swapchain->last_present_time = time; + } if (TRACE_ON(fps)) { DWORD time = GetTickCount(); diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index 26474cecb07..c17e29cddf2 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -4029,7 +4029,9 @@ struct wined3d_swapchain unsigned int swap_interval; unsigned int max_frame_latency;
- LONG prev_time, frames; /* Performance tracking */ + /* Performance tracking */ + LARGE_INTEGER last_present_time; + LONG prev_time, frames;
struct wined3d_swapchain_state state; HWND win_handle;
From: Matteo Bruni mbruni@codeweavers.com
--- These can be particularly useful in combination with the +timestamp debug channel. --- dlls/wined3d/cs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index 4d34a939ac7..0f5d77d93c9 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -1128,6 +1128,7 @@ static void wined3d_cs_exec_flush(struct wined3d_cs *cs, const void *data) { struct wined3d_context *context;
+ TRACE_(d3d_perf)("Flushing adapter.\n"); context = context_acquire(cs->c.device, NULL, 0); cs->c.device->adapter->adapter_ops->adapter_flush_context(context); context_release(context); @@ -2475,6 +2476,8 @@ HRESULT wined3d_device_context_emit_map(struct wined3d_device_context *context, return WINED3D_OK; }
+ TRACE_(d3d_perf)("Mapping resource %p (type %u), flags %#x through the CS.\n", resource, resource->type, flags); + wined3d_resource_wait_idle(resource);
/* We might end up invalidating the resource on the CS thread. */ @@ -2527,6 +2530,8 @@ HRESULT wined3d_device_context_emit_unmap(struct wined3d_device_context *context
wined3d_not_from_cs(context->device->cs);
+ TRACE_(d3d_perf)("Unmapping resource %p (type %u) through the CS.\n", resource, resource->type); + if (!(op = wined3d_device_context_require_space(context, sizeof(*op), WINED3D_CS_QUEUE_MAP))) return E_OUTOFMEMORY; op->opcode = WINED3D_CS_OP_UNMAP; @@ -3279,7 +3284,7 @@ static void *wined3d_cs_queue_require_space(struct wined3d_cs_queue *queue, size if (new_pos < tail && new_pos) break;
- TRACE("Waiting for free space. Head %lu, tail %lu, packet size %Iu.\n", + TRACE_(d3d_perf)("Waiting for free space. Head %lu, tail %lu, packet size %Iu.\n", head, tail, packet_size); }
@@ -3307,8 +3312,10 @@ static void wined3d_cs_mt_finish(struct wined3d_device_context *context, enum wi if (cs->thread_id == GetCurrentThreadId()) return wined3d_cs_st_finish(context, queue_id);
+ TRACE_(d3d_perf)("Waiting for queue %u to be empty.\n", queue_id); while (cs->queue[queue_id].head != *(volatile ULONG *)&cs->queue[queue_id].tail) wined3d_pause(&spin_count); + TRACE_(d3d_perf)("Queue is now empty.\n"); }
static const struct wined3d_device_context_ops wined3d_cs_mt_ops =
On Thu Dec 7 18:02:04 2023 +0000, Matteo Bruni wrote:
changed this line in [version 2 of the diff](/wine/wine/-/merge_requests/4625/diffs?diff_id=88745&start_sha=82b4f90bee044508302b7d5317a7deaf752a7660#b654b6b4d7e5ff6c5aeb9e514a6b99ee5eb392e9_3334_3351)
It turns out I had the wined3d_pause() timeout effectively set to 10 µs for a long time, until a very recent cleanup round (aka math is hard). So I restored the old value for now, but I also strongly suspect that a 0 timeout (i.e. yielding) wouldn't make much of a difference either. I'm retesting stuff to confirm but I'd be surprised to find otherwise.
On Thu Dec 7 18:02:07 2023 +0000, Matteo Bruni wrote:
changed this line in [version 2 of the diff](/wine/wine/-/merge_requests/4625/diffs?diff_id=88745&start_sha=82b4f90bee044508302b7d5317a7deaf752a7660#d70e015f44f0966173d9b167c78e603ab0ad15bd_3767_3769)
Similar as above, this was actually much lower originally (200 vs 2000 aka the current value of WINED3D_CS_SPIN_COUNT). Adjusted back in the current version. I'm going to do some further ad-hoc testing to double check, but again I doubt it makes much of a difference in practice.
This merge request was approved by Zebediah Figura.
This merge request was approved by Jan Sikorski.