From: Zebediah Figura zfigura@codeweavers.com
--- dlls/wined3d/cs.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index bf171326926..edc2feeaab2 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -24,6 +24,9 @@ WINE_DECLARE_DEBUG_CHANNEL(d3d_perf); WINE_DECLARE_DEBUG_CHANNEL(d3d_sync); WINE_DECLARE_DEBUG_CHANNEL(fps);
+static NTSTATUS (WINAPI *pNtAlertThreadByThreadId)(HANDLE tid); +static NTSTATUS (WINAPI *pNtWaitForAlertByThreadId)(void *addr, const LARGE_INTEGER *timeout); + #define WINED3D_INITIAL_CS_SIZE 4096
struct wined3d_deferred_upload @@ -3188,7 +3191,12 @@ static void wined3d_cs_queue_submit(struct wined3d_cs_queue *queue, struct wined InterlockedExchange((LONG *)&queue->head, queue->head + packet_size);
if (InterlockedCompareExchange(&cs->waiting_for_event, FALSE, TRUE)) - SetEvent(cs->event); + { + if (pNtAlertThreadByThreadId) + pNtAlertThreadByThreadId((HANDLE)(ULONG_PTR)cs->thread_id); + else + SetEvent(cs->event); + } }
static void wined3d_cs_mt_submit(struct wined3d_device_context *context, enum wined3d_cs_queue_id queue_id) @@ -3328,7 +3336,10 @@ static void wined3d_cs_wait_event(struct wined3d_cs *cs) && InterlockedCompareExchange(&cs->waiting_for_event, FALSE, TRUE)) return;
- WaitForSingleObject(cs->event, INFINITE); + if (pNtWaitForAlertByThreadId) + pNtWaitForAlertByThreadId(NULL, NULL); + else + WaitForSingleObject(cs->event, INFINITE); }
static void wined3d_cs_command_lock(const struct wined3d_cs *cs) @@ -3529,7 +3540,15 @@ struct wined3d_cs *wined3d_cs_create(struct wined3d_device *device, { cs->c.ops = &wined3d_cs_mt_ops;
- if (!(cs->event = CreateEventW(NULL, FALSE, FALSE, NULL))) + if (!pNtAlertThreadByThreadId) + { + HANDLE ntdll = GetModuleHandleW(L"ntdll.dll"); + + pNtAlertThreadByThreadId = (void *)GetProcAddress(ntdll, "NtAlertThreadByThreadId"); + pNtWaitForAlertByThreadId = (void *)GetProcAddress(ntdll, "NtWaitForAlertByThreadId"); + } + + if (!pNtAlertThreadByThreadId && !(cs->event = CreateEventW(NULL, FALSE, FALSE, NULL))) { ERR("Failed to create command stream event.\n"); heap_free(cs->data); @@ -3547,7 +3566,8 @@ struct wined3d_cs *wined3d_cs_create(struct wined3d_device *device, { ERR("Failed to get wined3d module handle.\n"); CloseHandle(cs->present_event); - CloseHandle(cs->event); + if (cs->event) + CloseHandle(cs->event); heap_free(cs->data); goto fail; } @@ -3557,7 +3577,8 @@ struct wined3d_cs *wined3d_cs_create(struct wined3d_device *device, ERR("Failed to create wined3d command stream thread.\n"); FreeLibrary(cs->wined3d_module); CloseHandle(cs->present_event); - CloseHandle(cs->event); + if (cs->event) + CloseHandle(cs->event); heap_free(cs->data); goto fail; } @@ -3581,7 +3602,7 @@ void wined3d_cs_destroy(struct wined3d_cs *cs) CloseHandle(cs->thread); if (!CloseHandle(cs->present_event)) ERR("Closing present event failed.\n"); - if (!CloseHandle(cs->event)) + if (cs->event && !CloseHandle(cs->event)) ERR("Closing event failed.\n"); }
From: Matteo Bruni mbruni@codeweavers.com
Make sure we poll queries before we go to sleep, to e.g. avoid hangs in Rocket League.
With significant contributions by Zebediah Figura. --- dlls/wined3d/cs.c | 16 +++++++++++----- dlls/wined3d/wined3d_private.h | 2 ++ 2 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index edc2feeaab2..59bcd67cb2f 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -3322,6 +3322,12 @@ static void poll_queries(struct wined3d_cs *cs)
static void wined3d_cs_wait_event(struct wined3d_cs *cs) { + static const LARGE_INTEGER query_timeout = {.QuadPart = WINED3D_CS_WAIT_TIMEOUT * -10}; + const LARGE_INTEGER *timeout = NULL; + + if (!list_empty(&cs->query_poll_list)) + timeout = &query_timeout; + InterlockedExchange(&cs->waiting_for_event, TRUE);
/* The main thread might have enqueued a command and blocked on it after @@ -3337,9 +3343,9 @@ static void wined3d_cs_wait_event(struct wined3d_cs *cs) return;
if (pNtWaitForAlertByThreadId) - pNtWaitForAlertByThreadId(NULL, NULL); + pNtWaitForAlertByThreadId(NULL, timeout); else - WaitForSingleObject(cs->event, INFINITE); + NtWaitForSingleObject(cs->event, FALSE, timeout); }
static void wined3d_cs_command_lock(const struct wined3d_cs *cs) @@ -3451,10 +3457,10 @@ static DWORD WINAPI wined3d_cs_run(void *ctx) YieldProcessor(); if (++spin_count >= WINED3D_CS_SPIN_COUNT) { - if (list_empty(&cs->query_poll_list)) - wined3d_cs_wait_event(cs); + if (poll) + poll = WINED3D_CS_QUERY_POLL_INTERVAL - 1; else - Sleep(0); + wined3d_cs_wait_event(cs); } continue; } diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index ea78312fef3..e812e5846e0 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -3564,6 +3564,8 @@ enum wined3d_cs_queue_id #define WINED3D_CS_QUEUE_SIZE 0x400000u #endif #define WINED3D_CS_SPIN_COUNT 2000u +/* How long to block waiting, in µs. */ +#define WINED3D_CS_WAIT_TIMEOUT 100 #define WINED3D_CS_QUEUE_MASK (WINED3D_CS_QUEUE_SIZE - 1)
C_ASSERT(!(WINED3D_CS_QUEUE_SIZE & (WINED3D_CS_QUEUE_SIZE - 1)));
From: Zebediah Figura zfigura@codeweavers.com
--- dlls/wined3d/cs.c | 3 ++- dlls/wined3d/wined3d_private.h | 11 ++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index 59bcd67cb2f..0b7cae2d90b 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -3284,12 +3284,13 @@ static void *wined3d_cs_mt_require_space(struct wined3d_device_context *context, static void wined3d_cs_mt_finish(struct wined3d_device_context *context, enum wined3d_cs_queue_id queue_id) { struct wined3d_cs *cs = wined3d_cs_from_context(context); + unsigned int spin_count = 0;
if (cs->thread_id == GetCurrentThreadId()) return wined3d_cs_st_finish(context, queue_id);
while (cs->queue[queue_id].head != *(volatile ULONG *)&cs->queue[queue_id].tail) - YieldProcessor(); + wined3d_pause(&spin_count); }
static const struct wined3d_device_context_ops wined3d_cs_mt_ops = diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index e812e5846e0..1826a7ad5ce 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -3758,6 +3758,14 @@ static inline void wined3d_resource_reference(struct wined3d_resource *resource) resource->access_time = cs->queue[WINED3D_CS_QUEUE_DEFAULT].head; }
+static inline void wined3d_pause(unsigned int *spin_count) +{ + static const LARGE_INTEGER timeout = {.QuadPart = WINED3D_CS_WAIT_TIMEOUT * -10}; + + if (++*spin_count >= WINED3D_CS_SPIN_COUNT) + NtDelayExecution(FALSE, &timeout); +} + static inline BOOL wined3d_ge_wrap(ULONG x, ULONG y) { return (x - y) < UINT_MAX / 2; @@ -3768,6 +3776,7 @@ static inline void wined3d_resource_wait_idle(const struct wined3d_resource *res { const struct wined3d_cs *cs = resource->device->cs; ULONG access_time, tail, head; + unsigned int spin_count = 0;
if (!cs->thread || cs->thread_id == GetCurrentThreadId()) return; @@ -3806,7 +3815,7 @@ static inline void wined3d_resource_wait_idle(const struct wined3d_resource *res if (!wined3d_ge_wrap(access_time, tail) && access_time != tail) break;
- YieldProcessor(); + wined3d_pause(&spin_count); } }
From: Matteo Bruni mbruni@codeweavers.com
--- This is kind of a stopgap, we're still potentially polling very often. We probably want to change the polling scheme entirely, perhaps to poll just after a PRESENT or FLUSH (maybe other commands as well), or make the polling time-based. --- dlls/wined3d/wined3d_private.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index 1826a7ad5ce..2f74ca17e2d 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -3557,7 +3557,7 @@ enum wined3d_cs_queue_id WINED3D_CS_QUEUE_COUNT, };
-#define WINED3D_CS_QUERY_POLL_INTERVAL 10u +#define WINED3D_CS_QUERY_POLL_INTERVAL 100u #if defined(_WIN64) #define WINED3D_CS_QUEUE_SIZE 0x1000000u #else
From: Matteo Bruni mbruni@codeweavers.com
--- dlls/wined3d/cs.c | 18 ++++++++++++++++++ dlls/wined3d/wined3d_private.h | 2 ++ 2 files changed, 20 insertions(+)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index 0b7cae2d90b..0d5f5578577 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -23,6 +23,7 @@ WINE_DEFAULT_DEBUG_CHANNEL(d3d); WINE_DECLARE_DEBUG_CHANNEL(d3d_perf); WINE_DECLARE_DEBUG_CHANNEL(d3d_sync); WINE_DECLARE_DEBUG_CHANNEL(fps); +WINE_DECLARE_DEBUG_CHANNEL(frametime);
static NTSTATUS (WINAPI *pNtAlertThreadByThreadId)(HANDLE tid); static NTSTATUS (WINAPI *pNtWaitForAlertByThreadId)(void *addr, const LARGE_INTEGER *timeout); @@ -3363,9 +3364,16 @@ static void wined3d_cs_command_unlock(const struct wined3d_cs *cs)
static inline bool wined3d_cs_execute_next(struct wined3d_cs *cs, struct wined3d_cs_queue *queue) { + static LARGE_INTEGER freq; + struct wined3d_cs_packet *packet; enum wined3d_cs_op opcode; SIZE_T tail; + LARGE_INTEGER time1; + LONGLONG total_time; + + if (!freq.QuadPart) + QueryPerformanceFrequency(&freq);
tail = queue->tail; packet = wined3d_next_cs_packet(queue->data, &tail, WINED3D_CS_QUEUE_MASK); @@ -3386,6 +3394,16 @@ static inline bool wined3d_cs_execute_next(struct wined3d_cs *cs, struct wined3d wined3d_cs_op_handlers[opcode](cs, packet->data); wined3d_cs_command_unlock(cs); TRACE("%s at %p executed.\n", debug_cs_op(opcode), packet); + if (TRACE_ON(frametime) && opcode == WINED3D_CS_OP_PRESENT) + { + QueryPerformanceCounter(&time1); + if (cs->last_present_time.QuadPart) + { + total_time = time1.QuadPart - cs->last_present_time.QuadPart; + TRACE_(frametime)("Frame duration %u μs.\n", (unsigned int)(total_time * 1000000 / freq.QuadPart)); + } + cs->last_present_time = time1; + } }
InterlockedExchange((LONG *)&queue->tail, tail); diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index 2f74ca17e2d..27c8a09f106 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -3619,6 +3619,8 @@ struct wined3d_cs LONG waiting_for_event; LONG waiting_for_present; LONG pending_presents; + + LARGE_INTEGER last_present_time; };
static inline void wined3d_device_context_lock(struct wined3d_device_context *context)
From: Matteo Bruni mbruni@codeweavers.com
--- These can be particularly useful in combination with the +timestamp debug channel. --- dlls/wined3d/cs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index 0d5f5578577..571aebd1af3 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -1111,6 +1111,7 @@ static void wined3d_cs_exec_flush(struct wined3d_cs *cs, const void *data) { struct wined3d_context *context;
+ TRACE_(d3d_perf)("Flushing adapter.\n"); context = context_acquire(cs->c.device, NULL, 0); cs->c.device->adapter->adapter_ops->adapter_flush_context(context); context_release(context); @@ -2458,6 +2459,8 @@ HRESULT wined3d_device_context_emit_map(struct wined3d_device_context *context, return WINED3D_OK; }
+ TRACE_(d3d_perf)("Mapping resource %p (type %u), flags %#x through the CS.\n", resource, resource->type, flags); + wined3d_resource_wait_idle(resource);
/* We might end up invalidating the resource on the CS thread. */ @@ -2510,6 +2513,8 @@ HRESULT wined3d_device_context_emit_unmap(struct wined3d_device_context *context
wined3d_not_from_cs(context->device->cs);
+ TRACE_(d3d_perf)("Unmapping resource %p (type %u) through the CS.\n", resource, resource->type); + if (!(op = wined3d_device_context_require_space(context, sizeof(*op), WINED3D_CS_QUEUE_MAP))) return E_OUTOFMEMORY; op->opcode = WINED3D_CS_OP_UNMAP; @@ -3262,7 +3267,7 @@ static void *wined3d_cs_queue_require_space(struct wined3d_cs_queue *queue, size if (new_pos < tail && new_pos) break;
- TRACE("Waiting for free space. Head %lu, tail %lu, packet size %Iu.\n", + TRACE_(d3d_perf)("Waiting for free space. Head %lu, tail %lu, packet size %Iu.\n", head, tail, packet_size); }
@@ -3290,8 +3295,10 @@ static void wined3d_cs_mt_finish(struct wined3d_device_context *context, enum wi if (cs->thread_id == GetCurrentThreadId()) return wined3d_cs_st_finish(context, queue_id);
+ TRACE_(d3d_perf)("Waiting for queue %u to be empty.\n", queue_id); while (cs->queue[queue_id].head != *(volatile ULONG *)&cs->queue[queue_id].tail) wined3d_pause(&spin_count); + TRACE_(d3d_perf)("Queue is now empty.\n"); }
static const struct wined3d_device_context_ops wined3d_cs_mt_ops =
Zebediah Figura (@zfigura) commented about dlls/wined3d/cs.c:
wined3d_cs_op_handlers[opcode](cs, packet->data); wined3d_cs_command_unlock(cs); TRACE("%s at %p executed.\n", debug_cs_op(opcode), packet);
if (TRACE_ON(frametime) && opcode == WINED3D_CS_OP_PRESENT)
Wait, why not in wined3d_cs_exec_present(), where the fps code is?
(Also, shouldn't this information be tied to the swapchain, not the CS?)
On Wed Dec 6 21:20:20 2023 +0000, Zebediah Figura wrote:
Wait, why not in wined3d_cs_exec_present(), where the fps code is? (Also, shouldn't this information be tied to the swapchain, not the CS?)
Good question(s). Probably because it was originally part of a much larger trace which included stuff that only made sense inside wined3d_cs_run().
I'll move everything to the proper place :sweat_smile:
Zebediah Figura (@zfigura) commented about dlls/wined3d/cs.c:
static void wined3d_cs_wait_event(struct wined3d_cs *cs) {
- static const LARGE_INTEGER query_timeout = {.QuadPart = WINED3D_CS_WAIT_TIMEOUT * -10};
It may be worth being aware that this won't actually work on Windows, or, well, it'll give you a wait at least as long as one interrupt timer tick, which is no shorter than 1 ms. I... *think* Linux can do shorter waits, although it's not particularly easy to find this out from research, and I don't know how much of a concern it is if this is only a problem on Windows.
I'm also not sure how much of a concern it is to poll for queries only every 1 ms. I guess cases where there are active queries but the CS is sleeping for more than 100 µs are going to be rare anyway (or symptomatic of some client-side overhead that should be fixed)?
Zebediah Figura (@zfigura) commented about dlls/wined3d/wined3d_private.h:
resource->access_time = cs->queue[WINED3D_CS_QUEUE_DEFAULT].head;
}
+static inline void wined3d_pause(unsigned int *spin_count) +{
- static const LARGE_INTEGER timeout = {.QuadPart = WINED3D_CS_WAIT_TIMEOUT * -10};
- if (++*spin_count >= WINED3D_CS_SPIN_COUNT)
Using WINED3D_CS_SPIN_COUNT here is a bit weird, since it's a wait on the other thread and I don't immediately see why we'd want to spin a comparable number of times. Defining a new symbolic constant with the same value may honestly be better in terms of clarity?
On Wed Dec 6 21:26:53 2023 +0000, Zebediah Figura wrote:
Using WINED3D_CS_SPIN_COUNT here is a bit weird, since it's a wait on the other thread and I don't immediately see why we'd want to spin a comparable number of times. Defining a new symbolic constant with the same value may honestly be better in terms of clarity?
I had the same doubt, in fact. Will do.
On Wed Dec 6 21:25:09 2023 +0000, Zebediah Figura wrote:
It may be worth being aware that this won't actually work on Windows, or, well, it'll give you a wait at least as long as one interrupt timer tick, which is no shorter than 1 ms. I... *think* Linux can do shorter waits, although it's not particularly easy to find this out from research, and I don't know how much of a concern it is if this is only a problem on Windows. I'm also not sure how much of a concern it is to poll for queries only every 1 ms. I guess cases where there are active queries but the CS is sleeping for more than 100 µs are going to be rare anyway (or symptomatic of some client-side overhead that should be fixed)?
I'm pretty sure that Linux can do these shorter sleeps just fine (with the understanding that it's only going to be a lower bound anyway). AFAIK recent Windows can actually go lower than 1ms as well, although it's not that simple because of course it isn't. I remember reading some blog post with all the interesting details but I can't find it now (I'll keep looking).
I'm also not sure how much of a concern it is to poll for queries only every 1 ms. I guess cases where there are active queries but the CS is sleeping for more than 100 µs are going to be rare anyway (or symptomatic of some client-side overhead that should be fixed)?
I think that's usually the case. The timeout was originally 1 ms and I didn't encounter obvious issues while testing (of course my testing was also necessarily limited). I guess the question is whether trying to go with this and possibly finding out the hard way is what we want to do for this one. There's also the option of yielding instead (i.e. 0 timeout), which maybe is safer in this sense, but also potentially not as good.
I should also mention that the following patch uses the same timeout but for client-side waits, and for those yielding is probably less of a problem (i.e. there's a good chance that the CS thread or some other would use the CPU time and we won't actually end up spinning). I could 1. use a separate timeout for the client side 2. set it to 0. I think I'll do 1 regardless, let me know how you feel about 2.
I think that's usually the case. The timeout was originally 1 ms and I didn't encounter obvious issues while testing (of course my testing was also necessarily limited). I guess the question is whether trying to go with this and possibly finding out the hard way is what we want to do for this one.
Perhaps this is rash of me, but the way I see it, if we don't find out the hard way, we're probably not going to find out at all. If there's a clear benefit on the other side, and the patch is bisectable, and we've tested some applications (especially if we know applications that are sensitive to this kind of thing) then I'm inclined to accept it.