That seems to fix some (difficult to reproduce) crashes in EA Desktop / CEF, usually on shutdown but sometimes during starting up.
Currently TpSetWait can set (or clear) the event, while waitqueue_thread_proc() gets woken from NtWaitForMultipleObjects by previously set wait object and call the callback as if new set (or cleared) wait object is signaled. The crashes I was reproducing were always happening when RtlDeregisterWaitEx call is racing with waking the wait and calling the callback.
-- v2: ntdll: Make sure wakeups from already unset events are ignored in waitqueue_thread_proc().
From: Paul Gofman pgofman@codeweavers.com
--- dlls/ntdll/threadpool.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/dlls/ntdll/threadpool.c b/dlls/ntdll/threadpool.c index 4f22114a55e..9a8f380bf72 100644 --- a/dlls/ntdll/threadpool.c +++ b/dlls/ntdll/threadpool.c @@ -1265,6 +1265,7 @@ static void CALLBACK waitqueue_thread_proc( void *param ) u.wait.wait_entry ) { assert( wait->type == TP_OBJECT_TYPE_WAIT ); + assert( wait->u.wait.wait_pending ); if (wait->u.wait.timeout <= now.QuadPart) { /* Wait object timed out. */ @@ -1272,6 +1273,7 @@ static void CALLBACK waitqueue_thread_proc( void *param ) { list_remove( &wait->u.wait.wait_entry ); list_add_tail( &bucket->reserved, &wait->u.wait.wait_entry ); + wait->u.wait.wait_pending = FALSE; } if ((wait->u.wait.flags & (WT_EXECUTEINWAITTHREAD | WT_EXECUTEINIOTHREAD))) { @@ -1329,6 +1331,7 @@ static void CALLBACK waitqueue_thread_proc( void *param ) { list_remove( &wait->u.wait.wait_entry ); list_add_tail( &bucket->reserved, &wait->u.wait.wait_entry ); + wait->u.wait.wait_pending = FALSE; } if ((wait->u.wait.flags & (WT_EXECUTEINWAITTHREAD | WT_EXECUTEINIOTHREAD))) {
From: Paul Gofman pgofman@codeweavers.com
--- dlls/ntdll/threadpool.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/dlls/ntdll/threadpool.c b/dlls/ntdll/threadpool.c index 9a8f380bf72..2887e84b12c 100644 --- a/dlls/ntdll/threadpool.c +++ b/dlls/ntdll/threadpool.c @@ -160,6 +160,7 @@ struct threadpool_object LONG num_pending_callbacks; LONG num_running_callbacks; LONG num_associated_callbacks; + LONG update_serial; /* arguments for callback */ union { @@ -1243,6 +1244,7 @@ static void tp_timerqueue_unlock( struct threadpool_object *timer ) static void CALLBACK waitqueue_thread_proc( void *param ) { struct threadpool_object *objects[MAXIMUM_WAITQUEUE_OBJECTS]; + LONG update_serials[MAXIMUM_WAITQUEUE_OBJECTS]; HANDLE handles[MAXIMUM_WAITQUEUE_OBJECTS + 1]; struct waitqueue_bucket *bucket = param; struct threadpool_object *wait, *next; @@ -1295,6 +1297,7 @@ static void CALLBACK waitqueue_thread_proc( void *param ) InterlockedIncrement( &wait->refcount ); objects[num_handles] = wait; handles[num_handles] = wait->u.wait.handle; + update_serials[num_handles] = wait->update_serial; num_handles++; } } @@ -1323,7 +1326,7 @@ static void CALLBACK waitqueue_thread_proc( void *param ) { wait = objects[status - STATUS_WAIT_0]; assert( wait->type == TP_OBJECT_TYPE_WAIT ); - if (wait->u.wait.bucket) + if (wait->u.wait.bucket && wait->update_serial == update_serials[status - STATUS_WAIT_0]) { /* Wait object signaled. */ assert( wait->u.wait.bucket == bucket ); @@ -1344,7 +1347,10 @@ static void CALLBACK waitqueue_thread_proc( void *param ) else tp_object_submit( wait, TRUE ); } else - WARN("wait object %p triggered while object was destroyed\n", wait); + { + WARN("wait object %p triggered while object was %s.\n", + wait, wait->u.wait.bucket ? "updated" : "destroyed"); + } }
/* Release temporary references to wait objects. */ @@ -1917,6 +1923,7 @@ static void tp_object_initialize( struct threadpool_object *object, struct threa object->num_pending_callbacks = 0; object->num_running_callbacks = 0; object->num_associated_callbacks = 0; + object->update_serial = 0;
if (environment) { @@ -3051,12 +3058,15 @@ VOID WINAPI TpSetWait( TP_WAIT *wait, HANDLE handle, LARGE_INTEGER *timeout ) { struct threadpool_object *this = impl_from_TP_WAIT( wait ); ULONGLONG timestamp = MAXLONGLONG; + BOOL same_handle;
TRACE( "%p %p %p\n", wait, handle, timeout );
RtlEnterCriticalSection( &waitqueue.cs );
assert( this->u.wait.bucket ); + + same_handle = this->u.wait.handle == handle; this->u.wait.handle = handle;
if (handle || this->u.wait.wait_pending) @@ -3090,6 +3100,8 @@ VOID WINAPI TpSetWait( TP_WAIT *wait, HANDLE handle, LARGE_INTEGER *timeout ) }
/* Wake up the wait queue thread. */ + if (!same_handle) + ++this->update_serial; NtSetEvent( bucket->update_event, NULL ); }
Sorry, I have no idea how I missed this, despite frequently looking over my list of assigned patches...
It looks correct to me. I don't see any tests, but I assume you tested this behaviour in isolation?
This merge request was approved by Elizabeth Figura.
IIRC I did, Windows doesn't seem to call the wait callbacks after they were actually removed. The race is eagerly reproducible with Proton with esync / fsync because there is no warranty that WaitForMultipleObjects() will be reported woken by the handle which was signaled first (and while object signal could happen after we set update event we'll see the object signaled and execute callback). If there is such warranty with correctly working sync primitives this currently present race is very hard to reproduce because object signal has to happen while we are in TpSetWait(), before NtSetEvent( bucket->update_event, NULL );. And then we should return from TpSetWait app should do something before the callback is called. Still, there is the race and also probably depending on wake up order in multiple wait is not great both for functional and readability reasons.