From: Paul Gofman pgofman@codeweavers.com
https://bugs.winehq.org/show_bug.cgi?id=59017 --- dlls/ntdll/sync.c | 33 ++++++++++++++++++++++++++++----- dlls/ntdll/tests/sync.c | 2 +- 2 files changed, 29 insertions(+), 6 deletions(-)
diff --git a/dlls/ntdll/sync.c b/dlls/ntdll/sync.c index 66d376b0154..18e61e47368 100644 --- a/dlls/ntdll/sync.c +++ b/dlls/ntdll/sync.c @@ -1383,6 +1383,7 @@ struct barrier_impl LONG total_thread_count; volatile LONG reached_thread_count; volatile LONG structure_lock_count; + volatile LONG waiting_thread_count; volatile LONG wait_barrier_complete; };
@@ -1402,6 +1403,7 @@ NTSTATUS WINAPI RtlInitBarrier( RTL_BARRIER *barrier, LONG thread_count, LONG sp b->spin_count = spin_count; b->reached_thread_count = 0; b->structure_lock_count = 0; + b->waiting_thread_count = 0; b->wait_barrier_complete = 0; return STATUS_SUCCESS; } @@ -1446,11 +1448,28 @@ BOOLEAN WINAPI RtlBarrier( RTL_BARRIER *barrier, ULONG flags ) if (flags & ~0x10000 && !once++) FIXME( "Unknown flags %#lx.\n", flags ); if (!barrier) return FALSE;
- if (ReadAcquire( &b->reached_thread_count ) >= b->total_thread_count) return TRUE; - /* Incrementing reached_thread_count may trigger RTL_BARRIER data desrtuction from another thread, * so lock RtlDeleteBarrier with waiting_thread_count before that. */ if (flags & 0x10000) InterlockedIncrement( &b->structure_lock_count ); + + /* On Windows the long wait doesn't consume CPU with any spin count, so probably the spin count + * is limited or not used at all. */ + spin_count = min( 2000, (unsigned int)b->spin_count ); + + /* Wait for previous wait iteration to complete. */ + count = 0; + while (ReadAcquire( &b->reached_thread_count ) == b->total_thread_count) + { + if (count < spin_count) + { + ++count; + YieldProcessor(); + continue; + } + RtlWaitOnAddress( (void *)&b->reached_thread_count, &b->total_thread_count, + sizeof(b->reached_thread_count), NULL ); + } + InterlockedIncrement( &b->waiting_thread_count ); if (InterlockedIncrement( &b->reached_thread_count ) == b->total_thread_count) { WriteRelease( &b->wait_barrier_complete, 1 ); @@ -1458,9 +1477,6 @@ BOOLEAN WINAPI RtlBarrier( RTL_BARRIER *barrier, ULONG flags ) ret = TRUE; goto done; } - /* On Windows the long wait doesn't consume CPU with any spin count, so probably the spin count - * is limited or not used at all. */ - spin_count = min( 2000, (unsigned int)b->spin_count ); count = 0; while (ReadAcquire( &b->reached_thread_count ) < b->total_thread_count ) { @@ -1474,6 +1490,13 @@ BOOLEAN WINAPI RtlBarrier( RTL_BARRIER *barrier, ULONG flags ) }
done: + if (!InterlockedDecrement( &b->waiting_thread_count )) + { + WriteRelease( &b->wait_barrier_complete, 0 ); + WriteRelease( &b->reached_thread_count, 0 ); + RtlWakeAddressAll( (const void *)&b->reached_thread_count ); + } + if (flags & 0x10000 && !InterlockedDecrement( &b->structure_lock_count )) { /* Now RTL_BARRIER structure contents may become invalid. Signaling on its address should be fine, the worst diff --git a/dlls/ntdll/tests/sync.c b/dlls/ntdll/tests/sync.c index 1d5a3c7ccd7..dcbb8b18c3f 100644 --- a/dlls/ntdll/tests/sync.c +++ b/dlls/ntdll/tests/sync.c @@ -1348,7 +1348,7 @@ static void test_barrier(void) WaitForSingleObject( threads[i], INFINITE ); CloseHandle( threads[i] ); } - todo_wine ok( true_ret_count == p.iter_count, "got %ld.\n", true_ret_count ); + ok( true_ret_count == p.iter_count, "got %ld.\n", true_ret_count );
/* Normal case. */ status = pRtlInitBarrier( &barrier, p.thread_count, -1 );