This comes from behavioral study of Windows, which doesn't seem to check if the lock is actually exclusively held, and just simply decrement the value stored in the lock.
This fixes a dead lock which prevents WeCom from starting up.
-- v13: ntdll: An implementation of SRWLOCK that closer matches Windows'. include: add atomic read/write of pointers
From: Yuxuan Shui yshui@codeweavers.com
--- include/winnt.h | 48 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-)
diff --git a/include/winnt.h b/include/winnt.h index c04f25b29bd..73c8775c460 100644 --- a/include/winnt.h +++ b/include/winnt.h @@ -4244,7 +4244,7 @@ typedef struct _ACL {
typedef enum _ACL_INFORMATION_CLASS { - AclRevisionInformation = 1, + AclRevisionInformation = 1, AclSizeInformation } ACL_INFORMATION_CLASS;
@@ -6911,12 +6911,20 @@ static FORCEINLINE void MemoryBarrier(void) */ #if _MSC_VER >= 1700 #pragma intrinsic(__iso_volatile_load32) +#pragma intrinsic(__iso_volatile_load64) #pragma intrinsic(__iso_volatile_store32) +#pragma intrinsic(__iso_volatile_store64) #define __WINE_LOAD32_NO_FENCE(src) (__iso_volatile_load32(src)) +#define __WINE_LOAD64_NO_FENCE(src) (__iso_volatile_load64(src)) #define __WINE_STORE32_NO_FENCE(dest, value) (__iso_volatile_store32(dest, value)) +#define __WINE_STORE64_NO_FENCE(dest, value) (__iso_volatile_store64(dest, value)) #else /* _MSC_VER >= 1700 */ -#define __WINE_LOAD32_NO_FENCE(src) (*(src)) -#define __WINE_STORE32_NO_FENCE(dest, value) ((void)(*(dest) = (value))) +#define __WINE_LOAD_NO_FENCE(src) (*(src)) +#define __WINE_STORE_NO_FENCE(dest, value) ((void)(*(dest) = (value))) +#define __WINE_LOAD32_NO_FENCE(src) __WINE_LOAD_NO_FENCE(src) +#define __WINE_LOAD64_NO_FENCE(src) __WINE_LOAD_NO_FENCE(src) +#define __WINE_STORE32_NO_FENCE(dest, value) __WINE_STORE_NO_FENCE(dest, value) +#define __WINE_STORE64_NO_FENCE(dest, value) __WINE_STORE_NO_FENCE(dest, value) #endif /* _MSC_VER >= 1700 */
#if defined(__i386__) || defined(__x86_64__) @@ -6943,6 +6951,18 @@ static FORCEINLINE LONG ReadAcquire( LONG const volatile *src ) return value; }
+static FORCEINLINE void* ReadPointerAcquire( void* const volatile *src ) +{ + void *value; +#ifdef _WIN64 + value = (void *)__WINE_LOAD64_NO_FENCE( (INT64 const volatile *)src ); +#else + value = (void *)__WINE_LOAD32_NO_FENCE( (INT32 const volatile *)src ); +#endif + __wine_memory_barrier_acq_rel(); + return value; +} + static FORCEINLINE LONG ReadNoFence( LONG const volatile *src ) { LONG value = __WINE_LOAD32_NO_FENCE( (int const volatile *)src ); @@ -6955,6 +6975,16 @@ static FORCEINLINE void WriteRelease( LONG volatile *dest, LONG value ) __WINE_STORE32_NO_FENCE( (int volatile *)dest, value ); }
+static FORCEINLINE void WritePointerRelease( void* volatile *dest, void* value ) +{ + __wine_memory_barrier_acq_rel(); +#ifdef _WIN64 + __WINE_STORE64_NO_FENCE( (INT64 volatile *)dest, (INT64)value ); +#else + __WINE_STORE32_NO_FENCE( (INT32 volatile *)dest, (INT32)value ); +#endif +} + static FORCEINLINE void WriteNoFence( LONG volatile *dest, LONG value ) { __WINE_STORE32_NO_FENCE( (int volatile *)dest, value ); @@ -7123,6 +7153,13 @@ static FORCEINLINE LONG ReadAcquire( LONG const volatile *src ) return value; }
+static FORCEINLINE void *ReadPointerAcquire( void *const volatile *src ) +{ + void *value; + __WINE_ATOMIC_LOAD_ACQUIRE( src, &value ); + return value; +} + static FORCEINLINE LONG ReadNoFence( LONG const volatile *src ) { LONG value; @@ -7140,6 +7177,11 @@ static FORCEINLINE void WriteNoFence( LONG volatile *dest, LONG value ) __WINE_ATOMIC_STORE_RELAXED( dest, &value ); }
+static FORCEINLINE void WritePointerRelease( void *volatile *src, void *value ) +{ + __WINE_ATOMIC_STORE_RELEASE( src, &value ); +} + static FORCEINLINE DECLSPEC_NORETURN void __fastfail(unsigned int code) { #if defined(__x86_64__) || defined(__i386__)
From: Yuxuan Shui yshui@codeweavers.com
--- dlls/kernel32/tests/sync.c | 24 ++ dlls/ntdll/sync.c | 534 +++++++++++++++++++++++++++---------- 2 files changed, 420 insertions(+), 138 deletions(-)
diff --git a/dlls/kernel32/tests/sync.c b/dlls/kernel32/tests/sync.c index 60180194b7a..4eaec87c52f 100644 --- a/dlls/kernel32/tests/sync.c +++ b/dlls/kernel32/tests/sync.c @@ -2533,6 +2533,29 @@ static void test_srwlock_example(void) trace("number of total exclusive accesses is %ld\n", srwlock_protected_value); }
+static void test_srwlock_quirk(void) +{ + union { SRWLOCK *s; LONG *l; } u = { &srwlock_example }; + + if (!pInitializeSRWLock) { + /* function is not yet in XP, only in newer Windows */ + win_skip("no srw lock support.\n"); + return; + } + + *u.l = 0; + pReleaseSRWLockExclusive(&srwlock_example); + ok(*u.l == 0xffffffff, "expected 0xffffffff, got %lx\n", *u.l); + + *u.l = 1; + pReleaseSRWLockExclusive(&srwlock_example); + ok(*u.l == 0, "expected 0x0, got %lx\n", *u.l); + + *u.l = 0x10000; + pReleaseSRWLockExclusive(&srwlock_example); + ok(*u.l == 0xffff, "expected 0xffff, got %lx\n", *u.l); +} + static DWORD WINAPI alertable_wait_thread(void *param) { HANDLE *semaphores = param; @@ -2887,6 +2910,7 @@ START_TEST(sync) test_srwlock_base(&aligned_srwlock); test_srwlock_base(&unaligned_srwlock.lock); test_srwlock_example(); + test_srwlock_quirk(); test_alertable_wait(); test_apc_deadlock(); test_crit_section(); diff --git a/dlls/ntdll/sync.c b/dlls/ntdll/sync.c index fa64917029a..4547ba900e0 100644 --- a/dlls/ntdll/sync.c +++ b/dlls/ntdll/sync.c @@ -26,6 +26,8 @@ #include <stdarg.h> #include <stdio.h> #include <stdlib.h> +#include <stdint.h> +#include <assert.h> #include <time.h>
#include "ntstatus.h" @@ -471,24 +473,85 @@ DWORD WINAPI RtlRunOnceExecuteOnce( RTL_RUN_ONCE *once, PRTL_RUN_ONCE_INIT_FN fu return RtlRunOnceComplete( once, 0, context ? *context : NULL ); }
-struct srw_lock -{ - short exclusive_waiters; +/** + * The SRWLOCK is a tagged pointer, with the last 4 bits been the tag. + * The pointer part serves 2 roles: if there is no waiters on the lock, + * it counts the number of shared owners of the lock; otherwise, it + * points to the end of the waiter queue, which is a linked list of waiters. + * This also implies the pointer must be 16 byte aligned. + * + * There are a couple helper functions below to make accessing different parts + * of the SRWLOCK easier. + */ + +enum srwlock_tag { + SRWLOCK_TAG_LOCKED = 1, + SRWLOCK_TAG_HAS_WAITERS = 2, + /* Unknown purpose on Windows, but we use it to indicate the list + * is being modified. */ + SRWLOCK_TAG_LIST_LOCKED = 4, + /* Unclear purpose on Windows, might be indicating this lock has + * > 1 owners. */ + SRWLOCK_TAG_HAS_MULTIPLE_OWNERS = 8, +};
- /* Number of shared owners, or -1 if owned exclusive. - * - * Sadly Windows has no equivalent to FUTEX_WAIT_BITSET, so in order to wake - * up *only* exclusive or *only* shared waiters (and thus avoid spurious - * wakeups), we need to wait on two different addresses. - * RtlAcquireSRWLockShared() needs to know the values of "exclusive_waiters" - * and "owners", but RtlAcquireSRWLockExclusive() only needs to know the - * value of "owners", so the former can wait on the entire structure, and - * the latter waits only on the "owners" member. Note then that "owners" - * must not be the first element in the structure. +enum srwlock_waiter_state { + SRWLOCK_WAITER_STATE_IS_EXCLUSIVE = 1, + /* ???? = 2, */ + SRWLOCK_WAITER_STATE_NOTIFIED = 4, +}; +struct DECLSPEC_ALIGN(16) srwlock_waiter { + /* Note the prev pointer is uninitialized for the list head. */ + struct srwlock_waiter *prev; + struct srwlock_waiter *head; + struct srwlock_waiter *next; + ULONG_PTR thread_id; + /* When the first waiter appears, the shared owner counter store in + * SRWLOCK is transferred here, however only the lower 28 bits are kept. + * A special value 0xffffffff indicates this waiter is not at the + * head of the list, and 0xfffffffe indicates this waiter is at the head + * of the list, but the lock is held exclusively. */ - short owners; + DWORD num_owners; + + /* A bitflag of states, see `enum srwlock_waiter_state`. */ + DWORD state; }; -C_ASSERT( sizeof(struct srw_lock) == 4 ); + +#define SRWLOCK_WAITER_IS_NOT_HEAD 0xffffffff +#define SRWLOCK_WAITER_EXCLUSIVELY_LOCKED 0xfffffffe + +static inline struct srwlock_waiter *srwlock_get_waiter(RTL_SRWLOCK lock) +{ + ULONG_PTR ptr = (ULONG_PTR)lock.Ptr; + return (void *)(ptr & ~(ULONG_PTR)0xf); +} + +static inline ULONG_PTR srwlock_get_shared_count(RTL_SRWLOCK lock) +{ + ULONG_PTR ptr = (ULONG_PTR)lock.Ptr; + return ((ptr & (ULONG_PTR)0xfffffff0) >> 4); +} + +static inline USHORT srwlock_get_tag(RTL_SRWLOCK lock) +{ + ULONG_PTR ptr = (ULONG_PTR)lock.Ptr; + return (USHORT)(ptr & (ULONG_PTR)0xf); +} + +static inline RTL_SRWLOCK srwlock_from_waiter(struct srwlock_waiter *waiter, USHORT tag) +{ + RTL_SRWLOCK lock; + lock.Ptr = (void *)((ULONG_PTR)waiter | (ULONG_PTR)tag); + return lock; +} + +static inline RTL_SRWLOCK srwlock_from_shared_owner_count(ULONG_PTR count, USHORT tag) +{ + RTL_SRWLOCK lock; + lock.Ptr = (void *)(((ULONG_PTR)count << 4) | (ULONG_PTR)tag); + return lock; +}
/*********************************************************************** * RtlInitializeSRWLock (NTDLL.@) @@ -496,141 +559,233 @@ C_ASSERT( sizeof(struct srw_lock) == 4 ); * NOTES * Please note that SRWLocks do not keep track of the owner of a lock. * It doesn't make any difference which thread for example unlocks an - * SRWLock (see corresponding tests). This implementation uses two - * keyed events (one for the exclusive waiters and one for the shared - * waiters) and is limited to 2^15-1 waiting threads. + * SRWLock (see corresponding tests). */ void WINAPI RtlInitializeSRWLock( RTL_SRWLOCK *lock ) { lock->Ptr = NULL; }
-/*********************************************************************** - * RtlAcquireSRWLockExclusive (NTDLL.@) - * - * NOTES - * Unlike RtlAcquireResourceExclusive this function doesn't allow - * nested calls from the same thread. "Upgrading" a shared access lock - * to an exclusive access lock also doesn't seem to be supported. - */ -void WINAPI RtlAcquireSRWLockExclusive( RTL_SRWLOCK *lock ) +/* Transfer the ownership of the lock to the waiters at the front of the list, + * remove them from the list, and wake them up. Only one thread can call this + * function at any given time. */ +static void srwlock_transfer_ownership(RTL_SRWLOCK *lock) { - union { RTL_SRWLOCK *rtl; struct srw_lock *s; LONG *l; } u = { lock }; + RTL_SRWLOCK old, new, read; + USHORT tag; + struct srwlock_waiter *last, *head, *last_up_to_date, *last_to_remove, + *new_head;
- InterlockedIncrement16( &u.s->exclusive_waiters ); + TRACE("%p\n", lock);
- for (;;) + do + { + old.Ptr = ReadPointerAcquire(&lock->Ptr); + tag = srwlock_get_tag(old); + last = srwlock_get_waiter(old); + + assert(!(tag & SRWLOCK_TAG_LIST_LOCKED)); + tag |= SRWLOCK_TAG_LIST_LOCKED; + new = srwlock_from_waiter(last, tag); + } while (InterlockedCompareExchangePointer(&lock->Ptr, new.Ptr, old.Ptr) != + old.Ptr); + + assert(tag & SRWLOCK_TAG_HAS_WAITERS); + old = new; + last = srwlock_get_waiter(old); + assert(last != NULL); + head = last->head; + assert(head != NULL); + last_up_to_date = last; /* the last waiter whose head pointer is up-to-date. */ + new_head = NULL; + if (head->num_owners != 0 && + head->num_owners != SRWLOCK_WAITER_EXCLUSIVELY_LOCKED) { - union { struct srw_lock s; LONG l; } old, new; - BOOL wait; + FIXME("head num_owners %lx\n", head->num_owners); + assert(FALSE); + } + head->num_owners = 0; + last_to_remove = head; + do + { + tag = srwlock_get_tag(old); + last = srwlock_get_waiter(old);
- do + /* Every time cmpxchg failed, new waiter could have been added + * to the end of the list. Which means we might have more waiters + * to wake, and more head pointers to update. */ + while (TRUE) { - old.s = *u.s; - new.s = old.s; - - if (!old.s.owners) + struct srwlock_waiter *tmp = NULL; + if (last_to_remove != last) + tmp = ReadPointerAcquire((void **)&last_to_remove->next); + if (tmp != new_head) { - /* Not locked exclusive or shared. We can try to grab it. */ - new.s.owners = -1; - --new.s.exclusive_waiters; - wait = FALSE; + new_head = tmp; + last_up_to_date = last_to_remove; } - else + if (!new_head) + break; + if (last_to_remove->state & SRWLOCK_WAITER_STATE_IS_EXCLUSIVE) { - wait = TRUE; + new_head->num_owners = SRWLOCK_WAITER_EXCLUSIVELY_LOCKED; + break; } - } while (InterlockedCompareExchange( u.l, new.l, old.l ) != old.l); + new_head->num_owners = last_to_remove->num_owners + 1; + if (new_head->num_owners > 1) + tag |= SRWLOCK_TAG_HAS_MULTIPLE_OWNERS; + if (new_head->state & SRWLOCK_WAITER_STATE_IS_EXCLUSIVE) + break; + last_to_remove = new_head; + } + + while (last_up_to_date != last) + { + last_up_to_date = ReadPointerAcquire((void **)&last_up_to_date->next); + assert(last_up_to_date != NULL); + last_up_to_date->head = new_head; + } + + tag = SRWLOCK_TAG_LOCKED; + if (new_head) + { + tag |= SRWLOCK_TAG_HAS_WAITERS; + if (new_head->num_owners > 1 && + new_head->num_owners != SRWLOCK_WAITER_EXCLUSIVELY_LOCKED) + tag |= SRWLOCK_TAG_HAS_MULTIPLE_OWNERS; + new = srwlock_from_waiter(last, tag); + } + else + { + if (last_to_remove->state & SRWLOCK_WAITER_STATE_IS_EXCLUSIVE) + new = srwlock_from_shared_owner_count(0, tag); + else + new = srwlock_from_shared_owner_count(last_to_remove->num_owners + 1, tag); + } + + read.Ptr = InterlockedCompareExchangePointer(&lock->Ptr, new.Ptr, old.Ptr); + if (read.Ptr == old.Ptr) + break; + old = read; + } while (TRUE);
- if (!wait) return; - RtlWaitOnAddress( &u.s->owners, &new.s.owners, sizeof(short), NULL ); + while (TRUE) + { + /* waking up the waiter will invalidate the waiter's entry in + * the list. */ + struct srwlock_waiter *next = head->next; + InterlockedOr((LONG *)&head->state, SRWLOCK_WAITER_STATE_NOTIFIED); + RtlWakeAddressSingle(&head->state); + if (head == last_to_remove) + break; + head = next; } }
-/*********************************************************************** - * RtlAcquireSRWLockShared (NTDLL.@) +/** + * Add a new waiter to the SRWLOCK, and wait on it. The wait will be interrupted + * if the value stored in `lock` changes to something other than `expected`. * - * NOTES - * Do not call this function recursively - it will only succeed when - * there are no threads waiting for an exclusive lock! + * Returns TRUE if the lock is acquired, FALSE otherwise. */ -void WINAPI RtlAcquireSRWLockShared( RTL_SRWLOCK *lock ) +#ifdef __GNUC__ +__attribute__((force_align_arg_pointer)) +#endif +static BOOLEAN srwlock_wait(RTL_SRWLOCK *lock, DWORD waiter_state, + RTL_SRWLOCK expected) { - union { RTL_SRWLOCK *rtl; struct srw_lock *s; LONG *l; } u = { lock }; + RTL_SRWLOCK new; + USHORT tag; + struct srwlock_waiter *prev = NULL, waiter = {0};
- for (;;) + tag = srwlock_get_tag(expected); + TRACE("%p %p %p\n", lock, &waiter, expected.Ptr); + + waiter.state = waiter_state; + if (tag & SRWLOCK_TAG_HAS_WAITERS) + { + prev = srwlock_get_waiter(expected); + if (!(tag & SRWLOCK_TAG_LIST_LOCKED)) + waiter.head = prev->head; + else + /* some other thread is modifying the list, they + * are responsible for setting our head pointer. */ + waiter.head = NULL; + waiter.prev = prev; + waiter.num_owners = SRWLOCK_WAITER_IS_NOT_HEAD; + } + else { - union { struct srw_lock s; LONG l; } old, new; - BOOL wait; + waiter.head = &waiter; + waiter.num_owners = srwlock_get_shared_count(expected); + if (waiter.num_owners > 1) + tag |= SRWLOCK_TAG_HAS_MULTIPLE_OWNERS; + if (waiter.num_owners == 0) + waiter.num_owners = SRWLOCK_WAITER_EXCLUSIVELY_LOCKED; + tag |= SRWLOCK_TAG_HAS_WAITERS; + } + waiter.next = NULL;
- do + if (prev) + { + if (InterlockedCompareExchangePointer((void **)&prev->next, + (void *)&waiter, NULL) != NULL) { - old.s = *u.s; - new = old; - - if (old.s.owners != -1 && !old.s.exclusive_waiters) - { - /* Not locked exclusive, and no exclusive waiters. - * We can try to grab it. */ - ++new.s.owners; - wait = FALSE; - } - else - { - wait = TRUE; - } - } while (InterlockedCompareExchange( u.l, new.l, old.l ) != old.l); + /* Someone else is also appending to the list, they have linked their + * node to `prev`, but hasn't changed lock->Ptr yet. So we wait for + * the lock to change and retry. */ + RtlWaitOnAddress(&lock->Ptr, &expected.Ptr, sizeof(expected.Ptr), + NULL); + return FALSE; + } + }
- if (!wait) return; - RtlWaitOnAddress( u.s, &new.s, sizeof(struct srw_lock), NULL ); + /* Once we reach here, `prev->next` can't be changed by anyone but us. So + * if we fail to change lock->Ptr, we have to reset it so others can append + * to the list. */ + new = srwlock_from_waiter(&waiter, tag); + if (InterlockedCompareExchangePointer(&lock->Ptr, new.Ptr, expected.Ptr) != + expected.Ptr) + { + if (prev) + WritePointerRelease((void **)&prev->next, NULL); + return FALSE; } -}
-/*********************************************************************** - * RtlReleaseSRWLockExclusive (NTDLL.@) - */ -void WINAPI RtlReleaseSRWLockExclusive( RTL_SRWLOCK *lock ) -{ - union { RTL_SRWLOCK *rtl; struct srw_lock *s; LONG *l; } u = { lock }; - union { struct srw_lock s; LONG l; } old, new; + RtlWakeAddressSingle(&lock->Ptr);
+ TRACE("waiting: %p, %p, %lu\n", lock, &waiter, waiter.num_owners); do { - old.s = *u.s; - new = old; - - if (old.s.owners != -1) ERR("Lock %p is not owned exclusive!\n", lock); - - new.s.owners = 0; - } while (InterlockedCompareExchange( u.l, new.l, old.l ) != old.l); + waiter_state = ReadAcquire((LONG *)&waiter.state); + if (waiter_state & SRWLOCK_WAITER_STATE_NOTIFIED) + break; + RtlWaitOnAddress(&waiter.state, &waiter_state, sizeof(DWORD), NULL); + } while (TRUE);
- if (new.s.exclusive_waiters) - RtlWakeAddressSingle( &u.s->owners ); - else - RtlWakeAddressAll( u.s ); + TRACE("acquired: %p\n", lock); + return TRUE; }
-/*********************************************************************** - * RtlReleaseSRWLockShared (NTDLL.@) - */ -void WINAPI RtlReleaseSRWLockShared( RTL_SRWLOCK *lock ) +static BOOLEAN srwlock_try_acquire_exclusive(RTL_SRWLOCK *lock, RTL_SRWLOCK *old) { - union { RTL_SRWLOCK *rtl; struct srw_lock *s; LONG *l; } u = { lock }; - union { struct srw_lock s; LONG l; } old, new; + RTL_SRWLOCK new; + USHORT tag;
do { - old.s = *u.s; - new = old; + old->Ptr = ReadPointerAcquire(&lock->Ptr); + tag = srwlock_get_tag(*old);
- if (old.s.owners == -1) ERR("Lock %p is owned exclusive!\n", lock); - else if (!old.s.owners) ERR("Lock %p is not owned shared!\n", lock); + if (tag & SRWLOCK_TAG_LOCKED) + return FALSE;
- --new.s.owners; - } while (InterlockedCompareExchange( u.l, new.l, old.l ) != old.l); + new = srwlock_from_shared_owner_count(0, SRWLOCK_TAG_LOCKED); + } while (InterlockedCompareExchangePointer(&lock->Ptr, new.Ptr, old->Ptr) != + old->Ptr);
- if (!new.s.owners) - RtlWakeAddressSingle( &u.s->owners ); + RtlWakeAddressSingle(&lock->Ptr); + return TRUE; }
/*********************************************************************** @@ -642,28 +797,57 @@ void WINAPI RtlReleaseSRWLockShared( RTL_SRWLOCK *lock ) */ BOOLEAN WINAPI RtlTryAcquireSRWLockExclusive( RTL_SRWLOCK *lock ) { - union { RTL_SRWLOCK *rtl; struct srw_lock *s; LONG *l; } u = { lock }; - union { struct srw_lock s; LONG l; } old, new; - BOOLEAN ret; + RTL_SRWLOCK old; + TRACE("%p\n", lock); + return srwlock_try_acquire_exclusive(lock, &old); +} + +/*********************************************************************** + * RtlAcquireSRWLockExclusive (NTDLL.@) + * + * NOTES + * Unlike RtlAcquireResourceExclusive this function doesn't allow + * nested calls from the same thread. "Upgrading" a shared access lock + * to an exclusive access lock also doesn't seem to be supported. + */ +void WINAPI RtlAcquireSRWLockExclusive( RTL_SRWLOCK *lock ) +{ + RTL_SRWLOCK old; + + TRACE("%p\n", lock); + + while (!srwlock_try_acquire_exclusive(lock, &old)) + if (srwlock_wait(lock, SRWLOCK_WAITER_STATE_IS_EXCLUSIVE, old)) + break; +} + +static BOOLEAN srwlock_try_acquire_shared(RTL_SRWLOCK *lock, RTL_SRWLOCK *old) +{ + RTL_SRWLOCK new; + USHORT tag; + ULONG_PTR shared_count;
do { - old.s = *u.s; - new.s = old.s; + old->Ptr = ReadPointerAcquire(&lock->Ptr); + tag = srwlock_get_tag(*old); + shared_count = srwlock_get_shared_count(*old);
- if (!old.s.owners) - { - /* Not locked exclusive or shared. We can try to grab it. */ - new.s.owners = -1; - ret = TRUE; - } - else - { - ret = FALSE; - } - } while (InterlockedCompareExchange( u.l, new.l, old.l ) != old.l); + if (tag & SRWLOCK_TAG_HAS_WAITERS) + return FALSE;
- return ret; + if ((tag & SRWLOCK_TAG_LOCKED) && shared_count == 0) + return FALSE; + + /* No waiters to check, and the lock is either locked shared, or not + * locked at all. We can try to grab it. */ + new = srwlock_from_shared_owner_count(shared_count + 1, + tag | SRWLOCK_TAG_LOCKED); + } while (InterlockedCompareExchangePointer(&lock->Ptr, new.Ptr, + old->Ptr ) != old->Ptr); + + RtlWakeAddressSingle(&lock->Ptr); + return TRUE; }
/*********************************************************************** @@ -671,29 +855,103 @@ BOOLEAN WINAPI RtlTryAcquireSRWLockExclusive( RTL_SRWLOCK *lock ) */ BOOLEAN WINAPI RtlTryAcquireSRWLockShared( RTL_SRWLOCK *lock ) { - union { RTL_SRWLOCK *rtl; struct srw_lock *s; LONG *l; } u = { lock }; - union { struct srw_lock s; LONG l; } old, new; - BOOLEAN ret; + RTL_SRWLOCK old; + TRACE("%p\n", lock); + return srwlock_try_acquire_shared(lock, &old); +} + +/*********************************************************************** + * RtlAcquireSRWLockShared (NTDLL.@) + * + * NOTES + * Do not call this function recursively - it will only succeed when + * there are no threads waiting for an exclusive lock! + */ +void WINAPI RtlAcquireSRWLockShared( RTL_SRWLOCK *lock ) +{ + RTL_SRWLOCK old; + + TRACE("%p\n", lock); + + while (!srwlock_try_acquire_shared(lock, &old)) + if (srwlock_wait(lock, 0, old)) + break; +} + +/*********************************************************************** + * RtlReleaseSRWLockExclusive (NTDLL.@) + */ +void WINAPI RtlReleaseSRWLockExclusive( RTL_SRWLOCK *lock ) +{ + RTL_SRWLOCK old, new; + USHORT tag; + + TRACE("%p\n", lock);
do { - old.s = *u.s; - new.s = old.s; + old.Ptr = ReadPointerAcquire(&lock->Ptr); + tag = srwlock_get_tag(old); + if (tag & SRWLOCK_TAG_HAS_WAITERS) + return srwlock_transfer_ownership(lock); + + /* Looks weird but this is how Windows does it. */ + new.Ptr = (void *)((ULONG_PTR)old.Ptr - 1); + } while (InterlockedCompareExchangePointer(&lock->Ptr, new.Ptr, old.Ptr) != + old.Ptr); + + RtlWakeAddressSingle(&lock->Ptr); +}
- if (old.s.owners != -1 && !old.s.exclusive_waiters) +/*********************************************************************** + * RtlReleaseSRWLockShared (NTDLL.@) + */ +void WINAPI RtlReleaseSRWLockShared( RTL_SRWLOCK *lock ) +{ + RTL_SRWLOCK old, new; + USHORT tag; + ULONG_PTR shared_count; + struct srwlock_waiter *last; + + TRACE("%p\n", lock); + + do + { + old.Ptr = ReadPointerAcquire(&lock->Ptr); + tag = srwlock_get_tag(old); + shared_count = srwlock_get_shared_count(old); + last = srwlock_get_waiter(old); + + if (!(tag & SRWLOCK_TAG_LOCKED)) { - /* Not locked exclusive, and no exclusive waiters. - * We can try to grab it. */ - ++new.s.owners; - ret = TRUE; + /* interestingly this check is only done for shared locks. */ + EXCEPTION_RECORD record; + record.ExceptionAddress = RtlReleaseSRWLockShared; + record.ExceptionCode = STATUS_RESOURCE_NOT_OWNED; + record.ExceptionFlags = 0; + record.ExceptionRecord = NULL; + record.NumberParameters = 0; + return RtlRaiseException(&record); } - else + + if (tag & SRWLOCK_TAG_HAS_WAITERS) { - ret = FALSE; + shared_count = InterlockedDecrement((LONG *)&last->head->num_owners); + if (shared_count == 0) + /* Only one thread reaches here. */ + srwlock_transfer_ownership(lock); + return; } - } while (InterlockedCompareExchange( u.l, new.l, old.l ) != old.l);
- return ret; + if (shared_count - 1 < 2) + tag &= ~SRWLOCK_TAG_HAS_MULTIPLE_OWNERS; + if (shared_count - 1 == 0) + tag &= ~SRWLOCK_TAG_LOCKED; + new = srwlock_from_shared_owner_count(shared_count - 1, tag); + } while (InterlockedCompareExchangePointer(&lock->Ptr, new.Ptr, old.Ptr) != + old.Ptr); + + RtlWakeAddressSingle(&lock->Ptr); }
/***********************************************************************
Just for an update, I *am* reviewing this, but it's a (necessarily) large and complex patch, so it's taking some time. [I was also out last week, so that's at least part of the delay.]
On Tue Aug 22 17:45:36 2023 +0000, Zebediah Figura wrote:
Just for an update, I *am* reviewing this, but it's a (necessarily) large and complex patch, so it's taking some time. [I was also out last week, so that's at least part of the delay.]
hey, no problem. thanks for looking at it!
let me know if there is anything unclear, or weird, i'll try to explain.
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
- NOTES
- Unlike RtlAcquireResourceExclusive this function doesn't allow
- nested calls from the same thread. "Upgrading" a shared access lock
- to an exclusive access lock also doesn't seem to be supported.
- */
-void WINAPI RtlAcquireSRWLockExclusive( RTL_SRWLOCK *lock ) +/* Transfer the ownership of the lock to the waiters at the front of the list,
- remove them from the list, and wake them up. Only one thread can call this
- function at any given time. */
+static void srwlock_transfer_ownership(RTL_SRWLOCK *lock) {
- union { RTL_SRWLOCK *rtl; struct srw_lock *s; LONG *l; } u = { lock };
- RTL_SRWLOCK old, new, read;
- USHORT tag;
- struct srwlock_waiter *last, *head, *last_up_to_date, *last_to_remove,
*new_head;
Let's just let this line flow past 80 characters. There are some other cases in this patch that feel similarly awkward; in general we don't stick fast to 80 characters, at least in ntdll, and having the occasional line that's 90-100 characters feels better to me than wrapping like this.
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
old.Ptr);
- assert(tag & SRWLOCK_TAG_HAS_WAITERS);
- old = new;
- last = srwlock_get_waiter(old);
- assert(last != NULL);
- head = last->head;
- assert(head != NULL);
- last_up_to_date = last; /* the last waiter whose head pointer is up-to-date. */
- new_head = NULL;
- if (head->num_owners != 0 &&
{head->num_owners != SRWLOCK_WAITER_EXCLUSIVELY_LOCKED)
union { struct srw_lock s; LONG l; } old, new;
BOOL wait;
FIXME("head num_owners %lx\n", head->num_owners);
I'd call this an ERR. Also, if you're using %x, please use %#x.
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
old.Ptr = ReadPointerAcquire(&lock->Ptr);
tag = srwlock_get_tag(old);
last = srwlock_get_waiter(old);
assert(!(tag & SRWLOCK_TAG_LIST_LOCKED));
tag |= SRWLOCK_TAG_LIST_LOCKED;
new = srwlock_from_waiter(last, tag);
- } while (InterlockedCompareExchangePointer(&lock->Ptr, new.Ptr, old.Ptr) !=
old.Ptr);
- assert(tag & SRWLOCK_TAG_HAS_WAITERS);
- old = new;
- last = srwlock_get_waiter(old);
- assert(last != NULL);
- head = last->head;
- assert(head != NULL);
It's not a big deal, but I think these assertions are superfluous if you're going to immediately dereference the variables. And, well, that means they kind of clutter the code.
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
- assert(last != NULL);
- head = last->head;
- assert(head != NULL);
- last_up_to_date = last; /* the last waiter whose head pointer is up-to-date. */
- new_head = NULL;
- if (head->num_owners != 0 &&
{head->num_owners != SRWLOCK_WAITER_EXCLUSIVELY_LOCKED)
union { struct srw_lock s; LONG l; } old, new;
BOOL wait;
FIXME("head num_owners %lx\n", head->num_owners);
assert(FALSE);
- }
- head->num_owners = 0;
- last_to_remove = head;
- do
do {} while (TRUE) is, while perfectly fine, arguably a bit less readable than just while (TRUE) {}. I suspect this was originally written with the cmpxchg in the condition, but then that didn't quite work out :-)
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
if (!old.s.owners)
struct srwlock_waiter *tmp = NULL;
if (last_to_remove != last)
tmp = ReadPointerAcquire((void **)&last_to_remove->next);
if (tmp != new_head) {
/* Not locked exclusive or shared. We can try to grab it. */
new.s.owners = -1;
--new.s.exclusive_waiters;
wait = FALSE;
new_head = tmp;
last_up_to_date = last_to_remove; }
else
if (!new_head)
break;
last_to_remove should always be the entry right before new_head, right? And if last_to_remove == last then new_head == NULL, and vice versa, right? And if new_head is NULL, then we're removing everything from the list and we don't need to update any head pointers.
So unless I'm missing something, we can simplify this part a bit, to:
```c if (last_to_remove == last) break; tmp = ReadAcquire(&last_to_remove->next); if (tmp != new_head) { new_head = tmp; last_up_to_date = last_to_remove; } ```
Somewhat relatedly, does the head pointer actually need to be accurate for every waiter? It's only ever read from the tail.
Even if so, I wonder if we can just simplify the "last_up_to_date" tracking in general. After playing with it a bit I came up with the following code structure; maybe this is a bit easier to follow?
```c new_head = NULL; last_up_to_date = last; for (;;) /* cmpxchg loop */ { if (!new_head) { for (;;) { /* walk the list, update last_to_remove */ } last_up_to_date = last_to_remove; }
while (last_up_to_date != last) /* update heads */ } ```
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
return RtlRunOnceComplete( once, 0, context ? *context : NULL );
}
-struct srw_lock -{
- short exclusive_waiters;
+/**
- The SRWLOCK is a tagged pointer, with the last 4 bits been the tag.
- The pointer part serves 2 roles: if there is no waiters on the lock,
- it counts the number of shared owners of the lock; otherwise, it
- points to the end of the waiter queue, which is a linked list of waiters.
For maximum clarity, can we say "tail" rather than "end"?
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
--new.s.exclusive_waiters;
wait = FALSE;
new_head = tmp;
last_up_to_date = last_to_remove; }
else
if (!new_head)
break;
if (last_to_remove->state & SRWLOCK_WAITER_STATE_IS_EXCLUSIVE) {
wait = TRUE;
new_head->num_owners = SRWLOCK_WAITER_EXCLUSIVELY_LOCKED;
break; }
} while (InterlockedCompareExchange( u.l, new.l, old.l ) != old.l);
new_head->num_owners = last_to_remove->num_owners + 1;
Depending on how and if this is restructured, making this a local variable may be a good idea. It should be more performant, at least.
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
}
while (last_up_to_date != last)
{
last_up_to_date = ReadPointerAcquire((void **)&last_up_to_date->next);
assert(last_up_to_date != NULL);
last_up_to_date->head = new_head;
}
tag = SRWLOCK_TAG_LOCKED;
if (new_head)
{
tag |= SRWLOCK_TAG_HAS_WAITERS;
if (new_head->num_owners > 1 &&
new_head->num_owners != SRWLOCK_WAITER_EXCLUSIVELY_LOCKED)
tag |= SRWLOCK_TAG_HAS_MULTIPLE_OWNERS;
I hate this indentation, because the continuation is at the same level as the body and looks the same. I'd rather either not wrap the if, or add braces. (In d3d we use eight-space indentation and put the && at the beginning of the line, which avoids the problem, but sadly that's not what we do in ntdll.)
Actually, isn't this "if (new_head != last)"? Which should be a cheaper comparison anyway.
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
}
read.Ptr = InterlockedCompareExchangePointer(&lock->Ptr, new.Ptr, old.Ptr);
if (read.Ptr == old.Ptr)
break;
old = read;
- } while (TRUE);
if (!wait) return;
RtlWaitOnAddress( &u.s->owners, &new.s.owners, sizeof(short), NULL );
- while (TRUE)
- {
/* waking up the waiter will invalidate the waiter's entry in
* the list. */
struct srwlock_waiter *next = head->next;
InterlockedOr((LONG *)&head->state, SRWLOCK_WAITER_STATE_NOTIFIED);
Does this actually need to be interlocked?
And, if so, we should just define "state" as a LONG type, avoiding the need to cast.
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
-/***********************************************************************
RtlAcquireSRWLockShared (NTDLL.@)
+/**
- Add a new waiter to the SRWLOCK, and wait on it. The wait will be interrupted
- if the value stored in `lock` changes to something other than `expected`.
- NOTES
- Do not call this function recursively - it will only succeed when
- there are no threads waiting for an exclusive lock!
*/
- Returns TRUE if the lock is acquired, FALSE otherwise.
-void WINAPI RtlAcquireSRWLockShared( RTL_SRWLOCK *lock ) +#ifdef __GNUC__ +__attribute__((force_align_arg_pointer)) +#endif
Even with the current bugs in i686-w64-mingw32 around alignment, I think we shouldn't need this? The WINAPI in the API functions includes force_align_arg_pointer already, and i686-w64-mingw32 should align the stack before calling this function. Are you encountering a situation where gcc isn't properly aligning the waiter structure?
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
- struct srwlock_waiter *prev = NULL, waiter = {0};
- for (;;)
- tag = srwlock_get_tag(expected);
- TRACE("%p %p %p\n", lock, &waiter, expected.Ptr);
- waiter.state = waiter_state;
- if (tag & SRWLOCK_TAG_HAS_WAITERS)
- {
prev = srwlock_get_waiter(expected);
if (!(tag & SRWLOCK_TAG_LIST_LOCKED))
waiter.head = prev->head;
else
/* some other thread is modifying the list, they
* are responsible for setting our head pointer. */
waiter.head = NULL;
This is pretty clearly broken, because the flag can get set immediately after we read it. But looking at the implementation, do we actually need to worry about the "locked" case? As far as I can tell srwlock_transfer_ownership() already correctly handles any case where the lock is concurrently updated.
(And if so, we can just throw out *all* of the "locking" code, right?)
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
read.Ptr = InterlockedCompareExchangePointer(&lock->Ptr, new.Ptr, old.Ptr);
if (read.Ptr == old.Ptr)
break;
old = read;
- } while (TRUE);
if (!wait) return;
RtlWaitOnAddress( &u.s->owners, &new.s.owners, sizeof(short), NULL );
- while (TRUE)
- {
/* waking up the waiter will invalidate the waiter's entry in
* the list. */
struct srwlock_waiter *next = head->next;
InterlockedOr((LONG *)&head->state, SRWLOCK_WAITER_STATE_NOTIFIED);
RtlWakeAddressSingle(&head->state);
It's no surprise that the waiter struct includes the TID, because I notice that we should be able to bypass win32 futexes and use TID alerts directly here.
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
{
/* Not locked exclusive, and no exclusive waiters.
* We can try to grab it. */
++new.s.owners;
wait = FALSE;
}
else
{
wait = TRUE;
}
} while (InterlockedCompareExchange( u.l, new.l, old.l ) != old.l);
/* Someone else is also appending to the list, they have linked their
* node to `prev`, but hasn't changed lock->Ptr yet. So we wait for
* the lock to change and retry. */
RtlWaitOnAddress(&lock->Ptr, &expected.Ptr, sizeof(expected.Ptr),
NULL);
I worry about the prospect of making this a futex wait, because as cheap as futexes are, we are in an extremely hot path here. Considering that what we're racing against should be two consecutive cmpxchg instructions, with no intervening calculations or memory accesses, I think spinning may be the right answer here.
I don't know if that means that we explicitly spin until the lock changes, or we just return FALSE and go through srwlock_wait() again.
Zebediah Figura (@zfigura) commented about dlls/ntdll/sync.c:
} while (InterlockedCompareExchange( u.l, new.l, old.l ) != old.l);
/* Someone else is also appending to the list, they have linked their
* node to `prev`, but hasn't changed lock->Ptr yet. So we wait for
* the lock to change and retry. */
RtlWaitOnAddress(&lock->Ptr, &expected.Ptr, sizeof(expected.Ptr),
NULL);
return FALSE;
}
- }
if (!wait) return;
RtlWaitOnAddress( u.s, &new.s, sizeof(struct srw_lock), NULL );
- /* Once we reach here, `prev->next` can't be changed by anyone but us. So
* if we fail to change lock->Ptr, we have to reset it so others can append
* to the list. */
- new = srwlock_from_waiter(&waiter, tag);
But this can't be possible, can it? Only the thread that successfully updated prev->next (i.e. this thread) can reach here. So that should just be assert(!prev), or am I missing something?
This patch uses several instances of ReadAcquire(). I firmly believe that any instance of a barrier needs to be justified with a comment, and it should be explained where the other half of the barrier is. If a location doesn't need a barrier, it should be ReadNoFence().
I believe there are two places where we need to care about acquire/release:
(1) Because we are implementing a lock, we need RtlAcquire* to have acquire semantics, and RtlRelease* to have release semantics. This should be covered by the cmpxchg of the lock ptr value itself, in all relevant paths. We may consider weakening those to InterlockedCompareExchangePointerAcquire() / InterlockedCompareExchangePointerRelease(), but I'm slightly less concerned about that, if only because there's precedent.
(2) When we fill the srwlock_waiter structure, we need the writes to be visible before we link it. Therefore the cmpxchgs to the lock Ptr and to the "next" pointer need to have release semantics, and on the other side, we need any reads of those members to have acquire semantics. Which they do, currently, but justification is a good idea.
On Tue Aug 22 23:51:38 2023 +0000, Zebediah Figura wrote:
Even with the current bugs in i686-w64-mingw32 around alignment, I think we shouldn't need this? The WINAPI in the API functions includes force_align_arg_pointer already, and i686-w64-mingw32 should align the stack before calling this function. Are you encountering a situation where gcc isn't properly aligning the waiter structure?
So I only added this because I observed the pointer being misaligned.
because the flag can get set immediately after we read it.
but the waiter is not added until we cmpxchg the lock ptr, and we would notice the flag got set and will retry.
On Wed Aug 23 06:11:00 2023 +0000, Yuxuan Shui wrote:
because the flag can get set immediately after we read it.
but the waiter is not added until we cmpxchg the lock ptr, and we would notice the flag got set and will retry.
when the list is in the "locked" state, `prev->head` isn't guaranteed to be up-to-date. but as you have also noted in another comment, we might not need all of the `head` pointers to be up-to-date. i haven't really thought that through, but if we do need all head pointers to be up-to-date, then we do need to care about the locked state.
On Tue Aug 22 23:51:39 2023 +0000, Zebediah Figura wrote:
I worry about the prospect of making this a futex wait, because as cheap as futexes are, we are in an extremely hot path here. Considering that what we're racing against should be two consecutive cmpxchg instructions, with no intervening calculations or memory accesses, I think spinning may be the right answer here. I don't know if that means that we explicitly spin until the lock changes, or we just return FALSE and go through srwlock_wait() again.
i am unsure about this one. if the thread that got through is de-scheduled in between the cmpxchgs, wouldn't all the spinning threads preventing it from being scheduled again in a timely manner?
does `RtlWaitOnAddress` not have a short busy waiting loop?
On Tue Aug 22 23:51:39 2023 +0000, Zebediah Figura wrote:
But this can't be possible, can it? Only the thread that successfully updated prev->next (i.e. this thread) can reach here. So that should just be assert(!prev), or am I missing something?
it's not possible among threads trying to add waiters, but some other thread might come along and change the lock pointer for something else. e.g. a thread releasing the lock could set the list_locked bit.
On Wed Aug 23 06:09:00 2023 +0000, Yuxuan Shui wrote:
So I only added this because I observed the pointer being misaligned.
Oh, I see, WINAPI doesn't actually include force_align_arg_pointer anymore, not after 62173699. We should probably revert that...
because the flag can get set immediately after we read it.
but the waiter is not added until we cmpxchg the lock ptr, and we would notice the flag got set and will retry.
Then we run into an ABA problem, though—the spinlock can get released again by that point.
In fact, yes, I think doing this lock-free isn't going to work. Regardless of whether we need all head pointers to be correct or just the tail's head pointer, I don't see a way to avoid the head pointer being stale in this function.
i am unsure about this one. if the thread that got through is de-scheduled in between the cmpxchgs, wouldn't all the spinning threads preventing it from being scheduled again in a timely manner?
I'm afraid I'm not familiar enough with kernel scheduling to know the answer to this one.
It's worth pointing out, though, that RtlWaitOnAddress() has its own spinlock. If there's more than two threads trying to concurrently add to the list, they're going to run into that spinlock anyway.
does `RtlWaitOnAddress` not have a short busy waiting loop?
It does not. Perhaps it should; I don't know, but given how much effort it took to get that function correct and performant, I'm wary of touching it.
On Wed Aug 23 17:19:46 2023 +0000, Zebediah Figura wrote:
i am unsure about this one. if the thread that got through is
de-scheduled in between the cmpxchgs, wouldn't all the spinning threads preventing it from being scheduled again in a timely manner? I'm afraid I'm not familiar enough with kernel scheduling to know the answer to this one. It's worth pointing out, though, that RtlWaitOnAddress() has its own spinlock. If there's more than two threads trying to concurrently add to the list, they're going to run into that spinlock anyway.
does `RtlWaitOnAddress` not have a short busy waiting loop?
It does not. Perhaps it should; I don't know, but given how much effort it took to get that function correct and performant, I'm wary of touching it.
Should it? Busy waits are a big issue already when apps do that themselves, shouldn't we avoid just any busy waits and heuristics around that (apart from locking something which is virtually guaranteed not to result in a real wait)?
Not related to this aspect, but I think the current implementation has a different issue with spinlock around control structures. Unless I am missing anything here, if we get SIGUSR for system APC while holding that lock, and that system APC will suspend the thread, the whole process will get locked up. I think this is a very unlikely condition so we are probably not hitting this in practices. Not sure which is a good way out of here, but if there are any ideas...
Should it? Busy waits are a big issue already when apps do that themselves, shouldn't we avoid just any busy waits and heuristics around that (apart from locking something which is virtually guaranteed not to result in a real wait)?
Well, the question is whether that wait should be made "hybrid", i.e. try for some small number of cycles before going to sleep. That may be a bit different from a typical busy wait.
It is worth mentioning, though, that if the wait *is* made hybrid, a subsequent wake will leave a buffered thread-ID alert, which may waste time.
[I can't imagine how Windows thought it was a good idea to implement futexes like this. Then again, maybe they figured out some magic faster implementation that I couldn't figure out.]
Not related to this aspect, but I think the current implementation has a different issue with spinlock around control structures. Unless I am missing anything here, if we get SIGUSR for system APC while holding that lock, and that system APC will suspend the thread, the whole process will get locked up. I think this is a very unlikely condition so we are probably not hitting this in practices. Not sure which is a good way out of here, but if there are any ideas...
Yes. It won't deadlock, because a system APC should only execute system code, but it will leave the other thread spinning. Unfortunately I don't think this is avoidable, but if anyone *does* know a way to code a wait queue that is lock-free, fair, supports timeouts, and doesn't allocate, I would love to hear it.
Well, the question is whether that wait should be made "hybrid", i.e. try for some small number of cycles before going to sleep. That may be a bit different from a typical busy wait.
It is still a busy wait, even a small one, involving interlocked memory access, and some guesses of what amount of spin is beneficial without having much of reliable data to decide on that. I think such solutions are also most often problematic in user space as what is small number and whether it make sense highly depends on app's workload and overall load.
Unfortunately I don't think this is avoidable, but if anyone *does* know a way to code a wait queue that is lock-free, fair, supports timeouts, and doesn't allocate, I would love to hear it.
It will de-facto deadlock if system APC is going to suspend thread (as the lock is going to be held until thread is unsuspended which may be forever). If there is some other system APC it won't deadlock although lock all the SRW locks operation process wise for the time of execution.
I don't know if there is a viable lock-free wait queue solution, likely a dead end complexity-wise in practice. What I could think of is some indication of "uninterruptable" section with either some rollback procedure to be called from sigusr handler or delayed sigusr processing to be initiated after exiting the lock from the functions taking such a lock. Not great comlexity wise as well but probably technically feasible. Or, well, maybe at least indicating that the lock is taken and printing a fixme in signal handler for a start, so we at least know when we hit that in practice?
On Wed Aug 23 17:15:26 2023 +0000, Zebediah Figura wrote:
because the flag can get set immediately after we read it.
but the waiter is not added until we cmpxchg the lock ptr, and we
would notice the flag got set and will retry. Then we run into an ABA problem, though—the spinlock can get released again by that point. In fact, yes, I think doing this lock-free isn't going to work. Regardless of whether we need all head pointers to be correct or just the tail's head pointer, I don't see a way to avoid the head pointer being stale in this function.
ah yeah there is indeed ABA problem. let me have a think about this.
On Wed Aug 23 22:05:23 2023 +0000, Yuxuan Shui wrote:
ah yeah there is indeed ABA problem. let me have a think about this.
but on windows acquiring the lock does not block on this list locked bit. either they also have this ABA problem, or they have some magical secret sauce... :thinking:
On Wed Aug 23 22:27:38 2023 +0000, Yuxuan Shui wrote:
but on windows acquiring the lock does not block on this list locked bit. either they also have this ABA problem, or they have some magical secret sauce... :thinking:
what if we let `srwlock_transfer_ownership` go beyond the tail of the list when updating the head pointer? (there is at most one node beyond the tail.)
and we cmpxchg when updating the head pointer on `srwlock_wait`'s side, if that fails then we know it's up-to-date.
it's still possible for `transfer_ownership` to start and finish in between `we successfully written to the head pointer` ~ `we update lock ptr`, but in that case `transfer_ownership` would overwrite the head pointer with the updated one.
On Wed Aug 23 17:07:23 2023 +0000, Zebediah Figura wrote:
Oh, I see, WINAPI doesn't actually include force_align_arg_pointer anymore, not after 62173699. We should probably revert that...
It was added because of calling into unix side back when there was no separation between it and PE via "syscalls". It makes no sense to add it back, especially because you can compile PE modules with a preferred stack boundary of 2 (4 bytes as on Windows) and then force_align_arg_pointer won't do anything.
What's this compiler bug about? Did you try annotate the variable declaration with DECLSPEC_ALIGN(16), not just the struct type?
It was added because of calling into unix side back when there was no separation between it and PE via "syscalls". It makes no sense to add it back, especially because you can compile PE modules with a preferred stack boundary of 2 (4 bytes as on Windows) and then force_align_arg_pointer won't do anything.
Sure, we could, and that'd be another way to solve the problem, but currently we don't. The fact is that gcc assumes the stack boundary is 16-byte-aligned, period, and that means that not only do we need to force alignment so we don't break the Unix ABI, but we also need to force alignment so that aligned types and variables actually will be aligned.
There may be motion upstream to change that, and assume 4-byte-alignment for the i686-w64-mingw32 target, but that will take time to propagate. We should introduce some solution before then.
What's this compiler bug about? Did you try annotate the variable declaration with DECLSPEC_ALIGN(16), not just the struct type?
That shouldn't make a difference.
On Thu Aug 24 06:08:47 2023 +0000, Yuxuan Shui wrote:
what if we let `srwlock_transfer_ownership` go beyond the tail of the list when updating the head pointer? (there is at most one node beyond the tail.) and we cmpxchg when updating the head pointer on `srwlock_wait`'s side, if that fails then we know it's up-to-date. it's still possible for `transfer_ownership` to start and finish in between `we successfully written to the head pointer` ~ `we update lock ptr`, but in that case `transfer_ownership` would overwrite the head pointer with the updated one.
Currently there is a gap between reading the head, and pushing our node to the list at all. If a wake happens during that gap, there is no way for it to know that the new waiter needs its head updated. I.e. we have this race:
``` thread Q thread S ------------------------------ read head go through list, change head wake cmpxchg next cmpxchg tail ```
I think you're onto a solution, although I'm not quite sure I correctly understand what you're saying, but in order for it to work, we *also* need to change the order of queueing:
* cmpxchg the "prev->next" pointer (compare to NULL), retry if failure * read prev->head * cmpxchg the "waiter.head" pointer (compare to NULL), ignore if failure * cmpxchg the lock tail pointer; failure should be impossible if there was a prev
Then, when waking, we have to, as you say, not trust the tail to be the actual tail.
On Thu Aug 24 17:35:33 2023 +0000, Zebediah Figura wrote:
Currently there is a gap between reading the head, and pushing our node to the list at all. If a wake happens during that gap, there is no way for it to know that the new waiter needs its head updated. I.e. we have this race:
thread Q thread S ------------------------------ read head go through list, change head wake cmpxchg next cmpxchg tail
I think you're onto a solution, although I'm not quite sure I correctly understand what you're saying, but in order for it to work, we *also* need to change the order of queueing:
- cmpxchg the "prev->next" pointer (compare to NULL), retry if failure
- read prev->head
- cmpxchg the "waiter.head" pointer (compare to NULL), ignore if failure
- cmpxchg the lock tail pointer; failure should be impossible if there
was a prev Then, when waking, we have to, as you say, not trust the tail to be the actual tail.
Ugh, no. *Now* I remember the unfixable race that I was seeing when I was trying to do this sort of thing with critical sections. I knew there was one, it was nagging at the back of my mind the entire time I was reviewing this series, but I couldn't remember what it was.
The race is very simple, and very fundamental. The problem is that in order to queue, you have to do *something* to the previous tail, whether that's a read or write. In this case it's both—you need to read the previous head from the tail, and write the "next" pointer to it.
The problem with this is that the tail can be woken, and removed from the list, at any time. As soon as you grab the address of the tail from the srwlock pointer, that memory can become invalid. At an unlikely extreme, the memory on that stack can simply be freed. At a more likely extreme, cmpxchg'ing the next pointer can scribble over a stack local in a different function.
Sure, we could, and that'd be another way to solve the problem, but currently we don't. The fact is that gcc assumes the stack boundary is 16-byte-aligned, period, and that means that not only do we need to force alignment so we don't break the Unix ABI, but we also need to force alignment so that aligned types and variables actually will be aligned.
That's not true, at least not with MinGW. If it were, this would never have been removed because it would break **every** optimization with anything > 8 bytes (technically, `double` doesn't require alignment), such as vectors. And I compiled that for ages, and it does realign if needed for vectors. If it doesn't, it's a compiler bug.
What you're essentially saying is that every library built as PE with MinGW is utterly broken if it uses auto vectorization or types that need 16 byte alignment on the stack without using this attribute? (and most of them don't)
BTW by "every library built as PE" I do not mean Wine's. I mean native DLLs compiled using MinGW and used on native Windows…
That shouldn't make a difference.
Right, that's why I was wondering what the bug was. Maybe it's a bug with types only, so it's a workaround.
Sure, we could, and that'd be another way to solve the problem, but currently we don't. The fact is that gcc assumes the stack boundary is 16-byte-aligned, period, and that means that not only do we need to force alignment so we don't break the Unix ABI, but we also need to force alignment so that aligned types and variables actually will be aligned.
That's not true, at least not with MinGW. If it were, this would never have been removed because it would break **every** optimization with anything > 8 bytes (technically, `double` doesn't require alignment), such as vectors. And I compiled that for ages, and it does realign if needed for vectors. If it doesn't, it's a compiler bug.
If vector optimizations are used (e.g. -msse2), gcc will manually align the stack for i686-w64-mingw32. This was intentionally done [1]. But that doesn't extend to manually aligned types; those are currently just broken on Wine, as this patch set shows.
I originally thought it was a compiler bug too, but the current behaviour *is* intentional, albeit perhaps poorly thought out. See also [2].
[1] https://inbox.sourceware.org/gcc-patches/5969976.Bvae8NF9fS@polaris/
[2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111107
What you're essentially saying is that every library built as PE with MinGW is utterly broken if it uses auto vectorization or types that need 16 byte alignment on the stack without using this attribute? (and most of them don't)
BTW by "every library built as PE" I do not mean Wine's. I mean native DLLs compiled using MinGW and used on native Windows…
It's not a problem if the entire stack is built with mingw, because mingw manually aligns the stack in its crt0, and gcc builds functions that expect and preserve 16-byte alignment. But if you ever mix MSVC and MinGW compiled libraries, yes, you will likely hit a misaligned stack.
On Thu Aug 24 17:41:53 2023 +0000, Zebediah Figura wrote:
Ugh, no. *Now* I remember the unfixable race that I was seeing when I was trying to do this sort of thing with critical sections. I knew there was one, it was nagging at the back of my mind the entire time I was reviewing this series, but I couldn't remember what it was. The race is very simple, and very fundamental. The problem is that in order to queue, you have to do *something* to the previous tail, whether that's a read or write. In this case it's both—you need to read the previous head from the tail, and write the "next" pointer to it. The problem with this is that the tail can be woken, and removed from the list, at any time. As soon as you grab the address of the tail from the srwlock pointer, that memory can become invalid. At an unlikely extreme, the memory on that stack can simply be freed. At a more likely extreme, cmpxchg'ing the next pointer can scribble over a stack local in a different function.
hmm, you are entirely right. now i wonder how microsoft does it.
maybe the multiple owners bit is meaningful in some way.
below is just **me thinking out loud:**
so it seems to me the `transfer_ownership` process (let's call this the TO thread for brevity) have to be done in two steps, first it scans the list, find out who needs to be woken up, without actually waking them up. then it needs to detect if some thread has started acquiring the lock, in which case it should bail; if not, it updates the lock pointer to reflect the new tail, then wakes up the waiters and thus free the nodes.
this means the waiter side needs to coordinate to let the TO thread know its intention to append to the list. and because of the problem you pointed out, this coordination has to be done through the lock ptr itself, because list node could be freed before we write into them.
the easiest way is using the list_locked bit as a spin lock, which i did consider, but thought the potential of making releasing side block is bad. at least this would work as a last resort.
On Thu Aug 24 22:51:27 2023 +0000, Yuxuan Shui wrote:
hmm, you are entirely right. now i wonder how microsoft does it. maybe the multiple owners bit is meaningful in some way. below is just **me thinking out loud:** so it seems to me the `transfer_ownership` process (let's call this the TO thread for brevity) have to be done in two steps, first it scans the list, find out who needs to be woken up, without actually waking them up. then it needs to detect if some thread has started acquiring the lock, in which case it should bail; if not, it updates the lock pointer to reflect the new tail, then wakes up the waiters and thus free the nodes. this means the waiter side needs to coordinate to let the TO thread know its intention to append to the list. and because of the problem you pointed out, this coordination has to be done through the lock ptr itself, because list node could be freed before we write into them. the easiest way is using the list_locked bit as a spin lock, which i did consider, but thought the potential of making releasing side block is bad. at least this would work as a last resort.
i guess this also means the waiter must update the tail (the lock ptr, we should name it something) before trying to update `tail->next`.
On Thu Aug 24 23:00:34 2023 +0000, Yuxuan Shui wrote:
i guess this also means the waiter must update the tail (the lock ptr, we should name it something clearer) before trying to update `tail->next`.
but if we do that, then there will be acquirer-acquirer contention, `tail->head` could be out of date even without TO.
So I noticed another problem. I said I was worried about the releasing side (TO) being blocked, but my approach isn't better. If TO is in contention with many acquirers, it would have to keep chasing the tail of the list to update its head pointer, which can be growing indefinitely. This would behave almost the same as having a spinlock on the list.
On Fri Aug 25 10:04:56 2023 +0000, Yuxuan Shui wrote:
So I noticed another problem. I said I was worried about the releasing side (TO) being blocked, but my approach isn't better. If TO is in contention with many acquirers, it would have to keep chasing the tail of the list to update its head pointer, which can be growing indefinitely. This would behave almost the same as having a spinlock on the list.
@zfigura i think this is still salvageable, i have a rough idea now, i will try to flesh it out and update the patch. (unless i found holes before i finish it)
On Thu Aug 24 20:31:16 2023 +0000, Zebediah Figura wrote:
Sure, we could, and that'd be another way to solve the problem, but
currently we don't. The fact is that gcc assumes the stack boundary is 16-byte-aligned, period, and that means that not only do we need to force alignment so we don't break the Unix ABI, but we also need to force alignment so that aligned types and variables actually will be aligned.
That's not true, at least not with MinGW. If it were, this would never
have been removed because it would break **every** optimization with anything > 8 bytes (technically, `double` doesn't require alignment), such as vectors. And I compiled that for ages, and it does realign if needed for vectors. If it doesn't, it's a compiler bug. If vector optimizations are used (e.g. -msse2), gcc will manually align the stack for i686-w64-mingw32. This was intentionally done [1]. But that doesn't extend to manually aligned types; those are currently just broken on Wine, as this patch set shows. I originally thought it was a compiler bug too, but the current behaviour *is* intentional, albeit perhaps poorly thought out. See also [2]. [1] https://inbox.sourceware.org/gcc-patches/5969976.Bvae8NF9fS@polaris/ [2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111107
What you're essentially saying is that every library built as PE with
MinGW is utterly broken if it uses auto vectorization or types that need 16 byte alignment on the stack without using this attribute? (and most of them don't)
BTW by "every library built as PE" I do not mean Wine's. I mean native
DLLs compiled using MinGW and used on native Windows… It's not a problem if the entire stack is built with mingw, because mingw manually aligns the stack in its crt0, and gcc builds functions that expect and preserve 16-byte alignment. But if you ever mix MSVC and MinGW compiled libraries, yes, you will likely hit a misaligned stack.
I meant if you build a library on MinGW with the intent to have it loadable by any code—on Windows, natively. You don't control the caller's alignment or whoever uses your library. Wouldn't this be a much more widespread issue in such case?
But it looks like -mincoming-stack-boundary=2 is the solution. We could add that somewhere (I don't know anything about configure/winegcc though). I had assumed it *was* the default already, on MinGW i686, but if not it's probably a bug with MinGW rather than GCC as a whole. Since that's the incoming boundary for the Win32 32-bit ABI.
BTW as I can see, -mincoming-stack-boundary means the incoming boundary (i.e. "external" callers to your exported functions), so that's what's important here. You can only assume such stack is 4-byte aligned on 32-bit Windows.
-mpreferred-stack-boundary is how you want to keep the stack in our own functions, i.e. how they "prefer" it. I think it should also be set to 2 to prevent pointless realignments unless necessary. (note that realignment should happen if required by the function, already)
Wouldn't this be a much more widespread issue in such case?
Only if you have objects with alignment requirements on the stack, which is probably reasonably rare.
I am not against setting `incoming-stack-boundary/preferred-stack-boundary` though.
I meant if you build a library on MinGW with the intent to have it loadable by any code—on Windows, natively. You don't control the caller's alignment or whoever uses your library. Wouldn't this be a much more widespread issue in such case?
It would require the library to (a) distribute binaries prebuilt with MinGW (which I get the impression is rare compared to building them with MSVC, even in the Free Software world), (b) manually use SSE or aligned types, (c) *not* build their libraries with -msse2 or anything.
And for all we know, maybe it *is* a widespread issue, and all of those libraries just end up using -mstackrealign or something.
-mpreferred-stack-boundary is how you want to keep the stack in our own functions, i.e. how they "prefer" it. I think it should also be set to 2 to prevent pointless realignments unless necessary. (note that realignment should happen if required by the function, already)
See, that's what it sounds like just from the name, but then why is there a warning about ABI in the documentation? Why would setting it, as Richard Biener suggests, make a difference? And for that matter, a truly static function that isn't exposed as a function pointer doesn't need to *have* an ABI; the compiler can and will pass the parameters however it feels like.
On Fri Aug 25 17:18:18 2023 +0000, Zebediah Figura wrote:
I meant if you build a library on MinGW with the intent to have it
loadable by any code—on Windows, natively. You don't control the caller's alignment or whoever uses your library. Wouldn't this be a much more widespread issue in such case? It would require the library to (a) distribute binaries prebuilt with MinGW (which I get the impression is rare compared to building them with MSVC, even in the Free Software world), (b) manually use SSE or aligned types, (c) *not* build their libraries with -msse2 or anything. And for all we know, maybe it *is* a widespread issue, and all of those libraries just end up using -mstackrealign or something.
-mpreferred-stack-boundary is how you want to keep the stack in our
own functions, i.e. how they "prefer" it. I think it should also be set to 2 to prevent pointless realignments unless necessary. (note that realignment should happen if required by the function, already) See, that's what it sounds like just from the name, but then why is there a warning about ABI in the documentation? Why would setting it, as Richard Biener suggests, make a difference? And for that matter, a truly static function that isn't exposed as a function pointer doesn't need to *have* an ABI; the compiler can and will pass the parameters however it feels like.
Well it can make a difference if you prefer "less" alignment than the ABI, so when you call a function expecting 16 byte alignment, and your preferred one is 4 bytes, you won't align it to 16 before calling, and then probably crash.
Of course this shouldn't apply to Win32 PE DLLs. GCC seems a bit stubborn about that, just because Linux enforced 16-byte alignment at some point on their ABI, it's not the same on Windows.
I don't even understand why the GCC devs have so many workarounds for this. Why does -msse2 or SSE even matter at all? The simple fact is that incoming alignment is only guaranteed to be 4 bytes on some platforms, such as i686-MinGW (32-bit Windows PE), and GCC should realign the stack *if* it needs more alignment than that, SSE or not. **Any** variable requiring more alignment should enforce that.
Why complicate this with SSE checks? It's not like SSE is special here, it's all about variable alignment. They should simply just set the default incoming and preferred to 2 (aka 4 bytes) on MinGW i686, but I think we should also set it in Wine to deal with old compilers if they even patch this up upstream in the first place.
BTW here's an example of incoming=2 and preferred=4: it will assume incoming stack (if function is not static, that is, since it doesn't know the caller) is 4 byte aligned, and so it will realign it since it "prefers" 16 bytes, whether it actually needs 16 byte alignment or not.
incoming=2 and preferred=2 won't realign unless a variable explicitly requires more than 4 bytes alignment, so it's the most ideal, and how MSVC behaves when building 32-bit.
Why do we have an unaligned srwlock test?