The `RtlRunOnce` family of functions are implemented using (a variant of) the Double Checked Locking Pattern (DCLP). The DCLP requires memory fences for correctness, on both the writer and reader sides. It's pretty obvious why it's needed on the writer side, but quite surprising that any is needed on the reader side! On strong memory model architectures (x86 and x86_64), only compiler-level fences are required. On weak memory model architectures like ARM64, instead, you need both CPU and compiler fences.
That's explained well in books like _Concurrent Programming on Windows_ by Joe Duffy and in online resources like [1].
The Wine implementation has fences on the writing side (`RtlRunOnceComplete`). That's because `InterlockedCompareExchangePointer` inserts a full memory fence. However some code paths on the reader side (`RtlRunOnceBeginInitialize`) are missing fences, specifically the (`RTL_RUN_ONCE_CHECK_ONLY`) branch and the (`!RTL_RUN_ONCE_CHECK_ONLY && (val & 3) == 2`) branch.
~~Add the missing fences using GCC's atomic builtins [2]~~ **EDIT:** using `ReadPointerAcquire` now
Note: with this MR, the generated code should change only for ARM64
### References:
1. [Double-Checked Locking is Fixed In C++11](https://preshing.com/20130930/double-checked-locking-is-fixed-in-cpp11/) 2. [GCC's Built-in Functions for Memory Model Aware Atomic Operations](https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html)
-- v2: ntdll: Use ReadPointerAcquire to get pointer value in RtlRunOnceBeginInitialize include: add atomic read/write of pointers
From: Yuxuan Shui yshui@codeweavers.com
--- include/winnt.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-)
diff --git a/include/winnt.h b/include/winnt.h index e8a47fec963..c94d1e497ad 100644 --- a/include/winnt.h +++ b/include/winnt.h @@ -4334,7 +4334,7 @@ typedef struct _ACL {
typedef enum _ACL_INFORMATION_CLASS { - AclRevisionInformation = 1, + AclRevisionInformation = 1, AclSizeInformation } ACL_INFORMATION_CLASS;
@@ -7149,6 +7149,18 @@ static FORCEINLINE LONG64 ReadAcquire64( LONG64 const volatile *src ) return value; }
+static FORCEINLINE void* ReadPointerAcquire( void* const volatile *src ) +{ + void *value; +#ifdef _WIN64 + value = (void *)__WINE_LOAD64_NO_FENCE( (INT64 const volatile *)src ); +#else + value = (void *)__WINE_LOAD32_NO_FENCE( (INT32 const volatile *)src ); +#endif + __wine_memory_barrier_acq_rel(); + return value; +} + static FORCEINLINE LONG ReadNoFence( LONG const volatile *src ) { LONG value = __WINE_LOAD32_NO_FENCE( (int const volatile *)src ); @@ -7170,6 +7182,17 @@ static FORCEINLINE LONG64 ReadNoFence64( LONG64 const volatile *src ) return value; }
+static FORCEINLINE void* ReadPointerNoFence( void* const volatile *src ) +{ + void *value; +#ifdef _WIN64 + value = (void *)__WINE_LOAD64_NO_FENCE( (INT64 const volatile *)src ); +#else + value = (void *)__WINE_LOAD32_NO_FENCE( (INT32 const volatile *)src ); +#endif + return value; +} + static FORCEINLINE void WriteRelease( LONG volatile *dest, LONG value ) { __wine_memory_barrier_acq_rel(); @@ -7190,11 +7213,30 @@ static FORCEINLINE void WriteRelease64( LONG64 volatile *dest, LONG64 value ) #endif }
+static FORCEINLINE void WritePointerRelease( void* volatile *dest, void* value ) +{ + __wine_memory_barrier_acq_rel(); +#ifdef _WIN64 + __WINE_STORE64_NO_FENCE( (INT64 volatile *)dest, (INT64)value ); +#else + __WINE_STORE32_NO_FENCE( (INT32 volatile *)dest, (INT32)value ); +#endif +} + static FORCEINLINE void WriteNoFence( LONG volatile *dest, LONG value ) { __WINE_STORE32_NO_FENCE( (int volatile *)dest, value ); }
+static FORCEINLINE void WritePointerNoFence( void* volatile *dest, void* value ) +{ +#ifdef _WIN64 + __WINE_STORE64_NO_FENCE( (INT64 volatile *)dest, (INT64)value ); +#else + __WINE_STORE32_NO_FENCE( (INT32 volatile *)dest, (INT32)value ); +#endif +} + #elif defined(__GNUC__)
static FORCEINLINE BOOLEAN WINAPI BitScanForward(DWORD *index, DWORD mask) @@ -7374,6 +7416,13 @@ static FORCEINLINE LONG64 ReadAcquire64( LONG64 const volatile *src ) return value; }
+static FORCEINLINE void *ReadPointerAcquire( void *const volatile *src ) +{ + void *value; + __WINE_ATOMIC_LOAD_ACQUIRE( src, &value ); + return value; +} + static FORCEINLINE LONG ReadNoFence( LONG const volatile *src ) { LONG value; @@ -7392,6 +7441,13 @@ static FORCEINLINE LONG64 ReadNoFence64( LONG64 const volatile *src ) return value; }
+static FORCEINLINE void *ReadPointerNoFence( void *const volatile *src ) +{ + void *value; + __WINE_ATOMIC_LOAD_RELAXED( src, &value ); + return value; +} + static FORCEINLINE void WriteRelease( LONG volatile *dest, LONG value ) { __WINE_ATOMIC_STORE_RELEASE( dest, &value ); @@ -7411,6 +7467,16 @@ static FORCEINLINE void WriteNoFence( LONG volatile *dest, LONG value ) __WINE_ATOMIC_STORE_RELAXED( dest, &value ); }
+static FORCEINLINE void WritePointerRelease( void *volatile *src, void *value ) +{ + __WINE_ATOMIC_STORE_RELEASE( src, &value ); +} + +static FORCEINLINE void WritePointerNoFence( void *volatile *src, void *value ) +{ + __WINE_ATOMIC_STORE_RELAXED( src, &value ); +} + static FORCEINLINE DECLSPEC_NORETURN void __fastfail(unsigned int code) { #if defined(__x86_64__) || defined(__i386__)
From: Luca Bacci luca.bacci982@gmail.com
This is needed on architectures with weak memory models like ARM64. --- dlls/ntdll/sync.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/dlls/ntdll/sync.c b/dlls/ntdll/sync.c index 522ce0a2142..3f8aa0b399d 100644 --- a/dlls/ntdll/sync.c +++ b/dlls/ntdll/sync.c @@ -61,7 +61,7 @@ DWORD WINAPI RtlRunOnceBeginInitialize( RTL_RUN_ONCE *once, ULONG flags, void ** { if (flags & RTL_RUN_ONCE_CHECK_ONLY) { - ULONG_PTR val = (ULONG_PTR)once->Ptr; + ULONG_PTR val = (ULONG_PTR)ReadPointerAcquire( &once->Ptr );
if (flags & RTL_RUN_ONCE_ASYNC) return STATUS_INVALID_PARAMETER; if ((val & 3) != 2) return STATUS_UNSUCCESSFUL; @@ -71,7 +71,7 @@ DWORD WINAPI RtlRunOnceBeginInitialize( RTL_RUN_ONCE *once, ULONG flags, void **
for (;;) { - ULONG_PTR next, val = (ULONG_PTR)once->Ptr; + ULONG_PTR next, val = (ULONG_PTR)ReadPointerAcquire( &once->Ptr );
switch (val & 3) {
Jinoh Kang (@iamahuman) commented about dlls/ntdll/sync.c:
for (;;) { ULONG_PTR val = (ULONG_PTR)once->Ptr;
```suggestion:-0+0 ULONG_PTR val = (ULONG_PTR)ReadPointerAcquire( &once->Ptr ); ```
`RTL_RUN_ONCE_ASYNC` permits multiple threads to race to completion. Non-atomic read here may lead to data race, and lack of ordering if other thread has already completed the initialization (run once).
Also, documentation for acquire-release pairing would be helpful. (Here, the InterlockedCompareExchangePointer is doing the release.)