64-bit volatile accesses are not atomic on i386. Both i686-w64-mingw32-gcc and x86 MSVC splits 64-bit loads into a pair of load/store ops.
Fix this by using a FILD/FISTP pair, which is also used to implement C11 atomics by i686-w64-mingw32-gcc.
Fixes: f82b1c1fcf770a5d6fa02c3f286282be79a201b8
-- v5: include: Prevent misuse of __WINE_ATOMIC_* helper macros for non-atomic large accesses. server: Fix incorrect usage of __WINE_ATOMIC_STORE_RELEASE in SHARED_WRITE_BEGIN/SHARED_WRITE_END. include: Fix ReadNoFence64 on i386.
From: Jinoh Kang jinoh.kang.kr@gmail.com
64-bit volatile loads are not atomic on i386. Both i686-w64-mingw32-gcc and x86 MSVC splits 64-bit loads into a pair of load ops.
Fix this by using a FILD/FISTP pair, which is also used to implement C11 atomic_load() by i686-w64-mingw32-gcc.
Fixes: f82b1c1fcf770a5d6fa02c3f286282be79a201b8 --- include/winnt.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/include/winnt.h b/include/winnt.h index 2547f841b7c..ef9e7bf6713 100644 --- a/include/winnt.h +++ b/include/winnt.h @@ -7122,7 +7122,16 @@ static FORCEINLINE LONG ReadNoFence( LONG const volatile *src )
static FORCEINLINE LONG64 ReadNoFence64( LONG64 const volatile *src ) { - LONG64 value = __WINE_LOAD64_NO_FENCE( (__int64 const volatile *)src ); + LONG64 value; +#if defined(__i386__) && _MSC_VER < 1700 + __asm { + mov eax, src + fild qword ptr [eax] + fistp value + } +#else + value = __WINE_LOAD64_NO_FENCE( (__int64 const volatile *)src ); +#endif return value; }
@@ -7315,7 +7324,11 @@ static FORCEINLINE LONG ReadNoFence( LONG const volatile *src ) static FORCEINLINE LONG64 ReadNoFence64( LONG64 const volatile *src ) { LONG64 value; +#ifdef __i386__ + __asm__ __volatile__( "fildq %1\n\tfistpq %0" : "=m" (value) : "m" (*src) : "memory", "st" ); +#else __WINE_ATOMIC_LOAD_RELAXED( src, &value ); +#endif return value; }
From: Jinoh Kang jinoh.kang.kr@gmail.com
64-bit volatile stores are not atomic on i386. Both i686-w64-mingw32-gcc and x86 MSVC splits 64-bit stores into a pair of store ops.
Fix this by using a FILD/FISTP pair, which is also used to implement C11 atomic_store() by i686-w64-mingw32-gcc.
Fixes: fac940dfac314c2b1c120cf9ff8503259153c5a0 --- include/winnt.h | 26 ++++++++++++++++++++++++++ server/file.h | 4 ++-- 2 files changed, 28 insertions(+), 2 deletions(-)
diff --git a/include/winnt.h b/include/winnt.h index ef9e7bf6713..7e8ebdb6ca7 100644 --- a/include/winnt.h +++ b/include/winnt.h @@ -7081,13 +7081,16 @@ static FORCEINLINE void MemoryBarrier(void) #pragma intrinsic(__iso_volatile_load32) #pragma intrinsic(__iso_volatile_load64) #pragma intrinsic(__iso_volatile_store32) +#pragma intrinsic(__iso_volatile_store64) #define __WINE_LOAD32_NO_FENCE(src) (__iso_volatile_load32(src)) #define __WINE_LOAD64_NO_FENCE(src) (__iso_volatile_load64(src)) #define __WINE_STORE32_NO_FENCE(dest, value) (__iso_volatile_store32(dest, value)) +#define __WINE_STORE64_NO_FENCE(dest, value) (__iso_volatile_store64(dest, value)) #else /* _MSC_VER >= 1700 */ #define __WINE_LOAD32_NO_FENCE(src) (*(src)) #define __WINE_LOAD64_NO_FENCE(src) (*(src)) #define __WINE_STORE32_NO_FENCE(dest, value) ((void)(*(dest) = (value))) +#define __WINE_STORE64_NO_FENCE(dest, value) ((void)(*(dest) = (value))) #endif /* _MSC_VER >= 1700 */
#if defined(__i386__) || defined(__x86_64__) @@ -7141,6 +7144,20 @@ static FORCEINLINE void WriteRelease( LONG volatile *dest, LONG value ) __WINE_STORE32_NO_FENCE( (int volatile *)dest, value ); }
+static FORCEINLINE void WriteRelease64( LONG64 volatile *dest, LONG64 value ) +{ +#if defined(__i386__) && _MSC_VER < 1700 + __asm { + mov eax, dest + fild value + fistp qword ptr [eax] + } +#else + __wine_memory_barrier_acq_rel(); + __WINE_STORE64_NO_FENCE( (__int64 volatile *)dest, value ); +#endif +} + static FORCEINLINE void WriteNoFence( LONG volatile *dest, LONG value ) { __WINE_STORE32_NO_FENCE( (int volatile *)dest, value ); @@ -7337,6 +7354,15 @@ static FORCEINLINE void WriteRelease( LONG volatile *dest, LONG value ) __WINE_ATOMIC_STORE_RELEASE( dest, &value ); }
+static FORCEINLINE void WriteRelease64( LONG64 volatile *dest, LONG64 value ) +{ +#ifdef __i386__ + __asm__ __volatile__( "fildq %1\n\tfistpq %0" : "=m" (*dest) : "m" (value) : "memory", "st" ); +#else + __WINE_ATOMIC_STORE_RELEASE( dest, &value ); +#endif +} + static FORCEINLINE void WriteNoFence( LONG volatile *dest, LONG value ) { __WINE_ATOMIC_STORE_RELAXED( dest, &value ); diff --git a/server/file.h b/server/file.h index 4f5fc7b26f1..7368640a674 100644 --- a/server/file.h +++ b/server/file.h @@ -205,13 +205,13 @@ extern struct obj_locator get_shared_object_locator( const volatile void *object shared_object_t *__obj = CONTAINING_RECORD( shared, shared_object_t, shm ); \ LONG64 __seq = __obj->seq + 1, __end = __seq + 1; \ assert( (__seq & 1) != 0 ); \ - __WINE_ATOMIC_STORE_RELEASE( &__obj->seq, &__seq ); \ + WriteRelease64( &__obj->seq, __seq ); \ do
#define SHARED_WRITE_END \ while(0); \ assert( __seq == __obj->seq ); \ - __WINE_ATOMIC_STORE_RELEASE( &__obj->seq, &__end ); \ + WriteRelease64( &__obj->seq, __end ); \ } while(0)
/* device functions */
From: Jinoh Kang jinoh.kang.kr@gmail.com
--- include/winnt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/include/winnt.h b/include/winnt.h index 7e8ebdb6ca7..f671b7d64a5 100644 --- a/include/winnt.h +++ b/include/winnt.h @@ -7313,10 +7313,10 @@ static FORCEINLINE void MemoryBarrier(void)
#if defined(__x86_64__) || defined(__i386__) /* On x86, Support old GCC with either no or buggy (GCC BZ#81316) __atomic_* support */ -#define __WINE_ATOMIC_LOAD_ACQUIRE(ptr, ret) do { *(ret) = *(ptr); __asm__ __volatile__( "" ::: "memory" ); } while (0) -#define __WINE_ATOMIC_LOAD_RELAXED(ptr, ret) do { *(ret) = *(ptr); } while (0) -#define __WINE_ATOMIC_STORE_RELEASE(ptr, val) do { __asm__ __volatile__( "" ::: "memory" ); *(ptr) = *(val); } while (0) -#define __WINE_ATOMIC_STORE_RELAXED(ptr, val) do { *(ptr) = *(val); } while (0) +#define __WINE_ATOMIC_LOAD_ACQUIRE(ptr, ret) do { C_ASSERT(sizeof(*(ptr)) <= sizeof(void *)); *(ret) = *(ptr); __asm__ __volatile__( "" ::: "memory" ); } while (0) +#define __WINE_ATOMIC_LOAD_RELAXED(ptr, ret) do { C_ASSERT(sizeof(*(ptr)) <= sizeof(void *)); *(ret) = *(ptr); } while (0) +#define __WINE_ATOMIC_STORE_RELEASE(ptr, val) do { C_ASSERT(sizeof(*(ptr)) <= sizeof(void *)); __asm__ __volatile__( "" ::: "memory" ); *(ptr) = *(val); } while (0) +#define __WINE_ATOMIC_STORE_RELAXED(ptr, val) do { C_ASSERT(sizeof(*(ptr)) <= sizeof(void *)); *(ptr) = *(val); } while (0) #else #define __WINE_ATOMIC_LOAD_ACQUIRE(ptr, ret) __atomic_load(ptr, ret, __ATOMIC_ACQUIRE) #define __WINE_ATOMIC_LOAD_RELAXED(ptr, ret) __atomic_load(ptr, ret, __ATOMIC_RELAXED)
v5: Fix more incorrect 64-bit atomic ops, reword commit messages a bit
William Horvath (@whrvt) commented about include/winnt.h:
#if defined(__x86_64__) || defined(__i386__)
```suggestion:-0+0 #if !defined(__clang__) && (defined(__GNUC__) && (__GNUC__ < 8 || (__GNUC__ == 8 && __GNUC_MINOR__ < 1)) && (defined(__x86_64__) || defined(__i386__))) ```
since this was [fixed](https://gcc.gnu.org/git/?p=gcc.git&a=commit;h=d8c40eff56f69877b33c697ded...) quite a while ago? This would make the include guards that your patch adds elsewhere quite messy, but I think the intrinsics should be preferred if they're not broken. Is there another reason we avoid them completely, besides what's in the GCC bugzilla link?
On Thu Feb 6 15:03:08 2025 +0000, William Horvath wrote:
#if !defined(__clang__) && (defined(__GNUC__) && (__GNUC__ < 8 || (__GNUC__ == 8 && __GNUC_MINOR__ < 1)) && (defined(__x86_64__) || defined(__i386__)))
since this was [fixed](https://gcc.gnu.org/git/?p=gcc.git&a=commit;h=d8c40eff56f69877b33c697ded...) quite a while ago? This would make the include guards that your patch adds elsewhere quite messy, but I think the intrinsics should be preferred if they're not broken. Is there another reason we avoid them completely, besides what's in the GCC bugzilla link?
This change is out of scope. Please open a new MR instead.
On Thu Feb 6 15:03:08 2025 +0000, Jinoh Kang wrote:
This change is out of scope. Please open a new MR instead.
How is it "out of scope" if it would also "Fix ReadNoFence64 on i386"? Anyways, it was just a suggestion, not a strong opinion. The current version is also fine.
On Thu Feb 6 15:05:49 2025 +0000, William Horvath wrote:
How is it "out of scope" if it would also "Fix ReadNoFence64 on i386"? Anyways, it was just a suggestion, not a strong opinion. The current version is also fine.
Note that Wine still supports old GCC versions. This suggestion is neither necessary or sufficient to fix ReadNoFence64 on i386 on older GCC versions. Even if this suggestion is applied, other parts of the MR cannot be simplified.
"... on older GCC versions"
Yes, and I understand that this suggestion neither *necessary* **nor** *sufficient* to fix the fact that it's currently broken on any GCC/Clang version. I didn't mean to imply this would obviate the need for the FILD/FISTP fix, I only meant that it might be clearer to primarily use the builtins (which are fixed in any relevant GCC/Clang version), and only use this as a fallback. You could simply have a macro like e.g. `BROKEN_ATOMIC_BUILTINS`, and check for `defined(__i386__) && defined(BROKEN_ATOMIC_BUILTINS)` instead of just `defined(__i386__)` here.
This can of course can be done after this MR is accepted, which I'm willing to do.
On Thu Feb 6 15:42:17 2025 +0000, William Horvath wrote:
"... on older GCC versions"
Yes, and I understand that this suggestion neither *necessary* **nor** *sufficient* to fix the fact that it's currently broken on any GCC/Clang version. I didn't mean to imply this would obviate the need for the FILD/FISTP fix, I only meant that it might be clearer to primarily use the builtins (which are fixed in any relevant GCC/Clang version), and only use this as a fallback. You could simply have a macro like e.g. `BROKEN_ATOMIC_BUILTINS`, and check for `defined(__i386__) && defined(BROKEN_ATOMIC_BUILTINS)` instead of just `defined(__i386__)` here. This can of course can be done after this MR is accepted, which I'm willing to do.
Feel free to open a new MR now, regardless of this patch; it's just a matter of resolving merge conflicts when either gets to the tree first. Probably better since the discussion can be started early.
The `__iso_volatile_` intrinsics still don't generate the correct instructions in MSVC-mode Clang. You can force the `_MSC_VER<1700` path by adding `-fmsc-version=1699`, which gives the correct result.
Bottom left quadrant here: https://godbolt.org/z/j9c4EdvoT
On Mon Feb 10 15:14:05 2025 +0000, William Horvath wrote:
The `__iso_volatile_` intrinsics still don't generate the correct instructions in MSVC-mode Clang. You can force the `_MSC_VER<1700` path by adding `-fmsc-version=1699`, which gives the correct result. Bottom left quadrant here: https://godbolt.org/z/j9c4EdvoT
Not a blocker, though: !7303
This merge request was approved by Elizabeth Figura.