Based on [a patch](https://www.winehq.org/mailman3/hyperkitty/list/wine-devel@winehq.org/messag...) by Jinoh Kang (@iamahuman) from February 2022.
I removed the need for the event object and implemented fast paths for Linux. On macOS 10.14+ `thread_get_register_pointer_values` is used on every thread of the process. On Linux 4.14+ `membarrier(MEMBARRIER_CMD_GLOBAL_EXPEDITED, ...)` is used. On x86 Linux <= 4.13 `madvise(..., MADV_DONTNEED)` is used, which sends IPIs to all cores causing them to do a memory barrier. On non-x86 Linux <= 4.2 and on other platforms the fallback path using APCs is used.
-- v9: ntdll: Add thread_get_register_pointer_values-based implementation of NtFlushProcessWriteBuffers. ntdll: Add sys_membarrier-based implementation of NtFlushProcessWriteBuffers. ntdll: Add MADV_DONTNEED-based implementation of NtFlushProcessWriteBuffers.
From: Torge Matthies tmatthies@codeweavers.com
Credits to Avi Kivity (scylladb) and Aliaksei Kandratsenka (gperftools) for this trick, see [1].
[1] https://github.com/scylladb/seastar/commit/77a58e4dc020233f66fccb8d9e8f7a8b7... --- dlls/ntdll/unix/virtual.c | 52 +++++++++++++++++++++++++++++++++++++- tools/winapi/nativeapi.dat | 1 + 2 files changed, 52 insertions(+), 1 deletion(-)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 8087a12785c..de8f8b6ebc1 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -215,6 +215,11 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
+#if defined(__linux__) && (defined(__i386__) || defined(__x86_64__)) +static void *dontneed_page; +static pthread_mutex_t dontneed_page_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif +
static inline BOOL is_beyond_limit( const void *addr, size_t size, const void *limit ) { @@ -5174,13 +5179,58 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
+static BOOL try_madvise( void ) +{ + BOOL success = FALSE; + char *mem; + + pthread_mutex_lock(&dontneed_page_mutex); + /* Credits to Avi Kivity (scylladb) and Aliaksei Kandratsenka (gperftools) for this trick, + see https://github.com/scylladb/seastar/commit/77a58e4dc020233f66fccb8d9e8f7a8b7... */ + mem = dontneed_page; + if (!mem) + { + int ret; + /* Allocate one page of memory that we can call madvise() on */ + mem = anon_mmap_alloc( page_size, PROT_READ | PROT_WRITE ); + if (mem == MAP_FAILED) + goto failed; + /* If the memory is locked, e.g. by a call to mlockall(MCL_FUTURE), the madvise() call below + will fail with error EINVAL, so unlock it here */ + ret = munlock( mem, page_size ); + /* munlock() may fail on old kernels if we don't have sufficient permissions, but that is not + a problem since in that case we didn't have permission to lock the memory either */ + if (ret && errno != EPERM) + goto failed; + dontneed_page = mem; + } + /* Force the page into memory to make madvise() have real work to do */ + *mem = 3; + /* Evict the page from memory to force the kernel to send an IPI to all threads of this process, + which has the side effect of executing a memory barrier in those threads */ + success = !madvise( mem, page_size, MADV_DONTNEED ); +failed: + pthread_mutex_unlock(&dontneed_page_mutex); + return success; +} + + /********************************************************************** * NtFlushProcessWriteBuffers (NTDLL.@) */ void WINAPI NtFlushProcessWriteBuffers(void) { static int once = 0; - if (!once++) FIXME( "stub\n" ); + if (try_madvise()) + { +#ifdef __aarch64__ + /* Some ARMv8 processors can broadcast TLB invalidations using the TLBI instruction, + the madvise trick does not work on those */ + if (!once++) FIXME( "memory barrier may not work on this platform\n" ); +#endif + return; + } + if (!once++) FIXME( "no implementation available on this platform\n" ); }
diff --git a/tools/winapi/nativeapi.dat b/tools/winapi/nativeapi.dat index ade20b5ee68..5512c4f1833 100644 --- a/tools/winapi/nativeapi.dat +++ b/tools/winapi/nativeapi.dat @@ -134,6 +134,7 @@ log10 logb longjmp lseek +madvise malloc mblen memccpy
From: Torge Matthies tmatthies@codeweavers.com
Uses the MEMBARRIER_CMD_PRIVATE_EXPEDITED membarrier command introduced in Linux 4.14. --- dlls/ntdll/unix/virtual.c | 49 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index de8f8b6ebc1..e90bdb3abfb 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -39,6 +39,9 @@ #ifdef HAVE_SYS_SYSINFO_H # include <sys/sysinfo.h> #endif +#ifdef HAVE_SYS_SYSCALL_H +# include <sys/syscall.h> +#endif #ifdef HAVE_SYS_SYSCTL_H # include <sys/sysctl.h> #endif @@ -215,10 +218,16 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
-#if defined(__linux__) && (defined(__i386__) || defined(__x86_64__)) +#ifdef __linux__ +#ifdef __NR_membarrier +static BOOL membarrier_exp_available; +static pthread_once_t membarrier_init_once = PTHREAD_ONCE_INIT; +#endif +#if defined(__i386__) || defined(__x86_64__) static void *dontneed_page; static pthread_mutex_t dontneed_page_mutex = PTHREAD_MUTEX_INITIALIZER; #endif +#endif
static inline BOOL is_beyond_limit( const void *addr, size_t size, const void *limit ) @@ -5179,6 +5188,42 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
+#if defined(__linux__) && defined(__NR_membarrier) +#define MEMBARRIER_CMD_QUERY 0x00 +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08 +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED 0x10 + + +static int membarrier( int cmd, unsigned int flags, int cpu_id ) +{ + return syscall( __NR_membarrier, cmd, flags, cpu_id ); +} + + +static void membarrier_init( void ) +{ + static const int exp_required_cmds = + MEMBARRIER_CMD_PRIVATE_EXPEDITED | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED; + int available_cmds = membarrier( MEMBARRIER_CMD_QUERY, 0, 0 ); + if (available_cmds == -1) + return; + if ((available_cmds & exp_required_cmds) == exp_required_cmds) + membarrier_exp_available = !membarrier( MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0, 0 ); +} + + +static BOOL try_exp_membarrier( void ) +{ + pthread_once(&membarrier_init_once, membarrier_init); + if (!membarrier_exp_available) + return FALSE; + return !membarrier( MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0 ); +} +#else +static BOOL try_exp_membarrier( void ) { return 0; } +#endif + + static BOOL try_madvise( void ) { BOOL success = FALSE; @@ -5221,6 +5266,8 @@ failed: void WINAPI NtFlushProcessWriteBuffers(void) { static int once = 0; + if (try_exp_membarrier()) + return; if (try_madvise()) { #ifdef __aarch64__
From: Torge Matthies tmatthies@codeweavers.com
--- dlls/ntdll/unix/virtual.c | 68 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index e90bdb3abfb..c5a2f878e3b 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -65,6 +65,9 @@ #if defined(__APPLE__) # include <mach/mach_init.h> # include <mach/mach_vm.h> +# include <mach/task.h> +# include <mach/thread_state.h> +# include <mach/vm_map.h> #endif
#include "ntstatus.h" @@ -218,6 +221,11 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
+#ifdef __APPLE__ +static kern_return_t (*p_thread_get_register_pointer_values)( thread_t, uintptr_t*, size_t*, uintptr_t* ); +static pthread_once_t tgrpvs_init_once = PTHREAD_ONCE_INIT; +#endif + #ifdef __linux__ #ifdef __NR_membarrier static BOOL membarrier_exp_available; @@ -5188,6 +5196,64 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
+#ifdef __APPLE__ + +static void tgrpvs_init( void ) +{ + p_thread_get_register_pointer_values = dlsym( RTLD_DEFAULT, "thread_get_register_pointer_values" ); +} + +static BOOL try_mach_tgrpvs( void ) +{ + /* Taken from https://github.com/dotnet/runtime/blob/7be37908e5a1cbb83b1062768c1649827eeac... */ + mach_msg_type_number_t count, i = 0; + thread_act_array_t threads; + kern_return_t kret; + BOOL success = FALSE; + + pthread_once(&tgrpvs_init_once, tgrpvs_init); + if (!p_thread_get_register_pointer_values) + return FALSE; + + /* Get references to all threads of this process */ + kret = task_threads( mach_task_self(), &threads, &count ); + if (kret) + return FALSE; + + /* Iterate through the threads in the list */ + while (i < count) + { + uintptr_t reg_values[128]; + size_t reg_count = ARRAY_SIZE( reg_values ); + uintptr_t sp; + + /* Request the thread's register pointer values to force the thread to go through a memory barrier */ + kret = p_thread_get_register_pointer_values( threads[i], &sp, ®_count, reg_values ); + /* This function always fails when querying Rosetta's exception handling thread, so we only treat + KERN_INSUFFICIENT_BUFFER_SIZE as an error, like .NET core does. */ + if (kret == KERN_INSUFFICIENT_BUFFER_SIZE) + goto fail; + + /* Deallocate thread reference once we're done with it */ + kret = mach_port_deallocate( mach_task_self(), threads[i++] ); + if (kret) + goto fail; + } + success = TRUE; +fail: + /* Deallocate remaining thread references */ + while (i < count) + mach_port_deallocate( mach_task_self(), threads[i++] ); + /* Deallocate thread list */ + vm_deallocate( mach_task_self(), (vm_address_t)threads, count * sizeof(threads[0]) ); + return success; +} + +#else +static BOOL try_mach_tgrpvs( void ) { return 0; } +#endif + + #if defined(__linux__) && defined(__NR_membarrier) #define MEMBARRIER_CMD_QUERY 0x00 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08 @@ -5266,6 +5332,8 @@ failed: void WINAPI NtFlushProcessWriteBuffers(void) { static int once = 0; + if (try_mach_tgrpvs()) + return; if (try_exp_membarrier()) return; if (try_madvise())
Hi,
It looks like your patch introduced the new failures shown below. Please investigate and fix them before resubmitting your patch. If they are not new, fixing them anyway would help a lot. Otherwise please ask for the known failures list to be updated.
The tests also ran into some preexisting test failures. If you know how to fix them that would be helpful. See the TestBot job for the details:
The full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=126751
Your paranoid android.
=== debian11 (32 bit report) ===
ole32: clipboard.c:1067: Test failed: OleIsCurrentClipboard returned 0 clipboard.c:1135: Test failed: 1 WM_DRAWCLIPBOARD received