Based on [a patch](https://www.winehq.org/mailman3/hyperkitty/list/wine-devel@winehq.org/messag...) by Jinoh Kang (@iamahuman) from February 2022.
I removed the need for the event object and implemented fast paths for Linux. On macOS 10.14+ `thread_get_register_pointer_values` is used on every thread of the process. On Linux 4.14+ `membarrier(MEMBARRIER_CMD_GLOBAL_EXPEDITED, ...)` is used. On x86 Linux <= 4.13 `madvise(..., MADV_DONTNEED)` is used, which sends IPIs to all cores causing them to do a memory barrier. On non-x86 Linux <= 4.2 and on other platforms the fallback path using APCs is used.
-- v8: ntdll: Add thread_get_register_pointer_values-based implementation of NtFlushProcessWriteBuffers. ntdll: Add sys_membarrier-based implementation of NtFlushProcessWriteBuffers. ntdll: Add MADV_DONTNEED-based implementation of NtFlushProcessWriteBuffers.
From: Torge Matthies tmatthies@codeweavers.com
Credits to Avi Kivity (scylladb) and Aliaksei Kandratsenka (gperftools) for this trick, see [1].
[1] https://github.com/scylladb/seastar/commit/77a58e4dc020233f66fccb8d9e8f7a8b7... --- dlls/ntdll/unix/virtual.c | 42 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 8087a12785c..b5d7ba39d99 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -215,6 +215,11 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
+#if defined(__linux__) && (defined(__i386__) || defined(__x86_64__)) +static void *dontneed_page; +static pthread_mutex_t dontneed_page_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif +
static inline BOOL is_beyond_limit( const void *addr, size_t size, const void *limit ) { @@ -5174,13 +5179,48 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
+#if defined(__linux__) && (defined(__i386__) || defined(__x86_64__)) +static int try_madvise( void ) +{ + int ret = 0; + char *mem; + + pthread_mutex_lock(&dontneed_page_mutex); + /* Credits to Avi Kivity (scylladb) and Aliaksei Kandratsenka (gperftools) for this trick, + * see https://github.com/scylladb/seastar/commit/77a58e4dc020233f66fccb8d9e8f7a8b7... */ + mem = dontneed_page; + if (!mem) + { + mem = anon_mmap_alloc( page_size, PROT_READ | PROT_WRITE ); + if (mem == MAP_FAILED) + goto failed; + if (mlock( mem, page_size )) + { + munmap( mem, page_size ); + goto failed; + } + dontneed_page = mem; + } + *mem = 3; + ret = !madvise( mem, page_size, MADV_DONTNEED ); +failed: + pthread_mutex_unlock(&dontneed_page_mutex); + return ret; +} +#else +static int try_madvise( void ) { return 0; } +#endif + + /********************************************************************** * NtFlushProcessWriteBuffers (NTDLL.@) */ void WINAPI NtFlushProcessWriteBuffers(void) { static int once = 0; - if (!once++) FIXME( "stub\n" ); + if (try_madvise()) + return; + if (!once++) FIXME( "no implementation available on this platform\n" ); }
From: Torge Matthies tmatthies@codeweavers.com
Uses the MEMBARRIER_CMD_PRIVATE_EXPEDITED membarrier command introduced in Linux 4.14. --- dlls/ntdll/unix/virtual.c | 49 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index b5d7ba39d99..10dd5a997fe 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -39,6 +39,9 @@ #ifdef HAVE_SYS_SYSINFO_H # include <sys/sysinfo.h> #endif +#ifdef HAVE_SYS_SYSCALL_H +# include <sys/syscall.h> +#endif #ifdef HAVE_SYS_SYSCTL_H # include <sys/sysctl.h> #endif @@ -215,10 +218,16 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
-#if defined(__linux__) && (defined(__i386__) || defined(__x86_64__)) +#ifdef __linux__ +#ifdef __NR_membarrier +static BOOL membarrier_exp_available; +static pthread_once_t membarrier_init_once = PTHREAD_ONCE_INIT; +#endif +#if defined(__i386__) || defined(__x86_64__) static void *dontneed_page; static pthread_mutex_t dontneed_page_mutex = PTHREAD_MUTEX_INITIALIZER; #endif +#endif
static inline BOOL is_beyond_limit( const void *addr, size_t size, const void *limit ) @@ -5179,6 +5188,42 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
+#if defined(__linux__) && defined(__NR_membarrier) +#define MEMBARRIER_CMD_QUERY 0x00 +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08 +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED 0x10 + + +static int membarrier( int cmd, unsigned int flags, int cpu_id ) +{ + return syscall( __NR_membarrier, cmd, flags, cpu_id ); +} + + +static void membarrier_init( void ) +{ + static const int exp_required_cmds = + MEMBARRIER_CMD_PRIVATE_EXPEDITED | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED; + int available_cmds = membarrier( MEMBARRIER_CMD_QUERY, 0, 0 ); + if (available_cmds == -1) + return; + if ((available_cmds & exp_required_cmds) == exp_required_cmds) + membarrier_exp_available = !membarrier( MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0, 0 ); +} + + +static int try_exp_membarrier( void ) +{ + pthread_once(&membarrier_init_once, membarrier_init); + if (!membarrier_exp_available) + return 0; + return !membarrier( MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0 ); +} +#else +static int try_exp_membarrier( void ) { return 0; } +#endif + + #if defined(__linux__) && (defined(__i386__) || defined(__x86_64__)) static int try_madvise( void ) { @@ -5218,6 +5263,8 @@ static int try_madvise( void ) { return 0; } void WINAPI NtFlushProcessWriteBuffers(void) { static int once = 0; + if (try_exp_membarrier()) + return; if (try_madvise()) return; if (!once++) FIXME( "no implementation available on this platform\n" );
From: Torge Matthies tmatthies@codeweavers.com
--- dlls/ntdll/unix/virtual.c | 62 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 10dd5a997fe..f2c5bd92de9 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -65,6 +65,9 @@ #if defined(__APPLE__) # include <mach/mach_init.h> # include <mach/mach_vm.h> +# include <mach/task.h> +# include <mach/thread_state.h> +# include <mach/vm_map.h> #endif
#include "ntstatus.h" @@ -218,6 +221,11 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
+#ifdef __APPLE__ +static kern_return_t (*p_thread_get_register_pointer_values)( thread_t, uintptr_t*, size_t*, uintptr_t* ); +static pthread_once_t tgrpvs_init_once = PTHREAD_ONCE_INIT; +#endif + #ifdef __linux__ #ifdef __NR_membarrier static BOOL membarrier_exp_available; @@ -5188,6 +5196,58 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
+#ifdef __APPLE__ + +static void tgrpvs_init( void ) +{ + p_thread_get_register_pointer_values = dlsym( RTLD_DEFAULT, "thread_get_register_pointer_values" ); +} + +static int try_mach_tgrpvs( void ) +{ + /* Taken from https://github.com/dotnet/runtime/blob/7be37908e5a1cbb83b1062768c1649827eeac... */ + mach_msg_type_number_t count, i; + thread_act_array_t threads; + kern_return_t kret; + int ret = 0; + + pthread_once(&tgrpvs_init_once, tgrpvs_init); + if (!p_thread_get_register_pointer_values) + return 0; + + kret = task_threads( mach_task_self(), &threads, &count ); + if (kret) + return 0; + + for (i = 0; i < count; i++) + { + uintptr_t reg_values[128]; + size_t reg_count = ARRAY_SIZE( reg_values ); + uintptr_t sp; + + /* This function always fails when querying Rosetta's exception handling thread, so we will only handle + KERN_INSUFFICIENT_BUFFER_SIZE as an error, like .NET core does. */ + kret = p_thread_get_register_pointer_values( threads[i], &sp, ®_count, reg_values ); + if (kret == KERN_INSUFFICIENT_BUFFER_SIZE) + goto fail; + + kret = mach_port_deallocate( mach_task_self(), threads[i] ); + if (kret) + goto fail; + } + ret = 1; +fail: + for (; i < count; i++) + mach_port_deallocate( mach_task_self(), threads[i] ); + vm_deallocate( mach_task_self(), (vm_address_t)threads, count * sizeof(threads[0]) ); + return ret; +} + +#else +static int try_mach_tgrpvs( void ) { return 0; } +#endif + + #if defined(__linux__) && defined(__NR_membarrier) #define MEMBARRIER_CMD_QUERY 0x00 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08 @@ -5263,6 +5323,8 @@ static int try_madvise( void ) { return 0; } void WINAPI NtFlushProcessWriteBuffers(void) { static int once = 0; + if (try_mach_tgrpvs()) + return; if (try_exp_membarrier()) return; if (try_madvise())
Jinoh Kang (@iamahuman) commented about dlls/ntdll/unix/virtual.c:
}
+#if defined(__linux__) && (defined(__i386__) || defined(__x86_64__))
Why restrict to x86?
Jinoh Kang (@iamahuman) commented about dlls/ntdll/unix/virtual.c:
if (mem == MAP_FAILED)
goto failed;
if (mlock( mem, page_size ))
{
munmap( mem, page_size );
goto failed;
}
dontneed_page = mem;
- }
- *mem = 3;
- ret = !madvise( mem, page_size, MADV_DONTNEED );
+failed:
- pthread_mutex_unlock(&dontneed_page_mutex);
- return ret;
+} +#else
Condition comments? (Also below)