Based on [a patch](https://www.winehq.org/mailman3/hyperkitty/list/wine-devel@winehq.org/messag...) by Jinoh Kang (@iamahuman) from February 2022.
I removed the need for the event object and implemented fast paths for Linux. On macOS 10.14+ `thread_get_register_pointer_values` is used on every thread of the process. On Linux 4.14+ `membarrier(MEMBARRIER_CMD_GLOBAL_EXPEDITED, ...)` is used. On x86 Linux <= 4.13 `madvise(..., MADV_DONTNEED)` is used, which sends IPIs to all cores causing them to do a memory barrier. On non-x86 Linux <= 4.2 and on other platforms the fallback path using APCs is used.
-- v7: ntdll: Add thread_get_register_pointer_values-based fast path for NtFlushProcessWriteBuffers. ntdll: Add sys_membarrier-based fast path to NtFlushProcessWriteBuffers. ntdll: Add MADV_DONTNEED-based fast path for NtFlushProcessWriteBuffers. ntdll: Implement NtFlushProcessWriteBuffers.
From: Torge Matthies tmatthies@codeweavers.com
Based on a patch by Jinoh Kang from February 2022 [1]. The following description is copied from said patch:
NtFlushProcessWriteBuffers is the NT equivalent of Linux membarrier() system call. The .NET Framework garbage collector uses it to synchronize with other threads, and thus is required to avoid silent memory corruption.
[1] https://www.winehq.org/mailman3/hyperkitty/list/wine-devel@winehq.org/messag... --- dlls/ntdll/unix/server.c | 6 ++ dlls/ntdll/unix/virtual.c | 27 ++++++++- server/protocol.def | 19 ++++++- server/thread.c | 117 ++++++++++++++++++++++++++++++++++++++ server/thread.h | 1 + 5 files changed, 166 insertions(+), 4 deletions(-)
diff --git a/dlls/ntdll/unix/server.c b/dlls/ntdll/unix/server.c index 07d9c8d3acd..70639ed7a72 100644 --- a/dlls/ntdll/unix/server.c +++ b/dlls/ntdll/unix/server.c @@ -604,6 +604,12 @@ static void invoke_system_apc( const apc_call_t *call, apc_result_t *result, BOO if (!self) NtClose( wine_server_ptr_handle(call->dup_handle.dst_process) ); break; } + case APC_MEMORY_BARRIER: + { + MemoryBarrier(); + result->type = call->type; + break; + } default: server_protocol_error( "get_apc_request: bad type %d\n", call->type ); break; diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 5dd482c0dbb..0bf0e67cb09 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -5181,8 +5181,31 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T */ void WINAPI NtFlushProcessWriteBuffers(void) { - static int once = 0; - if (!once++) FIXME( "stub\n" ); + static pthread_mutex_t apc_memorybarrier_mutex = PTHREAD_MUTEX_INITIALIZER; + NTSTATUS status; + + pthread_mutex_lock( &apc_memorybarrier_mutex ); + + do + { + SERVER_START_REQ( flush_process_write_buffers ) + { + status = wine_server_call( req ); + } + SERVER_END_REQ; + } + while (status); + + do + { + select_op_t select_op; + select_op.membarrier.op = SELECT_MEMBARRIER; + status = server_select( &select_op, sizeof(select_op.membarrier), SELECT_INTERRUPTIBLE, + TIMEOUT_INFINITE, NULL, NULL ); + } + while (status); + + pthread_mutex_unlock( &apc_memorybarrier_mutex ); }
diff --git a/server/protocol.def b/server/protocol.def index 8c2fbeb4afe..ccaf6b40d8d 100644 --- a/server/protocol.def +++ b/server/protocol.def @@ -462,7 +462,8 @@ enum select_op SELECT_WAIT_ALL, SELECT_SIGNAL_AND_WAIT, SELECT_KEYED_EVENT_WAIT, - SELECT_KEYED_EVENT_RELEASE + SELECT_KEYED_EVENT_RELEASE, + SELECT_MEMBARRIER };
typedef union @@ -485,6 +486,10 @@ typedef union obj_handle_t handle; client_ptr_t key; } keyed_event; + struct + { + enum select_op op; /* SELECT_MEMBARRIER */ + } membarrier; } select_op_t;
enum apc_type @@ -503,7 +508,8 @@ enum apc_type APC_MAP_VIEW, APC_UNMAP_VIEW, APC_CREATE_THREAD, - APC_DUP_HANDLE + APC_DUP_HANDLE, + APC_MEMORY_BARRIER };
typedef struct @@ -622,6 +628,10 @@ typedef union unsigned int attributes; /* object attributes */ unsigned int options; /* duplicate options */ } dup_handle; + struct + { + enum apc_type type; /* APC_MEMORY_BARRIER */ + } memory_barrier; } apc_call_t;
typedef union @@ -1628,6 +1638,11 @@ enum server_fd_type @END
+/* Issue a memory barrier on other threads in the same process */ +@REQ(flush_process_write_buffers) +@END + + struct thread_info { timeout_t start_time; diff --git a/server/thread.c b/server/thread.c index 5f8493dd309..3aa81632df1 100644 --- a/server/thread.c +++ b/server/thread.c @@ -112,6 +112,41 @@ static const struct object_ops thread_apc_ops = thread_apc_destroy /* destroy */ };
+/* process-wide memory barriers */ + +struct memory_barrier +{ + struct object obj; /* object header */ +}; + +static void dump_memory_barrier( struct object *obj, int verbose ); +static int memory_barrier_signaled( struct object *obj, struct wait_queue_entry *entry ); + +static const struct object_ops memory_barrier_ops = +{ + sizeof(struct memory_barrier), /* size */ + &no_type, /* type */ + dump_memory_barrier, /* dump */ + add_queue, /* add_queue */ + remove_queue, /* remove_queue */ + memory_barrier_signaled, /* signaled */ + no_satisfied, /* satisfied */ + no_signal, /* signal */ + no_get_fd, /* get_fd */ + default_map_access, /* map_access */ + default_get_sd, /* get_sd */ + default_set_sd, /* set_sd */ + no_get_full_name, /* get_full_name */ + no_lookup_name, /* lookup_name */ + no_link_name, /* link_name */ + NULL, /* unlink_name */ + no_open_file, /* open_file */ + no_kernel_obj_list, /* get_kernel_obj_list */ + no_close_handle, /* close_handle */ + no_destroy /* destroy */ +}; + +struct memory_barrier *memory_barrier_obj;
/* thread CPU context */
@@ -249,6 +284,7 @@ static inline void init_thread_structure( struct thread *thread ) thread->token = NULL; thread->desc = NULL; thread->desc_len = 0; + thread->mb_apcs_pending = 0;
thread->creation_time = current_time; thread->exit_time = 0; @@ -306,6 +342,11 @@ struct thread *create_thread( int fd, struct process *process, const struct secu struct thread *thread; int request_pipe[2];
+ if (memory_barrier_obj) + grab_object( &memory_barrier_obj->obj ); + else if (!(memory_barrier_obj = alloc_object( &memory_barrier_ops ))) + return NULL; + if (fd == -1) { if (pipe( request_pipe ) == -1) @@ -441,12 +482,14 @@ static void cleanup_thread( struct thread *thread ) thread->desktop = 0; thread->desc = NULL; thread->desc_len = 0; + thread->mb_apcs_pending = 0; }
/* destroy a thread when its refcount is 0 */ static void destroy_thread( struct object *obj ) { struct thread *thread = (struct thread *)obj; + struct memory_barrier *mb = memory_barrier_obj; assert( obj->ops == &thread_ops );
list_remove( &thread->entry ); @@ -454,6 +497,9 @@ static void destroy_thread( struct object *obj ) release_object( thread->process ); if (thread->id) free_ptid( thread->id ); if (thread->token) release_object( thread->token ); + if (mb->obj.refcount == 1) + memory_barrier_obj = NULL; + release_object( &mb->obj ); }
/* dump a thread on stdout for debugging purposes */ @@ -526,6 +572,18 @@ static struct thread_apc *create_apc( struct object *owner, const apc_call_t *ca return apc; }
+static void dump_memory_barrier( struct object *obj, int verbose ) +{ + assert( obj->ops == &memory_barrier_ops ); + fprintf( stderr, "Memory barrier\n" ); +} + +static int memory_barrier_signaled( struct object *obj, struct wait_queue_entry *entry ) +{ + struct thread *thread = entry->wait->thread; + return !thread->mb_apcs_pending; +} + /* get a thread pointer from a thread id (and increment the refcount) */ struct thread *get_thread_from_id( thread_id_t id ) { @@ -1029,6 +1087,13 @@ static int select_on( const select_op_t *select_op, data_size_t op_size, client_ current->wait->key = select_op->keyed_event.key; break;
+ case SELECT_MEMBARRIER: + object = &memory_barrier_obj->obj; + if (!object) return 1; + ret = wait_on( select_op, 1, &object, flags, when ); + if (!ret) return 1; + break; + default: set_error( STATUS_INVALID_PARAMETER ); return 1; @@ -1165,6 +1230,16 @@ int thread_queue_apc( struct process *process, struct thread *thread, struct obj return ret; }
+static void finish_membarrier_apc( struct thread_apc *apc ) +{ + struct thread *thread = (struct thread *)apc->owner; + + assert( thread ); + assert( thread->mb_apcs_pending > 0 ); + if (--thread->mb_apcs_pending) + wake_up( &memory_barrier_obj->obj, 1 ); +} + /* cancel the async procedure call owned by a specific object */ void thread_cancel_apc( struct thread *thread, struct object *owner, enum apc_type type ) { @@ -1176,6 +1251,8 @@ void thread_cancel_apc( struct thread *thread, struct object *owner, enum apc_ty if (apc->owner != owner) continue; list_remove( &apc->entry ); apc->executed = 1; + if (apc->call.type == APC_MEMORY_BARRIER) + finish_membarrier_apc( apc ); wake_up( &apc->obj, 0 ); release_object( apc ); return; @@ -1206,6 +1283,8 @@ static void clear_apc_queue( struct list *queue ) struct thread_apc *apc = LIST_ENTRY( ptr, struct thread_apc, entry ); list_remove( &apc->entry ); apc->executed = 1; + if (apc->call.type == APC_MEMORY_BARRIER) + finish_membarrier_apc( apc ); wake_up( &apc->obj, 0 ); release_object( apc ); } @@ -1652,6 +1731,8 @@ DECL_HANDLER(select) apc->result.create_thread.handle = handle; clear_error(); /* ignore errors from the above calls */ } + if (apc->call.type == APC_MEMORY_BARRIER) /* wake up caller if membarriers done */ + finish_membarrier_apc( apc ); wake_up( &apc->obj, 0 ); close_handle( current->process, req->prev_apc ); release_object( apc ); @@ -1673,6 +1754,8 @@ DECL_HANDLER(select) else { apc->executed = 1; + if (apc->call.type == APC_MEMORY_BARRIER) + finish_membarrier_apc( apc ); wake_up( &apc->obj, 0 ); } release_object( apc ); @@ -2015,3 +2098,37 @@ DECL_HANDLER(get_next_thread) set_error( STATUS_NO_MORE_ENTRIES ); release_object( process ); } + +DECL_HANDLER(flush_process_write_buffers) +{ + struct process *process = current->process; + struct thread *thread; + apc_call_t call; + + assert( memory_barrier_obj ); + + memset( &call, 0, sizeof(call) ); + call.memory_barrier.type = APC_MEMORY_BARRIER; + + LIST_FOR_EACH_ENTRY( thread, &process->thread_list, struct thread, proc_entry ) + { + struct thread_apc *apc; + int success; + + /* Do not send a memory barrier APC to the current thread or a terminated thread. */ + if (thread == current || thread->state == TERMINATED) continue; + + if (!(apc = create_apc( ¤t->obj, &call ))) break; + + if ((success = queue_apc( NULL, thread, apc ))) + thread->mb_apcs_pending++; + + release_object( apc ); + + if (!success) + { + set_error( STATUS_UNSUCCESSFUL ); + break; + } + } +} diff --git a/server/thread.h b/server/thread.h index 8dcf966a90a..c9040704700 100644 --- a/server/thread.h +++ b/server/thread.h @@ -90,6 +90,7 @@ struct thread struct list kernel_object; /* list of kernel object pointers */ data_size_t desc_len; /* thread description length in bytes */ WCHAR *desc; /* thread description string */ + int mb_apcs_pending; /* number of APCs left for the current memory barrier */ };
extern struct thread *current;
From: Torge Matthies tmatthies@codeweavers.com
Credits to Avi Kivity (scylladb) and Aliaksei Kandratsenka (gperftools) for this trick, see [1].
[1] https://github.com/scylladb/seastar/commit/77a58e4dc020233f66fccb8d9e8f7a8b7... --- dlls/ntdll/unix/virtual.c | 54 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 4 deletions(-)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 0bf0e67cb09..39871195462 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -215,6 +215,11 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
+#if defined(__linux__) && (defined(__i386__) || defined(__x86_64__)) +static void *dontneed_page; +static pthread_mutex_t dontneed_page_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif +
static inline BOOL is_beyond_limit( const void *addr, size_t size, const void *limit ) { @@ -5176,10 +5181,40 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
-/********************************************************************** - * NtFlushProcessWriteBuffers (NTDLL.@) - */ -void WINAPI NtFlushProcessWriteBuffers(void) +#if defined(__linux__) && (defined(__i386__) || defined(__x86_64__)) +static int try_madvise( void ) +{ + int ret = 0; + char *mem; + + pthread_mutex_lock(&dontneed_page_mutex); + /* Credits to Avi Kivity (scylladb) and Aliaksei Kandratsenka (gperftools) for this trick, + * see https://github.com/scylladb/seastar/commit/77a58e4dc020233f66fccb8d9e8f7a8b7... */ + mem = dontneed_page; + if (!mem) + { + mem = anon_mmap_alloc( page_size, PROT_READ | PROT_WRITE ); + if (mem == MAP_FAILED) + goto failed; + if (mlock( mem, page_size )) + { + munmap( mem, page_size ); + goto failed; + } + dontneed_page = mem; + } + *mem = 3; + ret = !madvise( mem, page_size, MADV_DONTNEED ); +failed: + pthread_mutex_unlock(&dontneed_page_mutex); + return ret; +} +#else +static int try_madvise( void ) { return 0; } +#endif + + +static void do_apc_memorybarrier( void ) { static pthread_mutex_t apc_memorybarrier_mutex = PTHREAD_MUTEX_INITIALIZER; NTSTATUS status; @@ -5209,6 +5244,17 @@ void WINAPI NtFlushProcessWriteBuffers(void) }
+/********************************************************************** + * NtFlushProcessWriteBuffers (NTDLL.@) + */ +void WINAPI NtFlushProcessWriteBuffers(void) +{ + if (try_madvise()) + return; + do_apc_memorybarrier(); +} + + /********************************************************************** * NtCreatePagingFile (NTDLL.@) */
From: Torge Matthies tmatthies@codeweavers.com
Uses the MEMBARRIER_CMD_PRIVATE_EXPEDITED membarrier command introduced in Linux 4.14. --- dlls/ntdll/unix/virtual.c | 49 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 39871195462..3c7a4c9f800 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -39,6 +39,9 @@ #ifdef HAVE_SYS_SYSINFO_H # include <sys/sysinfo.h> #endif +#ifdef HAVE_SYS_SYSCALL_H +# include <sys/syscall.h> +#endif #ifdef HAVE_SYS_SYSCTL_H # include <sys/sysctl.h> #endif @@ -215,10 +218,16 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
-#if defined(__linux__) && (defined(__i386__) || defined(__x86_64__)) +#ifdef __linux__ +#ifdef __NR_membarrier +static BOOL membarrier_exp_available; +static pthread_once_t membarrier_init_once = PTHREAD_ONCE_INIT; +#endif +#if defined(__i386__) || defined(__x86_64__) static void *dontneed_page; static pthread_mutex_t dontneed_page_mutex = PTHREAD_MUTEX_INITIALIZER; #endif +#endif
static inline BOOL is_beyond_limit( const void *addr, size_t size, const void *limit ) @@ -5181,6 +5190,42 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
+#if defined(__linux__) && defined(__NR_membarrier) +#define MEMBARRIER_CMD_QUERY 0x00 +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08 +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED 0x10 + + +static int membarrier( int cmd, unsigned int flags, int cpu_id ) +{ + return syscall( __NR_membarrier, cmd, flags, cpu_id ); +} + + +static void membarrier_init( void ) +{ + static const int exp_required_cmds = + MEMBARRIER_CMD_PRIVATE_EXPEDITED | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED; + int available_cmds = membarrier( MEMBARRIER_CMD_QUERY, 0, 0 ); + if (available_cmds == -1) + return; + if ((available_cmds & exp_required_cmds) == exp_required_cmds) + membarrier_exp_available = !membarrier( MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0, 0 ); +} + + +static int try_exp_membarrier( void ) +{ + pthread_once(&membarrier_init_once, membarrier_init); + if (!membarrier_exp_available) + return 0; + return !membarrier( MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0 ); +} +#else +static int try_exp_membarrier( void ) { return 0; } +#endif + + #if defined(__linux__) && (defined(__i386__) || defined(__x86_64__)) static int try_madvise( void ) { @@ -5249,6 +5294,8 @@ static void do_apc_memorybarrier( void ) */ void WINAPI NtFlushProcessWriteBuffers(void) { + if (try_exp_membarrier()) + return; if (try_madvise()) return; do_apc_memorybarrier();
From: Torge Matthies tmatthies@codeweavers.com
--- dlls/ntdll/unix/virtual.c | 62 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 3c7a4c9f800..91c84508f44 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -65,6 +65,9 @@ #if defined(__APPLE__) # include <mach/mach_init.h> # include <mach/mach_vm.h> +# include <mach/task.h> +# include <mach/thread_state.h> +# include <mach/vm_map.h> #endif
#include "ntstatus.h" @@ -218,6 +221,11 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
+#ifdef __APPLE__ +static kern_return_t (*p_thread_get_register_pointer_values)( thread_t, uintptr_t*, size_t*, uintptr_t* ); +static pthread_once_t tgrpvs_init_once = PTHREAD_ONCE_INIT; +#endif + #ifdef __linux__ #ifdef __NR_membarrier static BOOL membarrier_exp_available; @@ -5190,6 +5198,58 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
+#ifdef __APPLE__ + +static void tgrpvs_init( void ) +{ + p_thread_get_register_pointer_values = dlsym( RTLD_DEFAULT, "thread_get_register_pointer_values" ); +} + +static int try_mach_tgrpvs( void ) +{ + /* Taken from https://github.com/dotnet/runtime/blob/7be37908e5a1cbb83b1062768c1649827eeac... */ + mach_msg_type_number_t count, i; + thread_act_array_t threads; + kern_return_t kret; + int ret = 0; + + pthread_once(&tgrpvs_init_once, tgrpvs_init); + if (!p_thread_get_register_pointer_values) + return 0; + + kret = task_threads( mach_task_self(), &threads, &count ); + if (kret) + return 0; + + for (i = 0; i < count; i++) + { + uintptr_t reg_values[128]; + size_t reg_count = ARRAY_SIZE( reg_values ); + uintptr_t sp; + + /* This function always fails when querying Rosetta's exception handling thread, so we will only handle + KERN_INSUFFICIENT_BUFFER_SIZE as an error, like .NET core does. */ + kret = p_thread_get_register_pointer_values( threads[i], &sp, ®_count, reg_values ); + if (kret == KERN_INSUFFICIENT_BUFFER_SIZE) + goto fail; + + kret = mach_port_deallocate( mach_task_self(), threads[i] ); + if (kret) + goto fail; + } + ret = 1; +fail: + for (; i < count; i++) + mach_port_deallocate( mach_task_self(), threads[i] ); + vm_deallocate( mach_task_self(), (vm_address_t)threads, count * sizeof(threads[0]) ); + return ret; +} + +#else +static int try_mach_tgrpvs( void ) { return 0; } +#endif + + #if defined(__linux__) && defined(__NR_membarrier) #define MEMBARRIER_CMD_QUERY 0x00 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08 @@ -5294,6 +5354,8 @@ static void do_apc_memorybarrier( void ) */ void WINAPI NtFlushProcessWriteBuffers(void) { + if (try_mach_tgrpvs()) + return; if (try_exp_membarrier()) return; if (try_madvise())
This is still too complicated. membarrier() and the macOS equivalent should be enough for most use cases. We can add more complex fallbacks later if they turn out to be truly necessary.
On Wed Oct 19 13:33:59 2022 +0000, Torge Matthies wrote:
Is this true for all architectures though? Or at least all that Wine intends to support? If so then yeah we can throw out the APC path.
According to the paper by Dave et al. [I mentioned earlier here](https://gitlab.winehq.org/wine/wine/-/merge_requests/741#note_7626), the answer seems to be yes.
https://doi.org/10.1145/3241624.2926699
I don't see a reason to believe otherwise. Maybe just leave a fixme-once if you have doubts about it?