Supersedes !741.
On macOS 10.14+ `thread_get_register_pointer_values` is called on every thread of the process. On Linux 4.14+ `membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, ...)` is used. On x86 Linux <= 4.13 and on other platforms it falls back to calling `NtGetContextThread()` on each thread.
The fast path patches from @tmatthies are slightly modified in the following ways:
1. On unsupported platforms, the `try_*()` functions return `FALSE` instead of `0`. 2. `try_exp_membarrier()` is called first, then `try_mach_tgrpvs()`.
---
Known applications fixed by this MR:
- osu! (rhythm game) song selection menu stuttering - .NET CoreCLR GC - HotSpot JVM (-XX:+UseSystemMemoryBarrier) safepoints
-- v2: kernel32/tests: Add a store buffering litmus test involving FlushProcessWriteBuffers. ntdll: Add slow fallback implementation of NtFlushProcessWriteBuffers.
From: Jinoh Kang jinoh.kang.kr@gmail.com
--- dlls/kernel32/tests/virtual.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+)
diff --git a/dlls/kernel32/tests/virtual.c b/dlls/kernel32/tests/virtual.c index cfbfcac0950..466badf0433 100644 --- a/dlls/kernel32/tests/virtual.c +++ b/dlls/kernel32/tests/virtual.c @@ -54,6 +54,7 @@ static NTSTATUS (WINAPI *pNtProtectVirtualMemory)(HANDLE, PVOID *, SIZE_T *, ULO static NTSTATUS (WINAPI *pNtReadVirtualMemory)(HANDLE,const void *,void *,SIZE_T, SIZE_T *); static NTSTATUS (WINAPI *pNtWriteVirtualMemory)(HANDLE, void *, const void *, SIZE_T, SIZE_T *); static BOOL (WINAPI *pPrefetchVirtualMemory)(HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); +static void (WINAPI *pFlushProcessWriteBuffers)(void);
/* ############################### */
@@ -4408,6 +4409,23 @@ static void test_ReadProcessMemory(void) free(buf); }
+static void test_FlushProcessWriteBuffers(void) +{ + unsigned int i; + + if (!pFlushProcessWriteBuffers) + { + win_skip("no FlushProcessWriteBuffers in kernel32\n"); + return; + } + + /* simple stress test */ + for (i = 0; i < 128; i++) + { + pFlushProcessWriteBuffers(); + } +} + START_TEST(virtual) { int argc; @@ -4465,6 +4483,7 @@ START_TEST(virtual) pNtReadVirtualMemory = (void *)GetProcAddress( hntdll, "NtReadVirtualMemory" ); pNtWriteVirtualMemory = (void *)GetProcAddress( hntdll, "NtWriteVirtualMemory" ); pPrefetchVirtualMemory = (void *)GetProcAddress( hkernelbase, "PrefetchVirtualMemory" ); + pFlushProcessWriteBuffers = (void *)GetProcAddress( hkernel32, "FlushProcessWriteBuffers" );
GetSystemInfo(&si); trace("system page size %#lx\n", si.dwPageSize); @@ -4490,6 +4509,7 @@ START_TEST(virtual) test_write_watch(); test_PrefetchVirtualMemory(); test_ReadProcessMemory(); + test_FlushProcessWriteBuffers(); #if defined(__i386__) || defined(__x86_64__) test_stack_commit(); #endif
From: Torge Matthies tmatthies@codeweavers.com
--- dlls/ntdll/tests/virtual.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/dlls/ntdll/tests/virtual.c b/dlls/ntdll/tests/virtual.c index f74c078b420..b0bd86a2c48 100644 --- a/dlls/ntdll/tests/virtual.c +++ b/dlls/ntdll/tests/virtual.c @@ -46,6 +46,7 @@ static NTSTATUS (WINAPI *pNtMapViewOfSectionEx)(HANDLE, HANDLE, PVOID *, const L static NTSTATUS (WINAPI *pNtSetInformationVirtualMemory)(HANDLE, VIRTUAL_MEMORY_INFORMATION_CLASS, ULONG_PTR, PMEMORY_RANGE_ENTRY, PVOID, ULONG); +static NTSTATUS (WINAPI *pNtFlushProcessWriteBuffers)(void);
static const BOOL is_win64 = sizeof(void*) != sizeof(int); static BOOL is_wow64; @@ -2985,6 +2986,14 @@ static void test_exec_memory_writes(void) RtlRemoveVectoredExceptionHandler( handler ); }
+static void test_flush_write_buffers(void) +{ + NTSTATUS status; + + status = pNtFlushProcessWriteBuffers(); + ok( status == STATUS_SUCCESS, "NtFlushProcessWriteBuffers returned %08lx\n", status ); +} + START_TEST(virtual) { HMODULE mod; @@ -3017,6 +3026,7 @@ START_TEST(virtual) pNtAllocateVirtualMemoryEx = (void *)GetProcAddress(mod, "NtAllocateVirtualMemoryEx"); pNtMapViewOfSectionEx = (void *)GetProcAddress(mod, "NtMapViewOfSectionEx"); pNtSetInformationVirtualMemory = (void *)GetProcAddress(mod, "NtSetInformationVirtualMemory"); + pNtFlushProcessWriteBuffers = (void *)GetProcAddress(mod, "NtFlushProcessWriteBuffers");
NtQuerySystemInformation(SystemBasicInformation, &sbi, sizeof(sbi), NULL); trace("system page size %#lx\n", sbi.PageSize); @@ -3036,4 +3046,5 @@ START_TEST(virtual) test_query_region_information(); test_query_image_information(); test_exec_memory_writes(); + test_flush_write_buffers(); }
From: Torge Matthies tmatthies@codeweavers.com
Uses the MEMBARRIER_CMD_PRIVATE_EXPEDITED membarrier command introduced in Linux 4.14. --- dlls/ntdll/unix/virtual.c | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 0c0a12c394d..258bf9f943d 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -39,6 +39,9 @@ #ifdef HAVE_SYS_SYSINFO_H # include <sys/sysinfo.h> #endif +#ifdef HAVE_SYS_SYSCALL_H +# include <sys/syscall.h> +#endif #ifdef HAVE_SYS_SYSCTL_H # include <sys/sysctl.h> #endif @@ -220,6 +223,11 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
+#if defined(__linux__) && defined(__NR_membarrier) +static BOOL membarrier_exp_available; +static pthread_once_t membarrier_init_once = PTHREAD_ONCE_INIT; +#endif +
static inline BOOL is_beyond_limit( const void *addr, size_t size, const void *limit ) { @@ -6317,12 +6325,52 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
+#if defined(__linux__) && defined(__NR_membarrier) + +#define MEMBARRIER_CMD_QUERY 0x00 +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08 +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED 0x10 + +static int membarrier( int cmd, unsigned int flags, int cpu_id ) +{ + return syscall( __NR_membarrier, cmd, flags, cpu_id ); +} + +static void membarrier_init( void ) +{ + static const int exp_required_cmds = + MEMBARRIER_CMD_PRIVATE_EXPEDITED | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED; + int available_cmds = membarrier( MEMBARRIER_CMD_QUERY, 0, 0 ); + if (available_cmds == -1) + return; + if ((available_cmds & exp_required_cmds) == exp_required_cmds) + membarrier_exp_available = !membarrier( MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0, 0 ); +} + +static BOOL try_exp_membarrier( void ) +{ + pthread_once(&membarrier_init_once, membarrier_init); + if (!membarrier_exp_available) + return FALSE; + return !membarrier( MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0 ); +} + +#else /* defined(__linux__) && defined(__NR_membarrier) */ + +static BOOL try_exp_membarrier( void ) { return FALSE; } + +#endif /* defined(__linux__) && defined(__NR_membarrier) */ + + /********************************************************************** * NtFlushProcessWriteBuffers (NTDLL.@) */ NTSTATUS WINAPI NtFlushProcessWriteBuffers(void) { static int once = 0; + + if (try_exp_membarrier()) return STATUS_SUCCESS; + if (!once++) FIXME( "stub\n" ); return STATUS_SUCCESS; }
From: Torge Matthies tmatthies@codeweavers.com
--- dlls/ntdll/unix/virtual.c | 63 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 258bf9f943d..90a8af3437e 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -65,6 +65,9 @@ #if defined(__APPLE__) # include <mach/mach_init.h> # include <mach/mach_vm.h> +# include <mach/task.h> +# include <mach/thread_state.h> +# include <mach/vm_map.h> #endif
#include "ntstatus.h" @@ -223,6 +226,11 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
+#ifdef __APPLE__ +static kern_return_t (*p_thread_get_register_pointer_values)( thread_t, uintptr_t*, size_t*, uintptr_t* ); +static pthread_once_t tgrpvs_init_once = PTHREAD_ONCE_INIT; +#endif + #if defined(__linux__) && defined(__NR_membarrier) static BOOL membarrier_exp_available; static pthread_once_t membarrier_init_once = PTHREAD_ONCE_INIT; @@ -6325,6 +6333,60 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
+#ifdef __APPLE__ + +static void tgrpvs_init( void ) +{ + p_thread_get_register_pointer_values = dlsym( RTLD_DEFAULT, "thread_get_register_pointer_values" ); +} + +static BOOL try_mach_tgrpvs( void ) +{ + /* Taken from https://github.com/dotnet/runtime/blob/7be37908e5a1cbb83b1062768c1649827eeac... */ + mach_msg_type_number_t count, i = 0; + thread_act_array_t threads; + BOOL success = TRUE; + kern_return_t kret; + + pthread_once(&tgrpvs_init_once, tgrpvs_init); + if (!p_thread_get_register_pointer_values) + return FALSE; + + /* Get references to all threads of this process */ + kret = task_threads( mach_task_self(), &threads, &count ); + if (kret) + return FALSE; + + /* Iterate through the threads in the list */ + while (i < count) + { + uintptr_t reg_values[128]; + size_t reg_count = ARRAY_SIZE( reg_values ); + uintptr_t sp; + + /* Request the thread's register pointer values to force the thread to go through a memory barrier */ + kret = p_thread_get_register_pointer_values( threads[i], &sp, ®_count, reg_values ); + /* This function always fails when querying Rosetta's exception handling thread, so we only treat + KERN_INSUFFICIENT_BUFFER_SIZE as an error, like .NET core does. */ + if (kret == KERN_INSUFFICIENT_BUFFER_SIZE) + success = FALSE; + + /* Deallocate thread reference once we're done with it */ + mach_port_deallocate( mach_task_self(), threads[i++] ); + } + + /* Deallocate thread list */ + vm_deallocate( mach_task_self(), (vm_address_t)threads, count * sizeof(threads[0]) ); + return success; +} + +#else /* defined(__APPLE__) */ + +static BOOL try_mach_tgrpvs( void ) { return FALSE; } + +#endif /* defined(__APPLE__) */ + + #if defined(__linux__) && defined(__NR_membarrier)
#define MEMBARRIER_CMD_QUERY 0x00 @@ -6370,6 +6432,7 @@ NTSTATUS WINAPI NtFlushProcessWriteBuffers(void) static int once = 0;
if (try_exp_membarrier()) return STATUS_SUCCESS; + if (try_mach_tgrpvs()) return STATUS_SUCCESS;
if (!once++) FIXME( "stub\n" ); return STATUS_SUCCESS;
From: Jinoh Kang jinoh.kang.kr@gmail.com
--- dlls/ntdll/unix/virtual.c | 69 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 2 deletions(-)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 90a8af3437e..f1f81507a47 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -6424,17 +6424,82 @@ static BOOL try_exp_membarrier( void ) { return FALSE; } #endif /* defined(__linux__) && defined(__NR_membarrier) */
+static BOOL try_flush_process_write_buffers_slow_fallback(void) +{ + HANDLE thread = NULL; + BOOL success = FALSE; + + for (;;) + { + CONTEXT context; + NTSTATUS status; + HANDLE next; + + status = NtGetNextThread( GetCurrentProcess(), thread, + THREAD_GET_CONTEXT | THREAD_QUERY_LIMITED_INFORMATION, + 0, 0, &next ); + if (status) + { + if (status == STATUS_NO_MORE_ENTRIES) + { + success = TRUE; + } + else + { + ERR( "Failed to get handle to next thread of %p: %#x\n", thread, (int)status ); + } + break; + } + if (thread) NtClose( thread ); + thread = next; + + context.ContextFlags = CONTEXT_CONTROL; + status = NtGetContextThread( thread, &context ); + if (status) + { + ULONG terminated; + NTSTATUS qstatus; + + qstatus = NtQueryInformationThread( thread, ThreadIsTerminated, &terminated, sizeof(terminated), NULL ); + if (qstatus) + { + ERR( "Failed to get termination status of thread %p: %#x\n", thread, (int)qstatus ); + break; + } + + if (!terminated) + { + ERR( "Failed to execute heavy barrier on thread %p: %#x\n", thread, (int)status ); + break; + } + } + } + if (thread) NtClose( thread ); + return success; +} + + /********************************************************************** * NtFlushProcessWriteBuffers (NTDLL.@) */ NTSTATUS WINAPI NtFlushProcessWriteBuffers(void) { - static int once = 0; + static LONG volatile once;
if (try_exp_membarrier()) return STATUS_SUCCESS; if (try_mach_tgrpvs()) return STATUS_SUCCESS;
- if (!once++) FIXME( "stub\n" ); + if (!once && !InterlockedExchange( &once, TRUE )) + { + FIXME( "Falling back to slow implementation\n" ); + } + + while (!try_flush_process_write_buffers_slow_fallback()) + { + ERR( "Failed to execute slow fallback, retrying\n" ); + } + + /* NtFlushProcessWriteBuffers() cannot fail */ return STATUS_SUCCESS; }
From: Jinoh Kang jinoh.kang.kr@gmail.com
--- dlls/kernel32/tests/virtual.c | 148 ++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+)
diff --git a/dlls/kernel32/tests/virtual.c b/dlls/kernel32/tests/virtual.c index 466badf0433..461e5bacfe0 100644 --- a/dlls/kernel32/tests/virtual.c +++ b/dlls/kernel32/tests/virtual.c @@ -4409,8 +4409,144 @@ static void test_ReadProcessMemory(void) free(buf); }
+struct sbtestshared +{ + /* Number of slots per thread */ + unsigned int num_slots; + + /* Number of generations (iterations per each slot) */ + unsigned int num_generations; + + /* Stores from threads */ + LONG *wrote; + + /* Observations from threads */ + LONG *read; + + /* Number of observed reorderings (witnesses) in the SB litmus test. + * Note: unobservable reorderings are not counted. */ + LONG *reorderings; +}; + +struct sbtestparams +{ + struct sbtestshared *shared; + unsigned int th_id; + void (WINAPI *barrier)(void); +}; + +static DWORD CALLBACK sbtest_thread_proc( void *arg ) +{ + const struct sbtestparams *params = arg; + const struct sbtestshared s = *params->shared; + const unsigned int th_id = params->th_id; + void (WINAPI *const barrier)(void) = params->barrier; + unsigned int slot = 0, gen = 0; + + for (;;) + { + LONG read_l, read_r; + + /* Access slots in reverse order to avoid prefetching */ + if (!slot--) + { + if (gen++ >= s.num_generations) break; + slot = s.num_slots - 1; + } + + WriteRelease(&s.wrote[th_id * s.num_slots + slot], gen); + (*barrier)(); + read_l = (gen << 1) | (ReadAcquire(&s.wrote[(th_id ^ 1) * s.num_slots + slot]) == gen); + + WriteRelease(&s.read[th_id * s.num_slots + slot], read_l); + + while (((read_r = ReadAcquire(&s.read[(th_id ^ 1) * s.num_slots + slot])) & ~1) != (gen << 1)) + YieldProcessor(); + + /* fairly distribute testing overhead across threads */ + if ((slot & 1) == th_id) + { + if (!(read_l & 1) && !(read_r & 1)) + InterlockedIncrement( s.reorderings ); + } + } + + return 0; +} + +#ifdef _MSC_VER + +#pragma intrinsic(_ReadWriteBarrier) +void _ReadWriteBarrier(void); + +static void WINAPI compiler_barrier(void) +{ +#pragma warning(suppress:4996) + _ReadWriteBarrier(); +} + +#else /* _MSC_VER */ + +static void WINAPI compiler_barrier(void) +{ +#if defined(__x86_64__) || defined(__i386__) + __asm__ __volatile__("" ::: "memory"); +#else + __atomic_thread_fence( __ATOMIC_RELAXED ); +#endif +} + +#endif /* _MSC_VER */ + +static LONG store_buffer_litmus_test( void (*WINAPI barrier0)(void), void (*WINAPI barrier1)(void) ) +{ + LONG reorderings = 0; + struct sbtestshared shared = { + /* Should be big enough to avoid false sharing */ + .num_slots = 64, + + /* Increase if flaky, decrease if slow */ + .num_generations = 32768, + + .reorderings = &reorderings, + }; + struct sbtestparams pars[2]; + HANDLE threads[2]; + unsigned int i; + DWORD ret; + + shared.wrote = VirtualAlloc( NULL, ARRAY_SIZE(threads) * shared.num_slots * sizeof(*shared.wrote), + MEM_COMMIT, PAGE_READWRITE ); + ok( shared.wrote != NULL, "VirtualAlloc failed: %lu\n", GetLastError() ); + + shared.read = VirtualAlloc( NULL, ARRAY_SIZE(threads) * shared.num_slots * sizeof(*shared.read), + MEM_COMMIT, PAGE_READWRITE ); + ok( shared.read != NULL, "VirtualAlloc failed: %lu\n", GetLastError() ); + + for (i = 0; i < ARRAY_SIZE(threads); i++) + { + pars[i].shared = &shared; + pars[i].th_id = i; + pars[i].barrier = i == 0 ? barrier0 : barrier1; + threads[i] = CreateThread( NULL, 0, sbtest_thread_proc, &pars[i], 0, NULL ); + ok( threads[i] != NULL, "CreateThread failed: %lu\n", GetLastError() ); + } + + ret = WaitForMultipleObjects( ARRAY_SIZE(threads), threads, TRUE, INFINITE ); + ok( ret == WAIT_OBJECT_0, "WaitForMultipleObjects failed: %lu\n", GetLastError() ); + + ret = VirtualFree( shared.read, 0, MEM_RELEASE ); + ok( ret, "VirtualFree failed: %lu\n", GetLastError() ); + + ret = VirtualFree( shared.wrote, 0, MEM_RELEASE ); + ok( ret, "VirtualFree failed: %lu\n", GetLastError() ); + + return reorderings; +} + static void test_FlushProcessWriteBuffers(void) { + LONG reorderings; unsigned int i;
if (!pFlushProcessWriteBuffers) @@ -4424,6 +4560,18 @@ static void test_FlushProcessWriteBuffers(void) { pFlushProcessWriteBuffers(); } + + if (si.dwNumberOfProcessors == 1) + { + skip( "single-processor system, cannot test store buffering behavior\n" ); + return; + } + + reorderings = store_buffer_litmus_test( compiler_barrier, compiler_barrier ); + ok( reorderings, "expected write-read reordering with compiler barrier only (got %ld reorderings)\n", reorderings ); + + reorderings = store_buffer_litmus_test( compiler_barrier, pFlushProcessWriteBuffers ); + ok( !reorderings, "expected sequential consistency with FlushProcessWriteBuffers (got %ld reorderings)\n", reorderings ); }
START_TEST(virtual)