Supersedes !741.
On macOS 10.14+ `thread_get_register_pointer_values` is called on every thread of the process. On Linux 4.14+ `membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, ...)` is used. On x86 Linux <= 4.13 and on other platforms it falls back to calling `NtGetContextThread()` on each thread.
The fast path patches from @tmatthies are slightly modified in the following ways:
1. On unsupported platforms, the `try_*()` functions return `FALSE` instead of `0`. 2. `try_exp_membarrier()` is called first, then `try_mach_tgrpvs()`.
---
Known applications fixed by this MR:
- osu! (rhythm game) song selection menu stuttering - .NET CoreCLR GC - HotSpot JVM (-XX:+UseSystemMemoryBarrier) safepoints
-- v3: kernel32/tests: Add a store buffering litmus test involving FlushProcessWriteBuffers. ntdll: Add slow fallback implementation of NtFlushProcessWriteBuffers.
From: Jinoh Kang jinoh.kang.kr@gmail.com
--- dlls/kernel32/tests/virtual.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+)
diff --git a/dlls/kernel32/tests/virtual.c b/dlls/kernel32/tests/virtual.c index cfbfcac0950..466badf0433 100644 --- a/dlls/kernel32/tests/virtual.c +++ b/dlls/kernel32/tests/virtual.c @@ -54,6 +54,7 @@ static NTSTATUS (WINAPI *pNtProtectVirtualMemory)(HANDLE, PVOID *, SIZE_T *, ULO static NTSTATUS (WINAPI *pNtReadVirtualMemory)(HANDLE,const void *,void *,SIZE_T, SIZE_T *); static NTSTATUS (WINAPI *pNtWriteVirtualMemory)(HANDLE, void *, const void *, SIZE_T, SIZE_T *); static BOOL (WINAPI *pPrefetchVirtualMemory)(HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); +static void (WINAPI *pFlushProcessWriteBuffers)(void);
/* ############################### */
@@ -4408,6 +4409,23 @@ static void test_ReadProcessMemory(void) free(buf); }
+static void test_FlushProcessWriteBuffers(void) +{ + unsigned int i; + + if (!pFlushProcessWriteBuffers) + { + win_skip("no FlushProcessWriteBuffers in kernel32\n"); + return; + } + + /* simple stress test */ + for (i = 0; i < 128; i++) + { + pFlushProcessWriteBuffers(); + } +} + START_TEST(virtual) { int argc; @@ -4465,6 +4483,7 @@ START_TEST(virtual) pNtReadVirtualMemory = (void *)GetProcAddress( hntdll, "NtReadVirtualMemory" ); pNtWriteVirtualMemory = (void *)GetProcAddress( hntdll, "NtWriteVirtualMemory" ); pPrefetchVirtualMemory = (void *)GetProcAddress( hkernelbase, "PrefetchVirtualMemory" ); + pFlushProcessWriteBuffers = (void *)GetProcAddress( hkernel32, "FlushProcessWriteBuffers" );
GetSystemInfo(&si); trace("system page size %#lx\n", si.dwPageSize); @@ -4490,6 +4509,7 @@ START_TEST(virtual) test_write_watch(); test_PrefetchVirtualMemory(); test_ReadProcessMemory(); + test_FlushProcessWriteBuffers(); #if defined(__i386__) || defined(__x86_64__) test_stack_commit(); #endif
From: Torge Matthies tmatthies@codeweavers.com
--- dlls/ntdll/tests/virtual.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/dlls/ntdll/tests/virtual.c b/dlls/ntdll/tests/virtual.c index f74c078b420..b0bd86a2c48 100644 --- a/dlls/ntdll/tests/virtual.c +++ b/dlls/ntdll/tests/virtual.c @@ -46,6 +46,7 @@ static NTSTATUS (WINAPI *pNtMapViewOfSectionEx)(HANDLE, HANDLE, PVOID *, const L static NTSTATUS (WINAPI *pNtSetInformationVirtualMemory)(HANDLE, VIRTUAL_MEMORY_INFORMATION_CLASS, ULONG_PTR, PMEMORY_RANGE_ENTRY, PVOID, ULONG); +static NTSTATUS (WINAPI *pNtFlushProcessWriteBuffers)(void);
static const BOOL is_win64 = sizeof(void*) != sizeof(int); static BOOL is_wow64; @@ -2985,6 +2986,14 @@ static void test_exec_memory_writes(void) RtlRemoveVectoredExceptionHandler( handler ); }
+static void test_flush_write_buffers(void) +{ + NTSTATUS status; + + status = pNtFlushProcessWriteBuffers(); + ok( status == STATUS_SUCCESS, "NtFlushProcessWriteBuffers returned %08lx\n", status ); +} + START_TEST(virtual) { HMODULE mod; @@ -3017,6 +3026,7 @@ START_TEST(virtual) pNtAllocateVirtualMemoryEx = (void *)GetProcAddress(mod, "NtAllocateVirtualMemoryEx"); pNtMapViewOfSectionEx = (void *)GetProcAddress(mod, "NtMapViewOfSectionEx"); pNtSetInformationVirtualMemory = (void *)GetProcAddress(mod, "NtSetInformationVirtualMemory"); + pNtFlushProcessWriteBuffers = (void *)GetProcAddress(mod, "NtFlushProcessWriteBuffers");
NtQuerySystemInformation(SystemBasicInformation, &sbi, sizeof(sbi), NULL); trace("system page size %#lx\n", sbi.PageSize); @@ -3036,4 +3046,5 @@ START_TEST(virtual) test_query_region_information(); test_query_image_information(); test_exec_memory_writes(); + test_flush_write_buffers(); }
From: Torge Matthies tmatthies@codeweavers.com
Uses the MEMBARRIER_CMD_PRIVATE_EXPEDITED membarrier command introduced in Linux 4.14. --- dlls/ntdll/unix/virtual.c | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 0c0a12c394d..258bf9f943d 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -39,6 +39,9 @@ #ifdef HAVE_SYS_SYSINFO_H # include <sys/sysinfo.h> #endif +#ifdef HAVE_SYS_SYSCALL_H +# include <sys/syscall.h> +#endif #ifdef HAVE_SYS_SYSCTL_H # include <sys/sysctl.h> #endif @@ -220,6 +223,11 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
+#if defined(__linux__) && defined(__NR_membarrier) +static BOOL membarrier_exp_available; +static pthread_once_t membarrier_init_once = PTHREAD_ONCE_INIT; +#endif +
static inline BOOL is_beyond_limit( const void *addr, size_t size, const void *limit ) { @@ -6317,12 +6325,52 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
+#if defined(__linux__) && defined(__NR_membarrier) + +#define MEMBARRIER_CMD_QUERY 0x00 +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08 +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED 0x10 + +static int membarrier( int cmd, unsigned int flags, int cpu_id ) +{ + return syscall( __NR_membarrier, cmd, flags, cpu_id ); +} + +static void membarrier_init( void ) +{ + static const int exp_required_cmds = + MEMBARRIER_CMD_PRIVATE_EXPEDITED | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED; + int available_cmds = membarrier( MEMBARRIER_CMD_QUERY, 0, 0 ); + if (available_cmds == -1) + return; + if ((available_cmds & exp_required_cmds) == exp_required_cmds) + membarrier_exp_available = !membarrier( MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0, 0 ); +} + +static BOOL try_exp_membarrier( void ) +{ + pthread_once(&membarrier_init_once, membarrier_init); + if (!membarrier_exp_available) + return FALSE; + return !membarrier( MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0 ); +} + +#else /* defined(__linux__) && defined(__NR_membarrier) */ + +static BOOL try_exp_membarrier( void ) { return FALSE; } + +#endif /* defined(__linux__) && defined(__NR_membarrier) */ + + /********************************************************************** * NtFlushProcessWriteBuffers (NTDLL.@) */ NTSTATUS WINAPI NtFlushProcessWriteBuffers(void) { static int once = 0; + + if (try_exp_membarrier()) return STATUS_SUCCESS; + if (!once++) FIXME( "stub\n" ); return STATUS_SUCCESS; }
From: Torge Matthies tmatthies@codeweavers.com
--- dlls/ntdll/unix/virtual.c | 63 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 258bf9f943d..90a8af3437e 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -65,6 +65,9 @@ #if defined(__APPLE__) # include <mach/mach_init.h> # include <mach/mach_vm.h> +# include <mach/task.h> +# include <mach/thread_state.h> +# include <mach/vm_map.h> #endif
#include "ntstatus.h" @@ -223,6 +226,11 @@ struct range_entry static struct range_entry *free_ranges; static struct range_entry *free_ranges_end;
+#ifdef __APPLE__ +static kern_return_t (*p_thread_get_register_pointer_values)( thread_t, uintptr_t*, size_t*, uintptr_t* ); +static pthread_once_t tgrpvs_init_once = PTHREAD_ONCE_INIT; +#endif + #if defined(__linux__) && defined(__NR_membarrier) static BOOL membarrier_exp_available; static pthread_once_t membarrier_init_once = PTHREAD_ONCE_INIT; @@ -6325,6 +6333,60 @@ NTSTATUS WINAPI NtFlushInstructionCache( HANDLE handle, const void *addr, SIZE_T }
+#ifdef __APPLE__ + +static void tgrpvs_init( void ) +{ + p_thread_get_register_pointer_values = dlsym( RTLD_DEFAULT, "thread_get_register_pointer_values" ); +} + +static BOOL try_mach_tgrpvs( void ) +{ + /* Taken from https://github.com/dotnet/runtime/blob/7be37908e5a1cbb83b1062768c1649827eeac... */ + mach_msg_type_number_t count, i = 0; + thread_act_array_t threads; + BOOL success = TRUE; + kern_return_t kret; + + pthread_once(&tgrpvs_init_once, tgrpvs_init); + if (!p_thread_get_register_pointer_values) + return FALSE; + + /* Get references to all threads of this process */ + kret = task_threads( mach_task_self(), &threads, &count ); + if (kret) + return FALSE; + + /* Iterate through the threads in the list */ + while (i < count) + { + uintptr_t reg_values[128]; + size_t reg_count = ARRAY_SIZE( reg_values ); + uintptr_t sp; + + /* Request the thread's register pointer values to force the thread to go through a memory barrier */ + kret = p_thread_get_register_pointer_values( threads[i], &sp, ®_count, reg_values ); + /* This function always fails when querying Rosetta's exception handling thread, so we only treat + KERN_INSUFFICIENT_BUFFER_SIZE as an error, like .NET core does. */ + if (kret == KERN_INSUFFICIENT_BUFFER_SIZE) + success = FALSE; + + /* Deallocate thread reference once we're done with it */ + mach_port_deallocate( mach_task_self(), threads[i++] ); + } + + /* Deallocate thread list */ + vm_deallocate( mach_task_self(), (vm_address_t)threads, count * sizeof(threads[0]) ); + return success; +} + +#else /* defined(__APPLE__) */ + +static BOOL try_mach_tgrpvs( void ) { return FALSE; } + +#endif /* defined(__APPLE__) */ + + #if defined(__linux__) && defined(__NR_membarrier)
#define MEMBARRIER_CMD_QUERY 0x00 @@ -6370,6 +6432,7 @@ NTSTATUS WINAPI NtFlushProcessWriteBuffers(void) static int once = 0;
if (try_exp_membarrier()) return STATUS_SUCCESS; + if (try_mach_tgrpvs()) return STATUS_SUCCESS;
if (!once++) FIXME( "stub\n" ); return STATUS_SUCCESS;
From: Jinoh Kang jinoh.kang.kr@gmail.com
--- dlls/ntdll/unix/virtual.c | 71 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 2 deletions(-)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 90a8af3437e..8c15cf48b94 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -6424,17 +6424,84 @@ static BOOL try_exp_membarrier( void ) { return FALSE; } #endif /* defined(__linux__) && defined(__NR_membarrier) */
+static BOOL try_flush_process_write_buffers_slow_fallback(void) +{ + HANDLE thread = NULL; + BOOL success = FALSE; + + MemoryBarrier(); /* barrier for current thread */ + + for (;;) + { + CONTEXT context; + NTSTATUS status; + HANDLE next; + + status = NtGetNextThread( GetCurrentProcess(), thread, + THREAD_GET_CONTEXT | THREAD_QUERY_LIMITED_INFORMATION, + 0, 0, &next ); + if (status) + { + if (status == STATUS_NO_MORE_ENTRIES) + { + success = TRUE; + } + else + { + ERR( "Failed to get handle to next thread of %p: %#x\n", thread, (int)status ); + } + break; + } + if (thread) NtClose( thread ); + thread = next; + + context.ContextFlags = CONTEXT_CONTROL; + status = NtGetContextThread( thread, &context ); + if (status) + { + ULONG terminated; + NTSTATUS qstatus; + + qstatus = NtQueryInformationThread( thread, ThreadIsTerminated, &terminated, sizeof(terminated), NULL ); + if (qstatus) + { + ERR( "Failed to get termination status of thread %p: %#x\n", thread, (int)qstatus ); + break; + } + + if (!terminated) + { + ERR( "Failed to execute heavy barrier on thread %p: %#x\n", thread, (int)status ); + break; + } + } + } + if (thread) NtClose( thread ); + return success; +} + + /********************************************************************** * NtFlushProcessWriteBuffers (NTDLL.@) */ NTSTATUS WINAPI NtFlushProcessWriteBuffers(void) { - static int once = 0; + static LONG volatile once;
if (try_exp_membarrier()) return STATUS_SUCCESS; if (try_mach_tgrpvs()) return STATUS_SUCCESS;
- if (!once++) FIXME( "stub\n" ); + if (!once && !InterlockedExchange( &once, TRUE )) + { + FIXME( "Falling back to slow implementation\n" ); + } + + while (!try_flush_process_write_buffers_slow_fallback()) + { + ERR( "Failed to execute slow fallback, retrying\n" ); + } + + /* NtFlushProcessWriteBuffers() cannot fail */ return STATUS_SUCCESS; }
From: Jinoh Kang jinoh.kang.kr@gmail.com
--- dlls/kernel32/tests/virtual.c | 148 ++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+)
diff --git a/dlls/kernel32/tests/virtual.c b/dlls/kernel32/tests/virtual.c index 466badf0433..461e5bacfe0 100644 --- a/dlls/kernel32/tests/virtual.c +++ b/dlls/kernel32/tests/virtual.c @@ -4409,8 +4409,144 @@ static void test_ReadProcessMemory(void) free(buf); }
+struct sbtestshared +{ + /* Number of slots per thread */ + unsigned int num_slots; + + /* Number of generations (iterations per each slot) */ + unsigned int num_generations; + + /* Stores from threads */ + LONG *wrote; + + /* Observations from threads */ + LONG *read; + + /* Number of observed reorderings (witnesses) in the SB litmus test. + * Note: unobservable reorderings are not counted. */ + LONG *reorderings; +}; + +struct sbtestparams +{ + struct sbtestshared *shared; + unsigned int th_id; + void (WINAPI *barrier)(void); +}; + +static DWORD CALLBACK sbtest_thread_proc( void *arg ) +{ + const struct sbtestparams *params = arg; + const struct sbtestshared s = *params->shared; + const unsigned int th_id = params->th_id; + void (WINAPI *const barrier)(void) = params->barrier; + unsigned int slot = 0, gen = 0; + + for (;;) + { + LONG read_l, read_r; + + /* Access slots in reverse order to avoid prefetching */ + if (!slot--) + { + if (gen++ >= s.num_generations) break; + slot = s.num_slots - 1; + } + + WriteRelease(&s.wrote[th_id * s.num_slots + slot], gen); + (*barrier)(); + read_l = (gen << 1) | (ReadAcquire(&s.wrote[(th_id ^ 1) * s.num_slots + slot]) == gen); + + WriteRelease(&s.read[th_id * s.num_slots + slot], read_l); + + while (((read_r = ReadAcquire(&s.read[(th_id ^ 1) * s.num_slots + slot])) & ~1) != (gen << 1)) + YieldProcessor(); + + /* fairly distribute testing overhead across threads */ + if ((slot & 1) == th_id) + { + if (!(read_l & 1) && !(read_r & 1)) + InterlockedIncrement( s.reorderings ); + } + } + + return 0; +} + +#ifdef _MSC_VER + +#pragma intrinsic(_ReadWriteBarrier) +void _ReadWriteBarrier(void); + +static void WINAPI compiler_barrier(void) +{ +#pragma warning(suppress:4996) + _ReadWriteBarrier(); +} + +#else /* _MSC_VER */ + +static void WINAPI compiler_barrier(void) +{ +#if defined(__x86_64__) || defined(__i386__) + __asm__ __volatile__("" ::: "memory"); +#else + __atomic_thread_fence( __ATOMIC_RELAXED ); +#endif +} + +#endif /* _MSC_VER */ + +static LONG store_buffer_litmus_test( void (*WINAPI barrier0)(void), void (*WINAPI barrier1)(void) ) +{ + LONG reorderings = 0; + struct sbtestshared shared = { + /* Should be big enough to avoid false sharing */ + .num_slots = 64, + + /* Increase if flaky, decrease if slow */ + .num_generations = 32768, + + .reorderings = &reorderings, + }; + struct sbtestparams pars[2]; + HANDLE threads[2]; + unsigned int i; + DWORD ret; + + shared.wrote = VirtualAlloc( NULL, ARRAY_SIZE(threads) * shared.num_slots * sizeof(*shared.wrote), + MEM_COMMIT, PAGE_READWRITE ); + ok( shared.wrote != NULL, "VirtualAlloc failed: %lu\n", GetLastError() ); + + shared.read = VirtualAlloc( NULL, ARRAY_SIZE(threads) * shared.num_slots * sizeof(*shared.read), + MEM_COMMIT, PAGE_READWRITE ); + ok( shared.read != NULL, "VirtualAlloc failed: %lu\n", GetLastError() ); + + for (i = 0; i < ARRAY_SIZE(threads); i++) + { + pars[i].shared = &shared; + pars[i].th_id = i; + pars[i].barrier = i == 0 ? barrier0 : barrier1; + threads[i] = CreateThread( NULL, 0, sbtest_thread_proc, &pars[i], 0, NULL ); + ok( threads[i] != NULL, "CreateThread failed: %lu\n", GetLastError() ); + } + + ret = WaitForMultipleObjects( ARRAY_SIZE(threads), threads, TRUE, INFINITE ); + ok( ret == WAIT_OBJECT_0, "WaitForMultipleObjects failed: %lu\n", GetLastError() ); + + ret = VirtualFree( shared.read, 0, MEM_RELEASE ); + ok( ret, "VirtualFree failed: %lu\n", GetLastError() ); + + ret = VirtualFree( shared.wrote, 0, MEM_RELEASE ); + ok( ret, "VirtualFree failed: %lu\n", GetLastError() ); + + return reorderings; +} + static void test_FlushProcessWriteBuffers(void) { + LONG reorderings; unsigned int i;
if (!pFlushProcessWriteBuffers) @@ -4424,6 +4560,18 @@ static void test_FlushProcessWriteBuffers(void) { pFlushProcessWriteBuffers(); } + + if (si.dwNumberOfProcessors == 1) + { + skip( "single-processor system, cannot test store buffering behavior\n" ); + return; + } + + reorderings = store_buffer_litmus_test( compiler_barrier, compiler_barrier ); + ok( reorderings, "expected write-read reordering with compiler barrier only (got %ld reorderings)\n", reorderings ); + + reorderings = store_buffer_litmus_test( compiler_barrier, pFlushProcessWriteBuffers ); + ok( !reorderings, "expected sequential consistency with FlushProcessWriteBuffers (got %ld reorderings)\n", reorderings ); }
START_TEST(virtual)
Hi,
It looks like your patch introduced the new failures shown below. Please investigate and fix them before resubmitting your patch. If they are not new, fixing them anyway would help a lot. Otherwise please ask for the known failures list to be updated.
The tests also ran into some preexisting test failures. If you know how to fix them that would be helpful. See the TestBot job for the details:
The full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=151208
Your paranoid android.
=== debian11b (64 bit WoW report) ===
user32: input.c:4306: Test succeeded inside todo block: button_down_hwnd_todo 1: got MSG_TEST_WIN hwnd 0000000001F10100, msg WM_LBUTTONDOWN, wparam 0x1, lparam 0x320032
v2: Rename `try_heavy_membarrier_fallback` to `try_flush_process_write_buffers_slow_fallback`
v3: Insert an explicit `MemoryBarrier()` in slow fallback path