This uses the Mach COW mechanism to implement writewatch functionality.
Below is the same micro-benchmark @gofman used in his [UFFD MR](https://gitlab.winehq.org/wine/wine/-/merge_requests/7871).
``` Parameters: - number of concurrent threads; - number of pages; - delay between reading / resetting write watches (ms) - random (1) or sequentual (0) page write access; - reset with WRITE_WATCH_FLAG_RESET in GetWriteWatch (1) or in a separate ResetWriteWatch call (0).
Result is in the form of <average write to page time, ns> / <average GetWriteWatch() time, mcs>
Parameters Windows Mach COW Fallback 6 1080 3 1 1 897 / 80 371 / 12634 66202 / 186 6 1080 3 1 0 855 / 87 369 / 12637 66766 / 187 8 8192 3 1 1 6526 / 268 627 / 113263 111053 / 485 8 8192 3 1 0 1197 / 509 623 / 113810 122921 / 489 8 8192 1 1 1 1227 / 412 636 / 118930 150628 / 388 8 8192 1 1 0 5721 / 144 631 / 120538 146392 / 384 8 64 1 1 1 572 / 7 490 / 1078 1000 / 89 8 64 1 1 0 530 / 13 500 / 1075 1167 / 77 ```
This was all on the same M2 Max machine with Windows being win11 on ARM in a VM running the x64 binary emulated and otherwise Wine through Rosetta with and without this MR.
Unlike UFFD which is always better than fallback and comparable to the Windows performance, here good average write to page time is traded for bad average `GetWriteWatch()` time (pretty much in equal ratios).
However in real world applications (like the FFXIV + Dalamud mod framework/loader use case) the startup time is reduced from about 25.5s to 23.6s with this change from a cold start, including loading a modern dotnet 9 runtime into the game process and initializing a complex mod collection, with a fairly high GC pressure.
This is probably because the `GetWriteWatch()` calls the GC does mostly happen concurrently, whereas in Wines fallback implementation running threads are interrupted and often wait on the global virtual lock in Wine while the segfault is handled and parallel accesses to write watched memory and other VM operations are blocked.
Another advantage is that `VPROT_WRITEWATCH` can be used then for other purposes in the future and also Rosetta being a bit finicky sometimes with reported protections with the current implementation, but behaved always as expected so far in my testing with the new one.
On native ARM64 the `VM_PROT_COPY`/`SM_COW` mechanism works also as expected on native 16k pages (not that this matters much at the moment).
`GetWriteWatch()` with the reset flag also does not need to be transactional (unlike UFFD), since only marked pages are reset here and not the entire range.
From: Marc-Aurel Zent mzent@codeweavers.com
--- dlls/ntdll/unix/virtual.c | 185 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 994f76fb72a..b3a45837acd 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -414,6 +414,191 @@ static void kernel_get_write_watches( void *base, SIZE_T size, void **buffer, UL addr = next_addr; } } +#elif defined(__APPLE__) +static vm_prot_t get_mach_prot( mach_vm_address_t addr ) +{ + size_t i, idx = (size_t)ROUND_ADDR( addr, host_page_mask ) >> page_shift; + const BYTE *vprot_ptr; + BYTE vprot = 0; + vm_prot_t mach_prot = VM_PROT_NONE; + + if ((idx >> pages_vprot_shift) >= pages_vprot_size) return VM_PROT_NONE; + if (!pages_vprot[idx >> pages_vprot_shift]) return VM_PROT_NONE; + assert( host_page_mask >> page_shift <= pages_vprot_mask ); + vprot_ptr = pages_vprot[idx >> pages_vprot_shift] + (idx & pages_vprot_mask); + for (i = 0; i < host_page_size / page_size; i++) vprot |= vprot_ptr[i]; + if ((vprot & VPROT_COMMITTED) && !(vprot & VPROT_GUARD)) + { + if (vprot & VPROT_READ) mach_prot |= VM_PROT_READ; + if (vprot & VPROT_WRITE) mach_prot |= VM_PROT_WRITE | VM_PROT_READ; + if (vprot & VPROT_WRITECOPY) mach_prot |= VM_PROT_WRITE | VM_PROT_READ; + if (vprot & VPROT_EXEC) mach_prot |= VM_PROT_EXECUTE | VM_PROT_READ; + if (vprot & VPROT_WRITEWATCH) mach_prot &= ~VM_PROT_WRITE; + } + + return mach_prot; +} + +static void kernel_writewatch_init(void) +{ + use_kernel_writewatch = 1; + TRACE( "Using mach write watches.\n" ); +} + +static void kernel_writewatch_reset( void *start, SIZE_T len ) +{ + mach_vm_address_t current_address = (mach_vm_address_t)ROUND_ADDR( start, host_page_mask ); + SIZE_T end = current_address + ROUND_SIZE( start, len, host_page_mask ); + kern_return_t kr; + + while (current_address < end) + { + vm_prot_t mach_prot = get_mach_prot( current_address ); + + kr = mach_vm_protect( mach_task_self(), current_address, host_page_size, 0, + mach_prot | VM_PROT_COPY ); + + if (kr != KERN_SUCCESS) + { + ERR( "mach_vm_protect failed on address %p: %d\n", (void *)current_address, kr ); + break; + } + + current_address += host_page_size; + } +} + +static void kernel_writewatch_register_range( struct file_view *view, void *base, size_t size ) +{ + mach_vm_address_t current_address = (mach_vm_address_t)ROUND_ADDR( base, host_page_mask ); + mach_vm_address_t region_address; + mach_vm_size_t region_size; + mach_msg_type_number_t info_count; + mach_port_t object_name; + vm_region_extended_info_data_t info; + SIZE_T end = current_address + ROUND_SIZE( base, size, host_page_mask ); + kern_return_t kr; + + if (!(view->protect & VPROT_WRITEWATCH) || !use_kernel_writewatch) return; + + while (current_address < end) + { + vm_prot_t mach_prot = get_mach_prot( current_address ); + + region_address = current_address; + info_count = VM_REGION_EXTENDED_INFO_COUNT; + kr = mach_vm_region( mach_task_self(), ®ion_address, ®ion_size, VM_REGION_EXTENDED_INFO, + (vm_region_info_t)&info, &info_count, &object_name ); + + if (kr != KERN_SUCCESS) + { + ERR( "mach_vm_region failed: %d\n", kr ); + break; + } + + if (region_address > current_address) + { + ERR( "trying to register unmapped region\n" ); + break; + } + + assert( info.protection == mach_prot ); + + region_size = (mach_vm_size_t)host_page_size; + kr = mach_vm_protect( mach_task_self(), current_address, region_size, 0, + mach_prot | VM_PROT_COPY ); + + if (kr != KERN_SUCCESS) + { + ERR( "mach_vm_protect failed: %d\n", kr ); + break; + } + + kr = mach_make_memory_entry_64( mach_task_self(), ®ion_size, current_address, mach_prot, + &object_name, MACH_PORT_NULL ); + + if (kr != KERN_SUCCESS) + { + ERR( "mach_make_memory_entry_64 failed: %d\n", kr ); + current_address += host_page_size; + continue; + } + + assert( region_size == host_page_size ); + mach_port_deallocate( mach_task_self(), object_name ); + current_address += host_page_size; + } +} + +static void kernel_get_write_watches( void *base, SIZE_T size, void **buffer, ULONG_PTR *count, BOOL reset ) +{ + mach_vm_address_t current_address; + mach_vm_address_t region_address; + mach_vm_size_t region_size; + mach_msg_type_number_t info_count; + mach_port_t object_name; + vm_region_extended_info_data_t info; + data_size_t remaining_size; + SIZE_T buffer_len = *count; + size_t end; + kern_return_t kr; + + assert( !(size & page_mask) ); + + end = (size_t)((char *)base + size); + remaining_size = ROUND_SIZE( base, size, host_page_mask ); + current_address = (mach_vm_address_t)ROUND_ADDR( base, host_page_mask ); + *count = 0; + + while (remaining_size && buffer_len) + { + region_address = current_address; + info_count = VM_REGION_EXTENDED_INFO_COUNT; + kr = mach_vm_region( mach_task_self(), ®ion_address, ®ion_size, VM_REGION_EXTENDED_INFO, + (vm_region_info_t)&info, &info_count, &object_name ); + + if (kr != KERN_SUCCESS) + { + ERR( "mach_vm_region failed: %d\n", kr ); + break; + } + + if (region_address > min( current_address, (mach_vm_address_t)end )) break; + + if (info.share_mode != SM_COW) + { + size_t c_addr = max( (size_t)current_address, (size_t)base ); + size_t region_end = min( (size_t)(region_address + region_size), end ); + + while (buffer_len && c_addr < region_end) + { + buffer[(*count)++] = (void *)c_addr; + --buffer_len; + c_addr += page_size; + } + } + + current_address += region_size; + remaining_size -= region_size; + } + + if (reset) + { + ULONG_PTR i; + vm_prot_t mach_prot; + + for (i = 0; i < *count; i++) + { + current_address = (mach_vm_address_t)buffer[i]; + mach_prot = get_mach_prot( current_address ); + kr = mach_vm_protect( mach_task_self(), current_address, page_size, 0, + mach_prot | VM_PROT_COPY ); + + if (kr != KERN_SUCCESS) + ERR( "mach_vm_protect failed: %d\n", kr ); + } + } +} #else static void kernel_writewatch_init(void) {
Tim Clem (@tclem) commented about dlls/ntdll/unix/virtual.c:
addr = next_addr; }
} +#elif defined(__APPLE__) +static vm_prot_t get_mach_prot( mach_vm_address_t addr ) +{
- size_t i, idx = (size_t)ROUND_ADDR( addr, host_page_mask ) >> page_shift;
- const BYTE *vprot_ptr;
- BYTE vprot = 0;
- vm_prot_t mach_prot = VM_PROT_NONE;
- if ((idx >> pages_vprot_shift) >= pages_vprot_size) return VM_PROT_NONE;
- if (!pages_vprot[idx >> pages_vprot_shift]) return VM_PROT_NONE;
- assert( host_page_mask >> page_shift <= pages_vprot_mask );
- vprot_ptr = pages_vprot[idx >> pages_vprot_shift] + (idx & pages_vprot_mask);
- for (i = 0; i < host_page_size / page_size; i++) vprot |= vprot_ptr[i];
Is it worth factoring this out of get_host_page_vprot and reusing that here? Also the code in get_host_page_vprot has a separate case for !_WIN64; is that relevant here?
Tim Clem (@tclem) commented about dlls/ntdll/unix/virtual.c:
{
ERR( "mach_vm_region failed: %d\n", kr );
break;
}
if (region_address > current_address)
{
ERR( "trying to register unmapped region\n" );
break;
}
assert( info.protection == mach_prot );
region_size = (mach_vm_size_t)host_page_size;
kr = mach_vm_protect( mach_task_self(), current_address, region_size, 0,
mach_prot | VM_PROT_COPY );
Is there a reason to do this page by page? Can you not just do one mach_vm_protect for the whole range?
Tim Clem (@tclem) commented about dlls/ntdll/unix/virtual.c:
ERR( "mach_vm_protect failed: %d\n", kr );
break;
}
kr = mach_make_memory_entry_64( mach_task_self(), ®ion_size, current_address, mach_prot,
&object_name, MACH_PORT_NULL );
if (kr != KERN_SUCCESS)
{
ERR( "mach_make_memory_entry_64 failed: %d\n", kr );
current_address += host_page_size;
continue;
}
assert( region_size == host_page_size );
mach_port_deallocate( mach_task_self(), object_name );
What's the point of making this memory entry?