Signed-off-by: Paul Gofman pgofman@codeweavers.com --- This patch reduces the overhead for scanning huge ranges for the changed vprot greatly. Improves DeathLoop performance which reserves a huge memory area and then commits some pages from within it, often calling VirtualQuery() for the addresses within the allocated area.
dlls/ntdll/unix/virtual.c | 80 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 5 deletions(-)
diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 14ea3b11143..96ecce3daf4 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -947,17 +947,87 @@ static BYTE get_page_vprot( const void *addr ) * get_vprot_range_size * * Return the size of the region with equal masked protection byte. + * base and size should be page aligned. * The function assumes that base and size are page aligned and * base + size does not wrap around. */ static SIZE_T get_vprot_range_size( BYTE *base, SIZE_T size, BYTE mask, BYTE *vprot ) { - BYTE *addr; + SIZE_T i, start_idx, end_idx, aligned_start_idx, aligned_end_idx, count; + static const UINT_PTR index_align_mask = 7; + UINT64 vprot_word, mask_word, changed_word; + const BYTE *vprot_ptr; +#ifdef _WIN64 + size_t idx_page; +#endif + unsigned int j; + size_t idx; + + TRACE("base %p, size %p, mask %#x.\n", base, (void *)size, mask); + + start_idx = (size_t)base >> page_shift; + end_idx = start_idx + (size >> page_shift); + idx = start_idx; +#ifdef _WIN64 + end_idx = min( end_idx, pages_vprot_size << pages_vprot_shift ); + if (end_idx <= start_idx) + { + *vprot = 0; + return size; + } + idx_page = idx >> pages_vprot_shift; + idx &= pages_vprot_mask; + vprot_ptr = pages_vprot[idx_page++]; +#else + vprot_ptr = pages_vprot; +#endif + + aligned_start_idx = (start_idx + index_align_mask) & ~index_align_mask; + if (aligned_start_idx > end_idx) aligned_start_idx = end_idx; + + aligned_end_idx = end_idx & ~index_align_mask; + if (aligned_end_idx < aligned_start_idx) aligned_end_idx = aligned_start_idx; + + /* Page count in zero level page table on x64 is at least the multiples of 8 + * so we don't have to worry about crossing the boundary on unaligned idx values. */ + *vprot = vprot_ptr[idx]; + count = aligned_start_idx - start_idx; + + for (i = 0; i < count; ++i) + if ((*vprot ^ vprot_ptr[idx++]) & mask) return i << page_shift;
- *vprot = get_page_vprot( base ); - for (addr = base + page_size; addr != base + size; addr += page_size) - if ((*vprot ^ get_page_vprot( addr )) & mask) break; + count += aligned_end_idx - aligned_start_idx; + vprot_word = 0x101010101010101ull * *vprot; + mask_word = 0x101010101010101ull * mask; + for (; i < count; i += 8) + { +#ifdef _WIN64 + if (idx >> pages_vprot_shift) + { + idx = 0; + vprot_ptr = pages_vprot[idx_page++]; + } +#endif + changed_word = (vprot_word ^ *(UINT64 *)(vprot_ptr + idx)) & mask_word; + if (changed_word) + { + for (j = 0; !((BYTE *)&changed_word)[j]; ++j) ++i; + return i << page_shift; + } + idx += 8; + } + +#ifdef _WIN64 + if (aligned_end_idx != end_idx && (idx >> pages_vprot_shift)) + { + idx = 0; + vprot_ptr = pages_vprot[idx_page]; + } +#endif + count += end_idx - aligned_end_idx; + for (; i < count; ++i) + if ((*vprot ^ vprot_ptr[idx++]) & mask) return i << page_shift;
- return addr - base; + return *vprot & mask ? count << page_shift : size; }
/***********************************************************************