Not completely sure if it's worth having for 8.0, but opening this to share the target I'm trying to reach.
-- v5: ntdll: Add a thread-specific category group cache. ntdll: Use atomics and lock-free list for category groups. ntdll: Implement Low Fragmentation Heap frontend. ntdll: Count allocations and automatically enable LFH. ntdll: Implement HeapCompatibilityInformation. ntdll: Fix HeapWalk with empty uncommitted consecutive subheaps.
From: Rémi Bernon rbernon@codeweavers.com
--- dlls/ntdll/heap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index 9403c83e92c..a2410957a44 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -653,7 +653,7 @@ static SUBHEAP *find_subheap( const struct heap *heap, const struct block *block if (!check_subheap( subheap, heap )) return NULL; if (contains( first_block( subheap ), blocks_size, block, sizeof(*block) )) return subheap; /* outside of blocks region, possible corruption or heap_walk */ - if (contains( subheap_base( subheap ), subheap_size( subheap ), block, 0 )) return heap_walk ? subheap : NULL; + if (contains( subheap_base( subheap ), subheap_size( subheap ), block, 1 )) return heap_walk ? subheap : NULL; }
return NULL;
From: Rémi Bernon rbernon@codeweavers.com
--- dlls/kernel32/tests/heap.c | 10 ------- dlls/ntdll/heap.c | 57 ++++++++++++++++++++++++++++++++------ 2 files changed, 48 insertions(+), 19 deletions(-)
diff --git a/dlls/kernel32/tests/heap.c b/dlls/kernel32/tests/heap.c index a81cb1b7a15..29faf8c902e 100644 --- a/dlls/kernel32/tests/heap.c +++ b/dlls/kernel32/tests/heap.c @@ -827,11 +827,8 @@ static void test_HeapCreate(void) size = 0; SetLastError( 0xdeadbeef ); ret = pHeapQueryInformation( 0, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); - todo_wine ok( !ret, "HeapQueryInformation succeeded\n" ); - todo_wine ok( GetLastError() == ERROR_NOACCESS, "got error %lu\n", GetLastError() ); - todo_wine ok( size == 0, "got size %Iu\n", size );
size = 0; @@ -871,7 +868,6 @@ static void test_HeapCreate(void) ok( ret, "HeapSetInformation failed, error %lu\n", GetLastError() ); ret = pHeapQueryInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); ok( ret, "HeapQueryInformation failed, error %lu\n", GetLastError() ); - todo_wine ok( compat_info == 2, "got HeapCompatibilityInformation %lu\n", compat_info );
/* cannot be undone */ @@ -879,20 +875,15 @@ static void test_HeapCreate(void) compat_info = 0; SetLastError( 0xdeadbeef ); ret = pHeapSetInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info) ); - todo_wine ok( !ret, "HeapSetInformation succeeded\n" ); - todo_wine ok( GetLastError() == ERROR_GEN_FAILURE, "got error %lu\n", GetLastError() ); compat_info = 1; SetLastError( 0xdeadbeef ); ret = pHeapSetInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info) ); - todo_wine ok( !ret, "HeapSetInformation succeeded\n" ); - todo_wine ok( GetLastError() == ERROR_GEN_FAILURE, "got error %lu\n", GetLastError() ); ret = pHeapQueryInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); ok( ret, "HeapQueryInformation failed, error %lu\n", GetLastError() ); - todo_wine ok( compat_info == 2, "got HeapCompatibilityInformation %lu\n", compat_info );
ret = HeapDestroy( heap ); @@ -931,7 +922,6 @@ static void test_HeapCreate(void) ok( ret, "HeapSetInformation failed, error %lu\n", GetLastError() ); ret = pHeapQueryInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); ok( ret, "HeapQueryInformation failed, error %lu\n", GetLastError() ); - todo_wine ok( compat_info == 2, "got HeapCompatibilityInformation %lu\n", compat_info );
for (i = 0; i < 0x11; i++) ptrs[i] = pHeapAlloc( heap, 0, 24 + 2 * sizeof(void *) ); diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index a2410957a44..b300c7700af 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -39,6 +39,13 @@
WINE_DEFAULT_DEBUG_CHANNEL(heap);
+/* HeapCompatibilityInformation values */ + +#define HEAP_STD 0 +#define HEAP_LAL 1 +#define HEAP_LFH 2 + + /* undocumented RtlWalkHeap structure */
struct rtl_heap_entry @@ -194,6 +201,7 @@ struct heap DWORD force_flags; /* 0044/0074 */ /* end of the Windows 10 compatible struct layout */
+ LONG compat_info; /* HeapCompatibilityInformation / heap frontend type */ struct list entry; /* Entry in process heap list */ struct list subheap_list; /* Sub-heap list */ struct list large_list; /* Large blocks list */ @@ -1378,6 +1386,7 @@ HANDLE WINAPI RtlCreateHeap( ULONG flags, void *addr, SIZE_T total_size, SIZE_T heap->ffeeffee = 0xffeeffee; heap->auto_flags = (flags & HEAP_GROWABLE); heap->flags = (flags & ~HEAP_SHARED); + heap->compat_info = HEAP_STD; heap->magic = HEAP_MAGIC; heap->grow_size = max( HEAP_DEF_SIZE, total_size ); heap->min_size = commit_size; @@ -2039,21 +2048,24 @@ ULONG WINAPI RtlGetProcessHeaps( ULONG count, HANDLE *heaps ) * RtlQueryHeapInformation (NTDLL.@) */ NTSTATUS WINAPI RtlQueryHeapInformation( HANDLE handle, HEAP_INFORMATION_CLASS info_class, - void *info, SIZE_T size_in, PSIZE_T size_out ) + void *info, SIZE_T size_in, SIZE_T *size_out ) { + struct heap *heap; + ULONG flags; + + TRACE( "handle %p, info_class %u, info %p, size_in %Iu, size_out %p.\n", handle, info_class, info, size_in, size_out ); + switch (info_class) { case HeapCompatibilityInformation: + if (!(heap = unsafe_heap_from_handle( handle, 0, &flags ))) return STATUS_ACCESS_VIOLATION; if (size_out) *size_out = sizeof(ULONG); - - if (size_in < sizeof(ULONG)) - return STATUS_BUFFER_TOO_SMALL; - - *(ULONG *)info = 0; /* standard heap */ + if (size_in < sizeof(ULONG)) return STATUS_BUFFER_TOO_SMALL; + *(ULONG *)info = heap->compat_info; return STATUS_SUCCESS;
default: - FIXME("Unknown heap information class %u\n", info_class); + FIXME( "HEAP_INFORMATION_CLASS %u not implemented!\n", info_class ); return STATUS_INVALID_INFO_CLASS; } } @@ -2063,8 +2075,35 @@ NTSTATUS WINAPI RtlQueryHeapInformation( HANDLE handle, HEAP_INFORMATION_CLASS i */ NTSTATUS WINAPI RtlSetHeapInformation( HANDLE handle, HEAP_INFORMATION_CLASS info_class, void *info, SIZE_T size ) { - FIXME( "handle %p, info_class %d, info %p, size %Id stub!\n", handle, info_class, info, size ); - return STATUS_SUCCESS; + struct heap *heap; + ULONG flags; + + TRACE( "handle %p, info_class %u, info %p, size %Iu.\n", handle, info_class, info, size ); + + switch (info_class) + { + case HeapCompatibilityInformation: + { + ULONG compat_info; + + if (size < sizeof(ULONG)) return STATUS_BUFFER_TOO_SMALL; + if (!(heap = unsafe_heap_from_handle( handle, 0, &flags ))) return STATUS_INVALID_HANDLE; + if (heap->flags & HEAP_NO_SERIALIZE) return STATUS_SUCCESS; + + compat_info = *(ULONG *)info; + if (compat_info != HEAP_STD && compat_info != HEAP_LFH) + { + FIXME( "HeapCompatibilityInformation %lu not implemented!\n", compat_info ); + return STATUS_UNSUCCESSFUL; + } + if (InterlockedCompareExchange( &heap->compat_info, compat_info, HEAP_STD )) return STATUS_UNSUCCESSFUL; + return STATUS_SUCCESS; + } + + default: + FIXME( "HEAP_INFORMATION_CLASS %u not implemented!\n", info_class ); + return STATUS_SUCCESS; + } }
/***********************************************************************
From: Rémi Bernon rbernon@codeweavers.com
--- dlls/kernel32/tests/heap.c | 2 - dlls/ntdll/heap.c | 116 ++++++++++++++++++++++++++++++++++++- 2 files changed, 115 insertions(+), 3 deletions(-)
diff --git a/dlls/kernel32/tests/heap.c b/dlls/kernel32/tests/heap.c index 29faf8c902e..ed84441efe0 100644 --- a/dlls/kernel32/tests/heap.c +++ b/dlls/kernel32/tests/heap.c @@ -904,7 +904,6 @@ static void test_HeapCreate(void)
ret = pHeapQueryInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); ok( ret, "HeapQueryInformation failed, error %lu\n", GetLastError() ); - todo_wine ok( compat_info == 2, "got HeapCompatibilityInformation %lu\n", compat_info );
ret = HeapDestroy( heap ); @@ -1195,7 +1194,6 @@ static void test_HeapCreate(void)
ret = pHeapQueryInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); ok( ret, "HeapQueryInformation failed, error %lu\n", GetLastError() ); - todo_wine ok( compat_info == 2, "got HeapCompatibilityInformation %lu\n", compat_info );
/* locking is serialized */ diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index b300c7700af..4b1d9dfea20 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -189,6 +189,53 @@ typedef struct DECLSPEC_ALIGN(BLOCK_ALIGN) tagSUBHEAP C_ASSERT( sizeof(SUBHEAP) == offsetof(SUBHEAP, block) + sizeof(struct block) ); C_ASSERT( sizeof(SUBHEAP) == 4 * BLOCK_ALIGN );
+ +/* LFH block size categories */ +#define BLOCK_SIZE_SMALL_STEP (16) +#define BLOCK_SIZE_SMALL_MAX (1024) +#define BLOCK_SIZE_MEDIUM_STEP (192) +#define BLOCK_SIZE_MEDIUM_MAX (32768) + +#define BLOCK_SIZE_CATEGORY_SMALL( size ) ((size) > 0 ? ((size) - 1) / BLOCK_SIZE_SMALL_STEP : 0) +#define BLOCK_SIZE_CATEGORY_MEDIUM( size ) (BLOCK_SIZE_CATEGORY_SMALL( BLOCK_SIZE_SMALL_MAX ) + 1 + \ + (((size) > BLOCK_SIZE_SMALL_MAX) ? ((size) - BLOCK_SIZE_SMALL_MAX - 1) / BLOCK_SIZE_MEDIUM_STEP : 0)) + +#define BLOCK_SIZE_CATEGORY_COUNT (BLOCK_SIZE_CATEGORY_MEDIUM( BLOCK_SIZE_MEDIUM_MAX ) + 1) +#define BLOCK_SIZE_CATEGORY( size ) (((size) >= BLOCK_SIZE_MEDIUM_MAX) \ + ? (BLOCK_SIZE_CATEGORY_COUNT - 1) \ + : ((size) > BLOCK_SIZE_SMALL_MAX) \ + ? BLOCK_SIZE_CATEGORY_MEDIUM( size ) \ + : BLOCK_SIZE_CATEGORY_SMALL( size )) +#define BLOCK_CATEGORY_SIZE( category ) (category > BLOCK_SIZE_CATEGORY_SMALL( BLOCK_SIZE_SMALL_MAX ) \ + ? (BLOCK_SIZE_SMALL_MAX + (category - BLOCK_SIZE_CATEGORY_SMALL( BLOCK_SIZE_SMALL_MAX )) * BLOCK_SIZE_MEDIUM_STEP) \ + : ((SIZE_T)16 + category * BLOCK_SIZE_SMALL_STEP)) + +/* macro sanity checks */ +C_ASSERT( BLOCK_SIZE_CATEGORY( 0 ) == 0 ); +C_ASSERT( BLOCK_CATEGORY_SIZE( 0 ) == 16 ); +C_ASSERT( BLOCK_SIZE_CATEGORY( 16 ) == 0 ); +C_ASSERT( BLOCK_SIZE_CATEGORY( BLOCK_SIZE_SMALL_MAX ) == 63 ); +C_ASSERT( BLOCK_CATEGORY_SIZE( 63 ) == BLOCK_SIZE_SMALL_MAX ); +C_ASSERT( BLOCK_SIZE_CATEGORY( BLOCK_SIZE_SMALL_MAX + 1 ) == 64 ); +C_ASSERT( BLOCK_CATEGORY_SIZE( 64 ) == BLOCK_SIZE_SMALL_MAX + BLOCK_SIZE_MEDIUM_STEP ); +C_ASSERT( BLOCK_SIZE_CATEGORY( BLOCK_SIZE_SMALL_MAX + BLOCK_SIZE_MEDIUM_STEP ) == 64 ); + +/* keep the block size category count reasonable */ +C_ASSERT( BLOCK_SIZE_CATEGORY_COUNT <= 256 ); + +/* difference between block classes and all possible validation overhead must fit into block tail_size */ +C_ASSERT( BLOCK_SIZE_MEDIUM_STEP + 3 * BLOCK_ALIGN <= FIELD_MAX( struct block, tail_size ) ); + +/* a category of heap blocks of a certain size */ + +struct category +{ + /* counters for LFH activation */ + volatile LONG blocks_alive; + volatile LONG blocks_total; + volatile BOOL enabled; +}; + struct heap { /* win32/win64 */ DWORD_PTR unknown1[2]; /* 0000/0000 */ @@ -212,6 +259,7 @@ struct heap struct block **pending_free; /* Ring buffer for pending free requests */ RTL_CRITICAL_SECTION cs; struct entry free_lists[HEAP_NB_FREE_LISTS]; + struct category *categories; SUBHEAP subheap; };
@@ -544,6 +592,15 @@ static void heap_dump( const struct heap *heap ) TRACE( "heap: %p\n", heap ); TRACE( " next %p\n", LIST_ENTRY( heap->entry.next, struct heap, entry ) );
+ TRACE( " categories:\n" ); + for (i = 0; heap->categories && i < BLOCK_SIZE_CATEGORY_COUNT; i++) + { + struct category *category = heap->categories + i; + if (!category->blocks_alive && !category->blocks_total) continue; + TRACE( " %3u: size %#4Ix, blocks alive %ld, total %ld\n", i, BLOCK_CATEGORY_SIZE(i), + category->blocks_alive, category->blocks_total ); + } + TRACE( " free_lists: %p\n", heap->free_lists ); for (i = 0; i < HEAP_NB_FREE_LISTS; i++) TRACE( " %p: size %#8Ix, prev %p, next %p\n", heap->free_lists + i, get_free_list_block_size( i ), @@ -1430,6 +1487,14 @@ HANDLE WINAPI RtlCreateHeap( ULONG flags, void *addr, SIZE_T total_size, SIZE_T
heap_set_debug_flags( heap );
+ if (heap->flags & HEAP_GROWABLE) + { + /* allocate a large block to keep LFH block categories */ + SIZE_T size = sizeof(struct category) * BLOCK_SIZE_CATEGORY_COUNT; + NtAllocateVirtualMemory( NtCurrentProcess(), (void *)&heap->categories, + 0, &size, MEM_COMMIT, PAGE_READWRITE ); + } + /* link it into the per-process heap list */ if (process_heap) { @@ -1515,6 +1580,11 @@ HANDLE WINAPI RtlDestroyHeap( HANDLE handle ) NtFreeVirtualMemory( NtCurrentProcess(), &addr, &size, MEM_RELEASE ); } valgrind_notify_free_all( &heap->subheap, heap ); + if ((addr = heap->categories)) + { + size = 0; + NtFreeVirtualMemory( NtCurrentProcess(), &addr, &size, MEM_RELEASE ); + } size = 0; addr = heap; NtFreeVirtualMemory( NtCurrentProcess(), &addr, &size, MEM_RELEASE ); @@ -1572,6 +1642,28 @@ static NTSTATUS heap_allocate_block( struct heap *heap, ULONG flags, SIZE_T bloc return STATUS_SUCCESS; }
+static NTSTATUS heap_allocate_block_lfh( struct heap *heap, ULONG flags, SIZE_T block_size, + SIZE_T size, void **ret ) +{ + struct category *category, *last = heap->categories + BLOCK_SIZE_CATEGORY_COUNT - 1; + + category = heap->categories + BLOCK_SIZE_CATEGORY( block_size ); + if (!heap->categories || category == last) return STATUS_UNSUCCESSFUL; + + if (!category->enabled && category->blocks_alive <= 0x10 && category->blocks_total <= 0x800) + return STATUS_UNSUCCESSFUL; + category->enabled = TRUE; + + if (heap->compat_info != HEAP_LFH) + { + ULONG info = HEAP_LFH; + RtlSetHeapInformation( heap, HeapCompatibilityInformation, &info, sizeof(info) ); + return STATUS_UNSUCCESSFUL; + } + + return STATUS_NOT_IMPLEMENTED; +} + /*********************************************************************** * RtlAllocateHeap (NTDLL.@) */ @@ -1589,11 +1681,20 @@ void *WINAPI DECLSPEC_HOTPATCH RtlAllocateHeap( HANDLE handle, ULONG flags, SIZE status = STATUS_NO_MEMORY; else if (block_size >= HEAP_MIN_LARGE_BLOCK_SIZE) status = heap_allocate_large( heap, heap_flags, block_size, size, &ptr ); + else if (!heap_allocate_block_lfh( heap, heap_flags, block_size, size, &ptr )) + status = STATUS_SUCCESS; else { heap_lock( heap, heap_flags ); status = heap_allocate_block( heap, heap_flags, block_size, size, &ptr ); heap_unlock( heap, heap_flags ); + + if (!status && heap->categories) + { + block_size = block_get_size( (struct block *)ptr - 1 ); + heap->categories[BLOCK_SIZE_CATEGORY( block_size )].blocks_alive++; + heap->categories[BLOCK_SIZE_CATEGORY( block_size )].blocks_total++; + } }
if (!status) valgrind_notify_alloc( ptr, size, flags & HEAP_ZERO_MEMORY ); @@ -1628,9 +1729,14 @@ BOOLEAN WINAPI DECLSPEC_HOTPATCH RtlFreeHeap( HANDLE handle, ULONG flags, void * status = STATUS_SUCCESS; else { + SIZE_T block_size = block_get_size( block ); + heap_lock( heap, heap_flags ); status = heap_free_block( heap, heap_flags, block ); heap_unlock( heap, heap_flags ); + + if (!status && heap->categories) + heap->categories[BLOCK_SIZE_CATEGORY( block_size )].blocks_alive--; }
TRACE( "handle %p, flags %#lx, ptr %p, return %u, status %#lx.\n", handle, flags, ptr, !status, status ); @@ -1642,7 +1748,7 @@ static NTSTATUS heap_resize_block( struct heap *heap, ULONG flags, struct block SIZE_T size, SIZE_T *old_size, void **ret ) { SUBHEAP *subheap = block_get_subheap( heap, block ); - SIZE_T old_block_size; + SIZE_T old_category, old_block_size; struct entry *entry; struct block *next;
@@ -1669,6 +1775,7 @@ static NTSTATUS heap_resize_block( struct heap *heap, ULONG flags, struct block
old_block_size = block_get_size( block ); *old_size = old_block_size - block_get_overhead( block ); + old_category = BLOCK_SIZE_CATEGORY( old_block_size );
if (block_size >= HEAP_MIN_LARGE_BLOCK_SIZE) return STATUS_NO_MEMORY; /* growing small block to large block */
@@ -1708,6 +1815,13 @@ static NTSTATUS heap_resize_block( struct heap *heap, ULONG flags, struct block
heap_unlock( heap, flags );
+ if (heap->categories) + { + heap->categories[old_category].blocks_alive--; + heap->categories[BLOCK_SIZE_CATEGORY( block_size )].blocks_alive++; + heap->categories[BLOCK_SIZE_CATEGORY( block_size )].blocks_total++; + } + *ret = block + 1; return STATUS_SUCCESS; }
From: Rémi Bernon rbernon@codeweavers.com
This implements the reduced fragmentation from the heap frontend, by carving smaller blocks out of larger allocated blocks.
The super block and each sub-block are all flagged with BLOCK_FLAG_LFH.
The super-block (struct group) uses a standard struct block header, as well as a list entry to be linked in free list, and a free bit map to track free sub-blocks.
Sub-blocks reference their super block through the base_offset, instead of the subheap, using the block size as radix. --- dlls/kernel32/tests/heap.c | 19 +-- dlls/ntdll/heap.c | 274 +++++++++++++++++++++++++++++++++++-- 2 files changed, 270 insertions(+), 23 deletions(-)
diff --git a/dlls/kernel32/tests/heap.c b/dlls/kernel32/tests/heap.c index ed84441efe0..b3c3dc1eebc 100644 --- a/dlls/kernel32/tests/heap.c +++ b/dlls/kernel32/tests/heap.c @@ -1042,6 +1042,7 @@ static void test_HeapCreate(void) for (i = 1; i < count - 2; i++) { if (entries[i].wFlags != PROCESS_HEAP_ENTRY_BUSY) continue; + todo_wine /* Wine currently reports the LFH group as a single block */ ok( entries[i].cbData == 0x18 + 2 * sizeof(void *), "got cbData %#lx\n", entries[i].cbData ); ok( entries[i].cbOverhead == 0x8, "got cbOverhead %#x\n", entries[i].cbOverhead ); } @@ -1667,9 +1668,7 @@ static void test_GlobalAlloc(void) ok( size == small_size, "GlobalSize returned %Iu\n", size ); SetLastError( 0xdeadbeef ); tmp_mem = GlobalReAlloc( mem, small_size, 0 ); - todo_wine ok( !tmp_mem, "GlobalReAlloc succeeded\n" ); - todo_wine ok( GetLastError() == ERROR_NOT_ENOUGH_MEMORY, "got error %lu\n", GetLastError() ); if (tmp_mem) mem = tmp_mem; tmp_mem = GlobalReAlloc( mem, 1024 * 1024, GMEM_MODIFY ); @@ -1685,7 +1684,6 @@ static void test_GlobalAlloc(void) ok( !!mem, "GlobalAlloc failed, error %lu\n", GetLastError() ); tmp_mem = GlobalReAlloc( mem, small_size + 1, GMEM_MOVEABLE ); ok( !!tmp_mem, "GlobalReAlloc failed, error %lu\n", GetLastError() ); - todo_wine ok( tmp_mem != mem, "GlobalReAlloc didn't relocate memory\n" ); ptr = GlobalLock( tmp_mem ); ok( !!ptr, "GlobalLock failed, error %lu\n", GetLastError() ); @@ -1832,8 +1830,8 @@ static void test_GlobalAlloc(void) { ok( !is_mem_entry( tmp_mem ), "unexpected moveable %p\n", tmp_mem ); if (flags == GMEM_MODIFY) ok( tmp_mem == mem, "GlobalReAlloc returned %p\n", tmp_mem ); - else if (flags != GMEM_MOVEABLE) todo_wine_if(!flags) ok( !tmp_mem, "GlobalReAlloc succeeded\n" ); - else todo_wine ok( tmp_mem != mem, "GlobalReAlloc returned %p\n", tmp_mem ); + else if (flags != GMEM_MOVEABLE) ok( !tmp_mem, "GlobalReAlloc succeeded\n" ); + else ok( tmp_mem != mem, "GlobalReAlloc returned %p\n", tmp_mem ); } else { @@ -1847,7 +1845,7 @@ static void test_GlobalAlloc(void)
size = GlobalSize( mem ); if (flags == GMEM_MOVEABLE) ok( size == 0 || broken( size == 1 ) /* w7 */, "GlobalSize returned %Iu\n", size ); - else todo_wine_if(!flags) ok( size == small_size, "GlobalSize returned %Iu\n", size ); + else ok( size == small_size, "GlobalSize returned %Iu\n", size );
mem = GlobalFree( mem ); ok( !mem, "GlobalFree failed, error %lu\n", GetLastError() ); @@ -2405,9 +2403,7 @@ static void test_LocalAlloc(void) ok( size == small_size, "LocalSize returned %Iu\n", size ); SetLastError( 0xdeadbeef ); tmp_mem = LocalReAlloc( mem, small_size, 0 ); - todo_wine ok( !tmp_mem, "LocalReAlloc succeeded\n" ); - todo_wine ok( GetLastError() == ERROR_NOT_ENOUGH_MEMORY, "got error %lu\n", GetLastError() ); if (tmp_mem) mem = tmp_mem; tmp_mem = LocalReAlloc( mem, 1024 * 1024, LMEM_MODIFY ); @@ -2423,7 +2419,6 @@ static void test_LocalAlloc(void) ok( !!mem, "LocalAlloc failed, error %lu\n", GetLastError() ); tmp_mem = LocalReAlloc( mem, small_size + 1, LMEM_MOVEABLE ); ok( !!tmp_mem, "LocalReAlloc failed, error %lu\n", GetLastError() ); - todo_wine ok( tmp_mem != mem, "LocalReAlloc didn't relocate memory\n" ); ptr = LocalLock( tmp_mem ); ok( !!ptr, "LocalLock failed, error %lu\n", GetLastError() ); @@ -2516,13 +2511,13 @@ static void test_LocalAlloc(void) tmp_mem = LocalReAlloc( mem, 0, flags ); ok( !is_mem_entry( tmp_mem ), "unexpected moveable %p\n", tmp_mem ); if (flags & LMEM_MODIFY) ok( tmp_mem == mem, "LocalReAlloc returned %p\n", tmp_mem ); - else if (flags != LMEM_MOVEABLE) todo_wine_if(!flags) ok( !tmp_mem, "LocalReAlloc succeeded\n" ); - else todo_wine ok( tmp_mem != mem, "LocalReAlloc returned %p\n", tmp_mem ); + else if (flags != LMEM_MOVEABLE) ok( !tmp_mem, "LocalReAlloc succeeded\n" ); + else ok( tmp_mem != mem, "LocalReAlloc returned %p\n", tmp_mem ); if (tmp_mem) mem = tmp_mem;
size = LocalSize( mem ); if (flags == LMEM_MOVEABLE) ok( size == 0 || broken( size == 1 ) /* w7 */, "LocalSize returned %Iu\n", size ); - else todo_wine_if(!flags) ok( size == small_size, "LocalSize returned %Iu\n", size ); + else ok( size == small_size, "LocalSize returned %Iu\n", size );
mem = LocalFree( mem ); ok( !mem, "LocalFree failed, error %lu\n", GetLastError() ); diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index 4b1d9dfea20..e3800cad6af 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -101,6 +101,7 @@ C_ASSERT( sizeof(struct block) == 8 ); #define BLOCK_FLAG_PREV_FREE 0x02 #define BLOCK_FLAG_FREE_LINK 0x03 #define BLOCK_FLAG_LARGE 0x04 +#define BLOCK_FLAG_LFH 0x08 /* block is handled by the LFH frontend */ #define BLOCK_FLAG_USER_INFO 0x10 /* user flags up to 0xf0 */ #define BLOCK_FLAG_USER_MASK 0xf0
@@ -227,13 +228,15 @@ C_ASSERT( BLOCK_SIZE_CATEGORY_COUNT <= 256 ); C_ASSERT( BLOCK_SIZE_MEDIUM_STEP + 3 * BLOCK_ALIGN <= FIELD_MAX( struct block, tail_size ) );
/* a category of heap blocks of a certain size */ - struct category { /* counters for LFH activation */ volatile LONG blocks_alive; volatile LONG blocks_total; volatile BOOL enabled; + + /* list of groups with free blocks */ + struct list groups; };
struct heap @@ -776,19 +779,22 @@ static inline BOOL subheap_decommit( const struct heap *heap, SUBHEAP *subheap,
static void block_init_free( struct block *block, ULONG flags, SUBHEAP *subheap, SIZE_T block_size ) { - const char *end = (char *)block + block_size, *commit_end = subheap_commit_end( subheap ); struct entry *entry = (struct entry *)block;
valgrind_make_writable( block, sizeof(*entry) ); block_set_type( block, BLOCK_TYPE_FREE ); - block_set_base( block, subheap_base( subheap ) ); block_set_flags( block, ~0, BLOCK_FLAG_FREE ); block_set_size( block, block_size );
- /* If debugging, erase the freed block content */ - - if (end > commit_end) end = commit_end; - if (end > (char *)(entry + 1)) mark_block_free( entry + 1, end - (char *)(entry + 1), flags ); + if (!subheap) /* LFH block initialization, just clear its data */ + mark_block_free( entry + 1, (char *)block + block_size - (char *)(entry + 1), flags ); + else + { + const char *end = (char *)block + block_size, *commit_end; + block_set_base( block, subheap_base( subheap ) ); + if (end > (commit_end = subheap_commit_end( subheap ))) end = commit_end; + if (end > (char *)(entry + 1)) mark_block_free( entry + 1, end - (char *)(entry + 1), flags ); + } }
static void insert_free_block( struct heap *heap, ULONG flags, SUBHEAP *subheap, struct block *block ) @@ -1104,7 +1110,9 @@ static BOOL validate_free_block( const struct heap *heap, const SUBHEAP *subheap err = "invalid previous free block pointer"; else if (!(block_get_flags( prev ) & BLOCK_FLAG_FREE) || block_get_type( prev ) != BLOCK_TYPE_FREE) err = "invalid previous free block header"; - else if ((next = next_block( subheap, block ))) + else if ((next = next_block( subheap, block )) && + /* LFH blocks do not use BLOCK_FLAG_PREV_FREE or back pointer */ + !(block_get_flags( block ) & BLOCK_FLAG_LFH)) { if (!(block_get_flags( next ) & BLOCK_FLAG_PREV_FREE)) err = "invalid next block flags"; @@ -1154,7 +1162,10 @@ static BOOL validate_used_block( const struct heap *heap, const SUBHEAP *subheap err = "invalid block size"; else if (block->tail_size > block_get_size( block ) - sizeof(*block)) err = "invalid block unused size"; - else if ((next = next_block( subheap, block )) && (block_get_flags( next ) & BLOCK_FLAG_PREV_FREE)) + else if ((next = next_block( subheap, block )) && + (block_get_flags( next ) & BLOCK_FLAG_PREV_FREE) && + /* LFH blocks do not use BLOCK_FLAG_PREV_FREE or back pointer */ + !(block_get_flags( block ) & BLOCK_FLAG_LFH)) err = "invalid next block flags"; else if (block_get_flags( block ) & BLOCK_FLAG_PREV_FREE) { @@ -1291,6 +1302,16 @@ static inline struct block *unsafe_block_from_ptr( struct heap *heap, ULONG flag
if ((ULONG_PTR)ptr % BLOCK_ALIGN) err = "invalid ptr alignment"; + else if (block_get_flags( block ) & BLOCK_FLAG_LFH) + { + /* LFH blocks base points to the group, not the subheap */ + if (block_get_type( block ) == BLOCK_TYPE_DEAD) + err = "delayed freed block"; + else if (block_get_type( block ) == BLOCK_TYPE_FREE) + err = "already freed block"; + else if (block_get_type( block ) != BLOCK_TYPE_USED) + err = "invalid block type"; + } else if ((subheap = block_get_subheap( heap, block )) >= (SUBHEAP *)block) err = "invalid base offset"; else if (block_get_type( block ) == BLOCK_TYPE_USED) @@ -1493,6 +1514,8 @@ HANDLE WINAPI RtlCreateHeap( ULONG flags, void *addr, SIZE_T total_size, SIZE_T SIZE_T size = sizeof(struct category) * BLOCK_SIZE_CATEGORY_COUNT; NtAllocateVirtualMemory( NtCurrentProcess(), (void *)&heap->categories, 0, &size, MEM_COMMIT, PAGE_READWRITE ); + for (i = 0; i < BLOCK_SIZE_CATEGORY_COUNT; ++i) + list_init( &heap->categories[i].groups ); }
/* link it into the per-process heap list */ @@ -1642,10 +1665,170 @@ static NTSTATUS heap_allocate_block( struct heap *heap, ULONG flags, SIZE_T bloc return STATUS_SUCCESS; }
+/* Low Fragmentation Heap frontend */ + +/* header for every LFH block group */ + +struct DECLSPEC_ALIGN(BLOCK_ALIGN) group +{ + struct block block; + struct list entry; + /* one bit for each free block and the highest bit as unlinked flag */ + LONG free_bits; +}; + +#define GROUP_FLAG_FREE (1u << (sizeof(((struct group *)0)->free_bits) * 8 - 1)) + +static inline UINT block_get_group_index( const struct block *block ) +{ + return block->base_offset; +} + +static inline struct group *block_get_group( const struct block *block ) +{ + SIZE_T block_size = block_get_size( block ); + void *blocks = (char *)block - block_get_group_index( block ) * block_size; + return (struct group *)blocks - 1; +} + +static inline void block_set_group( struct block *block, SIZE_T block_size, const struct group *group ) +{ + const char *blocks = (char *)(group + 1); + block->base_offset = ((char *)block - blocks) / block_size; +} + +static inline struct block *group_get_block( struct group *group, SIZE_T block_size, UINT index ) +{ + char *blocks = (char *)(group + 1); + return (struct block *)(blocks + index * block_size); +} + +/* lookup a free block using the group free_bits, the current thread must own the group */ +static inline LONG group_find_free_block( struct group *group, SIZE_T block_size, struct block **block ) +{ +#if defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3))) + int i = __builtin_ffs( group->free_bits ) - 1; +#else + int i = sizeof(group->free_bits) * 8; + while (i--) if (group->free_bits & (1 << i)) break; +#endif + /* we remove the group from the free list once all its blocks are used, i will never be -1 */ + *block = group_get_block( group, block_size, i ); + return group->free_bits &= ~(1 << i); +} + +/* allocate a new group block using non-LFH allocation, returns a group owned by current thread */ +static struct group *group_allocate( struct heap *heap, ULONG flags, SIZE_T block_size, struct category *category ) +{ + SIZE_T i, size, group_size; + struct group *group; + NTSTATUS status; + char *ptr; + + size = sizeof(*group) + sizeof(group->free_bits) * 8 * block_size - sizeof(struct block); + group_size = heap_get_block_size( heap, flags, size ); + + if (group_size >= HEAP_MIN_LARGE_BLOCK_SIZE) + status = heap_allocate_large( heap, flags & ~HEAP_ZERO_MEMORY, group_size, size, (void **)&ptr ); + else + status = heap_allocate_block( heap, flags & ~HEAP_ZERO_MEMORY, group_size, size, (void **)&ptr ); + + if (status) return NULL; + + group = CONTAINING_RECORD( (struct block *)ptr - 1, struct group, block ); + block_set_flags( &group->block, 0, BLOCK_FLAG_LFH ); + group->free_bits = ~0; + + for (i = 0; i < sizeof(group->free_bits) * 8; ++i) + { + struct block *block = group_get_block( group, block_size, i ); + block_init_free( block, flags, NULL, block_size ); + block_set_flags( block, 0, BLOCK_FLAG_LFH ); + block_set_group( block, block_size, group ); + } + + return group; +} + +/* release a fully freed group to the non-LFH subheap, group must be owned by current thread */ +static NTSTATUS group_release( struct heap *heap, ULONG flags, struct category *category, struct group *group ) +{ + NTSTATUS status; + + block_set_flags( &group->block, BLOCK_FLAG_LFH, 0 ); + + if (block_get_flags( &group->block ) & BLOCK_FLAG_LARGE) + status = heap_free_large( heap, flags, &group->block ); + else + status = heap_free_block( heap, flags, &group->block ); + + return status; +} + +/* acquire a group from the category, thread takes ownership of a shared group or allocates a new one */ +static struct group *heap_acquire_category_group( struct heap *heap, ULONG flags, SIZE_T block_size, + struct category *category ) +{ + struct group *group; + struct list *entry; + + if (!(entry = list_head( &category->groups ))) + group = group_allocate( heap, flags, block_size, category ); + else + { + group = LIST_ENTRY( entry, struct group, entry ); + list_remove( &group->entry ); + } + + return group; +} + +/* release a thread owned and fully freed group to the category shared group, or free its memory */ +static NTSTATUS heap_release_category_group( struct heap *heap, ULONG flags, struct category *category, + struct group *group ) +{ + NTSTATUS status = STATUS_SUCCESS; + + /* try re-using the block group instead of releasing it */ + if (list_empty( &category->groups )) + list_add_tail( &category->groups, &group->entry ); + else + status = group_release( heap, flags, category, group ); + + return status; +} + +static struct block *find_free_block_lfh( struct heap *heap, ULONG flags, SIZE_T block_size, + struct category *category ) +{ + struct block *block; + struct group *group; + + /* acquire a group, the thread will own it and no other thread can clear free bits. + * some other thread might still set the free bits if they are freeing blocks. + */ + if (!(group = heap_acquire_category_group( heap, flags, block_size, category ))) return NULL; + + /* serialize with heap_free_block_lfh: atomically set GROUP_FLAG_FREE when the free bits are all 0. */ + if (group_find_free_block( group, block_size, &block )) + group->free_bits &= ~GROUP_FLAG_FREE; + else if (!group->free_bits) + group->free_bits = GROUP_FLAG_FREE; + + /* if GROUP_FLAG_FREE was set, thread released its ownership. */ + if (group->free_bits & GROUP_FLAG_FREE) return block; + + /* otherwise there is still some free blocks, put the group back into the category */ + list_add_tail( &category->groups, &group->entry ); + + return block; +} + static NTSTATUS heap_allocate_block_lfh( struct heap *heap, ULONG flags, SIZE_T block_size, SIZE_T size, void **ret ) { struct category *category, *last = heap->categories + BLOCK_SIZE_CATEGORY_COUNT - 1; + struct block *block;
category = heap->categories + BLOCK_SIZE_CATEGORY( block_size ); if (!heap->categories || category == last) return STATUS_UNSUCCESSFUL; @@ -1661,7 +1844,53 @@ static NTSTATUS heap_allocate_block_lfh( struct heap *heap, ULONG flags, SIZE_T return STATUS_UNSUCCESSFUL; }
- return STATUS_NOT_IMPLEMENTED; + block_size = BLOCK_CATEGORY_SIZE( BLOCK_SIZE_CATEGORY( block_size ) ); + + heap_lock( heap, flags ); + if ((block = find_free_block_lfh( heap, flags, block_size, category ))) + { + block_set_type( block, BLOCK_TYPE_USED ); + block_set_flags( block, ~BLOCK_FLAG_LFH, BLOCK_USER_FLAGS( flags ) ); + block_set_size( block, block_size ); + block->tail_size = block_size - sizeof(*block) - size; + initialize_block( block, 0, size, flags ); + mark_block_tail( block, flags ); + *ret = block + 1; + } + heap_unlock( heap, flags ); + + return block ? STATUS_SUCCESS : STATUS_NO_MEMORY; +} + +static NTSTATUS heap_free_block_lfh( struct heap *heap, ULONG flags, struct block *block ) +{ + struct category *category, *last = heap->categories + BLOCK_SIZE_CATEGORY_COUNT - 1; + SIZE_T i, block_size = block_get_size( block ); + struct group *group = block_get_group( block ); + NTSTATUS status = STATUS_SUCCESS; + + if (!(block_get_flags( block ) & BLOCK_FLAG_LFH)) return STATUS_UNSUCCESSFUL; + + category = heap->categories + BLOCK_SIZE_CATEGORY( block_size ); + if (category == last) return STATUS_UNSUCCESSFUL; + + heap_lock( heap, flags ); + + i = block_get_group_index( block ); + block_init_free( block, flags, NULL, block_size ); + block_set_flags( block, 0, BLOCK_FLAG_LFH ); + block_set_group( block, block_size, group ); + + /* if this was the last used block in a group and GROUP_FLAG_FREE was set */ + if ((group->free_bits |= (1 << i)) == ~0) + { + /* thread now owns the group, and can release it to its category */ + status = heap_release_category_group( heap, flags, category, group ); + } + + heap_unlock( heap, flags ); + + return status; }
/*********************************************************************** @@ -1727,6 +1956,8 @@ BOOLEAN WINAPI DECLSPEC_HOTPATCH RtlFreeHeap( HANDLE handle, ULONG flags, void * status = heap_free_large( heap, heap_flags, block ); else if (!(block = heap_delay_free( heap, heap_flags, block ))) status = STATUS_SUCCESS; + else if (!heap_free_block_lfh( heap, heap_flags, block )) + status = STATUS_SUCCESS; else { SIZE_T block_size = block_get_size( block ); @@ -1747,10 +1978,10 @@ BOOLEAN WINAPI DECLSPEC_HOTPATCH RtlFreeHeap( HANDLE handle, ULONG flags, void * static NTSTATUS heap_resize_block( struct heap *heap, ULONG flags, struct block *block, SIZE_T block_size, SIZE_T size, SIZE_T *old_size, void **ret ) { - SUBHEAP *subheap = block_get_subheap( heap, block ); SIZE_T old_category, old_block_size; struct entry *entry; struct block *next; + SUBHEAP *subheap;
if (block_get_flags( block ) & BLOCK_FLAG_LARGE) { @@ -1779,8 +2010,29 @@ static NTSTATUS heap_resize_block( struct heap *heap, ULONG flags, struct block
if (block_size >= HEAP_MIN_LARGE_BLOCK_SIZE) return STATUS_NO_MEMORY; /* growing small block to large block */
+ if (block_get_flags( block ) & BLOCK_FLAG_LFH) + { + /* as native LFH does it with different block size: refuse to resize even though we could */ + if (ROUND_SIZE( *old_size, BLOCK_ALIGN - 1) != ROUND_SIZE( size, BLOCK_ALIGN - 1)) return STATUS_NO_MEMORY; + if (size >= *old_size) return STATUS_NO_MEMORY; + + heap_lock( heap, flags ); + + block_set_flags( block, BLOCK_FLAG_USER_MASK & ~BLOCK_FLAG_USER_INFO, BLOCK_USER_FLAGS( flags ) ); + block->tail_size = old_block_size - sizeof(*block) - size; + initialize_block( block, *old_size, size, flags ); + mark_block_tail( block, flags ); + + heap_unlock( heap, flags ); + + *ret = block + 1; + return STATUS_SUCCESS; + } + heap_lock( heap, flags );
+ subheap = block_get_subheap( heap, block ); + if (block_size > old_block_size) { if (!(next = next_block( subheap, block )) || !(block_get_flags( next ) & BLOCK_FLAG_FREE) ||
From: Rémi Bernon rbernon@codeweavers.com
--- dlls/kernel32/tests/heap.c | 1 - dlls/ntdll/heap.c | 56 ++++++++++++++++++++------------------ 2 files changed, 29 insertions(+), 28 deletions(-)
diff --git a/dlls/kernel32/tests/heap.c b/dlls/kernel32/tests/heap.c index b3c3dc1eebc..0d71669cebb 100644 --- a/dlls/kernel32/tests/heap.c +++ b/dlls/kernel32/tests/heap.c @@ -1221,7 +1221,6 @@ static void test_HeapCreate(void) thread_params.flags = 0; SetEvent( thread_params.start_event ); res = WaitForSingleObject( thread_params.ready_event, 100 ); - todo_wine ok( !res, "WaitForSingleObject returned %#lx, error %lu\n", res, GetLastError() ); ret = HeapUnlock( heap ); ok( ret, "HeapUnlock failed, error %lu\n", GetLastError() ); diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index e3800cad6af..4d6c018dddd 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -236,7 +236,7 @@ struct category volatile BOOL enabled;
/* list of groups with free blocks */ - struct list groups; + SLIST_HEADER groups; };
struct heap @@ -1515,7 +1515,7 @@ HANDLE WINAPI RtlCreateHeap( ULONG flags, void *addr, SIZE_T total_size, SIZE_T NtAllocateVirtualMemory( NtCurrentProcess(), (void *)&heap->categories, 0, &size, MEM_COMMIT, PAGE_READWRITE ); for (i = 0; i < BLOCK_SIZE_CATEGORY_COUNT; ++i) - list_init( &heap->categories[i].groups ); + RtlInitializeSListHead( &heap->categories[i].groups ); }
/* link it into the per-process heap list */ @@ -1672,13 +1672,16 @@ static NTSTATUS heap_allocate_block( struct heap *heap, ULONG flags, SIZE_T bloc struct DECLSPEC_ALIGN(BLOCK_ALIGN) group { struct block block; - struct list entry; + SINGLE_LIST_ENTRY entry; /* one bit for each free block and the highest bit as unlinked flag */ LONG free_bits; };
#define GROUP_FLAG_FREE (1u << (sizeof(((struct group *)0)->free_bits) * 8 - 1))
+/* entry actually is a SLIST_ENTRY and needs to be 16B aligned on 64bit */ +C_ASSERT( offsetof(struct group, entry) % 8 == 0 ); + static inline UINT block_get_group_index( const struct block *block ) { return block->base_offset; @@ -1714,7 +1717,7 @@ static inline LONG group_find_free_block( struct group *group, SIZE_T block_size #endif /* we remove the group from the free list once all its blocks are used, i will never be -1 */ *block = group_get_block( group, block_size, i ); - return group->free_bits &= ~(1 << i); + return InterlockedAnd( &group->free_bits, ~(1 << i) ) & ~(1 << i); }
/* allocate a new group block using non-LFH allocation, returns a group owned by current thread */ @@ -1728,11 +1731,15 @@ static struct group *group_allocate( struct heap *heap, ULONG flags, SIZE_T bloc size = sizeof(*group) + sizeof(group->free_bits) * 8 * block_size - sizeof(struct block); group_size = heap_get_block_size( heap, flags, size );
+ heap_lock( heap, flags ); + if (group_size >= HEAP_MIN_LARGE_BLOCK_SIZE) status = heap_allocate_large( heap, flags & ~HEAP_ZERO_MEMORY, group_size, size, (void **)&ptr ); else status = heap_allocate_block( heap, flags & ~HEAP_ZERO_MEMORY, group_size, size, (void **)&ptr );
+ heap_unlock( heap, flags ); + if (status) return NULL;
group = CONTAINING_RECORD( (struct block *)ptr - 1, struct group, block ); @@ -1755,6 +1762,8 @@ static NTSTATUS group_release( struct heap *heap, ULONG flags, struct category * { NTSTATUS status;
+ heap_lock( heap, flags ); + block_set_flags( &group->block, BLOCK_FLAG_LFH, 0 );
if (block_get_flags( &group->block ) & BLOCK_FLAG_LARGE) @@ -1762,6 +1771,8 @@ static NTSTATUS group_release( struct heap *heap, ULONG flags, struct category * else status = heap_free_block( heap, flags, &group->block );
+ heap_unlock( heap, flags ); + return status; }
@@ -1770,15 +1781,12 @@ static struct group *heap_acquire_category_group( struct heap *heap, ULONG flags struct category *category ) { struct group *group; - struct list *entry; + SLIST_ENTRY *entry;
- if (!(entry = list_head( &category->groups ))) + if (!(entry = RtlInterlockedPopEntrySList( &category->groups ))) group = group_allocate( heap, flags, block_size, category ); else - { - group = LIST_ENTRY( entry, struct group, entry ); - list_remove( &group->entry ); - } + group = CONTAINING_RECORD( entry, struct group, entry );
return group; } @@ -1790,8 +1798,12 @@ static NTSTATUS heap_release_category_group( struct heap *heap, ULONG flags, str NTSTATUS status = STATUS_SUCCESS;
/* try re-using the block group instead of releasing it */ - if (list_empty( &category->groups )) - list_add_tail( &category->groups, &group->entry ); +#ifdef _WIN64 + if (category->groups.Header16.Depth <= 16) +#else + if (category->groups.Depth <= 16) +#endif + RtlInterlockedPushEntrySList( &category->groups, (SLIST_ENTRY *)&group->entry ); else status = group_release( heap, flags, category, group );
@@ -1811,15 +1823,15 @@ static struct block *find_free_block_lfh( struct heap *heap, ULONG flags, SIZE_T
/* serialize with heap_free_block_lfh: atomically set GROUP_FLAG_FREE when the free bits are all 0. */ if (group_find_free_block( group, block_size, &block )) - group->free_bits &= ~GROUP_FLAG_FREE; - else if (!group->free_bits) - group->free_bits = GROUP_FLAG_FREE; + InterlockedAnd( &group->free_bits, ~GROUP_FLAG_FREE ); + else + InterlockedCompareExchange( &group->free_bits, GROUP_FLAG_FREE, 0 );
/* if GROUP_FLAG_FREE was set, thread released its ownership. */ if (group->free_bits & GROUP_FLAG_FREE) return block;
/* otherwise there is still some free blocks, put the group back into the category */ - list_add_tail( &category->groups, &group->entry ); + RtlInterlockedPushEntrySList( &category->groups, (SLIST_ENTRY *)&group->entry );
return block; } @@ -1846,7 +1858,6 @@ static NTSTATUS heap_allocate_block_lfh( struct heap *heap, ULONG flags, SIZE_T
block_size = BLOCK_CATEGORY_SIZE( BLOCK_SIZE_CATEGORY( block_size ) );
- heap_lock( heap, flags ); if ((block = find_free_block_lfh( heap, flags, block_size, category ))) { block_set_type( block, BLOCK_TYPE_USED ); @@ -1857,7 +1868,6 @@ static NTSTATUS heap_allocate_block_lfh( struct heap *heap, ULONG flags, SIZE_T mark_block_tail( block, flags ); *ret = block + 1; } - heap_unlock( heap, flags );
return block ? STATUS_SUCCESS : STATUS_NO_MEMORY; } @@ -1874,22 +1884,18 @@ static NTSTATUS heap_free_block_lfh( struct heap *heap, ULONG flags, struct bloc category = heap->categories + BLOCK_SIZE_CATEGORY( block_size ); if (category == last) return STATUS_UNSUCCESSFUL;
- heap_lock( heap, flags ); - i = block_get_group_index( block ); block_init_free( block, flags, NULL, block_size ); block_set_flags( block, 0, BLOCK_FLAG_LFH ); block_set_group( block, block_size, group );
/* if this was the last used block in a group and GROUP_FLAG_FREE was set */ - if ((group->free_bits |= (1 << i)) == ~0) + if (InterlockedOr( &group->free_bits, 1 << i ) == ~(1 << i)) { /* thread now owns the group, and can release it to its category */ status = heap_release_category_group( heap, flags, category, group ); }
- heap_unlock( heap, flags ); - return status; }
@@ -2016,15 +2022,11 @@ static NTSTATUS heap_resize_block( struct heap *heap, ULONG flags, struct block if (ROUND_SIZE( *old_size, BLOCK_ALIGN - 1) != ROUND_SIZE( size, BLOCK_ALIGN - 1)) return STATUS_NO_MEMORY; if (size >= *old_size) return STATUS_NO_MEMORY;
- heap_lock( heap, flags ); - block_set_flags( block, BLOCK_FLAG_USER_MASK & ~BLOCK_FLAG_USER_INFO, BLOCK_USER_FLAGS( flags ) ); block->tail_size = old_block_size - sizeof(*block) - size; initialize_block( block, *old_size, size, flags ); mark_block_tail( block, flags );
- heap_unlock( heap, flags ); - *ret = block + 1; return STATUS_SUCCESS; }
From: Rémi Bernon rbernon@codeweavers.com
--- dlls/ntdll/heap.c | 64 +++++++++++++++++++++++++++++++++++++++-- dlls/ntdll/loader.c | 2 ++ dlls/ntdll/ntdll_misc.h | 1 + 3 files changed, 65 insertions(+), 2 deletions(-)
diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index 4d6c018dddd..aaa914b28e5 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -227,6 +227,11 @@ C_ASSERT( BLOCK_SIZE_CATEGORY_COUNT <= 256 ); /* difference between block classes and all possible validation overhead must fit into block tail_size */ C_ASSERT( BLOCK_SIZE_MEDIUM_STEP + 3 * BLOCK_ALIGN <= FIELD_MAX( struct block, tail_size ) );
+/* affinity to tid mapping array, limits the number of thread-local caches, + * and additional threads will fight for the global categories groups + */ +static LONG next_thread_affinity; + /* a category of heap blocks of a certain size */ struct category { @@ -237,6 +242,7 @@ struct category
/* list of groups with free blocks */ SLIST_HEADER groups; + struct group *affinity_group[32]; };
struct heap @@ -1672,7 +1678,11 @@ static NTSTATUS heap_allocate_block( struct heap *heap, ULONG flags, SIZE_T bloc struct DECLSPEC_ALIGN(BLOCK_ALIGN) group { struct block block; - SINGLE_LIST_ENTRY entry; + union + { + SINGLE_LIST_ENTRY entry; + struct list thread_entry; + }; /* one bit for each free block and the highest bit as unlinked flag */ LONG free_bits; }; @@ -1780,9 +1790,13 @@ static NTSTATUS group_release( struct heap *heap, ULONG flags, struct category * static struct group *heap_acquire_category_group( struct heap *heap, ULONG flags, SIZE_T block_size, struct category *category ) { + DWORD affinity = NtCurrentTeb()->HeapVirtualAffinity % ARRAY_SIZE(category->affinity_group); struct group *group; SLIST_ENTRY *entry;
+ if ((group = InterlockedExchangePointer( (void *)&category->affinity_group[affinity], NULL ))) + return group; + if (!(entry = RtlInterlockedPopEntrySList( &category->groups ))) group = group_allocate( heap, flags, block_size, category ); else @@ -1795,8 +1809,15 @@ static struct group *heap_acquire_category_group( struct heap *heap, ULONG flags static NTSTATUS heap_release_category_group( struct heap *heap, ULONG flags, struct category *category, struct group *group ) { + DWORD affinity = NtCurrentTeb()->HeapVirtualAffinity % ARRAY_SIZE(category->affinity_group); NTSTATUS status = STATUS_SUCCESS;
+ /* we cannot use InterlockedExchangePointer here because to the contrary to our current group, + * the current affinity_group might be only partially free. + */ + if (!InterlockedCompareExchangePointer( (void *)&category->affinity_group[affinity], group, NULL )) + return STATUS_SUCCESS; + /* try re-using the block group instead of releasing it */ #ifdef _WIN64 if (category->groups.Header16.Depth <= 16) @@ -1815,6 +1836,14 @@ static struct block *find_free_block_lfh( struct heap *heap, ULONG flags, SIZE_T { struct block *block; struct group *group; + DWORD affinity; + + if (!(affinity = NtCurrentTeb()->HeapVirtualAffinity)) + { + affinity = InterlockedIncrement( &next_thread_affinity ); + NtCurrentTeb()->HeapVirtualAffinity = affinity; + } + affinity %= ARRAY_SIZE(category->affinity_group);
/* acquire a group, the thread will own it and no other thread can clear free bits. * some other thread might still set the free bits if they are freeing blocks. @@ -1831,7 +1860,8 @@ static struct block *find_free_block_lfh( struct heap *heap, ULONG flags, SIZE_T if (group->free_bits & GROUP_FLAG_FREE) return block;
/* otherwise there is still some free blocks, put the group back into the category */ - RtlInterlockedPushEntrySList( &category->groups, (SLIST_ENTRY *)&group->entry ); + if ((group = InterlockedExchangePointer( (void *)&category->affinity_group[affinity], group ))) + RtlInterlockedPushEntrySList( &category->groups, (SLIST_ENTRY *)&group->entry );
return block; } @@ -1899,6 +1929,36 @@ static NTSTATUS heap_free_block_lfh( struct heap *heap, ULONG flags, struct bloc return status; }
+static void heap_thread_detach_category_groups( struct heap *heap ) +{ + DWORD affinity = NtCurrentTeb()->HeapVirtualAffinity % ARRAY_SIZE(heap->categories->affinity_group); + ULONG i; + + if (!heap->categories) return; + + for (i = 0; i < BLOCK_SIZE_CATEGORY_COUNT; ++i) + { + struct category *category = heap->categories + i; + struct group *group; + if (!(group = InterlockedExchangePointer( (void *)&category->affinity_group[affinity], NULL ))) continue; + RtlInterlockedPushEntrySList( &category->groups, (SLIST_ENTRY *)&group->entry ); + } +} + +void heap_thread_detach(void) +{ + struct heap *heap; + + RtlEnterCriticalSection( &process_heap->cs ); + + LIST_FOR_EACH_ENTRY( heap, &process_heap->entry, struct heap, entry ) + heap_thread_detach_category_groups( heap ); + + heap_thread_detach_category_groups( process_heap ); + + RtlLeaveCriticalSection( &process_heap->cs ); +} + /*********************************************************************** * RtlAllocateHeap (NTDLL.@) */ diff --git a/dlls/ntdll/loader.c b/dlls/ntdll/loader.c index 01a30742678..6c643575f8d 100644 --- a/dlls/ntdll/loader.c +++ b/dlls/ntdll/loader.c @@ -3737,6 +3737,8 @@ void WINAPI LdrShutdownThread(void) /* don't call DbgUiGetThreadDebugObject as some apps hook it and terminate if called */ if (NtCurrentTeb()->DbgSsReserved[1]) NtClose( NtCurrentTeb()->DbgSsReserved[1] ); RtlFreeThreadActivationContextStack(); + + heap_thread_detach(); }
diff --git a/dlls/ntdll/ntdll_misc.h b/dlls/ntdll/ntdll_misc.h index d1a7790991b..f6b77b79cde 100644 --- a/dlls/ntdll/ntdll_misc.h +++ b/dlls/ntdll/ntdll_misc.h @@ -127,5 +127,6 @@ static inline void ascii_to_unicode( WCHAR *dst, const char *src, size_t len )
/* FLS data */ extern TEB_FLS_DATA *fls_alloc_data(void) DECLSPEC_HIDDEN; +extern void heap_thread_detach(void) DECLSPEC_HIDDEN;
#endif
Hi,
It looks like your patch introduced the new failures shown below. Please investigate and fix them before resubmitting your patch. If they are not new, fixing them anyway would help a lot. Otherwise please ask for the known failures list to be updated.
The tests also ran into some preexisting test failures. If you know how to fix them that would be helpful. See the TestBot job for the details:
The full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=128994
Your paranoid android.
=== debian11 (32 bit report) ===
dxgi: dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 0. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 1. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 2. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 3. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 4. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 10. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 11. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 14. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 18. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 19. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 20. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 21. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 23. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 24. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 25. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 26. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 27. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 33. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 34. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 37. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 41. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 42. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 43.
I think this version should be much better, less clumsy and more performant.
I dropped the idea of thread-local subheaps entirely, as more testing shows that native LFH just creates LFH blocks in the middle of the other blocks. It makes them a bunch at a time, so I'm assuming it carves them out of a bigger non-LFH block and I'm doing the same here, allocating 31 (currently) blocks at a time.
I don't really know where native keeps the block group metadata, each LFH block seem to reference each other but there's still apparently some out-of-band metadata somewhere. For this implementation I've introduced block `group`s, which are simply standard blocks but instead kept internally in block size categories linked lists.
Each `group` has a small header to keep out-of-band data such as its group list entry and a free block bitmap, and its data is then split into smaller LFH blocks, each with the same usual heap block header but with a specific LFH flag.
Instead of the large and complex thread local data, and per-thread subheaps, I'm only giving each thread that tried to allocate a block an affinity, modulo some fixed limit. Each category uses an interlocked list to keep shared groups with free blocks, as well as affinity-based group pointers as an optimization over the interlocked list.
With both the affinity and the interlocked shared list there should not be too much contention, and only when new groups need to be allocated or are fully released, the heap lock needs to be taken. This avoids entering the CS as much as possible using atomic interlocked operations and careful thread ownership of block groups.
On Wed Feb 1 15:26:26 2023 +0000, **** wrote:
Marvin replied on the mailing list:
Hi, It looks like your patch introduced the new failures shown below. Please investigate and fix them before resubmitting your patch. If they are not new, fixing them anyway would help a lot. Otherwise please ask for the known failures list to be updated. The tests also ran into some preexisting test failures. If you know how to fix them that would be helpful. See the TestBot job for the details: The full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=128994 Your paranoid android. === debian11 (32 bit report) === dxgi: dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 0. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 1. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 2. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 3. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 4. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 10. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 11. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 14. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 18. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 19. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 20. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 21. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 23. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 24. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 25. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 26. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 27. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 33. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 34. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 37. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 41. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 42. dxgi.c:4501: Test succeeded inside todo block: Got usage 146ee8, expected 60, test 43.
I believe this is because of a use-after-free bug that is fixed with the LFH as we avoid touching the blocks now. Currently the `dxgi:dxgi` test crashes silently and only testbot catches that with:
``` The main process has no test summary line The test has unaccounted for failure messages The test has unaccounted for todo messages The test returned success despite having failures ```
Zebediah Figura (@zfigura) commented about dlls/ntdll/heap.c:
struct DECLSPEC_ALIGN(BLOCK_ALIGN) group { struct block block;
- struct list entry;
- SINGLE_LIST_ENTRY entry; /* one bit for each free block and the highest bit as unlinked flag */ LONG free_bits;
};
#define GROUP_FLAG_FREE (1u << (sizeof(((struct group *)0)->free_bits) * 8 - 1))
+/* entry actually is a SLIST_ENTRY and needs to be 16B aligned on 64bit */ +C_ASSERT( offsetof(struct group, entry) % 8 == 0 );
Wait, why can't we just define it as an SLIST_ENTRY then?
Zebediah Figura (@zfigura) commented about dlls/ntdll/heap.c:
NTSTATUS status = STATUS_SUCCESS; /* try re-using the block group instead of releasing it */
- if (list_empty( &category->groups ))
list_add_tail( &category->groups, &group->entry );
+#ifdef _WIN64
- if (category->groups.Header16.Depth <= 16)
+#else
- if (category->groups.Depth <= 16)
+#endif
This is RtlQueryDepthSList().
On Wed Feb 1 20:34:28 2023 +0000, Zebediah Figura wrote:
This is RtlQueryDepthSList().
Also, this appears to change behaviour in a way orthogonal to the rest of the patch (i.e. changing the threshold for reuse from 0 to 16), so maybe deserves a separate patch?
On Wed Feb 1 20:33:59 2023 +0000, Zebediah Figura wrote:
Wait, why can't we just define it as an SLIST_ENTRY then?
Because group address will be offset by 8, because of how blocks are layed-out in memory. I could alternatively make the `struct group` not include the block header, but it makes it annoying to access it and compute offsets to inner blocks.
Zebediah Figura (@zfigura) commented about dlls/ntdll/heap.c:
if (ROUND_SIZE( *old_size, BLOCK_ALIGN - 1) != ROUND_SIZE( size, BLOCK_ALIGN - 1)) return STATUS_NO_MEMORY; if (size >= *old_size) return STATUS_NO_MEMORY;
heap_lock( heap, flags );
block_set_flags( block, BLOCK_FLAG_USER_MASK & ~BLOCK_FLAG_USER_INFO, BLOCK_USER_FLAGS( flags ) ); block->tail_size = old_block_size - sizeof(*block) - size; initialize_block( block, *old_size, size, flags ); mark_block_tail( block, flags );
heap_unlock( heap, flags );
Why did this have locks to begin with? What are they protecting? (Concurrent access to this block, I guess, but that seems like it's a meaningless race, right?)
On Wed Feb 1 20:38:33 2023 +0000, Zebediah Figura wrote:
Also, this appears to change behaviour in a way orthogonal to the rest of the patch (i.e. changing the threshold for reuse from 0 to 16), so maybe deserves a separate patch?
`RtlQueryDepthSList` takes a bit to call, I wanted to avoid that. With the affinity cache it will matter a bit less, but still.
I think the threshold might be better moved and merged to the next change, as it's purpose is also to avoid starving the shared list (I had this change first before implementing the affinity list, so it helped at that point).
On Wed Feb 1 20:50:33 2023 +0000, Zebediah Figura wrote:
Why did this have locks to begin with? What are they protecting? (Concurrent access to this block, I guess, but that seems like it's a meaningless race, right?)
Just making sure, but yeah I think there's no race here even initially.
On Wed Feb 1 20:51:21 2023 +0000, Rémi Bernon wrote:
`RtlQueryDepthSList` takes a bit to call, I wanted to avoid that. With the affinity cache it will matter a bit less, but still. I think the threshold might be better moved and merged to the next change, as it's purpose is also to avoid starving the shared list (I had this change first before implementing the affinity list, so it helped at that point).
If we're concerned about call overhead, maybe we should define an inline helper in winnt.h like for other functions? Doesn't seem like a big deal, but it'd probably make this a bit more readable.
On Wed Feb 1 20:47:44 2023 +0000, Rémi Bernon wrote:
Because group address will be offset by 8, because of how blocks are layed-out in memory. I could alternatively make the `struct group` not include the block header, but it makes it annoying to access it and compute offsets to inner blocks.
Oh, I see. Maybe pshpack8.h would work? Or put a dummy 8 bytes before "block"? Not sure if it's worth it to avoid the casts, but worth considering maybe.
On Wed Feb 1 20:58:26 2023 +0000, Zebediah Figura wrote:
Oh, I see. Maybe pshpack8.h would work? Or put a dummy 8 bytes before "block"? Not sure if it's worth it to avoid the casts, but worth considering maybe.
The dummy will make it overlap the preceding block, and we want the `struct group` to be aligned like `struct block`. There's a bit of implicit about it, but I think the assert is enough.
Zebediah Figura (@zfigura) commented about dlls/ntdll/heap.c:
category = heap->categories + BLOCK_SIZE_CATEGORY( block_size ); if (category == last) return STATUS_UNSUCCESSFUL;
- heap_lock( heap, flags );
Maybe not worth doing at this point, but I think this can get moved below the block initialization.
On Wed Feb 1 20:52:24 2023 +0000, Rémi Bernon wrote:
Just making sure, but yeah I think there's no race here even initially.
Yeah, in general it's probably a lot easier to be confident about this patch if we restrict the locks to where they're definitely needed in earlier patches.
Looking at the rest of this patch, I'm obviously not familiar with the heap code at all, but it *looks* like it makes sense, assuming I can correctly read what's possibly-concurrent and what isn't. And the lock-free parts look correct as well.
On Wed Feb 1 21:03:14 2023 +0000, Rémi Bernon wrote:
The dummy will make it overlap the preceding block, and we want the `struct group` to be aligned like `struct block`. There's a bit of implicit about it, but I think the assert is enough.
Right, but in theory you'd never actually access the dummy bytes, they'd just exist to correct the alignment. (Again, not arguing too hard for it, I just don't like casts personally.)
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
- {
- case HeapCompatibilityInformation:
- {
ULONG compat_info;
if (size < sizeof(ULONG)) return STATUS_BUFFER_TOO_SMALL;
if (!(heap = unsafe_heap_from_handle( handle, 0, &flags ))) return STATUS_INVALID_HANDLE;
if (heap->flags & HEAP_NO_SERIALIZE) return STATUS_SUCCESS;
compat_info = *(ULONG *)info;
if (compat_info != HEAP_STD && compat_info != HEAP_LFH)
{
FIXME( "HeapCompatibilityInformation %lu not implemented!\n", compat_info );
return STATUS_UNSUCCESSFUL;
}
if (InterlockedCompareExchange( &heap->compat_info, compat_info, HEAP_STD )) return STATUS_UNSUCCESSFUL;
How about this? ```suggestion:-0+0 if (InterlockedCompareExchange( &heap->compat_info, compat_info, HEAP_STD ) != HEAP_STD) { return STATUS_UNSUCCESSFUL; } ```
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
+/* keep the block size category count reasonable */ +C_ASSERT( BLOCK_SIZE_CATEGORY_COUNT <= 256 );
+/* difference between block classes and all possible validation overhead must fit into block tail_size */ +C_ASSERT( BLOCK_SIZE_MEDIUM_STEP + 3 * BLOCK_ALIGN <= FIELD_MAX( struct block, tail_size ) );
+/* affinity to tid mapping array, limits the number of thread-local caches,
- and additional threads will fight for the global categories groups
- */
+static LONG next_thread_affinity;
+/* a category of heap blocks of a certain size */ +struct category +{
- /* counters for LFH activation */
- volatile LONG blocks_alive;
How about counting cumulative total _freed_ blocks instead? This will reduce two writes into one.
`(ULONG)category->blocks_total - (ULONG)category->blocks_dead` will still evaluate to the (approximate) alive block count even if an individual variable overflows.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
status = STATUS_NO_MEMORY; else if (block_size >= HEAP_MIN_LARGE_BLOCK_SIZE) status = heap_allocate_large( heap, heap_flags, block_size, size, &ptr );
- else if (!heap_allocate_block_lfh( heap, heap_flags, block_size, size, &ptr ))
else { heap_lock( heap, heap_flags ); status = heap_allocate_block( heap, heap_flags, block_size, size, &ptr ); heap_unlock( heap, heap_flags );status = STATUS_SUCCESS;
if (!status && heap->categories)
{
block_size = block_get_size( (struct block *)ptr - 1 );
heap->categories[BLOCK_SIZE_CATEGORY( block_size )].blocks_alive++;
heap->categories[BLOCK_SIZE_CATEGORY( block_size )].blocks_total++;
I think we can move this inside the heap critical section. Same for the other one.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
+C_ASSERT( BLOCK_CATEGORY_SIZE( 64 ) == BLOCK_SIZE_SMALL_MAX + BLOCK_SIZE_MEDIUM_STEP ); +C_ASSERT( BLOCK_SIZE_CATEGORY( BLOCK_SIZE_SMALL_MAX + BLOCK_SIZE_MEDIUM_STEP ) == 64 );
+/* keep the block size category count reasonable */ +C_ASSERT( BLOCK_SIZE_CATEGORY_COUNT <= 256 );
+/* difference between block classes and all possible validation overhead must fit into block tail_size */ +C_ASSERT( BLOCK_SIZE_MEDIUM_STEP + 3 * BLOCK_ALIGN <= FIELD_MAX( struct block, tail_size ) );
+/* affinity to tid mapping array, limits the number of thread-local caches,
- and additional threads will fight for the global categories groups
- */
+static LONG next_thread_affinity;
+/* a category of heap blocks of a certain size */ +struct category
Unless I have mistaken, "category" sounds like a non-standard term for a heap memory implementation. Maybe "bin" is more appropriate?
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
block_set_base( block, subheap_base( subheap ) ); block_set_flags( block, ~0, BLOCK_FLAG_FREE ); block_set_size( block, block_size );
/* If debugging, erase the freed block content */
if (end > commit_end) end = commit_end;
if (end > (char *)(entry + 1)) mark_block_free( entry + 1, end - (char *)(entry + 1), flags );
- if (!subheap) /* LFH block initialization, just clear its data */
mark_block_free( entry + 1, (char *)block + block_size - (char *)(entry + 1), flags );
- else
- {
const char *end = (char *)block + block_size, *commit_end;
block_set_base( block, subheap_base( subheap ) );
if (end > (commit_end = subheap_commit_end( subheap ))) end = commit_end;
if (end > (char *)(entry + 1)) mark_block_free( entry + 1, end - (char *)(entry + 1), flags );
Can we somehow delete the `if (end > (char *)(entry + 1))` guard and merge with `!subheap` case?
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
+{
- const char *blocks = (char *)(group + 1);
- block->base_offset = ((char *)block - blocks) / block_size;
+}
+static inline struct block *group_get_block( struct group *group, SIZE_T block_size, UINT index ) +{
- char *blocks = (char *)(group + 1);
- return (struct block *)(blocks + index * block_size);
+}
+/* lookup a free block using the group free_bits, the current thread must own the group */ +static inline LONG group_find_free_block( struct group *group, SIZE_T block_size, struct block **block ) +{ +#if defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)))
- int i = __builtin_ffs( group->free_bits ) - 1;
Use `_BitScanForward` instead.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
+{
- const char *blocks = (char *)(group + 1);
- block->base_offset = ((char *)block - blocks) / block_size;
+}
+static inline struct block *group_get_block( struct group *group, SIZE_T block_size, UINT index ) +{
- char *blocks = (char *)(group + 1);
- return (struct block *)(blocks + index * block_size);
+}
+/* lookup a free block using the group free_bits, the current thread must own the group */ +static inline LONG group_find_free_block( struct group *group, SIZE_T block_size, struct block **block ) +{ +#if defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)))
- int i = __builtin_ffs( group->free_bits ) - 1;
Technically, this constitutes a data race since we're accessing a shared variable (concurrent access from `heap_free_block_lfh`) without volatile semantics. You should at least mark it volatile and load it to a local variable explicitly.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
- while (i--) if (group->free_bits & (1 << i)) break;
+#endif
- /* we remove the group from the free list once all its blocks are used, i will never be -1 */
- *block = group_get_block( group, block_size, i );
- return InterlockedAnd( &group->free_bits, ~(1 << i) ) & ~(1 << i);
+}
+/* allocate a new group block using non-LFH allocation, returns a group owned by current thread */ +static struct group *group_allocate( struct heap *heap, ULONG flags, SIZE_T block_size, struct category *category ) +{
- SIZE_T i, size, group_size;
- struct group *group;
- NTSTATUS status;
- char *ptr;
- size = sizeof(*group) + sizeof(group->free_bits) * 8 * block_size - sizeof(struct block);
If my reading is correct, a group has exactly 31 LFH blocks due to `GROUP_FLAG_FREE`. In that case, we should probably subtract 1 from the count.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
+}
+static inline struct block *group_get_block( struct group *group, SIZE_T block_size, UINT index ) +{
- char *blocks = (char *)(group + 1);
- return (struct block *)(blocks + index * block_size);
+}
+/* lookup a free block using the group free_bits, the current thread must own the group */ +static inline LONG group_find_free_block( struct group *group, SIZE_T block_size, struct block **block ) +{ +#if defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)))
- int i = __builtin_ffs( group->free_bits ) - 1;
+#else
- int i = sizeof(group->free_bits) * 8;
- while (i--) if (group->free_bits & (1 << i)) break;
The direction is wrong.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
+/* lookup a free block using the group free_bits, the current thread must own the group */ +static inline LONG group_find_free_block( struct group *group, SIZE_T block_size, struct block **block ) +{ +#if defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)))
- int i = __builtin_ffs( group->free_bits ) - 1;
+#else
- int i = sizeof(group->free_bits) * 8;
- while (i--) if (group->free_bits & (1 << i)) break;
+#endif
- /* we remove the group from the free list once all its blocks are used, i will never be -1 */
- *block = group_get_block( group, block_size, i );
- return InterlockedAnd( &group->free_bits, ~(1 << i) ) & ~(1 << i);
+}
+/* allocate a new group block using non-LFH allocation, returns a group owned by current thread */ +static struct group *group_allocate( struct heap *heap, ULONG flags, SIZE_T block_size, struct category *category )
The `category` parameter is unused.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
return STATUS_SUCCESS;
}
+/* Low Fragmentation Heap frontend */
+/* header for every LFH block group */
+struct DECLSPEC_ALIGN(BLOCK_ALIGN) group +{
- struct block block;
This field is never used outside `group_allocate` and `group_releaes`, so it makes sense to consider the `block` as not being logically part of `struct group`. This will allow us to use `SLIST_ENTRY` directly.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
+/* affinity to tid mapping array, limits the number of thread-local caches,
- and additional threads will fight for the global categories groups
- */
+static LONG next_thread_affinity;
+/* a category of heap blocks of a certain size */ +struct category +{
- /* counters for LFH activation */
- volatile LONG blocks_alive;
- volatile LONG blocks_total;
- volatile BOOL enabled;
- /* list of groups with free blocks */
- SLIST_HEADER groups;
- struct group *affinity_group[32];
This array is intended to be accessed atomically by multiple threads, but a substantial number of elements reside in a cache line (assuming 64 byte cache line size). This results in false sharing, which may impact performance compared to purely threaded heaps.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
+/* acquire a group from the category, thread takes ownership of a shared group or allocates a new one */ +static struct group *heap_acquire_category_group( struct heap *heap, ULONG flags, SIZE_T block_size,
struct category *category )
+{
- DWORD affinity = NtCurrentTeb()->HeapVirtualAffinity % ARRAY_SIZE(category->affinity_group);
- struct group *group;
- SLIST_ENTRY *entry;
- if ((group = InterlockedExchangePointer( (void *)&category->affinity_group[affinity], NULL )))
return group;
- if (!(entry = RtlInterlockedPopEntrySList( &category->groups )))
group = group_allocate( heap, flags, block_size, category );
- else
group = CONTAINING_RECORD( entry, struct group, entry );
```suggestion:-3+0 if ((entry = RtlInterlockedPopEntrySList( &category->groups ))) return CONTAINING_RECORD( entry, struct group, entry );
return group_allocate( heap, flags, block_size, category ); ```
Since we already use `return` above, making it consistent allows for better readability. Also, I think it's a good idea to bring the define (assignment) closer to its use.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
* the current affinity_group might be only partially free.
*/
- if (!InterlockedCompareExchangePointer( (void *)&category->affinity_group[affinity], group, NULL ))
return STATUS_SUCCESS;
- /* try re-using the block group instead of releasing it */
+#ifdef _WIN64
- if (category->groups.Header16.Depth <= 16)
+#else
- if (category->groups.Depth <= 16)
+#endif
RtlInterlockedPushEntrySList( &category->groups, (SLIST_ENTRY *)&group->entry );
- else
status = group_release( heap, flags, category, group );
- return status;
```suggestion:-4+0 { RtlInterlockedPushEntrySList( &category->groups, (SLIST_ENTRY *)&group->entry );bi return STATUS_SUCCESS; }
return group_release( heap, flags, category, group ); ```
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
NtCurrentTeb()->HeapVirtualAffinity = affinity;
- }
- affinity %= ARRAY_SIZE(category->affinity_group);
- /* acquire a group, the thread will own it and no other thread can clear free bits.
* some other thread might still set the free bits if they are freeing blocks.
*/
- if (!(group = heap_acquire_category_group( heap, flags, block_size, category ))) return NULL;
- /* serialize with heap_free_block_lfh: atomically set GROUP_FLAG_FREE when the free bits are all 0. */
- if (group_find_free_block( group, block_size, &block ))
InterlockedAnd( &group->free_bits, ~GROUP_FLAG_FREE );
- else
InterlockedCompareExchange( &group->free_bits, GROUP_FLAG_FREE, 0 );
- /* if GROUP_FLAG_FREE was set, thread released its ownership. */
This comment is misleading since the current thread relinquishes ownership of the group in either case. Note that, in the current design, a thread does not own its affinity group; the group slot can be shared with arbitrary number of threads.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
- }
- affinity %= ARRAY_SIZE(category->affinity_group);
- /* acquire a group, the thread will own it and no other thread can clear free bits.
* some other thread might still set the free bits if they are freeing blocks.
*/
- if (!(group = heap_acquire_category_group( heap, flags, block_size, category ))) return NULL;
- /* serialize with heap_free_block_lfh: atomically set GROUP_FLAG_FREE when the free bits are all 0. */
- if (group_find_free_block( group, block_size, &block ))
InterlockedAnd( &group->free_bits, ~GROUP_FLAG_FREE );
- else
InterlockedCompareExchange( &group->free_bits, GROUP_FLAG_FREE, 0 );
- /* if GROUP_FLAG_FREE was set, thread released its ownership. */
- if (group->free_bits & GROUP_FLAG_FREE) return block;
I think it's better to remove the `return block` and embed the subsequent `if` instead. The `GROUP_FLAG_FREE` special case is not special enough to return a different block, so it's more clear to indicate that `GROUP_FLAG_FREE` has nothing to do with the returned block itself.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
group = group_allocate( heap, flags, block_size, category );
- else
group = CONTAINING_RECORD( entry, struct group, entry );
- return group;
+}
+/* release a thread owned and fully freed group to the category shared group, or free its memory */ +static NTSTATUS heap_release_category_group( struct heap *heap, ULONG flags, struct category *category,
struct group *group )
+{
- DWORD affinity = NtCurrentTeb()->HeapVirtualAffinity % ARRAY_SIZE(category->affinity_group);
- NTSTATUS status = STATUS_SUCCESS;
- /* we cannot use InterlockedExchangePointer here because to the contrary to our current group,
* the current affinity_group might be only partially free.
This explanation is not exactly right, since there is a way to use `InterlockedExchangePointer` here without a problem: we can simply move the old group into the `category->groups` list, just like `find_free_block_lfh`. A more useful comment would explain _why_ we choose not to replace the affinity group in some cases.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
C_ASSERT( sizeof(SUBHEAP) == 4 * BLOCK_ALIGN );
+/* LFH block size categories */ +#define BLOCK_SIZE_SMALL_STEP (16) +#define BLOCK_SIZE_SMALL_MAX (1024) +#define BLOCK_SIZE_MEDIUM_STEP (192) +#define BLOCK_SIZE_MEDIUM_MAX (32768)
+#define BLOCK_SIZE_CATEGORY_SMALL( size ) ((size) > 0 ? ((size) - 1) / BLOCK_SIZE_SMALL_STEP : 0) +#define BLOCK_SIZE_CATEGORY_MEDIUM( size ) (BLOCK_SIZE_CATEGORY_SMALL( BLOCK_SIZE_SMALL_MAX ) + 1 + \
(((size) > BLOCK_SIZE_SMALL_MAX) ? ((size) - BLOCK_SIZE_SMALL_MAX - 1) / BLOCK_SIZE_MEDIUM_STEP : 0))
+#define BLOCK_SIZE_CATEGORY_COUNT (BLOCK_SIZE_CATEGORY_MEDIUM( BLOCK_SIZE_MEDIUM_MAX ) + 1) +#define BLOCK_SIZE_CATEGORY( size ) (((size) >= BLOCK_SIZE_MEDIUM_MAX) \
? (BLOCK_SIZE_CATEGORY_COUNT - 1) \
Why not make the "no category" case an invalid index? ```suggestion:-1+0 #define BLOCK_SIZE_CATEGORY( size ) (((size) > BLOCK_SIZE_MEDIUM_MAX) \ ? BLOCK_SIZE_CATEGORY_COUNT \ ``` (This should be accompanied by changes to other code making assumptions on the invalid category index.)
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
RtlQueryHeapInformation (NTDLL.@)
*/ NTSTATUS WINAPI RtlQueryHeapInformation( HANDLE handle, HEAP_INFORMATION_CLASS info_class,
void *info, SIZE_T size_in, PSIZE_T size_out )
void *info, SIZE_T size_in, SIZE_T *size_out )
{
- struct heap *heap;
- ULONG flags;
- TRACE( "handle %p, info_class %u, info %p, size_in %Iu, size_out %p.\n", handle, info_class, info, size_in, size_out );
- switch (info_class) { case HeapCompatibilityInformation:
if (!(heap = unsafe_heap_from_handle( handle, 0, &flags ))) return STATUS_ACCESS_VIOLATION;
Since `RtlQueryHeapInformation` is not a NT system service (syscall), native might just propagate the exception instead of handling it.
On Thu Feb 2 15:45:11 2023 +0000, Jinoh Kang wrote:
This array is intended to be accessed atomically by multiple threads, but a substantial number of elements reside in a cache line (assuming 64 byte cache line size). This results in false sharing, which may impact performance compared to purely threaded heaps.
A possible solution is to have an array of slots per each affinity. This will also reduce the frequency of affinity groups running out.
On Thu Feb 2 14:14:24 2023 +0000, Jinoh Kang wrote:
Technically, this constitutes a data race since we're accessing a shared variable (concurrent access from `heap_free_block_lfh`) without volatile semantics. You should at least mark it volatile and load it to a local variable explicitly.
"volatile semantics" aren't really correct here. This is what ReadNoFence() is for.
"volatile semantics" aren't really correct here. This is what ReadNoFence() is for.
I was trying to argue for that, but realized that existing code isn't really conforming either. I'm not sure that volatile as a substitute for relaxed atomic access would be a problem in practice, but I do agree that atomic helpers are designed for this sort of thing.