Not completely sure if it's worth having for 8.0, but opening this to share the target I'm trying to reach.
-- v6: ntdll: Add a thread-specific category group cache. ntdll: Use atomics and lock-free list for category groups. ntdll: Implement Low Fragmentation Heap frontend. ntdll: Count allocations and automatically enable LFH. ntdll: Increase heap block tail_size capacity to 16 bits. ntdll: Implement HeapCompatibilityInformation. ntdll: Fix HeapWalk with empty uncommitted consecutive subheaps.
From: Rémi Bernon rbernon@codeweavers.com
--- dlls/ntdll/heap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index 9403c83e92c..a2410957a44 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -653,7 +653,7 @@ static SUBHEAP *find_subheap( const struct heap *heap, const struct block *block if (!check_subheap( subheap, heap )) return NULL; if (contains( first_block( subheap ), blocks_size, block, sizeof(*block) )) return subheap; /* outside of blocks region, possible corruption or heap_walk */ - if (contains( subheap_base( subheap ), subheap_size( subheap ), block, 0 )) return heap_walk ? subheap : NULL; + if (contains( subheap_base( subheap ), subheap_size( subheap ), block, 1 )) return heap_walk ? subheap : NULL; }
return NULL;
From: Rémi Bernon rbernon@codeweavers.com
--- dlls/kernel32/tests/heap.c | 34 +++++++++++++++------- dlls/ntdll/heap.c | 58 ++++++++++++++++++++++++++++++++------ 2 files changed, 73 insertions(+), 19 deletions(-)
diff --git a/dlls/kernel32/tests/heap.c b/dlls/kernel32/tests/heap.c index a81cb1b7a15..307b11771b3 100644 --- a/dlls/kernel32/tests/heap.c +++ b/dlls/kernel32/tests/heap.c @@ -827,11 +827,8 @@ static void test_HeapCreate(void) size = 0; SetLastError( 0xdeadbeef ); ret = pHeapQueryInformation( 0, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); - todo_wine ok( !ret, "HeapQueryInformation succeeded\n" ); - todo_wine ok( GetLastError() == ERROR_NOACCESS, "got error %lu\n", GetLastError() ); - todo_wine ok( size == 0, "got size %Iu\n", size );
size = 0; @@ -871,7 +868,6 @@ static void test_HeapCreate(void) ok( ret, "HeapSetInformation failed, error %lu\n", GetLastError() ); ret = pHeapQueryInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); ok( ret, "HeapQueryInformation failed, error %lu\n", GetLastError() ); - todo_wine ok( compat_info == 2, "got HeapCompatibilityInformation %lu\n", compat_info );
/* cannot be undone */ @@ -879,25 +875,44 @@ static void test_HeapCreate(void) compat_info = 0; SetLastError( 0xdeadbeef ); ret = pHeapSetInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info) ); - todo_wine ok( !ret, "HeapSetInformation succeeded\n" ); - todo_wine ok( GetLastError() == ERROR_GEN_FAILURE, "got error %lu\n", GetLastError() ); compat_info = 1; SetLastError( 0xdeadbeef ); ret = pHeapSetInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info) ); - todo_wine ok( !ret, "HeapSetInformation succeeded\n" ); - todo_wine ok( GetLastError() == ERROR_GEN_FAILURE, "got error %lu\n", GetLastError() ); ret = pHeapQueryInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); ok( ret, "HeapQueryInformation failed, error %lu\n", GetLastError() ); - todo_wine ok( compat_info == 2, "got HeapCompatibilityInformation %lu\n", compat_info );
ret = HeapDestroy( heap ); ok( ret, "HeapDestroy failed, error %lu\n", GetLastError() );
+ + /* cannot set LFH with HEAP_NO_SERIALIZE */ + + heap = HeapCreate( HEAP_NO_SERIALIZE, 0, 0 ); + ok( !!heap, "HeapCreate failed, error %lu\n", GetLastError() ); + ok( !((ULONG_PTR)heap & 0xffff), "wrong heap alignment\n" ); + + ret = pHeapQueryInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); + ok( ret, "HeapQueryInformation failed, error %lu\n", GetLastError() ); + ok( compat_info == 0, "got HeapCompatibilityInformation %lu\n", compat_info ); + + compat_info = 2; + SetLastError( 0xdeadbeef ); + ret = pHeapSetInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info) ); + ok( !ret, "HeapSetInformation succeeded\n" ); + ok( GetLastError() == ERROR_INVALID_PARAMETER, "got error %lu\n", GetLastError() ); + ret = pHeapQueryInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); + ok( ret, "HeapQueryInformation failed, error %lu\n", GetLastError() ); + ok( compat_info == 0, "got HeapCompatibilityInformation %lu\n", compat_info ); + + ret = HeapDestroy( heap ); + ok( ret, "HeapDestroy failed, error %lu\n", GetLastError() ); + + /* some allocation pattern automatically enables LFH */
heap = HeapCreate( 0, 0, 0 ); @@ -931,7 +946,6 @@ static void test_HeapCreate(void) ok( ret, "HeapSetInformation failed, error %lu\n", GetLastError() ); ret = pHeapQueryInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); ok( ret, "HeapQueryInformation failed, error %lu\n", GetLastError() ); - todo_wine ok( compat_info == 2, "got HeapCompatibilityInformation %lu\n", compat_info );
for (i = 0; i < 0x11; i++) ptrs[i] = pHeapAlloc( heap, 0, 24 + 2 * sizeof(void *) ); diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index a2410957a44..3b716b76126 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -39,6 +39,13 @@
WINE_DEFAULT_DEBUG_CHANNEL(heap);
+/* HeapCompatibilityInformation values */ + +#define HEAP_STD 0 +#define HEAP_LAL 1 +#define HEAP_LFH 2 + + /* undocumented RtlWalkHeap structure */
struct rtl_heap_entry @@ -194,6 +201,7 @@ struct heap DWORD force_flags; /* 0044/0074 */ /* end of the Windows 10 compatible struct layout */
+ LONG compat_info; /* HeapCompatibilityInformation / heap frontend type */ struct list entry; /* Entry in process heap list */ struct list subheap_list; /* Sub-heap list */ struct list large_list; /* Large blocks list */ @@ -1378,6 +1386,7 @@ HANDLE WINAPI RtlCreateHeap( ULONG flags, void *addr, SIZE_T total_size, SIZE_T heap->ffeeffee = 0xffeeffee; heap->auto_flags = (flags & HEAP_GROWABLE); heap->flags = (flags & ~HEAP_SHARED); + WriteNoFence( &heap->compat_info, HEAP_STD ); heap->magic = HEAP_MAGIC; heap->grow_size = max( HEAP_DEF_SIZE, total_size ); heap->min_size = commit_size; @@ -2039,21 +2048,24 @@ ULONG WINAPI RtlGetProcessHeaps( ULONG count, HANDLE *heaps ) * RtlQueryHeapInformation (NTDLL.@) */ NTSTATUS WINAPI RtlQueryHeapInformation( HANDLE handle, HEAP_INFORMATION_CLASS info_class, - void *info, SIZE_T size_in, PSIZE_T size_out ) + void *info, SIZE_T size_in, SIZE_T *size_out ) { + struct heap *heap; + ULONG flags; + + TRACE( "handle %p, info_class %u, info %p, size_in %Iu, size_out %p.\n", handle, info_class, info, size_in, size_out ); + switch (info_class) { case HeapCompatibilityInformation: + if (!(heap = unsafe_heap_from_handle( handle, 0, &flags ))) return STATUS_ACCESS_VIOLATION; if (size_out) *size_out = sizeof(ULONG); - - if (size_in < sizeof(ULONG)) - return STATUS_BUFFER_TOO_SMALL; - - *(ULONG *)info = 0; /* standard heap */ + if (size_in < sizeof(ULONG)) return STATUS_BUFFER_TOO_SMALL; + *(ULONG *)info = ReadNoFence( &heap->compat_info ); return STATUS_SUCCESS;
default: - FIXME("Unknown heap information class %u\n", info_class); + FIXME( "HEAP_INFORMATION_CLASS %u not implemented!\n", info_class ); return STATUS_INVALID_INFO_CLASS; } } @@ -2063,8 +2075,36 @@ NTSTATUS WINAPI RtlQueryHeapInformation( HANDLE handle, HEAP_INFORMATION_CLASS i */ NTSTATUS WINAPI RtlSetHeapInformation( HANDLE handle, HEAP_INFORMATION_CLASS info_class, void *info, SIZE_T size ) { - FIXME( "handle %p, info_class %d, info %p, size %Id stub!\n", handle, info_class, info, size ); - return STATUS_SUCCESS; + struct heap *heap; + ULONG flags; + + TRACE( "handle %p, info_class %u, info %p, size %Iu.\n", handle, info_class, info, size ); + + switch (info_class) + { + case HeapCompatibilityInformation: + { + ULONG compat_info; + + if (size < sizeof(ULONG)) return STATUS_BUFFER_TOO_SMALL; + if (!(heap = unsafe_heap_from_handle( handle, 0, &flags ))) return STATUS_INVALID_HANDLE; + if (heap->flags & HEAP_NO_SERIALIZE) return STATUS_INVALID_PARAMETER; + + compat_info = *(ULONG *)info; + if (compat_info != HEAP_STD && compat_info != HEAP_LFH) + { + FIXME( "HeapCompatibilityInformation %lu not implemented!\n", compat_info ); + return STATUS_UNSUCCESSFUL; + } + if (InterlockedCompareExchange( &heap->compat_info, compat_info, HEAP_STD ) != HEAP_STD) + return STATUS_UNSUCCESSFUL; + return STATUS_SUCCESS; + } + + default: + FIXME( "HEAP_INFORMATION_CLASS %u not implemented!\n", info_class ); + return STATUS_SUCCESS; + } }
/***********************************************************************
From: Rémi Bernon rbernon@codeweavers.com
We need this for larger LFH block bin steps. Use smaller block types to get more space, they are somewhat redundant with the block flags. --- dlls/ntdll/heap.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index 3b716b76126..5b67ac53cca 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -86,11 +86,14 @@ struct rtl_heap_entry
struct block { - WORD block_size; /* block size in multiple of BLOCK_ALIGN */ + /* block size in multiple of BLOCK_ALIGN */ + WORD block_size; + /* unused size (used block) / high size bits (free block) */ + WORD tail_size; + /* offset to region base */ + WORD base_offset; + BYTE block_type; BYTE block_flags; - BYTE tail_size; /* unused size (used block) / high size bits (free block) */ - WORD base_offset; /* offset to the region base in multiple of REGION_ALIGN */ - WORD block_type; };
C_ASSERT( sizeof(struct block) == 8 ); @@ -131,10 +134,10 @@ typedef struct C_ASSERT( sizeof(ARENA_LARGE) == offsetof(ARENA_LARGE, block) + sizeof(struct block) ); C_ASSERT( sizeof(ARENA_LARGE) == 4 * BLOCK_ALIGN );
-#define BLOCK_TYPE_USED 0x5355 -#define BLOCK_TYPE_DEAD 0xdead -#define BLOCK_TYPE_FREE 0x5246 -#define BLOCK_TYPE_LARGE 0x614c +#define BLOCK_TYPE_USED 'u' +#define BLOCK_TYPE_DEAD 'D' +#define BLOCK_TYPE_FREE 'F' +#define BLOCK_TYPE_LARGE 'L'
#define BLOCK_FILL_USED 0x55 #define BLOCK_FILL_TAIL 0xab @@ -157,8 +160,9 @@ C_ASSERT( sizeof(struct entry) <= HEAP_MIN_BLOCK_SIZE ); #define HEAP_MAX_BLOCK_REGION_SIZE (FIELD_MAX( struct block, base_offset ) * REGION_ALIGN)
C_ASSERT( HEAP_MAX_USED_BLOCK_SIZE == 512 * 1024 * (sizeof(void *) / 4) - BLOCK_ALIGN ); -C_ASSERT( HEAP_MAX_FREE_BLOCK_SIZE == 128 * 1024 * 1024 * (sizeof(void *) / 4) - BLOCK_ALIGN ); -C_ASSERT( HEAP_MAX_BLOCK_REGION_SIZE >= HEAP_MAX_FREE_BLOCK_SIZE ); +C_ASSERT( HEAP_MAX_FREE_BLOCK_SIZE >= 128 * 1024 * 1024 * (sizeof(void *) / 4) - BLOCK_ALIGN ); +C_ASSERT( HEAP_MAX_BLOCK_REGION_SIZE >= 128 * 1024 * 1024 * (sizeof(void *) / 4) - BLOCK_ALIGN ); +C_ASSERT( HEAP_MAX_FREE_BLOCK_SIZE >= HEAP_MAX_BLOCK_REGION_SIZE );
/* minimum size to start allocating large blocks */ #define HEAP_MIN_LARGE_BLOCK_SIZE (HEAP_MAX_USED_BLOCK_SIZE - 0x1000)
From: Rémi Bernon rbernon@codeweavers.com
--- dlls/kernel32/tests/heap.c | 2 - dlls/ntdll/heap.c | 136 ++++++++++++++++++++++++++++++++++++- 2 files changed, 134 insertions(+), 4 deletions(-)
diff --git a/dlls/kernel32/tests/heap.c b/dlls/kernel32/tests/heap.c index 307b11771b3..cf4b4fa7058 100644 --- a/dlls/kernel32/tests/heap.c +++ b/dlls/kernel32/tests/heap.c @@ -928,7 +928,6 @@ static void test_HeapCreate(void)
ret = pHeapQueryInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); ok( ret, "HeapQueryInformation failed, error %lu\n", GetLastError() ); - todo_wine ok( compat_info == 2, "got HeapCompatibilityInformation %lu\n", compat_info );
ret = HeapDestroy( heap ); @@ -1219,7 +1218,6 @@ static void test_HeapCreate(void)
ret = pHeapQueryInformation( heap, HeapCompatibilityInformation, &compat_info, sizeof(compat_info), &size ); ok( ret, "HeapQueryInformation failed, error %lu\n", GetLastError() ); - todo_wine ok( compat_info == 2, "got HeapCompatibilityInformation %lu\n", compat_info );
/* locking is serialized */ diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index 5b67ac53cca..eac4d84c78d 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -90,7 +90,7 @@ struct block WORD block_size; /* unused size (used block) / high size bits (free block) */ WORD tail_size; - /* offset to region base */ + /* offset to region base / first group block (LFH block) */ WORD base_offset; BYTE block_type; BYTE block_flags; @@ -193,6 +193,76 @@ typedef struct DECLSPEC_ALIGN(BLOCK_ALIGN) tagSUBHEAP C_ASSERT( sizeof(SUBHEAP) == offsetof(SUBHEAP, block) + sizeof(struct block) ); C_ASSERT( sizeof(SUBHEAP) == 4 * BLOCK_ALIGN );
+ +/* LFH block size bins */ + +#define BIN_SIZE_MIN_0 0 +#define BIN_SIZE_MIN_1 0x100 +#define BIN_SIZE_MIN_2 0x200 +#define BIN_SIZE_MIN_3 0x400 +#define BIN_SIZE_MIN_4 0x800 +#define BIN_SIZE_MIN_5 0x1000 +#define BIN_SIZE_MIN_6 0x2000 +#define BIN_SIZE_MIN_7 0x4000 +#define BIN_SIZE_MAX 0x8000 + +#define BIN_SIZE_STEP_0 (16) +#define BIN_SIZE_STEP_1 (BIN_SIZE_MIN_1 >> 4) +#define BIN_SIZE_STEP_2 (BIN_SIZE_MIN_2 >> 4) +#define BIN_SIZE_STEP_3 (BIN_SIZE_MIN_3 >> 4) +#define BIN_SIZE_STEP_4 (BIN_SIZE_MIN_4 >> 4) +#define BIN_SIZE_STEP_5 (BIN_SIZE_MIN_5 >> 4) +#define BIN_SIZE_STEP_6 (BIN_SIZE_MIN_6 >> 4) +#define BIN_SIZE_STEP_7 (BIN_SIZE_MIN_7 >> 4) + +#define BLOCK_BIN_SIZE_N( n, bin ) (BIN_SIZE_MIN_##n + (bin + 1) * BIN_SIZE_STEP_##n) +#define BLOCK_SIZE_BIN_N( n, size ) ((size) > BIN_SIZE_MIN_##n ? ((size) - 1 - BIN_SIZE_MIN_##n) / BIN_SIZE_STEP_##n : 0) + +#define BLOCK_BIN_SIZE( bin ) ((bin) >= 0x80 ? ~0u : \ + (bin) >= 0x70 ? BLOCK_BIN_SIZE_N( 7, bin - 0x70 ) : \ + (bin) >= 0x60 ? BLOCK_BIN_SIZE_N( 6, bin - 0x60 ) : \ + (bin) >= 0x50 ? BLOCK_BIN_SIZE_N( 5, bin - 0x50 ) : \ + (bin) >= 0x40 ? BLOCK_BIN_SIZE_N( 4, bin - 0x40 ) : \ + (bin) >= 0x30 ? BLOCK_BIN_SIZE_N( 3, bin - 0x30 ) : \ + (bin) >= 0x20 ? BLOCK_BIN_SIZE_N( 2, bin - 0x20 ) : \ + (bin) >= 0x10 ? BLOCK_BIN_SIZE_N( 1, bin - 0x10 ) : \ + BLOCK_BIN_SIZE_N( 0, bin - 0x00 )) + +#define BLOCK_SIZE_BIN( size ) ((size) > BIN_SIZE_MAX ? 0x80 : \ + (size) > BIN_SIZE_MIN_7 ? 0x70 + BLOCK_SIZE_BIN_N( 7, size ) : \ + (size) > BIN_SIZE_MIN_6 ? 0x60 + BLOCK_SIZE_BIN_N( 6, size ) : \ + (size) > BIN_SIZE_MIN_5 ? 0x50 + BLOCK_SIZE_BIN_N( 5, size ) : \ + (size) > BIN_SIZE_MIN_4 ? 0x40 + BLOCK_SIZE_BIN_N( 4, size ) : \ + (size) > BIN_SIZE_MIN_3 ? 0x30 + BLOCK_SIZE_BIN_N( 3, size ) : \ + (size) > BIN_SIZE_MIN_2 ? 0x20 + BLOCK_SIZE_BIN_N( 2, size ) : \ + (size) > BIN_SIZE_MIN_1 ? 0x10 + BLOCK_SIZE_BIN_N( 1, size ) : \ + (size) > BIN_SIZE_MIN_0 ? 0x0 + BLOCK_SIZE_BIN_N( 0, size ) : 0) + +#define BLOCK_SIZE_BIN_COUNT (BLOCK_SIZE_BIN( BIN_SIZE_MAX + 1 ) + 1) + +/* macros sanity checks */ +C_ASSERT( BLOCK_SIZE_BIN( 0 ) == 0 ); +C_ASSERT( BLOCK_SIZE_BIN( 0x10 ) == 0 ); +C_ASSERT( BLOCK_BIN_SIZE( 0 ) == BIN_SIZE_MIN_0 + 1 * BIN_SIZE_STEP_0 ); +C_ASSERT( BLOCK_SIZE_BIN( 0x11 ) == 1 ); +C_ASSERT( BLOCK_BIN_SIZE( 1 ) == BIN_SIZE_MIN_0 + 2 * BIN_SIZE_STEP_0 ); +C_ASSERT( BLOCK_SIZE_BIN( BIN_SIZE_MAX ) == 0x7f ); +C_ASSERT( BLOCK_BIN_SIZE( 0x7f ) == BIN_SIZE_MAX ); +C_ASSERT( BLOCK_SIZE_BIN( BIN_SIZE_MAX + 1 ) == 0x80 ); +C_ASSERT( BLOCK_BIN_SIZE( 0x80 ) == ~0u ); + +/* difference between block classes and all possible validation overhead must fit into block tail_size */ +C_ASSERT( BIN_SIZE_STEP_7 + 3 * BLOCK_ALIGN <= FIELD_MAX( struct block, tail_size ) ); + +/* a bin, tracking heap blocks of a certain size */ +struct bin +{ + /* counters for LFH activation */ + LONG count_alloc; + LONG count_freed; + LONG enabled; +}; + struct heap { /* win32/win64 */ DWORD_PTR unknown1[2]; /* 0000/0000 */ @@ -216,6 +286,7 @@ struct heap struct block **pending_free; /* Ring buffer for pending free requests */ RTL_CRITICAL_SECTION cs; struct entry free_lists[HEAP_NB_FREE_LISTS]; + struct bin *bins; SUBHEAP subheap; };
@@ -548,6 +619,16 @@ static void heap_dump( const struct heap *heap ) TRACE( "heap: %p\n", heap ); TRACE( " next %p\n", LIST_ENTRY( heap->entry.next, struct heap, entry ) );
+ TRACE( " bins:\n" ); + for (i = 0; heap->bins && i < BLOCK_SIZE_BIN_COUNT; i++) + { + const struct bin *bin = heap->bins + i; + ULONG alloc = ReadNoFence( &bin->count_alloc ), freed = ReadNoFence( &bin->count_freed ); + if (!alloc && !freed) continue; + TRACE( " %3u: size %#4x, alloc %ld, freed %ld, enabled %lu\n", i, BLOCK_BIN_SIZE( i ), + alloc, freed, ReadNoFence( &bin->enabled ) ); + } + TRACE( " free_lists: %p\n", heap->free_lists ); for (i = 0; i < HEAP_NB_FREE_LISTS; i++) TRACE( " %p: size %#8Ix, prev %p, next %p\n", heap->free_lists + i, get_free_list_block_size( i ), @@ -1434,6 +1515,13 @@ HANDLE WINAPI RtlCreateHeap( ULONG flags, void *addr, SIZE_T total_size, SIZE_T
heap_set_debug_flags( heap );
+ if (heap->flags & HEAP_GROWABLE) + { + SIZE_T size = sizeof(struct bin) * BLOCK_SIZE_BIN_COUNT; + NtAllocateVirtualMemory( NtCurrentProcess(), (void *)&heap->bins, + 0, &size, MEM_COMMIT, PAGE_READWRITE ); + } + /* link it into the per-process heap list */ if (process_heap) { @@ -1519,6 +1607,11 @@ HANDLE WINAPI RtlDestroyHeap( HANDLE handle ) NtFreeVirtualMemory( NtCurrentProcess(), &addr, &size, MEM_RELEASE ); } valgrind_notify_free_all( &heap->subheap, heap ); + if ((addr = heap->bins)) + { + size = 0; + NtFreeVirtualMemory( NtCurrentProcess(), &addr, &size, MEM_RELEASE ); + } size = 0; addr = heap; NtFreeVirtualMemory( NtCurrentProcess(), &addr, &size, MEM_RELEASE ); @@ -1576,6 +1669,25 @@ static NTSTATUS heap_allocate_block( struct heap *heap, ULONG flags, SIZE_T bloc return STATUS_SUCCESS; }
+static void bin_try_enable( struct heap *heap, struct bin *bin ) +{ + ULONG alloc = ReadNoFence( &bin->count_alloc ), freed = ReadNoFence( &bin->count_freed ); + SIZE_T block_size = BLOCK_BIN_SIZE( bin - heap->bins ); + BOOL enable = FALSE; + + if (bin == heap->bins && alloc > 0x10) enable = TRUE; + else if (bin - heap->bins < 0x30 && alloc > 0x800) enable = TRUE; + else if (bin - heap->bins < 0x30 && alloc - freed > 0x10) enable = TRUE; + else if (alloc - freed > 0x400000 / block_size) enable = TRUE; + + if (enable) InterlockedExchange( &bin->enabled, TRUE ); + if (enable && ReadNoFence( &heap->compat_info ) != HEAP_LFH) + { + ULONG info = HEAP_LFH; + RtlSetHeapInformation( heap, HeapCompatibilityInformation, &info, sizeof(info) ); + } +} + /*********************************************************************** * RtlAllocateHeap (NTDLL.@) */ @@ -1598,6 +1710,13 @@ void *WINAPI DECLSPEC_HOTPATCH RtlAllocateHeap( HANDLE handle, ULONG flags, SIZE heap_lock( heap, heap_flags ); status = heap_allocate_block( heap, heap_flags, block_size, size, &ptr ); heap_unlock( heap, heap_flags ); + + if (!status && heap->bins) + { + SIZE_T bin = BLOCK_SIZE_BIN( block_get_size( (struct block *)ptr - 1 ) ); + InterlockedIncrement( &heap->bins[bin].count_alloc ); + if (!ReadNoFence( &heap->bins[bin].enabled )) bin_try_enable( heap, &heap->bins[bin] ); + } }
if (!status) valgrind_notify_alloc( ptr, size, flags & HEAP_ZERO_MEMORY ); @@ -1632,9 +1751,13 @@ BOOLEAN WINAPI DECLSPEC_HOTPATCH RtlFreeHeap( HANDLE handle, ULONG flags, void * status = STATUS_SUCCESS; else { + SIZE_T block_size = block_get_size( block ), bin = BLOCK_SIZE_BIN( block_size ); + heap_lock( heap, heap_flags ); status = heap_free_block( heap, heap_flags, block ); heap_unlock( heap, heap_flags ); + + if (!status && heap->bins) InterlockedIncrement( &heap->bins[bin].count_freed ); }
TRACE( "handle %p, flags %#lx, ptr %p, return %u, status %#lx.\n", handle, flags, ptr, !status, status ); @@ -1646,7 +1769,7 @@ static NTSTATUS heap_resize_block( struct heap *heap, ULONG flags, struct block SIZE_T size, SIZE_T *old_size, void **ret ) { SUBHEAP *subheap = block_get_subheap( heap, block ); - SIZE_T old_block_size; + SIZE_T old_bin, old_block_size; struct entry *entry; struct block *next;
@@ -1673,6 +1796,7 @@ static NTSTATUS heap_resize_block( struct heap *heap, ULONG flags, struct block
old_block_size = block_get_size( block ); *old_size = old_block_size - block_get_overhead( block ); + old_bin = BLOCK_SIZE_BIN( old_block_size );
if (block_size >= HEAP_MIN_LARGE_BLOCK_SIZE) return STATUS_NO_MEMORY; /* growing small block to large block */
@@ -1712,6 +1836,14 @@ static NTSTATUS heap_resize_block( struct heap *heap, ULONG flags, struct block
heap_unlock( heap, flags );
+ if (heap->bins) + { + SIZE_T new_bin = BLOCK_SIZE_BIN( block_size ); + InterlockedIncrement( &heap->bins[old_bin].count_freed ); + InterlockedIncrement( &heap->bins[new_bin].count_alloc ); + if (!ReadNoFence( &heap->bins[new_bin].enabled )) bin_try_enable( heap, &heap->bins[new_bin] ); + } + *ret = block + 1; return STATUS_SUCCESS; }
From: Rémi Bernon rbernon@codeweavers.com
This implements the reduced fragmentation from the heap frontend, by carving smaller blocks out of larger allocated blocks.
The super block and each sub-block are all flagged with BLOCK_FLAG_LFH.
The super-block (struct group) uses a standard struct block header, as well as a list entry to be linked in free list, and a free bit map to track free sub-blocks.
Sub-blocks reference their super block through the base_offset, instead of the subheap, using the block size as radix. --- dlls/kernel32/tests/heap.c | 20 ++- dlls/ntdll/heap.c | 262 ++++++++++++++++++++++++++++++++++++- 2 files changed, 266 insertions(+), 16 deletions(-)
diff --git a/dlls/kernel32/tests/heap.c b/dlls/kernel32/tests/heap.c index cf4b4fa7058..8c06e9d5f53 100644 --- a/dlls/kernel32/tests/heap.c +++ b/dlls/kernel32/tests/heap.c @@ -986,6 +986,7 @@ static void test_HeapCreate(void) ok( entries[0].cbData <= 0x1000 /* sizeof(*heap) */, "got cbData %#lx\n", entries[0].cbData ); ok( entries[0].cbOverhead == 0, "got cbOverhead %#x\n", entries[0].cbOverhead ); ok( entries[0].iRegionIndex == 0, "got iRegionIndex %d\n", entries[0].iRegionIndex ); + todo_wine /* Wine currently reports the LFH group as a single block here */ ok( entries[1].wFlags == 0, "got wFlags %#x\n", entries[1].wFlags );
for (i = 0; i < 0x12; i++) @@ -1066,6 +1067,7 @@ static void test_HeapCreate(void) for (i = 1; i < count - 2; i++) { if (entries[i].wFlags != PROCESS_HEAP_ENTRY_BUSY) continue; + todo_wine /* Wine currently reports the LFH group as a single block */ ok( entries[i].cbData == 0x18 + 2 * sizeof(void *), "got cbData %#lx\n", entries[i].cbData ); ok( entries[i].cbOverhead == 0x8, "got cbOverhead %#x\n", entries[i].cbOverhead ); } @@ -1691,9 +1693,7 @@ static void test_GlobalAlloc(void) ok( size == small_size, "GlobalSize returned %Iu\n", size ); SetLastError( 0xdeadbeef ); tmp_mem = GlobalReAlloc( mem, small_size, 0 ); - todo_wine ok( !tmp_mem, "GlobalReAlloc succeeded\n" ); - todo_wine ok( GetLastError() == ERROR_NOT_ENOUGH_MEMORY, "got error %lu\n", GetLastError() ); if (tmp_mem) mem = tmp_mem; tmp_mem = GlobalReAlloc( mem, 1024 * 1024, GMEM_MODIFY ); @@ -1709,7 +1709,6 @@ static void test_GlobalAlloc(void) ok( !!mem, "GlobalAlloc failed, error %lu\n", GetLastError() ); tmp_mem = GlobalReAlloc( mem, small_size + 1, GMEM_MOVEABLE ); ok( !!tmp_mem, "GlobalReAlloc failed, error %lu\n", GetLastError() ); - todo_wine ok( tmp_mem != mem, "GlobalReAlloc didn't relocate memory\n" ); ptr = GlobalLock( tmp_mem ); ok( !!ptr, "GlobalLock failed, error %lu\n", GetLastError() ); @@ -1856,8 +1855,8 @@ static void test_GlobalAlloc(void) { ok( !is_mem_entry( tmp_mem ), "unexpected moveable %p\n", tmp_mem ); if (flags == GMEM_MODIFY) ok( tmp_mem == mem, "GlobalReAlloc returned %p\n", tmp_mem ); - else if (flags != GMEM_MOVEABLE) todo_wine_if(!flags) ok( !tmp_mem, "GlobalReAlloc succeeded\n" ); - else todo_wine ok( tmp_mem != mem, "GlobalReAlloc returned %p\n", tmp_mem ); + else if (flags != GMEM_MOVEABLE) ok( !tmp_mem, "GlobalReAlloc succeeded\n" ); + else ok( tmp_mem != mem, "GlobalReAlloc returned %p\n", tmp_mem ); } else { @@ -1871,7 +1870,7 @@ static void test_GlobalAlloc(void)
size = GlobalSize( mem ); if (flags == GMEM_MOVEABLE) ok( size == 0 || broken( size == 1 ) /* w7 */, "GlobalSize returned %Iu\n", size ); - else todo_wine_if(!flags) ok( size == small_size, "GlobalSize returned %Iu\n", size ); + else ok( size == small_size, "GlobalSize returned %Iu\n", size );
mem = GlobalFree( mem ); ok( !mem, "GlobalFree failed, error %lu\n", GetLastError() ); @@ -2429,9 +2428,7 @@ static void test_LocalAlloc(void) ok( size == small_size, "LocalSize returned %Iu\n", size ); SetLastError( 0xdeadbeef ); tmp_mem = LocalReAlloc( mem, small_size, 0 ); - todo_wine ok( !tmp_mem, "LocalReAlloc succeeded\n" ); - todo_wine ok( GetLastError() == ERROR_NOT_ENOUGH_MEMORY, "got error %lu\n", GetLastError() ); if (tmp_mem) mem = tmp_mem; tmp_mem = LocalReAlloc( mem, 1024 * 1024, LMEM_MODIFY ); @@ -2447,7 +2444,6 @@ static void test_LocalAlloc(void) ok( !!mem, "LocalAlloc failed, error %lu\n", GetLastError() ); tmp_mem = LocalReAlloc( mem, small_size + 1, LMEM_MOVEABLE ); ok( !!tmp_mem, "LocalReAlloc failed, error %lu\n", GetLastError() ); - todo_wine ok( tmp_mem != mem, "LocalReAlloc didn't relocate memory\n" ); ptr = LocalLock( tmp_mem ); ok( !!ptr, "LocalLock failed, error %lu\n", GetLastError() ); @@ -2540,13 +2536,13 @@ static void test_LocalAlloc(void) tmp_mem = LocalReAlloc( mem, 0, flags ); ok( !is_mem_entry( tmp_mem ), "unexpected moveable %p\n", tmp_mem ); if (flags & LMEM_MODIFY) ok( tmp_mem == mem, "LocalReAlloc returned %p\n", tmp_mem ); - else if (flags != LMEM_MOVEABLE) todo_wine_if(!flags) ok( !tmp_mem, "LocalReAlloc succeeded\n" ); - else todo_wine ok( tmp_mem != mem, "LocalReAlloc returned %p\n", tmp_mem ); + else if (flags != LMEM_MOVEABLE) ok( !tmp_mem, "LocalReAlloc succeeded\n" ); + else ok( tmp_mem != mem, "LocalReAlloc returned %p\n", tmp_mem ); if (tmp_mem) mem = tmp_mem;
size = LocalSize( mem ); if (flags == LMEM_MOVEABLE) ok( size == 0 || broken( size == 1 ) /* w7 */, "LocalSize returned %Iu\n", size ); - else todo_wine_if(!flags) ok( size == small_size, "LocalSize returned %Iu\n", size ); + else ok( size == small_size, "LocalSize returned %Iu\n", size );
mem = LocalFree( mem ); ok( !mem, "LocalFree failed, error %lu\n", GetLastError() ); diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index eac4d84c78d..26a873aae87 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -84,7 +84,7 @@ struct rtl_heap_entry #define REGION_ALIGN 0x10000 #define BLOCK_ALIGN (2 * sizeof(void *))
-struct block +struct DECLSPEC_ALIGN(8) block { /* block size in multiple of BLOCK_ALIGN */ WORD block_size; @@ -104,6 +104,7 @@ C_ASSERT( sizeof(struct block) == 8 ); #define BLOCK_FLAG_PREV_FREE 0x02 #define BLOCK_FLAG_FREE_LINK 0x03 #define BLOCK_FLAG_LARGE 0x04 +#define BLOCK_FLAG_LFH 0x08 /* block is handled by the LFH frontend */ #define BLOCK_FLAG_USER_INFO 0x10 /* user flags up to 0xf0 */ #define BLOCK_FLAG_USER_MASK 0xf0
@@ -261,6 +262,9 @@ struct bin LONG count_alloc; LONG count_freed; LONG enabled; + + /* list of groups with free blocks */ + struct list groups; };
struct heap @@ -1132,7 +1136,9 @@ static BOOL validate_free_block( const struct heap *heap, const SUBHEAP *subheap err = "invalid previous free block pointer"; else if (!(block_get_flags( prev ) & BLOCK_FLAG_FREE) || block_get_type( prev ) != BLOCK_TYPE_FREE) err = "invalid previous free block header"; - else if ((next = next_block( subheap, block ))) + else if ((next = next_block( subheap, block )) && + /* LFH blocks do not use BLOCK_FLAG_PREV_FREE or back pointer */ + !(block_get_flags( block ) & BLOCK_FLAG_LFH)) { if (!(block_get_flags( next ) & BLOCK_FLAG_PREV_FREE)) err = "invalid next block flags"; @@ -1182,7 +1188,9 @@ static BOOL validate_used_block( const struct heap *heap, const SUBHEAP *subheap err = "invalid block size"; else if (block->tail_size > block_get_size( block ) - sizeof(*block)) err = "invalid block unused size"; - else if ((next = next_block( subheap, block )) && (block_get_flags( next ) & BLOCK_FLAG_PREV_FREE)) + else if ((next = next_block( subheap, block )) && (block_get_flags( next ) & BLOCK_FLAG_PREV_FREE) && + /* LFH blocks do not use BLOCK_FLAG_PREV_FREE or back pointer */ + !(block_get_flags( block ) & BLOCK_FLAG_LFH)) err = "invalid next block flags"; else if (block_get_flags( block ) & BLOCK_FLAG_PREV_FREE) { @@ -1319,6 +1327,16 @@ static inline struct block *unsafe_block_from_ptr( struct heap *heap, ULONG flag
if ((ULONG_PTR)ptr % BLOCK_ALIGN) err = "invalid ptr alignment"; + else if (block_get_flags( block ) & BLOCK_FLAG_LFH) + { + /* LFH blocks base points to the group, not the subheap */ + if (block_get_type( block ) == BLOCK_TYPE_DEAD) + err = "delayed freed block"; + else if (block_get_type( block ) == BLOCK_TYPE_FREE) + err = "already freed block"; + else if (block_get_type( block ) != BLOCK_TYPE_USED) + err = "invalid block type"; + } else if ((subheap = block_get_subheap( heap, block )) >= (SUBHEAP *)block) err = "invalid base offset"; else if (block_get_type( block ) == BLOCK_TYPE_USED) @@ -1520,6 +1538,8 @@ HANDLE WINAPI RtlCreateHeap( ULONG flags, void *addr, SIZE_T total_size, SIZE_T SIZE_T size = sizeof(struct bin) * BLOCK_SIZE_BIN_COUNT; NtAllocateVirtualMemory( NtCurrentProcess(), (void *)&heap->bins, 0, &size, MEM_COMMIT, PAGE_READWRITE ); + for (i = 0; i < BLOCK_SIZE_BIN_COUNT; ++i) + list_init( &heap->bins[i].groups ); }
/* link it into the per-process heap list */ @@ -1669,6 +1689,219 @@ static NTSTATUS heap_allocate_block( struct heap *heap, ULONG flags, SIZE_T bloc return STATUS_SUCCESS; }
+/* Low Fragmentation Heap frontend */ + +/* header for every LFH block group */ +struct DECLSPEC_ALIGN(BLOCK_ALIGN) group +{ + struct list entry; + /* one bit for each free block and the highest bit for GROUP_FLAG_FREE */ + LONG free_bits; + /* first block of a group, required for alignment */ + struct block blocks; +}; + +#define GROUP_BLOCK_COUNT (sizeof(((struct group *)0)->free_bits) * 8 - 1) +#define GROUP_FLAG_FREE (1u << GROUP_BLOCK_COUNT) + +static inline UINT block_get_group_index( const struct block *block ) +{ + return block->base_offset; +} + +static inline struct group *block_get_group( const struct block *block ) +{ + SIZE_T block_size = block_get_size( block ); + void *blocks = (char *)block - block_get_group_index( block ) * block_size; + return CONTAINING_RECORD( blocks, struct group, blocks ); +} + +static inline void block_set_group( struct block *block, SIZE_T block_size, const struct group *group ) +{ + SIZE_T offset = (char *)block - (char *)&group->blocks; + block->base_offset = offset / block_size; +} + +static inline struct block *group_get_block( struct group *group, SIZE_T block_size, UINT index ) +{ + char *blocks = (char *)&group->blocks; + return (struct block *)(blocks + index * block_size); +} + +/* lookup a free block using the group free_bits, the current thread must own the group */ +static inline LONG group_find_free_block( struct group *group, SIZE_T block_size, struct block **block ) +{ + ULONG i, free_bits = group->free_bits; + /* free_bits & ~GROUP_FLAG_FREE will never be 0 as the group is unlinked when it's fully used */ + BitScanForward( &i, free_bits & ~GROUP_FLAG_FREE ); + *block = group_get_block( group, block_size, i ); + return group->free_bits &= ~(1 << i); +} + +/* allocate a new group block using non-LFH allocation, returns a group owned by current thread */ +static struct group *group_allocate( struct heap *heap, ULONG flags, SIZE_T block_size ) +{ + SIZE_T i, group_size, group_block_size; + struct group *group; + NTSTATUS status; + + group_size = offsetof(struct group, blocks) + GROUP_BLOCK_COUNT * block_size; + group_block_size = heap_get_block_size( heap, flags, group_size ); + + if (group_block_size >= HEAP_MIN_LARGE_BLOCK_SIZE) + status = heap_allocate_large( heap, flags & ~HEAP_ZERO_MEMORY, group_block_size, group_size, (void **)&group ); + else + status = heap_allocate_block( heap, flags & ~HEAP_ZERO_MEMORY, group_block_size, group_size, (void **)&group ); + + if (status) return NULL; + + block_set_flags( (struct block *)group - 1, 0, BLOCK_FLAG_LFH ); + group->free_bits = ~0; + + for (i = 0; i < GROUP_BLOCK_COUNT; ++i) + { + struct block *block = group_get_block( group, block_size, i ); + valgrind_make_writable( block, sizeof(*block) ); + block_set_type( block, BLOCK_TYPE_FREE ); + block_set_flags( block, ~0, BLOCK_FLAG_FREE | BLOCK_FLAG_LFH ); + block_set_group( block, block_size, group ); + block_set_size( block, block_size ); + mark_block_free( block + 1, (char *)block + block_size - (char *)(block + 1), flags ); + } + + return group; +} + +/* release a fully freed group to the non-LFH subheap, group must be owned by current thread */ +static NTSTATUS group_release( struct heap *heap, ULONG flags, struct bin *bin, struct group *group ) +{ + struct block *block = (struct block *)group - 1; + NTSTATUS status; + + block_set_flags( block, BLOCK_FLAG_LFH, 0 ); + + if (block_get_flags( block ) & BLOCK_FLAG_LARGE) + status = heap_free_large( heap, flags, block ); + else + status = heap_free_block( heap, flags, block ); + + return status; +} + +/* acquire a group from the bin, thread takes ownership of a shared group or allocates a new one */ +static struct group *heap_acquire_bin_group( struct heap *heap, ULONG flags, SIZE_T block_size, struct bin *bin ) +{ + struct group *group; + struct list *entry; + + if ((entry = list_head( &bin->groups ))) + { + group = LIST_ENTRY( entry, struct group, entry ); + list_remove( &group->entry ); + return group; + } + + return group_allocate( heap, flags, block_size ); +} + +/* release a thread owned and fully freed group to the bin shared group, or free its memory */ +static NTSTATUS heap_release_bin_group( struct heap *heap, ULONG flags, struct bin *bin, struct group *group ) +{ + /* try re-using the block group instead of releasing it */ + if (list_empty( &bin->groups )) + { + list_add_tail( &bin->groups, &group->entry ); + return STATUS_SUCCESS; + } + + return group_release( heap, flags, bin, group ); +} + +static struct block *find_free_bin_block( struct heap *heap, ULONG flags, SIZE_T block_size, struct bin *bin ) +{ + struct block *block; + struct group *group; + + /* acquire a group, the thread will own it and no other thread can clear free bits. + * some other thread might still set the free bits if they are freeing blocks. + */ + if (!(group = heap_acquire_bin_group( heap, flags, block_size, bin ))) return NULL; + + /* serialize with heap_free_block_lfh: atomically set GROUP_FLAG_FREE when the free bits are all 0. */ + if (group_find_free_block( group, block_size, &block )) + group->free_bits &= ~GROUP_FLAG_FREE; + else if (!group->free_bits) + group->free_bits = GROUP_FLAG_FREE; + + if (!(group->free_bits & GROUP_FLAG_FREE)) + { + /* if GROUP_FLAG_FREE isn't set, thread is responsible for putting it back into group list. */ + list_add_tail( &bin->groups, &group->entry ); + } + + return block; +} + +static NTSTATUS heap_allocate_block_lfh( struct heap *heap, ULONG flags, SIZE_T block_size, + SIZE_T size, void **ret ) +{ + struct bin *bin, *last = heap->bins + BLOCK_SIZE_BIN_COUNT - 1; + struct block *block; + + bin = heap->bins + BLOCK_SIZE_BIN( block_size ); + if (ReadNoFence( &heap->compat_info ) != HEAP_LFH) return STATUS_UNSUCCESSFUL; + if (!heap->bins || bin == last) return STATUS_UNSUCCESSFUL; + if (!ReadNoFence( &bin->enabled )) return STATUS_UNSUCCESSFUL; + + block_size = BLOCK_BIN_SIZE( BLOCK_SIZE_BIN( block_size ) ); + + heap_lock( heap, flags ); + if ((block = find_free_bin_block( heap, flags, block_size, bin ))) + { + block_set_type( block, BLOCK_TYPE_USED ); + block_set_flags( block, ~BLOCK_FLAG_LFH, BLOCK_USER_FLAGS( flags ) ); + block->tail_size = block_size - sizeof(*block) - size; + initialize_block( block, 0, size, flags ); + mark_block_tail( block, flags ); + *ret = block + 1; + } + heap_unlock( heap, flags ); + + return block ? STATUS_SUCCESS : STATUS_NO_MEMORY; +} + +static NTSTATUS heap_free_block_lfh( struct heap *heap, ULONG flags, struct block *block ) +{ + struct bin *bin, *last = heap->bins + BLOCK_SIZE_BIN_COUNT - 1; + SIZE_T i, block_size = block_get_size( block ); + struct group *group = block_get_group( block ); + NTSTATUS status = STATUS_SUCCESS; + + if (!(block_get_flags( block ) & BLOCK_FLAG_LFH)) return STATUS_UNSUCCESSFUL; + + bin = heap->bins + BLOCK_SIZE_BIN( block_size ); + if (bin == last) return STATUS_UNSUCCESSFUL; + + i = block_get_group_index( block ); + valgrind_make_writable( block, sizeof(*block) ); + block_set_type( block, BLOCK_TYPE_FREE ); + block_set_flags( block, ~BLOCK_FLAG_LFH, BLOCK_FLAG_FREE ); + mark_block_free( block + 1, (char *)block + block_size - (char *)(block + 1), flags ); + + heap_lock( heap, flags ); + + /* if this was the last used block in a group and GROUP_FLAG_FREE was set */ + if ((group->free_bits |= (1 << i)) == ~0) + { + /* thread now owns the group, and can release it to its bin */ + status = heap_release_bin_group( heap, flags, bin, group ); + } + + heap_unlock( heap, flags ); + + return status; +} + static void bin_try_enable( struct heap *heap, struct bin *bin ) { ULONG alloc = ReadNoFence( &bin->count_alloc ), freed = ReadNoFence( &bin->count_freed ); @@ -1705,6 +1938,8 @@ void *WINAPI DECLSPEC_HOTPATCH RtlAllocateHeap( HANDLE handle, ULONG flags, SIZE status = STATUS_NO_MEMORY; else if (block_size >= HEAP_MIN_LARGE_BLOCK_SIZE) status = heap_allocate_large( heap, heap_flags, block_size, size, &ptr ); + else if (!heap_allocate_block_lfh( heap, heap_flags, block_size, size, &ptr )) + status = STATUS_SUCCESS; else { heap_lock( heap, heap_flags ); @@ -1749,6 +1984,8 @@ BOOLEAN WINAPI DECLSPEC_HOTPATCH RtlFreeHeap( HANDLE handle, ULONG flags, void * status = heap_free_large( heap, heap_flags, block ); else if (!(block = heap_delay_free( heap, heap_flags, block ))) status = STATUS_SUCCESS; + else if (!heap_free_block_lfh( heap, heap_flags, block )) + status = STATUS_SUCCESS; else { SIZE_T block_size = block_get_size( block ), bin = BLOCK_SIZE_BIN( block_size ); @@ -1768,10 +2005,10 @@ BOOLEAN WINAPI DECLSPEC_HOTPATCH RtlFreeHeap( HANDLE handle, ULONG flags, void * static NTSTATUS heap_resize_block( struct heap *heap, ULONG flags, struct block *block, SIZE_T block_size, SIZE_T size, SIZE_T *old_size, void **ret ) { - SUBHEAP *subheap = block_get_subheap( heap, block ); SIZE_T old_bin, old_block_size; struct entry *entry; struct block *next; + SUBHEAP *subheap;
if (block_get_flags( block ) & BLOCK_FLAG_LARGE) { @@ -1800,6 +2037,23 @@ static NTSTATUS heap_resize_block( struct heap *heap, ULONG flags, struct block
if (block_size >= HEAP_MIN_LARGE_BLOCK_SIZE) return STATUS_NO_MEMORY; /* growing small block to large block */
+ if (block_get_flags( block ) & BLOCK_FLAG_LFH) + { + /* as native LFH does it with different block size: refuse to resize even though we could */ + if (ROUND_SIZE( *old_size, BLOCK_ALIGN - 1) != ROUND_SIZE( size, BLOCK_ALIGN - 1)) return STATUS_NO_MEMORY; + if (size >= *old_size) return STATUS_NO_MEMORY; + + block_set_flags( block, BLOCK_FLAG_USER_MASK & ~BLOCK_FLAG_USER_INFO, BLOCK_USER_FLAGS( flags ) ); + block->tail_size = old_block_size - sizeof(*block) - size; + initialize_block( block, *old_size, size, flags ); + mark_block_tail( block, flags ); + + *ret = block + 1; + return STATUS_SUCCESS; + } + + subheap = block_get_subheap( heap, block ); + heap_lock( heap, flags );
if (block_size > old_block_size)
From: Rémi Bernon rbernon@codeweavers.com
--- dlls/kernel32/tests/heap.c | 1 - dlls/ntdll/heap.c | 53 ++++++++++++++++++-------------------- 2 files changed, 25 insertions(+), 29 deletions(-)
diff --git a/dlls/kernel32/tests/heap.c b/dlls/kernel32/tests/heap.c index 8c06e9d5f53..b0b56132393 100644 --- a/dlls/kernel32/tests/heap.c +++ b/dlls/kernel32/tests/heap.c @@ -1246,7 +1246,6 @@ static void test_HeapCreate(void) thread_params.flags = 0; SetEvent( thread_params.start_event ); res = WaitForSingleObject( thread_params.ready_event, 100 ); - todo_wine ok( !res, "WaitForSingleObject returned %#lx, error %lu\n", res, GetLastError() ); ret = HeapUnlock( heap ); ok( ret, "HeapUnlock failed, error %lu\n", GetLastError() ); diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index 26a873aae87..d88581fb171 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -264,7 +264,7 @@ struct bin LONG enabled;
/* list of groups with free blocks */ - struct list groups; + SLIST_HEADER groups; };
struct heap @@ -1539,7 +1539,7 @@ HANDLE WINAPI RtlCreateHeap( ULONG flags, void *addr, SIZE_T total_size, SIZE_T NtAllocateVirtualMemory( NtCurrentProcess(), (void *)&heap->bins, 0, &size, MEM_COMMIT, PAGE_READWRITE ); for (i = 0; i < BLOCK_SIZE_BIN_COUNT; ++i) - list_init( &heap->bins[i].groups ); + RtlInitializeSListHead( &heap->bins[i].groups ); }
/* link it into the per-process heap list */ @@ -1694,7 +1694,7 @@ static NTSTATUS heap_allocate_block( struct heap *heap, ULONG flags, SIZE_T bloc /* header for every LFH block group */ struct DECLSPEC_ALIGN(BLOCK_ALIGN) group { - struct list entry; + SLIST_ENTRY entry; /* one bit for each free block and the highest bit for GROUP_FLAG_FREE */ LONG free_bits; /* first block of a group, required for alignment */ @@ -1731,11 +1731,11 @@ static inline struct block *group_get_block( struct group *group, SIZE_T block_s /* lookup a free block using the group free_bits, the current thread must own the group */ static inline LONG group_find_free_block( struct group *group, SIZE_T block_size, struct block **block ) { - ULONG i, free_bits = group->free_bits; + ULONG i, free_bits = ReadNoFence( &group->free_bits ); /* free_bits & ~GROUP_FLAG_FREE will never be 0 as the group is unlinked when it's fully used */ BitScanForward( &i, free_bits & ~GROUP_FLAG_FREE ); *block = group_get_block( group, block_size, i ); - return group->free_bits &= ~(1 << i); + return InterlockedAnd( &group->free_bits, ~(1 << i) ) & ~(1 << i); }
/* allocate a new group block using non-LFH allocation, returns a group owned by current thread */ @@ -1748,15 +1748,19 @@ static struct group *group_allocate( struct heap *heap, ULONG flags, SIZE_T bloc group_size = offsetof(struct group, blocks) + GROUP_BLOCK_COUNT * block_size; group_block_size = heap_get_block_size( heap, flags, group_size );
+ heap_lock( heap, flags ); + if (group_block_size >= HEAP_MIN_LARGE_BLOCK_SIZE) status = heap_allocate_large( heap, flags & ~HEAP_ZERO_MEMORY, group_block_size, group_size, (void **)&group ); else status = heap_allocate_block( heap, flags & ~HEAP_ZERO_MEMORY, group_block_size, group_size, (void **)&group );
+ heap_unlock( heap, flags ); + if (status) return NULL;
block_set_flags( (struct block *)group - 1, 0, BLOCK_FLAG_LFH ); - group->free_bits = ~0; + WriteNoFence( &group->free_bits, ~0 );
for (i = 0; i < GROUP_BLOCK_COUNT; ++i) { @@ -1778,6 +1782,8 @@ static NTSTATUS group_release( struct heap *heap, ULONG flags, struct bin *bin, struct block *block = (struct block *)group - 1; NTSTATUS status;
+ heap_lock( heap, flags ); + block_set_flags( block, BLOCK_FLAG_LFH, 0 );
if (block_get_flags( block ) & BLOCK_FLAG_LARGE) @@ -1785,21 +1791,18 @@ static NTSTATUS group_release( struct heap *heap, ULONG flags, struct bin *bin, else status = heap_free_block( heap, flags, block );
+ heap_unlock( heap, flags ); + return status; }
/* acquire a group from the bin, thread takes ownership of a shared group or allocates a new one */ static struct group *heap_acquire_bin_group( struct heap *heap, ULONG flags, SIZE_T block_size, struct bin *bin ) { - struct group *group; - struct list *entry; + SLIST_ENTRY *entry;
- if ((entry = list_head( &bin->groups ))) - { - group = LIST_ENTRY( entry, struct group, entry ); - list_remove( &group->entry ); - return group; - } + if ((entry = RtlInterlockedPopEntrySList( &bin->groups ))) + return CONTAINING_RECORD( entry, struct group, entry );
return group_allocate( heap, flags, block_size ); } @@ -1808,9 +1811,9 @@ static struct group *heap_acquire_bin_group( struct heap *heap, ULONG flags, SIZ static NTSTATUS heap_release_bin_group( struct heap *heap, ULONG flags, struct bin *bin, struct group *group ) { /* try re-using the block group instead of releasing it */ - if (list_empty( &bin->groups )) + if (!RtlQueryDepthSList( &bin->groups )) { - list_add_tail( &bin->groups, &group->entry ); + RtlInterlockedPushEntrySList( &bin->groups, (SLIST_ENTRY *)&group->entry ); return STATUS_SUCCESS; }
@@ -1829,14 +1832,14 @@ static struct block *find_free_bin_block( struct heap *heap, ULONG flags, SIZE_T
/* serialize with heap_free_block_lfh: atomically set GROUP_FLAG_FREE when the free bits are all 0. */ if (group_find_free_block( group, block_size, &block )) - group->free_bits &= ~GROUP_FLAG_FREE; - else if (!group->free_bits) - group->free_bits = GROUP_FLAG_FREE; + InterlockedAnd( &group->free_bits, ~GROUP_FLAG_FREE ); + else + InterlockedCompareExchange( &group->free_bits, GROUP_FLAG_FREE, 0 );
- if (!(group->free_bits & GROUP_FLAG_FREE)) + if (!(ReadNoFence( &group->free_bits ) & GROUP_FLAG_FREE)) { /* if GROUP_FLAG_FREE isn't set, thread is responsible for putting it back into group list. */ - list_add_tail( &bin->groups, &group->entry ); + RtlInterlockedPushEntrySList( &bin->groups, (SLIST_ENTRY *)&group->entry ); }
return block; @@ -1855,7 +1858,6 @@ static NTSTATUS heap_allocate_block_lfh( struct heap *heap, ULONG flags, SIZE_T
block_size = BLOCK_BIN_SIZE( BLOCK_SIZE_BIN( block_size ) );
- heap_lock( heap, flags ); if ((block = find_free_bin_block( heap, flags, block_size, bin ))) { block_set_type( block, BLOCK_TYPE_USED ); @@ -1865,7 +1867,6 @@ static NTSTATUS heap_allocate_block_lfh( struct heap *heap, ULONG flags, SIZE_T mark_block_tail( block, flags ); *ret = block + 1; } - heap_unlock( heap, flags );
return block ? STATUS_SUCCESS : STATUS_NO_MEMORY; } @@ -1888,17 +1889,13 @@ static NTSTATUS heap_free_block_lfh( struct heap *heap, ULONG flags, struct bloc block_set_flags( block, ~BLOCK_FLAG_LFH, BLOCK_FLAG_FREE ); mark_block_free( block + 1, (char *)block + block_size - (char *)(block + 1), flags );
- heap_lock( heap, flags ); - /* if this was the last used block in a group and GROUP_FLAG_FREE was set */ - if ((group->free_bits |= (1 << i)) == ~0) + if (InterlockedOr( &group->free_bits, 1 << i ) == ~(1 << i)) { /* thread now owns the group, and can release it to its bin */ status = heap_release_bin_group( heap, flags, bin, group ); }
- heap_unlock( heap, flags ); - return status; }
From: Rémi Bernon rbernon@codeweavers.com
--- dlls/ntdll/heap.c | 69 +++++++++++++++++++++++++++++++++++++++-- dlls/ntdll/loader.c | 2 ++ dlls/ntdll/ntdll_misc.h | 1 + 3 files changed, 70 insertions(+), 2 deletions(-)
diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index d88581fb171..5ce29ce87ad 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -255,6 +255,9 @@ C_ASSERT( BLOCK_BIN_SIZE( 0x80 ) == ~0u ); /* difference between block classes and all possible validation overhead must fit into block tail_size */ C_ASSERT( BIN_SIZE_STEP_7 + 3 * BLOCK_ALIGN <= FIELD_MAX( struct block, tail_size ) );
+static ULONG affinity_mapping[] = {20,6,31,15,14,29,27,4,18,24,26,13,0,9,2,30,17,7,23,25,10,19,12,3,22,21,5,16,1,28,11,8}; +static LONG next_thread_affinity; + /* a bin, tracking heap blocks of a certain size */ struct bin { @@ -265,6 +268,7 @@ struct bin
/* list of groups with free blocks */ SLIST_HEADER groups; + struct group *affinity_group[ARRAY_SIZE(affinity_mapping)]; };
struct heap @@ -1697,6 +1701,8 @@ struct DECLSPEC_ALIGN(BLOCK_ALIGN) group SLIST_ENTRY entry; /* one bit for each free block and the highest bit for GROUP_FLAG_FREE */ LONG free_bits; + /* affinity of the thread which last allocated from this group */ + LONG affinity; /* first block of a group, required for alignment */ struct block blocks; }; @@ -1796,11 +1802,30 @@ static NTSTATUS group_release( struct heap *heap, ULONG flags, struct bin *bin, return status; }
+static inline ULONG heap_current_thread_affinity(void) +{ + ULONG affinity; + + if (!(affinity = NtCurrentTeb()->HeapVirtualAffinity)) + { + affinity = InterlockedIncrement( &next_thread_affinity ); + affinity = affinity_mapping[affinity % ARRAY_SIZE(affinity_mapping)]; + NtCurrentTeb()->HeapVirtualAffinity = affinity; + } + + return affinity; +} + /* acquire a group from the bin, thread takes ownership of a shared group or allocates a new one */ static struct group *heap_acquire_bin_group( struct heap *heap, ULONG flags, SIZE_T block_size, struct bin *bin ) { + ULONG affinity = NtCurrentTeb()->HeapVirtualAffinity; + struct group *group; SLIST_ENTRY *entry;
+ if ((group = InterlockedExchangePointer( (void *)&bin->affinity_group[affinity], NULL ))) + return group; + if ((entry = RtlInterlockedPopEntrySList( &bin->groups ))) return CONTAINING_RECORD( entry, struct group, entry );
@@ -1810,8 +1835,16 @@ static struct group *heap_acquire_bin_group( struct heap *heap, ULONG flags, SIZ /* release a thread owned and fully freed group to the bin shared group, or free its memory */ static NTSTATUS heap_release_bin_group( struct heap *heap, ULONG flags, struct bin *bin, struct group *group ) { + ULONG affinity = group->affinity; + + /* using InterlockedExchangePointer here would possibly return a group that has used blocks, + * we prefer keeping our fully freed group instead for reduced memory consumption. + */ + if (!InterlockedCompareExchangePointer( (void *)&bin->affinity_group[affinity], group, NULL )) + return STATUS_SUCCESS; + /* try re-using the block group instead of releasing it */ - if (!RtlQueryDepthSList( &bin->groups )) + if (RtlQueryDepthSList( &bin->groups ) <= ARRAY_SIZE(bin->affinity_group)) { RtlInterlockedPushEntrySList( &bin->groups, (SLIST_ENTRY *)&group->entry ); return STATUS_SUCCESS; @@ -1822,6 +1855,7 @@ static NTSTATUS heap_release_bin_group( struct heap *heap, ULONG flags, struct b
static struct block *find_free_bin_block( struct heap *heap, ULONG flags, SIZE_T block_size, struct bin *bin ) { + ULONG affinity = heap_current_thread_affinity(); struct block *block; struct group *group;
@@ -1829,6 +1863,7 @@ static struct block *find_free_bin_block( struct heap *heap, ULONG flags, SIZE_T * some other thread might still set the free bits if they are freeing blocks. */ if (!(group = heap_acquire_bin_group( heap, flags, block_size, bin ))) return NULL; + group->affinity = affinity;
/* serialize with heap_free_block_lfh: atomically set GROUP_FLAG_FREE when the free bits are all 0. */ if (group_find_free_block( group, block_size, &block )) @@ -1839,7 +1874,8 @@ static struct block *find_free_bin_block( struct heap *heap, ULONG flags, SIZE_T if (!(ReadNoFence( &group->free_bits ) & GROUP_FLAG_FREE)) { /* if GROUP_FLAG_FREE isn't set, thread is responsible for putting it back into group list. */ - RtlInterlockedPushEntrySList( &bin->groups, (SLIST_ENTRY *)&group->entry ); + if ((group = InterlockedExchangePointer( (void *)&bin->affinity_group[affinity], group ))) + RtlInterlockedPushEntrySList( &bin->groups, (SLIST_ENTRY *)&group->entry ); }
return block; @@ -1918,6 +1954,35 @@ static void bin_try_enable( struct heap *heap, struct bin *bin ) } }
+static void heap_thread_detach_bin_groups( struct heap *heap ) +{ + ULONG i, affinity = NtCurrentTeb()->HeapVirtualAffinity; + + if (!heap->bins) return; + + for (i = 0; i < BLOCK_SIZE_BIN_COUNT; ++i) + { + struct bin *bin = heap->bins + i; + struct group *group; + if (!(group = InterlockedExchangePointer( (void *)&bin->affinity_group[affinity], NULL ))) continue; + RtlInterlockedPushEntrySList( &bin->groups, (SLIST_ENTRY *)&group->entry ); + } +} + +void heap_thread_detach(void) +{ + struct heap *heap; + + RtlEnterCriticalSection( &process_heap->cs ); + + LIST_FOR_EACH_ENTRY( heap, &process_heap->entry, struct heap, entry ) + heap_thread_detach_bin_groups( heap ); + + heap_thread_detach_bin_groups( process_heap ); + + RtlLeaveCriticalSection( &process_heap->cs ); +} + /*********************************************************************** * RtlAllocateHeap (NTDLL.@) */ diff --git a/dlls/ntdll/loader.c b/dlls/ntdll/loader.c index 01a30742678..6c643575f8d 100644 --- a/dlls/ntdll/loader.c +++ b/dlls/ntdll/loader.c @@ -3737,6 +3737,8 @@ void WINAPI LdrShutdownThread(void) /* don't call DbgUiGetThreadDebugObject as some apps hook it and terminate if called */ if (NtCurrentTeb()->DbgSsReserved[1]) NtClose( NtCurrentTeb()->DbgSsReserved[1] ); RtlFreeThreadActivationContextStack(); + + heap_thread_detach(); }
diff --git a/dlls/ntdll/ntdll_misc.h b/dlls/ntdll/ntdll_misc.h index d1a7790991b..f6b77b79cde 100644 --- a/dlls/ntdll/ntdll_misc.h +++ b/dlls/ntdll/ntdll_misc.h @@ -127,5 +127,6 @@ static inline void ascii_to_unicode( WCHAR *dst, const char *src, size_t len )
/* FLS data */ extern TEB_FLS_DATA *fls_alloc_data(void) DECLSPEC_HIDDEN; +extern void heap_thread_detach(void) DECLSPEC_HIDDEN;
#endif
On Thu Feb 2 15:52:10 2023 +0000, Jinoh Kang wrote:
Since `RtlQueryHeapInformation` is not a NT system service (syscall), native might just propagate the exception instead of handling it.
According to tests, calling `HeapQueryInformation( 0, ... )` fails and sets `ERROR_NOACCESS` last error, so I think it's trapped somewhere.
On Fri Feb 3 07:41:57 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_802_824)
I didn't like very much having to restore flags after the call, and didn't want to pass another parameter either. Also it does a bit more than necessary for LFH, so I'm just not using the helper anymore.
On Fri Feb 3 07:41:56 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_2527_2551)
Done.
On Fri Feb 3 07:41:56 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_239_265)
Right good idea, I also replaced the access with atomic operations.
On Fri Feb 3 07:41:57 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1991_2013)
I'm using `InterlockedIncrement` now so probably unnecessary.
On Fri Feb 3 07:41:59 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1741_1754)
Right, less memory usage! I added a macro to make it more readable.
On Fri Feb 3 07:41:59 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1726_1740)
I'm not sure it did matter, but anyway I've replaced this with `BitScanForward`.
On Fri Feb 3 07:41:58 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1723_1740)
Yeah, doing that now.
On Fri Feb 3 07:41:58 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1723_1740)
Done.
On Wed Feb 1 21:06:44 2023 +0000, Zebediah Figura wrote:
Maybe not worth doing at this point, but I think this can get moved below the block initialization.
Done.
On Wed Feb 1 21:06:44 2023 +0000, Zebediah Figura wrote:
Yeah, in general it's probably a lot easier to be confident about this patch if we restrict the locks to where they're definitely needed in earlier patches. Looking at the rest of this patch, I'm obviously not familiar with the heap code at all, but it *looks* like it makes sense, assuming I can correctly read what's possibly-concurrent and what isn't. And the lock-free parts look correct as well.
I removed these locks.
On Fri Feb 3 07:41:56 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1826_1847)
Well, all the `Rtl*SList` functions would benefit from being inlined here, I think, although the Pop function is really bad with its exception handler. Anyway I think the affinity mechanism covers the performance problem with the linked list.
On Wed Feb 1 21:10:46 2023 +0000, Zebediah Figura wrote:
Right, but in theory you'd never actually access the dummy bytes, they'd just exist to correct the alignment. (Again, not arguing too hard for it, I just don't like casts personally.)
That should be resolved now.
On Fri Feb 3 07:42:00 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1734_1748)
Right thanks! I had stats added for blocks at some point, could be something to have later but I've removed it for now.
On Fri Feb 3 07:42:00 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1680_1701)
I've done that, although it now requires the first block header to be explicit. It's the same with `subheap`s though, so probably fine.
On Fri Feb 3 07:42:02 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1860_1874)
Okay.
On Fri Feb 3 07:42:01 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1831_1853)
Looks good, thanks.
On Fri Feb 3 07:42:01 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1803_1829)
Done.
On Fri Feb 3 07:42:02 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1859_1874)
Hmm, I'm not sure to understand what you mean. The group ownership here is only in the sense "How's responsible for putting it back into the group list".
If we've set GROUP_FLAG_FREE, we delegated that to the future last block free-er. If not, then we still own the group and need to put back into the list ourselves.
Anyway I reworded the comment a bit.
On Fri Feb 3 07:42:03 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_1816_1840)
Right.
On Fri Feb 3 07:42:02 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_206_198)
I think it simplifies some of the code, and for instance it allows to keep counters for the "no" category.
I've also reworked the size classification and enable heuristics, to better spread them out and better match native (though not exactly but that shouldn't matter).
On Fri Feb 3 07:42:01 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_245_271)
I've added a static affinity shuffle, which helps quite a bit. I'm also now keeping track of the group affinity, and release them to their original affinity slot.
That also helps memory consumption and improves group re-use if one thread allocates and another one frees. Only allocating thread get an affinity now.
On Fri Feb 3 07:41:57 2023 +0000, Rémi Bernon wrote:
changed this line in [version 6 of the diff](/wine/wine/-/merge_requests/1628/diffs?diff_id=30762&start_sha=6190611728b9a3a81f510597496bb565b5902551#ce84fe3b548046500368ac49e84ae7006719c2d4_236_261)
Yeah, I thought bin was a bit small, but it's probably alright. I've renamed category to bin.
On Fri Feb 3 07:42:16 2023 +0000, Rémi Bernon wrote:
Well, all the `Rtl*SList` functions would benefit from being inlined here, I think, although the Pop function is really bad with its exception handler. Anyway I think the affinity mechanism covers the performance problem with the linked list.
If we provide inline versions I think we we want to just skip the exception handler, like for lstrcpyn().
On Fri Feb 3 18:07:07 2023 +0000, Zebediah Figura wrote:
If we provide inline versions I think we we want to just skip the exception handler, like for lstrcpyn().
You can't just skip it, it makes it incorrect. It needs to be written differently.
On Fri Feb 3 19:35:22 2023 +0000, Rémi Bernon wrote:
You can't just skip it, it makes it incorrect. It needs to be written differently.
Oh, hmm, you're right, I hadn't looked at how that was done.
On Wed Feb 1 16:56:07 2023 +0000, Rémi Bernon wrote:
I believe this is because of a use-after-free bug that is better handled with the LFH as we avoid touching the blocks now. Currently the `dxgi:dxgi` test crashes silently and only testbot catches that with:
The main process has no test summary line The test has unaccounted for failure messages The test has unaccounted for todo messages The test returned success despite having failures
That failure is actually due to calling IDXGIResource::GetUsage() with an uninitialized stack parameter. Our stub doesn't initialize the variable. I've sent a patch for this: https://gitlab.winehq.org/wine/wine/-/merge_requests/2107/diffs
Not sure why the LFH triggers it, since it should just be uninitialized stack data?
Currently the `dxgi:dxgi` test crashes silently and only testbot catches that with:
The main process has no test summary line The test has unaccounted for failure messages The test has unaccounted for todo messages The test returned success despite having failures
I don't see this on the testbot patterns page, nor in this specific failure. Where are you seeing this?
On Fri Feb 3 23:18:01 2023 +0000, Zebediah Figura wrote:
That failure is actually due to calling IDXGIResource::GetUsage() with an uninitialized stack parameter. Our stub doesn't initialize the variable. I've sent a patch for this: https://gitlab.winehq.org/wine/wine/-/merge_requests/2107/diffs Not sure why the LFH triggers it, since it should just be uninitialized stack data?
Currently the `dxgi:dxgi` test crashes silently and only testbot
catches that with:
The main process has no test summary line The test has unaccounted for failure messages The test has unaccounted for todo messages The test returned success despite having failures
I don't see this on the testbot patterns page, nor in this specific failure. Where are you seeing this?
Here for instance https://test.winehq.org/data/a4ed65d57721adade08440b9cabf44e3394fc3ad/linux_...
To clarify, `group->flags & GROUP_FLAG_FREE` is set _if and only if_ the group was full at the time of the _last_ satisfied allocation request for the group, right?
On Sat Feb 4 17:25:00 2023 +0000, Jinoh Kang wrote:
To clarify, `group->flags & GROUP_FLAG_FREE` is set _if and only if_ the group was full at the time of the _last_ satisfied allocation request for the group, right?
Yes, although it's not strictly synchronized with the block allocation: the block allocation may complete using the last block of the group, but then when we try to set the `GROUP_FLAG_FREE` flag, we find that another thread has freed a block. In which case the flag isn't set of course.
It is also possible to think about it being equivalent to *the group is not linked anymore and the last thread to release a block to it gets ownership and is responsible of either putting the group back into a free list, or release it*.
On Sat Feb 4 08:27:02 2023 +0000, Rémi Bernon wrote:
Here for instance https://test.winehq.org/data/a4ed65d57721adade08440b9cabf44e3394fc3ad/linux_...
Interesting, thanks. I guess the patterns page doesn't track that machine.
I don't think it's related to this; I guess it *could* be memory corruption but it's really impossible to tell without more data.
On Sat Feb 4 17:25:37 2023 +0000, Rémi Bernon wrote:
Yes, although it's not strictly synchronized with the block allocation: the block allocation may complete using the last block of the group, but then when we try to set the `GROUP_FLAG_FREE` flag, we find that another thread has freed a block. In which case the flag isn't set of course. It is also possible to think about it being equivalent to *the group is not linked anymore and the last thread to release a block to it gets ownership and is responsible of either putting the group back into a free list, or release it*.
Thanks for your remark! Is there a reason that we do not _immediately_ put the group back to the group list after a block has been freed and the group now has block slots available?
On Sun Feb 5 15:24:53 2023 +0000, Jinoh Kang wrote:
Thanks for your remark! Is there a reason that we do not _immediately_ put the group back to the group list after a block has been freed and the group now has block slots available?
The reason was that I think it's harder to avoid races. You need to synchronize all potential free-ers, whereas doing that on the last one you can be sure it's the only one still doing something with the group (as it's also unlinked).
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
if ((ULONG_PTR)ptr % BLOCK_ALIGN) err = "invalid ptr alignment";
- else if (block_get_flags( block ) & BLOCK_FLAG_LFH)
- {
/* LFH blocks base points to the group, not the subheap */
if (block_get_type( block ) == BLOCK_TYPE_DEAD)
err = "delayed freed block";
else if (block_get_type( block ) == BLOCK_TYPE_FREE)
err = "already freed block";
else if (block_get_type( block ) != BLOCK_TYPE_USED)
err = "invalid block type";
- }
I think it helps readability to hoist the type checks (dead and free) common to both LFH and non-LFH.
```suggestion:-9+0 else if (block_get_type( block ) == BLOCK_TYPE_DEAD) err = "delayed freed block"; else if (block_get_type( block ) == BLOCK_TYPE_FREE) err = "already freed block"; else if (block_get_flags( block ) & BLOCK_FLAG_LFH) { if (block_get_type( block ) != BLOCK_TYPE_USED) err = "invalid block type";
/* NOTE: A LFH block does not have a subheap (its base points to a group instead) */ } ```
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
return STATUS_SUCCESS;
}
+/* Low Fragmentation Heap frontend */
+/* header for every LFH block group */ +struct DECLSPEC_ALIGN(BLOCK_ALIGN) group +{
- SLIST_ENTRY entry;
- /* one bit for each free block and the highest bit for GROUP_FLAG_FREE */
- LONG free_bits;
- /* affinity of the thread which last allocated from this group */
- LONG affinity;
- /* first block of a group, required for alignment */
- struct block blocks;
Since the comment calls it the first block, maybe it's a good idea to make it clear in the name as well.
```suggestion:-0+0 struct block first_block; ```
I think `blocks` would be more appropriate if it was declared as a flexible array member (or at least as a 1-sized array) and the block itself was fixed-size.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
- return block;
+}
+static NTSTATUS heap_allocate_block_lfh( struct heap *heap, ULONG flags, SIZE_T block_size,
SIZE_T size, void **ret )
+{
- struct bin *bin, *last = heap->bins + BLOCK_SIZE_BIN_COUNT - 1;
- struct block *block;
- bin = heap->bins + BLOCK_SIZE_BIN( block_size );
- if (ReadNoFence( &heap->compat_info ) != HEAP_LFH) return STATUS_UNSUCCESSFUL;
- if (!heap->bins || bin == last) return STATUS_UNSUCCESSFUL;
- if (!ReadNoFence( &bin->enabled )) return STATUS_UNSUCCESSFUL;
- block_size = BLOCK_BIN_SIZE( BLOCK_SIZE_BIN( block_size ) );
Nit: extraneous space. ```suggestion:-0+0 block_size = BLOCK_BIN_SIZE( BLOCK_SIZE_BIN( block_size )); ```
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
+{
- ULONG alloc = ReadNoFence( &bin->count_alloc ), freed = ReadNoFence( &bin->count_freed );
- SIZE_T block_size = BLOCK_BIN_SIZE( bin - heap->bins );
- BOOL enable = FALSE;
- if (bin == heap->bins && alloc > 0x10) enable = TRUE;
- else if (bin - heap->bins < 0x30 && alloc > 0x800) enable = TRUE;
- else if (bin - heap->bins < 0x30 && alloc - freed > 0x10) enable = TRUE;
- else if (alloc - freed > 0x400000 / block_size) enable = TRUE;
- if (enable) InterlockedExchange( &bin->enabled, TRUE );
- if (enable && ReadNoFence( &heap->compat_info ) != HEAP_LFH)
- {
ULONG info = HEAP_LFH;
RtlSetHeapInformation( heap, HeapCompatibilityInformation, &info, sizeof(info) );
- }
I've described some possible room for improvement below. Feel free to dismiss them if you believe them not to be significant.
1. **State simplification**: This can be improved so that LFH activation comes before LFH bin activation. This lets us maintain the invariant that a LFH bin is enabled _only if_ LFH is activated for the entire heap. This invariant give us an option (not mandatory) to simplify the guard clauses in `heap_allocate_block_lfh` by removing the `compat_info == HEAP_LFH` check, which will make the LFH code path a little faster.
Perhaps your intention was to abide by a "transactional" code pattern, where you first make changes to the bin _and then_ commit it by flipping the LFH enable switch. While I indeed favor this code style in general, I think it's more useful to keep the invariant above since it reduces the number of states to reason about (e.g. a standard heap cannot ever have any LFH-enabled bins) and simplifies checks.
2. **Performance**: This does not need the sequentially consistent memory ordering guarantee imposed by `InterlockedExchange`. Sequentially consistent memory access implies a full barrier, which may lead to nontrivial performance impact compared to weaker ordering especially in wmo architectures such as ARM. How about downgrading it to `WriteRelease`?
3. **Control flow simplification**: Two conditional control structues, one as an `if` statement and the other as a ternary expression, are controlled by the same predicate: `enable`. We can merge them (jump threading in compiler terms) to remove duplicate expressions.
The suggestion below uses an early return statement. You can use nested `if`s as well, but I think guard clauses without visible side effects can help readability in C (which lacks many control structure syntactic sugars found in many functional programming languages).
```suggestion:-4+0 if (!enabled) return;
if (ReadNoFence( &heap->compat_info ) != HEAP_LFH) { ULONG info = HEAP_LFH;
/* NOTE: assume that failure means HEAP_LFH has been set concurrently */ RtlSetHeapInformation( heap, HeapCompatibilityInformation, &info, sizeof(info) ); }
/* paired with ReadAcquire in heap_allocate_block_lfh. */ WrtieRelease( &bin->enabled, TRUE ); ```
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
status = STATUS_NO_MEMORY; else if (block_size >= HEAP_MIN_LARGE_BLOCK_SIZE) status = heap_allocate_large( heap, heap_flags, block_size, size, &ptr );
- else if (!heap_allocate_block_lfh( heap, heap_flags, block_size, size, &ptr ))
else { heap_lock( heap, heap_flags ); status = heap_allocate_block( heap, heap_flags, block_size, size, &ptr ); heap_unlock( heap, heap_flags );status = STATUS_SUCCESS;
if (!status && heap->bins)
{
SIZE_T bin = BLOCK_SIZE_BIN( block_get_size( (struct block *)ptr - 1 ) );
InterlockedIncrement( &heap->bins[bin].count_alloc );
Just to make sure: do we want to keep tallying even after the bin is enabled?
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
- group_size = offsetof(struct group, blocks) + GROUP_BLOCK_COUNT * block_size;
- group_block_size = heap_get_block_size( heap, flags, group_size );
- heap_lock( heap, flags );
- if (group_block_size >= HEAP_MIN_LARGE_BLOCK_SIZE)
status = heap_allocate_large( heap, flags & ~HEAP_ZERO_MEMORY, group_block_size, group_size, (void **)&group );
- else
status = heap_allocate_block( heap, flags & ~HEAP_ZERO_MEMORY, group_block_size, group_size, (void **)&group );
- heap_unlock( heap, flags );
- if (status) return NULL;
- block_set_flags( (struct block *)group - 1, 0, BLOCK_FLAG_LFH );
Note: we may want to distinguish LFH groups and actual LFH blocks, for heap-walking and validation purposes.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
RtlInterlockedPushEntrySList( &bin->groups, (SLIST_ENTRY *)&group->entry );
- }
- return block;
+}
+static NTSTATUS heap_allocate_block_lfh( struct heap *heap, ULONG flags, SIZE_T block_size,
SIZE_T size, void **ret )
+{
- struct bin *bin, *last = heap->bins + BLOCK_SIZE_BIN_COUNT - 1;
- struct block *block;
- bin = heap->bins + BLOCK_SIZE_BIN( block_size );
- if (ReadNoFence( &heap->compat_info ) != HEAP_LFH) return STATUS_UNSUCCESSFUL;
- if (!heap->bins || bin == last) return STATUS_UNSUCCESSFUL;
- if (!ReadNoFence( &bin->enabled )) return STATUS_UNSUCCESSFUL;
We should ensure that the bin use _happens after_ its activation.
```suggestion:-0+0 /* paired with WriteRelease in bin_try_enable. */ if (!ReadAcquire( &bin->enabled )) return STATUS_UNSUCCESSFUL; ```
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
old_block_size = block_get_size( block ); *old_size = old_block_size - block_get_overhead( block );
old_bin = BLOCK_SIZE_BIN( old_block_size );
if (block_size >= HEAP_MIN_LARGE_BLOCK_SIZE) return STATUS_NO_MEMORY; /* growing small block to large block */
if (block_get_flags( block ) & BLOCK_FLAG_LFH)
{
/* as native LFH does it with different block size: refuse to resize even though we could */
if (ROUND_SIZE( *old_size, BLOCK_ALIGN - 1) != ROUND_SIZE( size, BLOCK_ALIGN - 1)) return STATUS_NO_MEMORY;
if (size >= *old_size) return STATUS_NO_MEMORY;
block_set_flags( block, BLOCK_FLAG_USER_MASK & ~BLOCK_FLAG_USER_INFO, BLOCK_USER_FLAGS( flags ) );
block->tail_size = old_block_size - sizeof(*block) - size;
initialize_block( block, *old_size, size, flags );
mark_block_tail( block, flags );
`heap_resize_block` is already too complex. How about factoring this out into a following helper function?
```c /* Finish resizing a block of memory and re-initialize it. */ static void commit_block_resize( struct block *block, SIZE_T old_size, SIZE_T size, DWORD flags ) { valgrind_notify_resize( block + 1, old_size, size ); block_set_flags( block, BLOCK_FLAG_USER_MASK & ~BLOCK_FLAG_USER_INFO, BLOCK_USER_FLAGS( flags ) ); block->tail_size = block_get_size( block ) - sizeof(*block) - size; initialize_block( block, old_size, size, flags ); mark_block_tail( block, flags ); } ```
Feel free to rename it if the "commit" part feels like it will be easily confused with virtual memory commitment.
On Sun Feb 5 18:02:10 2023 +0000, Rémi Bernon wrote:
The reason was that I think it's harder to avoid races. You need to synchronize all potential free-ers, whereas doing that on the last one you can be sure it's the only one still doing something with the group (as it's also unlinked).
Personally, I find the `GROUP_FLAG_FREE` flag a little unintuitive due to the following reasons:
- Spatial dependence of semantics. The `GROUP_FLAG_FREE` flag in `free_bits` indicates many things depending on the value of the entire `free_bits` bitfield: - If `GROUP_FLAG_FREE` is set and all other bits in `free_bits` are set, it means that the group is not being referenced anywhere else. - If `GROUP_FLAG_FREE` is set and some other bits in `free_bits` are unset, it means that the group is still being shared with other allocations and any subsequent or concurrent `heap_free_block_lfh` call may free the group. - If `GROUP_FLAG_FREE` is unset, it means that the group is owned by a thread (a stack variable), the affinity group array, or the group lookaside list. - Temporal dependence of computation. Whether `GROUP_FLAG_FREE` is set or not is a function of not only the `free_bits` bitfield, but also the history of previous `heap_allocate_block_lfh` and `heap_free_block_lfh` calls.
Instead, I propose reusing the `group->entry.Next` field to indicate whether the group is contained in a affinity group array or the group lookaside list. The `Next` field will be interpreted as follows:
- `Next == (SLIST_ENTY *)-1`: The group is currently unlinked from `affinity_group` and any group lookaside list. Next call to `heap_free_block_lfh` will release the group. - `Next == NULL`: The group is owned by a thread (a stack variable), is the last element of a group lookaside list, or is referenced by `affinity_group`. - `Next` is any other value: The group is an element of a group lookaside list.
With the above rules in mind, the algorithm will be changed as follows:
- If `find_free_bin_block` determines that the free bits are all 0 (i.e. the block is full), it sets `group->entry.Next = (void *)-1` instead of putting it back to the group list. - `heap_free_block_lfh` calls `InterlockedCompareExchange( &group->entry.Next, NULL, (void *)-1 )` to claim ownership of the group _before_ maipulating `free_bits`. If the CAS is successful, it calls `heap_release_bin_group` _after_ setting the corresponding bit `free_bits`.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
+{
- ULONG affinity = heap_current_thread_affinity();
- struct block *block;
- struct group *group;
- /* acquire a group, the thread will own it and no other thread can clear free bits.
* some other thread might still set the free bits if they are freeing blocks.
*/
- if (!(group = heap_acquire_bin_group( heap, flags, block_size, bin ))) return NULL;
- group->affinity = affinity;
- /* serialize with heap_free_block_lfh: atomically set GROUP_FLAG_FREE when the free bits are all 0. */
- if (group_find_free_block( group, block_size, &block ))
InterlockedAnd( &group->free_bits, ~GROUP_FLAG_FREE );
- else
InterlockedCompareExchange( &group->free_bits, GROUP_FLAG_FREE, 0 );
This is correct but a little hard to verify. Multiple isolated atomic operations update the same variable `free_bits` in sequence, and they can be concurrently interleaved with accesses from other threads. Verifying their correctness requires knowledge of _partial_ ownership, that all set (1) bits in `free_bits` are owned by `find_free_bin_block` (i.e. cannot be transitioned by `InterlockedOr`) while all unset (0) bits in `free_bits` are owned by `heap_free_block_lfh` (i.e. can be transitioned by `InterlockedOr`).
Instead, how about merging all four atomic operations into one like this?
```c /* look up and acquire a free block using the group free_bits */ static inline LONG group_find_free_block( struct group *group, SIZE_T block_size, struct block **block ) { ULONG i; LONG old_free_bits, new_free_bits, prev_free_bits;
prev_free_bits = ReadNoFence( &group->free_bits ); do { old_free_bits = prev_free_bits;
/* unset the least set bit */ new_free_bits = old_free_bits & (old_free_bits - 1);
/* set GROUP_FLAG_FREE if and only if other bits are 0. */ new_free_bits &= ~GROUP_FLAG_FREE; if (new_free_bits == 0) new_free_bits = GROUP_FLAG_FREE;
prev_free_bits = InterlockedCompareExchange( &group->free_bits, new_free_bits, old_free_bits ); } while (prev_free_bits != old_free_bits);
if (BitScanForward( &i, old_free_bits & ~GROUP_FLAG_FREE )) *block = group_get_block( group, block_size, i ); else *block = NULL; /* heap corrupted */
return new_free_bits; } ```
On Mon Feb 6 15:54:47 2023 +0000, Jinoh Kang wrote:
This is correct but a little hard to verify. Multiple isolated atomic operations update the same variable `free_bits` in sequence, and they can be concurrently interleaved with accesses from other threads. Verifying their correctness requires knowledge of _partial_ ownership, that all set (1) bits in `free_bits` are owned by `find_free_bin_block` (i.e. cannot be transitioned by `InterlockedOr`) while all unset (0) bits in `free_bits` are owned by `heap_free_block_lfh` (i.e. can be transitioned by `InterlockedOr`). Instead, how about merging all four atomic operations into one like this?
/* look up and acquire a free block using the group free_bits */ static inline LONG group_find_free_block( struct group *group, SIZE_T block_size, struct block **block ) { ULONG i; LONG old_free_bits, new_free_bits, prev_free_bits; prev_free_bits = ReadNoFence( &group->free_bits ); do { old_free_bits = prev_free_bits; /* unset the least set bit */ new_free_bits = old_free_bits & (old_free_bits - 1); /* set GROUP_FLAG_FREE if and only if other bits are 0. */ new_free_bits &= ~GROUP_FLAG_FREE; if (new_free_bits == 0) new_free_bits = GROUP_FLAG_FREE; prev_free_bits = InterlockedCompareExchange( &group->free_bits, new_free_bits, old_free_bits ); } while (prev_free_bits != old_free_bits); if (BitScanForward( &i, old_free_bits & ~GROUP_FLAG_FREE )) *block = group_get_block( group, block_size, i ); else *block = NULL; /* heap corrupted */ return new_free_bits; }
(Note that this comment is no longer necessary if you're going to switch to my proposal above.)
On Mon Feb 6 16:15:26 2023 +0000, Jinoh Kang wrote:
Personally, I find the `GROUP_FLAG_FREE` flag a little unintuitive due to the following reasons:
- Spatial dependence of semantics. The `GROUP_FLAG_FREE` flag in
`free_bits` indicates many things depending on the value of the entire `free_bits` bitfield:
- If `GROUP_FLAG_FREE` is set and all other bits in `free_bits` are
set, it means that the group is not being referenced anywhere else.
- If `GROUP_FLAG_FREE` is set and some other bits in `free_bits` are
unset, it means that the group is still being shared with other allocations and any subsequent or concurrent `heap_free_block_lfh` call may free the group.
- If `GROUP_FLAG_FREE` is unset, it means that the group is owned by
a thread (a stack variable), the affinity group array, or the group lookaside list.
- Temporal dependence of computation. Whether `GROUP_FLAG_FREE` is set
or not is a function of not only the `free_bits` bitfield, but also the history of previous `heap_allocate_block_lfh` and `heap_free_block_lfh` calls. Instead, I propose reusing the `group->entry.Next` field to indicate whether the group is contained in a affinity group array or the group lookaside list. The `Next` field will be interpreted as follows:
- `Next == (SLIST_ENTY *)-1`: The group is currently unlinked from
`affinity_group` and any group lookaside list. Next call to `heap_free_block_lfh` will release the group.
- `Next == NULL`: The group is owned by a thread (a stack variable), is
the last element of a group lookaside list, or is referenced by `affinity_group`.
- `Next` is any other value: The group is an element of a group
lookaside list. With the above rules in mind, the algorithm will be changed as follows:
- If `find_free_bin_block` determines that the free bits are all 0 (i.e.
the block is full), it sets `group->entry.Next = (void *)-1` instead of putting it back to the group list. Otherwise, it sets `group->entry.Next = NULL` before putting it back to the group list.
- `heap_free_block_lfh` calls `InterlockedCompareExchange(
&group->entry.Next, NULL, (void *)-1 )` to claim ownership of the group _before_ maipulating `free_bits`. If the CAS is successful, it calls `heap_release_bin_group` _after_ setting the corresponding bit `free_bits`. The advantages of this approach are twofold:
- We have one extra block per group.
- Groups are immediately available for use after one of its block is freed.
Both contribute to less memory footprint.
Hm, the approach might not be viable unless we roll our own SList, since the RTL SList operations do not perform atomic access on `entry->Next`.
On Mon Feb 6 16:19:06 2023 +0000, Jinoh Kang wrote:
Hm, the approach might not be viable unless we roll our own SList, since the RTL SList operations do not perform atomic access on `entry->Next`.
Fwiw I think this also creates another problem with when fully empty groups are released. Right now as we wait for a group to go trough a fully used -> fully freed cycle before deciding what to do, we have the option of releasing the group entirely to the subheap.
Putting groups into the linked lists earlier removes that option and I think it becomes harder to act on fully freed groups.
Putting groups into the linked lists earlier removes that option and I think it becomes harder to act on fully freed groups.
Thanks, I get it. However, this implies that the existing implementation isn't exactly free from this problem either: a group that has never been fully allocated cannot be released to subheap even it is now fully free.
I think this problem can be overcome by forbidding partially free group from being added to a group list at all. We still allow partially free groups for `affinity_group` since a group knows which `affinity_group` slot it belongs to. This also means that `heap_free_block_lfh` can never encounter a block from a group that belongs to a lookaside SList, which eliminates the RTL SList atomicity problem as well.
On Fri Feb 3 07:42:20 2023 +0000, Rémi Bernon wrote:
I've added a static affinity shuffle, which helps quite a bit. I'm also now keeping track of the group affinity, and release them to their original affinity slot. That also helps memory consumption and improves group re-use if one thread allocates and another one frees. Only allocating thread get an affinity now.
I have another idea: we can transpose the affinity group pointer array. The array will be taken out of `struct bin`, and its new type will be `struct group *affinity_group[32][BLOCK_SIZE_BIN_COUNT];`. This takes advantage of the fact that the number of bins is constant, or at least bounded.
On Tue Feb 7 11:01:28 2023 +0000, Jinoh Kang wrote:
I have another idea: we can transpose the affinity group pointer array. The array will be taken out of `struct bin`, and its new type will be `struct group *affinity_group[32][BLOCK_SIZE_BIN_COUNT];`. This takes advantage of the fact that the number of bins is constant, or at least bounded.
For example:
```c /* LFH group cache. Holds at most 1 group per bin. */ struct group_cache { struct group *bin_groups[BLOCK_SIZE_BIN_COUNT]; };
/* LFH state. Only allocated for growable heaps. */ struct lfh_state { struct bin bins[BLOCK_SIZE_BIN_COUNT]; /* each group_cache should reside in a different cache line if possible. */ struct group_cache *parallel_cache[ARRAY_SIZE(affinity_mapping)]; };