Currently, the free list consists of a "small list" for sizes below 256, which are linearly spaced, and a "large list" which is manually split into a few chunks.
This patch replaces it with a single log-linear policy, while expanding the range the large list covers.
The old implementation had issues when a lot of large allocations happened. In this case, all the allocations went in the last catch-all bucket in the "large list", and what happens is: 1. The linked list grew in size over time, causing searching cost to skyrocket. 2. With the first-fit allocation policy, fragmentation was also making the problem worse.
The new bucketing covers the entire range up until we start allocating large blocks, which will not enter the free list. It also makes the allocation policy closer to best-fit (although not exactly), reducing fragmentation.
The increase in number of free lists does incur some cost when it needs to be skipped over, but the improvement in allocation performance outweighs it.
For future work, these ideas (mostly from glibc) might or might not benefit performance: - Use an exact best-fit allocation policy. - Add a bitmap for freelist, allowing empty lists to be skipped with a single bit scan.
Signed-off-by: Tatsuyuki Ishi ishitatsuyuki@gmail.com
-- v2: ntdll: Use log-linear bucketing for free lists.
From: Tatsuyuki Ishi ishitatsuyuki@gmail.com
Currently, the free list consists of a "small list" for sizes below 256, which are linearly spaced, and a "large list" which is manually split into a few chunks.
This patch replaces it with a single log-linear policy, while expanding the range the large list covers.
The old implementation had issues when a lot of large allocations happened. In this case, all the allocations went in the last catch-all bucket in the "large list", and what happens is: 1. The linked list grew in size over time, causing searching cost to skyrocket. 2. With the first-fit allocation policy, fragmentation was also making the problem worse.
The new bucketing covers the entire range up until we start allocating large blocks, which will not enter the free list. It also makes the allocation policy closer to best-fit (although not exactly), reducing fragmentation.
The increase in number of free lists does incur some cost when it needs to be skipped over, but the improvement in allocation performance outweighs it.
For future work, these ideas (mostly from glibc) might or might not benefit performance: - Use an exact best-fit allocation policy. - Add a bitmap for freelist, allowing empty lists to be skipped with a single bit scan.
Signed-off-by: Tatsuyuki Ishi ishitatsuyuki@gmail.com --- dlls/kernel32/tests/heap.c | 2 - dlls/ntdll/heap.c | 102 ++++++++++++++++++++++++------------- 2 files changed, 66 insertions(+), 38 deletions(-)
diff --git a/dlls/kernel32/tests/heap.c b/dlls/kernel32/tests/heap.c index b0b56132393..d2742b55495 100644 --- a/dlls/kernel32/tests/heap.c +++ b/dlls/kernel32/tests/heap.c @@ -452,9 +452,7 @@ static void test_HeapCreate(void)
SetLastError( 0xdeadbeef ); ptr1 = HeapAlloc( heap, 0, alloc_size - (0x200 + 0x80 * sizeof(void *)) ); - todo_wine ok( !ptr1, "HeapAlloc succeeded\n" ); - todo_wine ok( GetLastError() == ERROR_NOT_ENOUGH_MEMORY, "got error %lu\n", GetLastError() ); ret = HeapFree( heap, 0, ptr1 ); ok( ret, "HeapFree failed, error %lu\n", GetLastError() ); diff --git a/dlls/ntdll/heap.c b/dlls/ntdll/heap.c index 6688fab9690..9d58b410207 100644 --- a/dlls/ntdll/heap.c +++ b/dlls/ntdll/heap.c @@ -146,7 +146,8 @@ C_ASSERT( sizeof(ARENA_LARGE) == 4 * BLOCK_ALIGN );
#define ROUND_ADDR(addr, mask) ((void *)((UINT_PTR)(addr) & ~(UINT_PTR)(mask))) #define ROUND_SIZE(size, mask) ((((SIZE_T)(size) + (mask)) & ~(SIZE_T)(mask))) -#define FIELD_MAX(type, field) (((SIZE_T)1 << (sizeof(((type *)0)->field) * 8)) - 1) +#define FIELD_BITS(type, field) (sizeof(((type *)0)->field) * 8) +#define FIELD_MAX(type, field) (((SIZE_T)1 << FIELD_BITS(type, field)) - 1)
#define HEAP_MIN_BLOCK_SIZE ROUND_SIZE(sizeof(struct entry) + BLOCK_ALIGN, BLOCK_ALIGN - 1)
@@ -168,17 +169,10 @@ C_ASSERT( HEAP_MAX_FREE_BLOCK_SIZE >= HEAP_MAX_BLOCK_REGION_SIZE ); /* minimum size to start allocating large blocks */ #define HEAP_MIN_LARGE_BLOCK_SIZE (HEAP_MAX_USED_BLOCK_SIZE - 0x1000)
-/* There will be a free list bucket for every arena size up to and including this value */ -#define HEAP_MAX_SMALL_FREE_LIST 0x100 -C_ASSERT( HEAP_MAX_SMALL_FREE_LIST % BLOCK_ALIGN == 0 ); -#define HEAP_NB_SMALL_FREE_LISTS (((HEAP_MAX_SMALL_FREE_LIST - HEAP_MIN_BLOCK_SIZE) / BLOCK_ALIGN) + 1) - -/* Max size of the blocks on the free lists above HEAP_MAX_SMALL_FREE_LIST */ -static const SIZE_T free_list_sizes[] = -{ - 0x200, 0x400, 0x1000, ~(SIZE_T)0 -}; -#define HEAP_NB_FREE_LISTS (ARRAY_SIZE(free_list_sizes) + HEAP_NB_SMALL_FREE_LISTS) +#define FREE_LIST_LINEAR_BITS 3 +#define FREE_LIST_LINEAR_MASK ((1 << FREE_LIST_LINEAR_BITS) - 1) +#define FREE_LIST_MAX_LOG FIELD_BITS( struct block, block_size ) +#define NB_FREE_LISTS ((FREE_LIST_MAX_LOG - FREE_LIST_LINEAR_BITS + 1) * (1 << FREE_LIST_LINEAR_BITS) + 1)
typedef struct DECLSPEC_ALIGN(BLOCK_ALIGN) tagSUBHEAP { @@ -304,7 +298,7 @@ struct heap DWORD pending_pos; /* Position in pending free requests ring */ struct block **pending_free; /* Ring buffer for pending free requests */ RTL_CRITICAL_SECTION cs; - struct entry free_lists[HEAP_NB_FREE_LISTS]; + struct entry free_lists[NB_FREE_LISTS]; struct bin *bins; SUBHEAP subheap; }; @@ -567,23 +561,6 @@ static void valgrind_notify_free_all( SUBHEAP *subheap, const struct heap *heap #endif }
-/* locate a free list entry of the appropriate size */ -/* size is the size of the whole block including the arena header */ -static inline struct entry *find_free_list( struct heap *heap, SIZE_T block_size, BOOL last ) -{ - struct entry *list, *end = heap->free_lists + ARRAY_SIZE(heap->free_lists); - unsigned int i; - - if (block_size <= HEAP_MAX_SMALL_FREE_LIST) - i = (block_size - HEAP_MIN_BLOCK_SIZE) / BLOCK_ALIGN; - else for (i = HEAP_NB_SMALL_FREE_LISTS; i < HEAP_NB_FREE_LISTS - 1; i++) - if (block_size <= free_list_sizes[i - HEAP_NB_SMALL_FREE_LISTS]) break; - - list = heap->free_lists + i; - if (last && ++list == end) list = heap->free_lists; - return list; -} - /* get the memory protection type to use for a given heap */ static inline ULONG get_protection_type( DWORD flags ) { @@ -622,10 +599,63 @@ static void heap_set_status( const struct heap *heap, ULONG flags, NTSTATUS stat if (status) RtlSetLastWin32ErrorAndNtStatusFromNtStatus( status ); }
-static size_t get_free_list_block_size( unsigned int index ) +/* + * Given a size, return its index in the block size list for freelists. + * + * With FREE_LIST_LINEAR_BITS=3, the list looks like this + * (with respect to size / BLOCK_ALIGN): + * 0, + * 1, 2, 3, 4, 5, 6, 7, 8, + * 9, 10, 11, 12, 13, 14, 15, 16, + * 18, 20, 22, 24, 26, 28, 30, 32, + * 36, 40, 44, 48, 52, 56, 60, 64, + * 72, 80, 88, 96, 104, 112, 120, 128, + * ... + */ +static unsigned long get_free_list_block( size_t size ) { - if (index < HEAP_NB_SMALL_FREE_LISTS) return HEAP_MIN_BLOCK_SIZE + index * BLOCK_ALIGN; - return free_list_sizes[index - HEAP_NB_SMALL_FREE_LISTS]; + size_t bsize = size / BLOCK_ALIGN; + DWORD b; + unsigned long block, log, linear; + + if ( bsize > UINT_MAX ) + return NB_FREE_LISTS - 1; + + /* find the highest bit */ + if ( !BitScanReverse( &b, bsize ) || b < FREE_LIST_LINEAR_BITS ) + return bsize; + + /* use the top FREE_LIST_LINEAR_BITS + 1 bits to determine block index */ + /* the topmost bit is always 1 and discarded */ + linear = ( bsize >> ( b - FREE_LIST_LINEAR_BITS ) ) & FREE_LIST_LINEAR_MASK; + + log = b - FREE_LIST_LINEAR_BITS + 1; + block = ( log << FREE_LIST_LINEAR_BITS ) + linear; + + return min( block, NB_FREE_LISTS - 1 ); +} + +static size_t get_free_list_block_size( unsigned long index ) +{ + unsigned long log = 1 << FREE_LIST_LINEAR_BITS; + unsigned long linear = index & FREE_LIST_LINEAR_MASK; + + if ( ( index >> FREE_LIST_LINEAR_BITS ) == 0 ) + return index * BLOCK_ALIGN; + + return ( ( log + linear ) << ( ( index >> FREE_LIST_LINEAR_BITS ) - 1 ) ) * BLOCK_ALIGN; +} + +/* locate a free list entry of the appropriate size */ +/* size is the size of the whole block including the arena header */ +static inline struct entry *find_free_list( struct heap *heap, SIZE_T block_size, BOOL last ) +{ + unsigned long block = get_free_list_block( block_size ); + + if ( last && ++block == NB_FREE_LISTS ) + block = 0; + + return &heap->free_lists[block]; }
static void heap_dump( const struct heap *heap ) @@ -649,7 +679,7 @@ static void heap_dump( const struct heap *heap ) }
TRACE( " free_lists: %p\n", heap->free_lists ); - for (i = 0; i < HEAP_NB_FREE_LISTS; i++) + for (i = 0; i < NB_FREE_LISTS; i++) TRACE( " %p: size %#8Ix, prev %p, next %p\n", heap->free_lists + i, get_free_list_block_size( i ), LIST_ENTRY( heap->free_lists[i].entry.prev, struct entry, entry ), LIST_ENTRY( heap->free_lists[i].entry.next, struct entry, entry ) ); @@ -1124,7 +1154,7 @@ static BOOL is_valid_free_block( const struct heap *heap, const struct block *bl unsigned int i;
if ((subheap = find_subheap( heap, block, FALSE ))) return TRUE; - for (i = 0; i < HEAP_NB_FREE_LISTS; i++) if (block == &heap->free_lists[i].block) return TRUE; + for (i = 0; i < NB_FREE_LISTS; i++) if (block == &heap->free_lists[i].block) return TRUE; return FALSE; }
@@ -1508,7 +1538,7 @@ HANDLE WINAPI RtlCreateHeap( ULONG flags, void *addr, SIZE_T total_size, SIZE_T list_init( &heap->large_list );
list_init( &heap->free_lists[0].entry ); - for (i = 0, entry = heap->free_lists; i < HEAP_NB_FREE_LISTS; i++, entry++) + for (i = 0, entry = heap->free_lists; i < NB_FREE_LISTS; i++, entry++) { block_set_flags( &entry->block, ~0, BLOCK_FLAG_FREE_LINK ); block_set_size( &entry->block, 0 );
Hi,
It looks like your patch introduced the new failures shown below. Please investigate and fix them before resubmitting your patch. If they are not new, fixing them anyway would help a lot. Otherwise please ask for the known failures list to be updated.
The tests also ran into some preexisting test failures. If you know how to fix them that would be helpful. See the TestBot job for the details:
The full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=131716
Your paranoid android.
=== debian11b (64 bit WoW report) ===
kernel32: heap.c:446: Test failed: HeapAlloc failed, error 8 Unhandled exception: page fault on read access to 0xfffffffffffffffe in 64-bit code (0x0000017002b0f0).
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
-static size_t get_free_list_block_size( unsigned int index ) +/*
- Given a size, return its index in the block size list for freelists.
- With FREE_LIST_LINEAR_BITS=3, the list looks like this
- (with respect to size / BLOCK_ALIGN):
- 0,
- 1, 2, 3, 4, 5, 6, 7, 8,
- 9, 10, 11, 12, 13, 14, 15, 16,
- 18, 20, 22, 24, 26, 28, 30, 32,
- 36, 40, 44, 48, 52, 56, 60, 64,
- 72, 80, 88, 96, 104, 112, 120, 128,
- ...
- */
+static unsigned long get_free_list_block( size_t size )
Wine tries to avoid `unsigned long`. `long` differs in width between the LP64 model (64-bit Unix side, where `long` is 64-bit) and the LLP64 model (64-bit PE code, where `long` is 32-bit), which makes it confusing[^aj-ulong] and incompatible with `-mno-cygwin` builds.
```suggestion:-0+0 static unsigned int get_free_list_block( size_t size ) ``` On the other hand, `ULONG` is guaranteed to mean unsigned 32-bit integer and thus fine to use:
```suggestion:-0+0 static ULONG get_free_list_block( size_t size ) ```
Ditto for other `unsigned long` uses.
[^aj-ulong]: See https://www.winehq.org/mailman3/hyperkitty/list/wine-devel@winehq.org/message/N6NC2HPTJKFPKQHZMCKT6QZ3XIXYS22C/.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/heap.c:
- 72, 80, 88, 96, 104, 112, 120, 128,
- ...
- */
+static unsigned long get_free_list_block( size_t size ) {
- if (index < HEAP_NB_SMALL_FREE_LISTS) return HEAP_MIN_BLOCK_SIZE + index * BLOCK_ALIGN;
- return free_list_sizes[index - HEAP_NB_SMALL_FREE_LISTS];
- size_t bsize = size / BLOCK_ALIGN;
- DWORD b;
- unsigned long block, log, linear;
- if ( bsize > UINT_MAX )
return NB_FREE_LISTS - 1;
- /* find the highest bit */
- if ( !BitScanReverse( &b, bsize ) || b < FREE_LIST_LINEAR_BITS )
How about shifting `bsize` right by `FREE_LIST_LINEAR_BITS` before passing it to `BitScanReverse`? That'll eliminate extra comparsion as well as subtraction in `b - FREE_LIST_LINEAR_BITS`.
Rémi Bernon (@rbernon) commented about dlls/ntdll/heap.c:
-/* There will be a free list bucket for every arena size up to and including this value */ -#define HEAP_MAX_SMALL_FREE_LIST 0x100 -C_ASSERT( HEAP_MAX_SMALL_FREE_LIST % BLOCK_ALIGN == 0 ); -#define HEAP_NB_SMALL_FREE_LISTS (((HEAP_MAX_SMALL_FREE_LIST - HEAP_MIN_BLOCK_SIZE) / BLOCK_ALIGN) + 1)
-/* Max size of the blocks on the free lists above HEAP_MAX_SMALL_FREE_LIST */ -static const SIZE_T free_list_sizes[] = -{
- 0x200, 0x400, 0x1000, ~(SIZE_T)0
-}; -#define HEAP_NB_FREE_LISTS (ARRAY_SIZE(free_list_sizes) + HEAP_NB_SMALL_FREE_LISTS) +#define FREE_LIST_LINEAR_BITS 3 +#define FREE_LIST_LINEAR_MASK ((1 << FREE_LIST_LINEAR_BITS) - 1) +#define FREE_LIST_MAX_LOG FIELD_BITS( struct block, block_size ) +#define NB_FREE_LISTS ((FREE_LIST_MAX_LOG - FREE_LIST_LINEAR_BITS + 1) * (1 << FREE_LIST_LINEAR_BITS) + 1)
```suggestion:-4+0 #define FREE_LIST_LINEAR_BITS 3 #define FREE_LIST_LINEAR_MASK ((1 << FREE_LIST_LINEAR_BITS) - 1) #define FREE_LIST_COUNT ((FIELD_BITS( struct block, block_size ) - FREE_LIST_LINEAR_BITS + 1) * (1 << FREE_LIST_LINEAR_BITS) + 1)
C_ASSERT( FREE_LIST_COUNT <= 0x80 ); ```
Something like that, I'd use a common `FREE_LIST_` prefix for all the related defines, remove `FREE_LIST_MAX_LOG` which isn't very useful, and add a `C_ASSERT` to better show that the actual number of free lists.
Note that apparently the induced heap size increase makes some tests to fail, using only 2 bits seems to be an easy fix, as it reduces the number of free lists to ~64, but I don't know if that's still good enough?
Rémi Bernon (@rbernon) commented about dlls/ntdll/heap.c:
return index * BLOCK_ALIGN;
- return ( ( log + linear ) << ( ( index >> FREE_LIST_LINEAR_BITS ) - 1 ) ) * BLOCK_ALIGN;
+}
+/* locate a free list entry of the appropriate size */ +/* size is the size of the whole block including the arena header */ +static inline struct entry *find_free_list( struct heap *heap, SIZE_T block_size, BOOL last ) +{
- unsigned long block = get_free_list_block( block_size );
- if ( last && ++block == NB_FREE_LISTS )
block = 0;
- return &heap->free_lists[block];
}
```suggestion:-10+0 /* locate a free list entry of the appropriate size */ static inline struct entry *find_free_list( struct heap *heap, SIZE_T block_size, BOOL last ) { UINT index = get_free_list_index( block_size ); if (last && ++index == FREE_LIST_COUNT) index = 0; return &heap->free_lists[index]; } ```
I think we can remove the block_size comments at this point, `block_size` is used consistently now and implies that the size includes the block header.
I'd also prefer to use "index" rather than "block" which already has a different meaning.
Rémi Bernon (@rbernon) commented about dlls/ntdll/heap.c:
- log = b - FREE_LIST_LINEAR_BITS + 1;
- block = ( log << FREE_LIST_LINEAR_BITS ) + linear;
- return min( block, NB_FREE_LISTS - 1 );
+}
+static size_t get_free_list_block_size( unsigned long index ) +{
- unsigned long log = 1 << FREE_LIST_LINEAR_BITS;
- unsigned long linear = index & FREE_LIST_LINEAR_MASK;
- if ( ( index >> FREE_LIST_LINEAR_BITS ) == 0 )
return index * BLOCK_ALIGN;
- return ( ( log + linear ) << ( ( index >> FREE_LIST_LINEAR_BITS ) - 1 ) ) * BLOCK_ALIGN;
+}
```suggestion:-32+0 static SIZE_T get_free_list_block_size( UINT index ) { UINT linear = index & FREE_LIST_LINEAR_MASK, shift = index >> FREE_LIST_LINEAR_BITS; if (shift) linear = (linear + (1 << FREE_LIST_LINEAR_BITS)) << (shift - 1); return linear * BLOCK_ALIGN; }
static UINT get_free_list_index( SIZE_T block_size ) { UINT linear, shift; DWORD bit;
if (block_size >= get_free_list_block_size( FREE_LIST_COUNT - 1 )) return FREE_LIST_COUNT - 1;
block_size /= BLOCK_ALIGN; if (!BitScanReverse( &bit, block_size ) || bit < FREE_LIST_LINEAR_BITS) return block_size;
/* the highest bit is always set, ignore it and encode the next FREE_LIST_LINEAR_BITS bits * as a linear scale, combined with the shift as a log scale, in the free list index. */ shift = bit - FREE_LIST_LINEAR_BITS + 1; linear = (block_size - (1 << bit)) >> (shift - 1);
return (shift << FREE_LIST_LINEAR_BITS) + (linear & FREE_LIST_LINEAR_MASK); } ```
We can use `get_free_list_block_size` to early reject large cases, and make the rest a little bit simpler.
Also note that the style has spaces only in function and macros parentheses. And like @iamahuman suggested, the code around uses win32 types, so prefer `ULONG` / `UINT` over unsigned long, and `SIZE_T` instead of `size_t`.
I know, after discussing this elsewhere, that this is supposed to improve shader compilation performance in Overwatch 2 even more, but if you have some actual numbers it would be nice to mention them here.