Signed-off-by: Jinoh Kang jinoh.kang.kr@gmail.com --- dlls/win32u/Makefile.in | 1 + dlls/win32u/alloc.c | 173 ++++++++++++++++++++++++++++++++++++ dlls/win32u/ntgdi_private.h | 4 + 3 files changed, 178 insertions(+) create mode 100644 dlls/win32u/alloc.c
diff --git a/dlls/win32u/Makefile.in b/dlls/win32u/Makefile.in index 2dea3682064..6c7eb383553 100644 --- a/dlls/win32u/Makefile.in +++ b/dlls/win32u/Makefile.in @@ -9,6 +9,7 @@ IMPORTS = ntdll winecrt0 EXTRADLLFLAGS = -nodefaultlibs -Wb,--syscall-table,1
C_SRCS = \ + alloc.c \ bitblt.c \ bitmap.c \ brush.c \ diff --git a/dlls/win32u/alloc.c b/dlls/win32u/alloc.c new file mode 100644 index 00000000000..350f0ea0c87 --- /dev/null +++ b/dlls/win32u/alloc.c @@ -0,0 +1,173 @@ +/* + * simple freelist cache allocator + * + * Copyright 2022 Jinoh Kang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA + */ + +#if 0 +#pragma makedep unix +#endif + +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <pthread.h> + +#include "windef.h" +#include "winbase.h" +#include "wingdi.h" +#include "ntgdi_private.h" +#include "wine/debug.h" + +WINE_DEFAULT_DEBUG_CHANNEL(gdi); + + +#define MEM_CACHE_NR_BUCKETS 1024 +#define MEM_CACHE_MIN_SIZE 16384 +#define MEM_CACHE_STEP 4096 +#ifdef _WIN64 +#define MEM_CACHE_THRESHOLD (32UL << 20) /* 32MB */ +#else +#define MEM_CACHE_THRESHOLD ( 8UL << 20) /* 8MB */ +#endif + +static pthread_mutex_t mem_cache_lock = PTHREAD_MUTEX_INITIALIZER; +static void **mem_cache_buckets[MEM_CACHE_NR_BUCKETS]; +static SIZE_T mem_cache_total_size; + +static SIZE_T get_bucket_index( SIZE_T size ) +{ + if (size < MEM_CACHE_MIN_SIZE) return (SIZE_T)-1; + return (size - MEM_CACHE_MIN_SIZE + MEM_CACHE_STEP - 1) / MEM_CACHE_STEP; +} + +static SIZE_T get_bucket_chunk_size( SIZE_T index ) +{ + return index * MEM_CACHE_STEP + MEM_CACHE_MIN_SIZE; +} + +static void *bucket_pop_chunk( SIZE_T i ) +{ + SIZE_T real_size = get_bucket_chunk_size( i ); + void *mem; + + if (!(mem = mem_cache_buckets[i])) + return NULL; + + assert(mem_cache_total_size >= real_size); + mem_cache_buckets[i] = *(void **)mem; + mem_cache_total_size -= real_size; + return mem; +} + +static void bucket_push_chunk( SIZE_T i, void *mem ) +{ + SIZE_T real_size = get_bucket_chunk_size( i ); + + mem_cache_total_size += real_size; + *(void **)mem = mem_cache_buckets[i]; + mem_cache_buckets[i] = mem; +} + +void *alloc_gdi_cache_memory( SIZE_T size, BOOL zero_mem ) +{ + SIZE_T i, real_size = size; + SIZE_T real_bucket_i; + void *mem = NULL; + + i = get_bucket_index( real_size ); + if (i < MEM_CACHE_NR_BUCKETS) + { + real_size = get_bucket_chunk_size( i ); + assert(real_size >= size); + + pthread_mutex_lock( &mem_cache_lock ); + + real_bucket_i = i; + while (!(mem = bucket_pop_chunk( real_bucket_i ))) + { + if (++real_bucket_i >= MEM_CACHE_NR_BUCKETS) break; + } + + pthread_mutex_unlock( &mem_cache_lock ); + + if (mem) + { + if (i != real_bucket_i) + { + void *realloc_mem = realloc( mem, real_size ); + if (realloc_mem) + mem = realloc_mem; + } + if (zero_mem) + memset(mem, 0, size); + } + else TRACE("no cache for %lu\n", real_size); + } + + if (!mem) + { + if (zero_mem) + mem = calloc( 1, real_size ); + else + mem = malloc( real_size ); + } + + return mem; +} + +void free_gdi_cache_memory( void *mem, SIZE_T size ) +{ + SIZE_T i, real_size; + SIZE_T free_bucket_i; + + i = get_bucket_index( size ); + if (i < MEM_CACHE_NR_BUCKETS) + { + real_size = get_bucket_chunk_size( i ); + + pthread_mutex_lock( &mem_cache_lock ); + + free_bucket_i = 0; + while (real_size > MEM_CACHE_THRESHOLD - mem_cache_total_size) + { + void *old_mem = bucket_pop_chunk( free_bucket_i ); + if (old_mem) + { + TRACE("pop cache %p (%lu)\n", old_mem, real_size); + free( old_mem ); + } + if (++free_bucket_i >= MEM_CACHE_NR_BUCKETS) break; + } + + if (real_size <= MEM_CACHE_THRESHOLD - mem_cache_total_size) + { + bucket_push_chunk( i, mem ); + mem = NULL; + } + else + { + TRACE("discard memory %p (%lu)\n", mem, real_size); + } + + pthread_mutex_unlock( &mem_cache_lock ); + } + + if (mem) + free( mem ); +} diff --git a/dlls/win32u/ntgdi_private.h b/dlls/win32u/ntgdi_private.h index 75cddc00fb3..5e5f6041865 100644 --- a/dlls/win32u/ntgdi_private.h +++ b/dlls/win32u/ntgdi_private.h @@ -427,6 +427,10 @@ extern HRGN create_polypolygon_region( const POINT *pts, const INT *count, INT n extern BOOL delete_dce( struct dce *dce ) DECLSPEC_HIDDEN; extern void update_dc( DC *dc ) DECLSPEC_HIDDEN;
+/* alloc.c */ +extern void *alloc_gdi_cache_memory( SIZE_T size, BOOL zero_mem ) DECLSPEC_HIDDEN; +extern void free_gdi_cache_memory( void *mem, SIZE_T size ) DECLSPEC_HIDDEN; + #define RGN_DEFAULT_RECTS 4 typedef struct {
Signed-off-by: Jinoh Kang jinoh.kang.kr@gmail.com --- dlls/win32u/bitmap.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/dlls/win32u/bitmap.c b/dlls/win32u/bitmap.c index 2be88715948..dd0dfce4069 100644 --- a/dlls/win32u/bitmap.c +++ b/dlls/win32u/bitmap.c @@ -159,7 +159,7 @@ HBITMAP WINAPI NtGdiCreateBitmap( INT width, INT height, UINT planes, bmpobj->dib.dsBm.bmWidthBytes = get_bitmap_stride( width, bpp ); bmpobj->dib.dsBm.bmPlanes = planes; bmpobj->dib.dsBm.bmBitsPixel = bpp; - bmpobj->dib.dsBm.bmBits = calloc( 1, size ); + bmpobj->dib.dsBm.bmBits = alloc_gdi_cache_memory( size, TRUE ); if (!bmpobj->dib.dsBm.bmBits) { free( bmpobj ); @@ -169,7 +169,7 @@ HBITMAP WINAPI NtGdiCreateBitmap( INT width, INT height, UINT planes,
if (!(hbitmap = alloc_gdi_handle( &bmpobj->obj, NTGDI_OBJ_BITMAP, &bitmap_funcs ))) { - free( bmpobj->dib.dsBm.bmBits ); + free_gdi_cache_memory( bmpobj->dib.dsBm.bmBits, size ); free( bmpobj ); return 0; } @@ -444,9 +444,15 @@ HGDIOBJ WINAPI NtGdiSelectBitmap( HDC hdc, HGDIOBJ handle ) static BOOL BITMAP_DeleteObject( HGDIOBJ handle ) { BITMAPOBJ *bmp = free_gdi_handle( handle ); + INT dib_stride; + SIZE_T size;
if (!bmp) return FALSE; - free( bmp->dib.dsBm.bmBits ); + + dib_stride = get_dib_stride( bmp->dib.dsBm.bmWidth, bmp->dib.dsBm.bmBitsPixel ); + size = dib_stride * bmp->dib.dsBm.bmHeight; + + free_gdi_cache_memory( bmp->dib.dsBm.bmBits, size ); free( bmp ); return TRUE; }
Signed-off-by: Jinoh Kang jinoh.kang.kr@gmail.com --- dlls/win32u/bitblt.c | 19 +++++++++++++++++++ dlls/win32u/ntgdi_private.h | 4 ++++ 2 files changed, 23 insertions(+)
diff --git a/dlls/win32u/bitblt.c b/dlls/win32u/bitblt.c index d7bbd353b2d..44bee3ef80a 100644 --- a/dlls/win32u/bitblt.c +++ b/dlls/win32u/bitblt.c @@ -168,6 +168,25 @@ void CDECL free_heap_bits( struct gdi_image_bits *bits ) free( bits->ptr ); }
+static void CDECL free_gdi_cache_bits( struct gdi_image_bits *bits ) +{ + free_gdi_cache_memory( bits->ptr, (SIZE_T)bits->param ); +} + +BOOL alloc_gdi_cache_bits( struct gdi_image_bits *bits, SIZE_T size, int flags ) +{ + void *mem = alloc_gdi_cache_memory( size, flags & ALLOC_ZERO_MEMORY ); + + if (!mem) + return FALSE; + + bits->ptr = mem; + bits->is_copy = !!(flags & ALLOC_IS_COPY); + bits->free = free_gdi_cache_bits; + bits->param = (void *)size; + return TRUE; +} + DWORD convert_bits( const BITMAPINFO *src_info, struct bitblt_coords *src, BITMAPINFO *dst_info, struct gdi_image_bits *bits ) { diff --git a/dlls/win32u/ntgdi_private.h b/dlls/win32u/ntgdi_private.h index 5e5f6041865..bcd9a55785c 100644 --- a/dlls/win32u/ntgdi_private.h +++ b/dlls/win32u/ntgdi_private.h @@ -676,6 +676,10 @@ static inline void copy_bitmapinfo( BITMAPINFO *dst, const BITMAPINFO *src ) memcpy( dst, src, get_dib_info_size( src, DIB_RGB_COLORS )); }
+#define ALLOC_ZERO_MEMORY 1 +#define ALLOC_IS_COPY 2 + +extern BOOL alloc_gdi_cache_bits( struct gdi_image_bits *bits, SIZE_T size, int flags ) DECLSPEC_HIDDEN; extern void CDECL free_heap_bits( struct gdi_image_bits *bits ) DECLSPEC_HIDDEN;
void set_gdi_client_ptr( HGDIOBJ handle, void *ptr ) DECLSPEC_HIDDEN;
Signed-off-by: Jinoh Kang jinoh.kang.kr@gmail.com --- dlls/win32u/bitblt.c | 34 +++++++++++++--------------------- dlls/win32u/bitmap.c | 4 +--- dlls/win32u/brush.c | 3 +-- dlls/win32u/dib.c | 12 ++++++------ dlls/win32u/dibdrv/bitblt.c | 18 +++++------------- dlls/win32u/dibdrv/objects.c | 5 ++--- dlls/win32u/font.c | 25 +++++++------------------ 7 files changed, 35 insertions(+), 66 deletions(-)
diff --git a/dlls/win32u/bitblt.c b/dlls/win32u/bitblt.c index 44bee3ef80a..d9abe243b5a 100644 --- a/dlls/win32u/bitblt.c +++ b/dlls/win32u/bitblt.c @@ -190,7 +190,7 @@ BOOL alloc_gdi_cache_bits( struct gdi_image_bits *bits, SIZE_T size, int flags ) DWORD convert_bits( const BITMAPINFO *src_info, struct bitblt_coords *src, BITMAPINFO *dst_info, struct gdi_image_bits *bits ) { - void *ptr; + struct gdi_image_bits new_bits; DWORD err; BOOL top_down = dst_info->bmiHeader.biHeight < 0;
@@ -199,14 +199,12 @@ DWORD convert_bits( const BITMAPINFO *src_info, struct bitblt_coords *src, dst_info->bmiHeader.biSizeImage = get_dib_image_size( dst_info ); if (top_down) dst_info->bmiHeader.biHeight = -dst_info->bmiHeader.biHeight;
- if (!(ptr = malloc( dst_info->bmiHeader.biSizeImage ))) + if (!alloc_gdi_cache_bits( &new_bits, dst_info->bmiHeader.biSizeImage, ALLOC_IS_COPY )) return ERROR_OUTOFMEMORY;
- err = convert_bitmapinfo( src_info, bits->ptr, src, dst_info, ptr ); + err = convert_bitmapinfo( src_info, bits->ptr, src, dst_info, new_bits.ptr ); if (bits->free) bits->free( bits ); - bits->ptr = ptr; - bits->is_copy = TRUE; - bits->free = free_heap_bits; + *bits = new_bits; return err; }
@@ -214,7 +212,7 @@ DWORD stretch_bits( const BITMAPINFO *src_info, struct bitblt_coords *src, BITMAPINFO *dst_info, struct bitblt_coords *dst, struct gdi_image_bits *bits, int mode ) { - void *ptr; + struct gdi_image_bits new_bits; DWORD err;
dst_info->bmiHeader.biWidth = dst->visrect.right - dst->visrect.left; @@ -222,14 +220,12 @@ DWORD stretch_bits( const BITMAPINFO *src_info, struct bitblt_coords *src, dst_info->bmiHeader.biSizeImage = get_dib_image_size( dst_info );
if (src_info->bmiHeader.biHeight < 0) dst_info->bmiHeader.biHeight = -dst_info->bmiHeader.biHeight; - if (!(ptr = malloc( dst_info->bmiHeader.biSizeImage ))) + if (!alloc_gdi_cache_bits( &new_bits, dst_info->bmiHeader.biSizeImage, ALLOC_IS_COPY )) return ERROR_OUTOFMEMORY;
- err = stretch_bitmapinfo( src_info, bits->ptr, src, dst_info, ptr, dst, mode ); + err = stretch_bitmapinfo( src_info, bits->ptr, src, dst_info, new_bits.ptr, dst, mode ); if (bits->free) bits->free( bits ); - bits->ptr = ptr; - bits->is_copy = TRUE; - bits->free = free_heap_bits; + *bits = new_bits; return err; }
@@ -240,13 +236,11 @@ static DWORD blend_bits( const BITMAPINFO *src_info, const struct gdi_image_bits if (!dst_bits->is_copy) { int size = dst_info->bmiHeader.biSizeImage; - void *ptr = malloc( size ); - if (!ptr) return ERROR_OUTOFMEMORY; - memcpy( ptr, dst_bits->ptr, size ); + struct gdi_image_bits new_bits; + if (!alloc_gdi_cache_bits( &new_bits, size, ALLOC_IS_COPY )) return ERROR_OUTOFMEMORY; + memcpy( new_bits.ptr, dst_bits->ptr, size ); if (dst_bits->free) dst_bits->free( dst_bits ); - dst_bits->ptr = ptr; - dst_bits->is_copy = TRUE; - dst_bits->free = free_heap_bits; + *dst_bits = new_bits; } return blend_bitmapinfo( src_info, src_bits->ptr, src, dst_info, dst_bits->ptr, dst, blend ); } @@ -489,10 +483,8 @@ BOOL CDECL nulldrv_GradientFill( PHYSDEV dev, TRIVERTEX *vert_array, ULONG nvert if (err && err != ERROR_BAD_FORMAT) goto done;
info->bmiHeader.biSizeImage = get_dib_image_size( info ); - if (!(bits.ptr = calloc( 1, info->bmiHeader.biSizeImage ))) + if (!alloc_gdi_cache_bits( &bits, info->bmiHeader.biSizeImage, ALLOC_ZERO_MEMORY | ALLOC_IS_COPY )) goto done; - bits.is_copy = TRUE; - bits.free = free_heap_bits;
/* make src and points relative to the bitmap */ src = dst; diff --git a/dlls/win32u/bitmap.c b/dlls/win32u/bitmap.c index dd0dfce4069..172e708178c 100644 --- a/dlls/win32u/bitmap.c +++ b/dlls/win32u/bitmap.c @@ -322,13 +322,11 @@ LONG WINAPI NtGdiSetBitmapBits( } else { - if (!(src_bits.ptr = malloc( dst.height * dst_stride ))) + if (!alloc_gdi_cache_bits( &src_bits, dst.height * dst_stride, ALLOC_IS_COPY )) { GDI_ReleaseObj( hbitmap ); return 0; } - src_bits.is_copy = TRUE; - src_bits.free = free_heap_bits; for (i = 0; i < count / src_stride; i++) memcpy( (char *)src_bits.ptr + i * dst_stride, (char *)bits + i * src_stride, src_stride ); if (count % src_stride) diff --git a/dlls/win32u/brush.c b/dlls/win32u/brush.c index f9b718b4d2e..b12d88d25a4 100644 --- a/dlls/win32u/brush.c +++ b/dlls/win32u/brush.c @@ -73,9 +73,8 @@ static BOOL copy_bitmap( struct brush_pattern *brush, HBITMAP bitmap ) brush->bits = bits; if (!bits.free) { - if (!(brush->bits.ptr = malloc( info->bmiHeader.biSizeImage ))) goto done; + if (!alloc_gdi_cache_bits( &brush->bits, info->bmiHeader.biSizeImage, 0 )) goto done; memcpy( brush->bits.ptr, bits.ptr, info->bmiHeader.biSizeImage ); - brush->bits.free = free_heap_bits; }
if (!(brush->info = malloc( get_dib_info_size( info, DIB_RGB_COLORS )))) diff --git a/dlls/win32u/dib.c b/dlls/win32u/dib.c index ee82d1a8ed8..7150908b2ab 100644 --- a/dlls/win32u/dib.c +++ b/dlls/win32u/dib.c @@ -67,6 +67,7 @@ #include <stdlib.h> #include <string.h> #include <assert.h> +#include <sys/mman.h>
#include "ntstatus.h" #define WIN32_NO_STATUS @@ -323,14 +324,15 @@ static BOOL build_rle_bitmap( BITMAPINFO *info, struct gdi_image_bits *bits, HRG int x, y, width = info->bmiHeader.biWidth, height = info->bmiHeader.biHeight; HRGN run = NULL; BYTE skip, num, data; - BYTE *out_bits, *in_bits = bits->ptr; + BYTE *out_bits = NULL, *in_bits = bits->ptr; + struct gdi_image_bits new_bits;
if (clip) *clip = NULL;
assert( info->bmiHeader.biBitCount == 4 || info->bmiHeader.biBitCount == 8 );
- out_bits = calloc( 1, get_dib_image_size( info ) ); - if (!out_bits) goto fail; + if (!alloc_gdi_cache_bits( &new_bits, get_dib_image_size( info ), ALLOC_ZERO_MEMORY | ALLOC_IS_COPY)) goto fail; + out_bits = new_bits.ptr;
if (clip) { @@ -446,9 +448,7 @@ done: if (run) NtGdiDeleteObjectApp( run ); if (bits->free) bits->free( bits );
- bits->ptr = out_bits; - bits->is_copy = TRUE; - bits->free = free_heap_bits; + *bits = new_bits; info->bmiHeader.biSizeImage = get_dib_image_size( info );
return TRUE; diff --git a/dlls/win32u/dibdrv/bitblt.c b/dlls/win32u/dibdrv/bitblt.c index 76330679e59..3a9c28bf9bf 100644 --- a/dlls/win32u/dibdrv/bitblt.c +++ b/dlls/win32u/dibdrv/bitblt.c @@ -699,22 +699,19 @@ static DWORD copy_src_bits( dib_info *src, RECT *src_rect ) { int y, stride = get_dib_stride( src->width, src->bit_count ); int height = src_rect->bottom - src_rect->top; - void *ptr = malloc( stride * height ); + struct gdi_image_bits new_bits;
- if (!ptr) return ERROR_OUTOFMEMORY; + if (!alloc_gdi_cache_bits( &new_bits, stride * height, ALLOC_IS_COPY )) return ERROR_OUTOFMEMORY;
for (y = 0; y < height; y++) - memcpy( (char *)ptr + y * stride, + memcpy( (char *)new_bits.ptr + y * stride, (char *)src->bits.ptr + (src->rect.top + src_rect->top + y) * src->stride, stride ); src->stride = stride; src->height = height; src->rect.top = 0; src->rect.bottom = height; if (src->bits.free) src->bits.free( &src->bits ); - src->bits.is_copy = TRUE; - src->bits.ptr = ptr; - src->bits.free = free_heap_bits; - src->bits.param = NULL; + src->bits = new_bits;
offset_rect( src_rect, 0, -src_rect->top ); return ERROR_SUCCESS; @@ -730,12 +727,7 @@ static DWORD create_tmp_dib( const dib_info *copy, int width, int height, dib_in ret->rect.top = 0; ret->rect.right = width; ret->rect.bottom = height; - ret->bits.ptr = malloc( ret->height * ret->stride ); - ret->bits.is_copy = TRUE; - ret->bits.free = free_heap_bits; - ret->bits.param = NULL; - - return ret->bits.ptr ? ERROR_SUCCESS : ERROR_OUTOFMEMORY; + return alloc_gdi_cache_bits( &ret->bits, ret->height * ret->stride, ALLOC_IS_COPY ) ? ERROR_SUCCESS : ERROR_OUTOFMEMORY; }
static DWORD execute_rop( dibdrv_physdev *pdev, const RECT *dst_rect, dib_info *src, diff --git a/dlls/win32u/dibdrv/objects.c b/dlls/win32u/dibdrv/objects.c index d137daa1f33..28ddea8c3da 100644 --- a/dlls/win32u/dibdrv/objects.c +++ b/dlls/win32u/dibdrv/objects.c @@ -2022,9 +2022,8 @@ static BOOL select_pattern_brush( dibdrv_physdev *pdev, dib_brush *brush, BOOL * } else { - brush->dib.bits.ptr = malloc( brush->dib.height * brush->dib.stride ); - brush->dib.bits.is_copy = TRUE; - brush->dib.bits.free = free_heap_bits; + memset( &brush->dib.bits, 0, sizeof(brush->dib.bits) ); + alloc_gdi_cache_bits( &brush->dib.bits, brush->dib.height * brush->dib.stride, ALLOC_IS_COPY ); brush->dib.funcs->convert_to(&brush->dib, &pattern, &pattern.rect, dither); } return TRUE; diff --git a/dlls/win32u/font.c b/dlls/win32u/font.c index 87ef75f04af..9746798dc84 100644 --- a/dlls/win32u/font.c +++ b/dlls/win32u/font.c @@ -5151,15 +5151,13 @@ static DWORD get_glyph_bitmap( HDC hdc, UINT index, UINT flags, UINT aa_flags, stride = get_dib_stride( metrics->gmBlackBoxX, 1 ); size = metrics->gmBlackBoxY * stride;
- if (!(image->ptr = malloc( size ))) return ERROR_OUTOFMEMORY; - image->is_copy = TRUE; - image->free = free_heap_bits; + if (!alloc_gdi_cache_bits( image, size, ALLOC_IS_COPY )) return ERROR_OUTOFMEMORY;
ret = NtGdiGetGlyphOutline( hdc, index, aa_flags, metrics, size, image->ptr, &identity, FALSE ); if (ret == GDI_ERROR) { - free( image->ptr ); + if (image->free) image->free( image ); return ERROR_NOT_FOUND; } return ERROR_SUCCESS; @@ -5322,10 +5320,7 @@ BOOL CDECL nulldrv_ExtTextOut( PHYSDEV dev, INT x, INT y, UINT flags, const RECT src.visrect.right = src.width; src.visrect.bottom = src.height;
- bits.ptr = malloc( info->bmiHeader.biSizeImage ); - if (!bits.ptr) return ERROR_OUTOFMEMORY; - bits.is_copy = TRUE; - bits.free = free_heap_bits; + if (!alloc_gdi_cache_bits( &bits, info->bmiHeader.biSizeImage, ALLOC_IS_COPY )) return ERROR_OUTOFMEMORY; err = ERROR_SUCCESS; } } @@ -5335,17 +5330,11 @@ BOOL CDECL nulldrv_ExtTextOut( PHYSDEV dev, INT x, INT y, UINT flags, const RECT err = src_dev->funcs->pGetImage( src_dev, info, &bits, &src ); if (!err && !bits.is_copy) { - void *ptr = malloc( info->bmiHeader.biSizeImage ); - if (!ptr) - { - if (bits.free) bits.free( &bits ); - return ERROR_OUTOFMEMORY; - } - memcpy( ptr, bits.ptr, info->bmiHeader.biSizeImage ); + struct gdi_image_bits new_bits; + if (!alloc_gdi_cache_bits( &new_bits, info->bmiHeader.biSizeImage, ALLOC_IS_COPY )) return ERROR_OUTOFMEMORY; + memcpy( new_bits.ptr, bits.ptr, info->bmiHeader.biSizeImage ); if (bits.free) bits.free( &bits ); - bits.ptr = ptr; - bits.is_copy = TRUE; - bits.free = free_heap_bits; + bits = new_bits; } } if (!err)
On Mon, Mar 21, 2022 at 05:41:48AM +0900, Jinoh Kang wrote:
Signed-off-by: Jinoh Kang jinoh.kang.kr@gmail.com
dlls/win32u/Makefile.in | 1 + dlls/win32u/alloc.c | 173 ++++++++++++++++++++++++++++++++++++ dlls/win32u/ntgdi_private.h | 4 + 3 files changed, 178 insertions(+) create mode 100644 dlls/win32u/alloc.c
Hi Jinoh,
Could you provide some justification for why we should include this? Additionally, there are a few magic numbers in this patch, some discussion about their value would be good.
Also note, [1/4] is adding dead (unused) code. You'd want to combine it with [2/4] say.
Huw.
On Mon, Mar 21, 2022, 7:56 PM Huw Davies huw@codeweavers.com wrote:
On Mon, Mar 21, 2022 at 05:41:48AM +0900, Jinoh Kang wrote:
Signed-off-by: Jinoh Kang jinoh.kang.kr@gmail.com
dlls/win32u/Makefile.in | 1 + dlls/win32u/alloc.c | 173 ++++++++++++++++++++++++++++++++++++ dlls/win32u/ntgdi_private.h | 4 + 3 files changed, 178 insertions(+) create mode 100644 dlls/win32u/alloc.c
Hi Jinoh,
Could you provide some justification for why we should include this?
Well, I forgot to mark this as RFC. I still haven't decided how to benchmark this change effectively; feedbacks are welcome.
The rationale was that some applications were seeing frequent page faults due to writing to demand-zero pages in GDI bit blitting routines.
For a bitmap that extends over several demand-zero pages, each page access would incur a fault (in the worst scenario), resulting in significant performance drop and/or high CPU usage. This may be aggravated by non-sequential access (which usually is the case with bottom-up DIBs) or with transparent huge pages disabled.
Specifically, KakaoTalk renders GIF animations via GDI, calling CreateCompatibleBitmap() a number of times per each frame. NtCreateBitmap() uses calloc() to allocate bitmap memory, and calloc() uses freshly mmap()-ed area above certain size threshold. Apparantly GLibc ptmalloc assumes the use pattern where bigger objects are allocated and freed much rarer.
Additionally, there are a few magic numbers in this patch, some
discussion about their value would be good.
They are pretty arbitrary as you might have guessed. It certainly needs more benchmark work so it can be tuned more appropriately.
Also note, [1/4] is adding dead (unused) code. You'd want to combine it with [2/4] say.
Ack.
Huw.
On Tue, Mar 22, 2022 at 01:47:59AM +0900, Jin-oh Kang wrote:
On Mon, Mar 21, 2022, 7:56 PM Huw Davies huw@codeweavers.com wrote: Could you provide some justification for why we should include this?
Well, I forgot to mark this as RFC. I still haven't decided how to benchmark this change effectively; feedbacks are welcome.
The rationale was that some applications were seeing frequent page faults due to writing to demand-zero pages in GDI bit blitting routines.
For a bitmap that extends over several demand-zero pages, each page access would incur a fault (in the worst scenario), resulting in significant performance drop and/or high CPU usage. This may be aggravated by non- sequential access (which usually is the case with bottom-up DIBs) or with transparent huge pages disabled.
We'd want to see some actual data on perf / CPU usage to go forward.
Huw.
On 3/21/22 17:47, Jin-oh Kang wrote:
On Mon, Mar 21, 2022, 7:56 PM Huw Davies huw@codeweavers.com wrote:
On Mon, Mar 21, 2022 at 05:41:48AM +0900, Jinoh Kang wrote:
Signed-off-by: Jinoh Kang jinoh.kang.kr@gmail.com
dlls/win32u/Makefile.in | 1 + dlls/win32u/alloc.c | 173 ++++++++++++++++++++++++++++++++++++ dlls/win32u/ntgdi_private.h | 4 + 3 files changed, 178 insertions(+) create mode 100644 dlls/win32u/alloc.c
Hi Jinoh,
Could you provide some justification for why we should include this?
Well, I forgot to mark this as RFC. I still haven't decided how to benchmark this change effectively; feedbacks are welcome.
The rationale was that some applications were seeing frequent page faults due to writing to demand-zero pages in GDI bit blitting routines.
For a bitmap that extends over several demand-zero pages, each page access would incur a fault (in the worst scenario), resulting in significant performance drop and/or high CPU usage. This may be aggravated by non-sequential access (which usually is the case with bottom-up DIBs) or with transparent huge pages disabled.
In my experience transparent huge pages solve this, unless the access pattern is bad (like pages touched backwards). In which case, memsetting the pages would be sufficient (for large blocks we rely on VirtualAlloc allocating zeroed pages and we do not write to them).
I believe most Linux distributions enable THP by default now, so if you disable them it's pretty much that you know what performance hit it can induce and I don't think Wine should try to mitigate it.
Then there's the question about platforms without THP-like feature, which I'm not sure how common they are. If there's a portable (-ish) way to request pages larger than the default, when allocating virtual memory, that may be a better and more generic fix?
On 3/22/22 17:43, Rémi Bernon wrote:
On 3/21/22 17:47, Jin-oh Kang wrote:
On Mon, Mar 21, 2022, 7:56 PM Huw Davies huw@codeweavers.com wrote:
On Mon, Mar 21, 2022 at 05:41:48AM +0900, Jinoh Kang wrote:
Signed-off-by: Jinoh Kang jinoh.kang.kr@gmail.com
dlls/win32u/Makefile.in | 1 + dlls/win32u/alloc.c | 173 ++++++++++++++++++++++++++++++++++++ dlls/win32u/ntgdi_private.h | 4 + 3 files changed, 178 insertions(+) create mode 100644 dlls/win32u/alloc.c
Hi Jinoh,
Could you provide some justification for why we should include this?
Well, I forgot to mark this as RFC. I still haven't decided how to benchmark this change effectively; feedbacks are welcome.
The rationale was that some applications were seeing frequent page faults due to writing to demand-zero pages in GDI bit blitting routines.
For a bitmap that extends over several demand-zero pages, each page access would incur a fault (in the worst scenario), resulting in significant performance drop and/or high CPU usage. This may be aggravated by non-sequential access (which usually is the case with bottom-up DIBs) or with transparent huge pages disabled.
In my experience transparent huge pages solve this, unless the access pattern is bad (like pages touched backwards). In which case, memsetting the pages would be sufficient (for large blocks we rely on VirtualAlloc allocating zeroed pages and we do not write to them).
I believe most Linux distributions enable THP by default now, so if you disable them it's pretty much that you know what performance hit it can induce and I don't think Wine should try to mitigate it.
Then there's the question about platforms without THP-like feature, which I'm not sure how common they are.
The platform needs to either support THP, or the malloc() usage pattern of allocating and freeing large objects frequently.
If there's a portable (-ish) way to request pages larger than the default, when allocating virtual memory, that may be a better and more generic fix?
I don't think such tunable would be portable, but I agree it's the best way forward if there is one.
Did you compare the actual performance on some test cases to just memsetting the whole area before doing the blit? In my experience on Linux pre-faulting the memory this way reduce the overhead greatly, while the benefit of explicit huge pages management over this is not apparent. It is not obvious that caching large memory areas in gdi32 worth it given VM address exhaustion issues we have with a bunch of 32 bit apps. IMO if something like this is absolutely necessary that should be somehow considered on ntdll VM level so such a memory can be reclaimed.
On 3/22/22 12:06, Jinoh Kang wrote:
In my experience transparent huge pages solve this, unless the access pattern is bad (like pages touched backwards). In which case, memsetting the pages would be sufficient (for large blocks we rely on VirtualAlloc allocating zeroed pages and we do not write to them).
I believe most Linux distributions enable THP by default now, so if you disable them it's pretty much that you know what performance hit it can induce and I don't think Wine should try to mitigate it.
Then there's the question about platforms without THP-like feature, which I'm not sure how common they are.
The platform needs to either support THP, or the malloc() usage pattern of allocating and freeing large objects frequently.
If there's a portable (-ish) way to request pages larger than the default, when allocating virtual memory, that may be a better and more generic fix?
I don't think such tunable would be portable, but I agree it's the best way forward if there is one.
On 3/22/22 18:28, Paul Gofman wrote:
Did you compare the actual performance on some test cases to just memsetting the whole area before doing the blit?
That sounds like a good idea. I'll try turning existing tests into benchmarks and see how it works.
In my experience on Linux pre-faulting the memory this way reduce the overhead greatly, while the benefit of explicit huge pages management over this is not apparent.
That shows stark difference from my experience. In my case, memset()-ing yielded no significant improvement in performance (the faults moves from blitting to zeroing).
It is not obvious that caching large memory areas in gdi32 worth it given VM address exhaustion issues we have with a bunch of 32 bit apps.
Yes, this alone is certainly a showstopper.
IMO if something like this is absolutely necessary that should be somehow considered on ntdll VM level so such a memory can be reclaimed.
Thanks!
On 3/22/22 19:37, Jinoh Kang wrote:
In my experience on Linux pre-faulting the memory this way reduce the overhead greatly, while the benefit of explicit huge pages management over this is not apparent. That shows stark difference from my experience. In my case, memset()-ing yielded no significant improvement in performance (the faults moves from blitting to zeroing).
Did you have transparent huge pages enabled?
On 3/23/22 01:41, Paul Gofman wrote:
On 3/22/22 19:37, Jinoh Kang wrote:
In my experience on Linux pre-faulting the memory this way reduce the overhead greatly, while the benefit of explicit huge pages management over this is not apparent. That shows stark difference from my experience. In my case, memset()-ing yielded no significant improvement in performance (the faults moves from blitting to zeroing).
Did you have transparent huge pages enabled?
Sorry, I somehow misread "explicit HP" as "transparent HP." It now makes total sense. To answer your question, no (for some reason). I think it's safe to treat myself as a special case here.
Not all platforms have THP though, so I'll have to evaluate whether my patchset could bring more advantage at less VM overhead on such platforms.