This is to prepare so we don't recalculate the lookup cache map for color tables on every clipped rect (which is expensive).
Signed-off-by: Gabriel Ivăncescu gabrielopcode@gmail.com ---
This is a no-op patch, it's needed for next patch.
dlls/gdi32/dibdrv/bitblt.c | 14 +- dlls/gdi32/dibdrv/dibdrv.h | 18 +-- dlls/gdi32/dibdrv/primitives.c | 284 +++++++++++++++++++-------------- 3 files changed, 177 insertions(+), 139 deletions(-)
diff --git a/dlls/gdi32/dibdrv/bitblt.c b/dlls/gdi32/dibdrv/bitblt.c index 8f67535..045c969 100644 --- a/dlls/gdi32/dibdrv/bitblt.c +++ b/dlls/gdi32/dibdrv/bitblt.c @@ -584,17 +584,15 @@ static void mask_rect( dib_info *dst, const RECT *dst_rect, const dib_info *src, static DWORD blend_rect( dib_info *dst, const RECT *dst_rect, const dib_info *src, const RECT *src_rect, HRGN clip, BLENDFUNCTION blend ) { - POINT origin; struct clipped_rects clipped_rects; - int i; + POINT offset;
if (!get_clipped_rects( dst, dst_rect, clip, &clipped_rects )) return ERROR_SUCCESS; - for (i = 0; i < clipped_rects.count; i++) - { - origin.x = src_rect->left + clipped_rects.rects[i].left - dst_rect->left; - origin.y = src_rect->top + clipped_rects.rects[i].top - dst_rect->top; - dst->funcs->blend_rect( dst, &clipped_rects.rects[i], src, &origin, blend ); - } + + offset.x = src_rect->left - dst_rect->left; + offset.y = src_rect->top - dst_rect->top; + dst->funcs->blend_rect( dst, src, &offset, &clipped_rects, blend ); + free_clipped_rects( &clipped_rects ); return ERROR_SUCCESS; } diff --git a/dlls/gdi32/dibdrv/dibdrv.h b/dlls/gdi32/dibdrv/dibdrv.h index 88b4c62..7db825c 100644 --- a/dlls/gdi32/dibdrv/dibdrv.h +++ b/dlls/gdi32/dibdrv/dibdrv.h @@ -165,6 +165,13 @@ static inline dibdrv_physdev *get_dibdrv_pdev( PHYSDEV dev ) return (dibdrv_physdev *)dev; }
+struct clipped_rects +{ + RECT *rects; + int count; + RECT buffer[32]; +}; + struct line_params { int err_start, err_add_1, err_add_2, bias; @@ -189,8 +196,8 @@ typedef struct primitive_funcs const dib_info *brush, const rop_mask_bits *bits); void (* copy_rect)(const dib_info *dst, const RECT *rc, const dib_info *src, const POINT *origin, int rop2, int overlap); - void (* blend_rect)(const dib_info *dst, const RECT *rc, const dib_info *src, - const POINT *origin, BLENDFUNCTION blend); + void (* blend_rect)(const dib_info *dst, const dib_info *src, const POINT *offset, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend); BOOL (* gradient_rect)(const dib_info *dib, const RECT *rc, const TRIVERTEX *v, int mode); void (* mask_rect)(const dib_info *dst, const RECT *rc, const dib_info *src, const POINT *origin, int rop2); @@ -240,13 +247,6 @@ typedef struct DWORD octant; } bres_params;
-struct clipped_rects -{ - RECT *rects; - int count; - RECT buffer[32]; -}; - extern void get_rop_codes(INT rop, struct rop_codes *codes) DECLSPEC_HIDDEN; extern void reset_dash_origin(dibdrv_physdev *pdev) DECLSPEC_HIDDEN; extern void init_dib_info_from_bitmapinfo(dib_info *dib, const BITMAPINFO *info, void *bits) DECLSPEC_HIDDEN; diff --git a/dlls/gdi32/dibdrv/primitives.c b/dlls/gdi32/dibdrv/primitives.c index 01a1c7c..0a5f7e5 100644 --- a/dlls/gdi32/dibdrv/primitives.c +++ b/dlls/gdi32/dibdrv/primitives.c @@ -4651,199 +4651,239 @@ static inline DWORD blend_rgb( BYTE dst_r, BYTE dst_g, BYTE dst_b, DWORD src, BL blend_color( dst_r, src >> 16, blend.SourceConstantAlpha ) << 16); }
-static void blend_rect_8888(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_8888(const dib_info *dst, const dib_info *src, const POINT *offset, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - DWORD *dst_ptr = get_pixel_ptr_32( dst, rc->left, rc->top ); - int x, y; + int i, x, y;
- if (blend.AlphaFormat & AC_SRC_ALPHA) + for (i = 0; i < clipped_rects->count; i++) { - if (blend.SourceConstantAlpha == 255) - for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) - for (x = 0; x < rc->right - rc->left; x++) - dst_ptr[x] = blend_argb( dst_ptr[x], src_ptr[x] ); + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + offset->x, rc->top + offset->y ); + DWORD *dst_ptr = get_pixel_ptr_32( dst, rc->left, rc->top ); + + if (blend.AlphaFormat & AC_SRC_ALPHA) + { + if (blend.SourceConstantAlpha == 255) + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) + for (x = 0; x < rc->right - rc->left; x++) + dst_ptr[x] = blend_argb( dst_ptr[x], src_ptr[x] ); + else + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) + for (x = 0; x < rc->right - rc->left; x++) + dst_ptr[x] = blend_argb_alpha( dst_ptr[x], src_ptr[x], blend.SourceConstantAlpha ); + } + else if (src->compression == BI_RGB) + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) + for (x = 0; x < rc->right - rc->left; x++) + dst_ptr[x] = blend_argb_constant_alpha( dst_ptr[x], src_ptr[x], blend.SourceConstantAlpha ); else - for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) - for (x = 0; x < rc->right - rc->left; x++) - dst_ptr[x] = blend_argb_alpha( dst_ptr[x], src_ptr[x], blend.SourceConstantAlpha ); - } - else if (src->compression == BI_RGB) - for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) - for (x = 0; x < rc->right - rc->left; x++) - dst_ptr[x] = blend_argb_constant_alpha( dst_ptr[x], src_ptr[x], blend.SourceConstantAlpha ); - else - for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) - for (x = 0; x < rc->right - rc->left; x++) - dst_ptr[x] = blend_argb_no_src_alpha( dst_ptr[x], src_ptr[x], blend.SourceConstantAlpha ); + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) + for (x = 0; x < rc->right - rc->left; x++) + dst_ptr[x] = blend_argb_no_src_alpha( dst_ptr[x], src_ptr[x], blend.SourceConstantAlpha ); + } }
-static void blend_rect_32(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_32(const dib_info *dst, const dib_info *src, const POINT *offset, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - DWORD *dst_ptr = get_pixel_ptr_32( dst, rc->left, rc->top ); - int x, y; + int i, x, y;
- if (dst->red_len == 8 && dst->green_len == 8 && dst->blue_len == 8) + for (i = 0; i < clipped_rects->count; i++) { - for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + offset->x, rc->top + offset->y ); + DWORD *dst_ptr = get_pixel_ptr_32( dst, rc->left, rc->top ); + + if (dst->red_len == 8 && dst->green_len == 8 && dst->blue_len == 8) { - for (x = 0; x < rc->right - rc->left; x++) + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) { - DWORD val = blend_rgb( dst_ptr[x] >> dst->red_shift, - dst_ptr[x] >> dst->green_shift, - dst_ptr[x] >> dst->blue_shift, - src_ptr[x], blend ); - dst_ptr[x] = ((( val & 0xff) << dst->blue_shift) | - (((val >> 8) & 0xff) << dst->green_shift) | - (((val >> 16) & 0xff) << dst->red_shift)); + for (x = 0; x < rc->right - rc->left; x++) + { + DWORD val = blend_rgb( dst_ptr[x] >> dst->red_shift, + dst_ptr[x] >> dst->green_shift, + dst_ptr[x] >> dst->blue_shift, + src_ptr[x], blend ); + dst_ptr[x] = ((( val & 0xff) << dst->blue_shift) | + (((val >> 8) & 0xff) << dst->green_shift) | + (((val >> 16) & 0xff) << dst->red_shift)); + } } } - } - else - { - for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) + else { - for (x = 0; x < rc->right - rc->left; x++) + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) { - DWORD val = blend_rgb( get_field( dst_ptr[x], dst->red_shift, dst->red_len ), - get_field( dst_ptr[x], dst->green_shift, dst->green_len ), - get_field( dst_ptr[x], dst->blue_shift, dst->blue_len ), - src_ptr[x], blend ); - dst_ptr[x] = rgb_to_pixel_masks( dst, val >> 16, val >> 8, val ); + for (x = 0; x < rc->right - rc->left; x++) + { + DWORD val = blend_rgb( get_field( dst_ptr[x], dst->red_shift, dst->red_len ), + get_field( dst_ptr[x], dst->green_shift, dst->green_len ), + get_field( dst_ptr[x], dst->blue_shift, dst->blue_len ), + src_ptr[x], blend ); + dst_ptr[x] = rgb_to_pixel_masks( dst, val >> 16, val >> 8, val ); + } } } } }
-static void blend_rect_24(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_24(const dib_info *dst, const dib_info *src, const POINT *offset, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - BYTE *dst_ptr = get_pixel_ptr_24( dst, rc->left, rc->top ); - int x, y; + int i, x, y;
- for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) + for (i = 0; i < clipped_rects->count; i++) { - for (x = 0; x < rc->right - rc->left; x++) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + offset->x, rc->top + offset->y ); + BYTE *dst_ptr = get_pixel_ptr_24( dst, rc->left, rc->top ); + + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) { - DWORD val = blend_rgb( dst_ptr[x * 3 + 2], dst_ptr[x * 3 + 1], dst_ptr[x * 3], - src_ptr[x], blend ); - dst_ptr[x * 3] = val; - dst_ptr[x * 3 + 1] = val >> 8; - dst_ptr[x * 3 + 2] = val >> 16; + for (x = 0; x < rc->right - rc->left; x++) + { + DWORD val = blend_rgb( dst_ptr[x * 3 + 2], dst_ptr[x * 3 + 1], dst_ptr[x * 3], + src_ptr[x], blend ); + dst_ptr[x * 3] = val; + dst_ptr[x * 3 + 1] = val >> 8; + dst_ptr[x * 3 + 2] = val >> 16; + } } } }
-static void blend_rect_555(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_555(const dib_info *dst, const dib_info *src, const POINT *offset, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - WORD *dst_ptr = get_pixel_ptr_16( dst, rc->left, rc->top ); - int x, y; + int i, x, y;
- for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 2, src_ptr += src->stride / 4) + for (i = 0; i < clipped_rects->count; i++) { - for (x = 0; x < rc->right - rc->left; x++) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + offset->x, rc->top + offset->y ); + WORD *dst_ptr = get_pixel_ptr_16( dst, rc->left, rc->top ); + + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 2, src_ptr += src->stride / 4) { - DWORD val = blend_rgb( ((dst_ptr[x] >> 7) & 0xf8) | ((dst_ptr[x] >> 12) & 0x07), - ((dst_ptr[x] >> 2) & 0xf8) | ((dst_ptr[x] >> 7) & 0x07), - ((dst_ptr[x] << 3) & 0xf8) | ((dst_ptr[x] >> 2) & 0x07), - src_ptr[x], blend ); - dst_ptr[x] = ((val >> 9) & 0x7c00) | ((val >> 6) & 0x03e0) | ((val >> 3) & 0x001f); + for (x = 0; x < rc->right - rc->left; x++) + { + DWORD val = blend_rgb( ((dst_ptr[x] >> 7) & 0xf8) | ((dst_ptr[x] >> 12) & 0x07), + ((dst_ptr[x] >> 2) & 0xf8) | ((dst_ptr[x] >> 7) & 0x07), + ((dst_ptr[x] << 3) & 0xf8) | ((dst_ptr[x] >> 2) & 0x07), + src_ptr[x], blend ); + dst_ptr[x] = ((val >> 9) & 0x7c00) | ((val >> 6) & 0x03e0) | ((val >> 3) & 0x001f); + } } } }
-static void blend_rect_16(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_16(const dib_info *dst, const dib_info *src, const POINT *offset, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - WORD *dst_ptr = get_pixel_ptr_16( dst, rc->left, rc->top ); - int x, y; + int i, x, y;
- for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 2, src_ptr += src->stride / 4) + for (i = 0; i < clipped_rects->count; i++) { - for (x = 0; x < rc->right - rc->left; x++) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + offset->x, rc->top + offset->y ); + WORD *dst_ptr = get_pixel_ptr_16( dst, rc->left, rc->top ); + + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 2, src_ptr += src->stride / 4) { - DWORD val = blend_rgb( get_field( dst_ptr[x], dst->red_shift, dst->red_len ), - get_field( dst_ptr[x], dst->green_shift, dst->green_len ), - get_field( dst_ptr[x], dst->blue_shift, dst->blue_len ), - src_ptr[x], blend ); - dst_ptr[x] = rgb_to_pixel_masks( dst, val >> 16, val >> 8, val ); + for (x = 0; x < rc->right - rc->left; x++) + { + DWORD val = blend_rgb( get_field( dst_ptr[x], dst->red_shift, dst->red_len ), + get_field( dst_ptr[x], dst->green_shift, dst->green_len ), + get_field( dst_ptr[x], dst->blue_shift, dst->blue_len ), + src_ptr[x], blend ); + dst_ptr[x] = rgb_to_pixel_masks( dst, val >> 16, val >> 8, val ); + } } } }
-static void blend_rect_8(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_8(const dib_info *dst, const dib_info *src, const POINT *offset, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { const RGBQUAD *color_table = get_dib_color_table( dst ); - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - BYTE *dst_ptr = get_pixel_ptr_8( dst, rc->left, rc->top ); - int x, y; + int i, x, y;
- for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) + for (i = 0; i < clipped_rects->count; i++) { - for (x = 0; x < rc->right - rc->left; x++) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + offset->x, rc->top + offset->y ); + BYTE *dst_ptr = get_pixel_ptr_8( dst, rc->left, rc->top ); + + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) { - RGBQUAD rgb = color_table[dst_ptr[x]]; - DWORD val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[x], blend ); - dst_ptr[x] = rgb_lookup_colortable( dst, val >> 16, val >> 8, val ); + for (x = 0; x < rc->right - rc->left; x++) + { + RGBQUAD rgb = color_table[dst_ptr[x]]; + DWORD val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[x], blend ); + dst_ptr[x] = rgb_lookup_colortable( dst, val >> 16, val >> 8, val ); + } } } }
-static void blend_rect_4(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_4(const dib_info *dst, const dib_info *src, const POINT *offset, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { const RGBQUAD *color_table = get_dib_color_table( dst ); - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - BYTE *dst_ptr = get_pixel_ptr_4( dst, rc->left, rc->top ); - int i, x, y; + int i, j, x, y;
- for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) + for (i = 0; i < clipped_rects->count; i++) { - for (i = 0, x = (dst->rect.left + rc->left) & 1; i < rc->right - rc->left; i++, x++) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + offset->x, rc->top + offset->y ); + BYTE *dst_ptr = get_pixel_ptr_4( dst, rc->left, rc->top ); + + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) { - DWORD val = ((x & 1) ? dst_ptr[x / 2] : (dst_ptr[x / 2] >> 4)) & 0x0f; - RGBQUAD rgb = color_table[val]; - val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[i], blend ); - val = rgb_lookup_colortable( dst, val >> 16, val >> 8, val ); - if (x & 1) - dst_ptr[x / 2] = val | (dst_ptr[x / 2] & 0xf0); - else - dst_ptr[x / 2] = (val << 4) | (dst_ptr[x / 2] & 0x0f); + for (j = 0, x = (dst->rect.left + rc->left) & 1; j < rc->right - rc->left; j++, x++) + { + DWORD val = ((x & 1) ? dst_ptr[x / 2] : (dst_ptr[x / 2] >> 4)) & 0x0f; + RGBQUAD rgb = color_table[val]; + val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[j], blend ); + val = rgb_lookup_colortable( dst, val >> 16, val >> 8, val ); + if (x & 1) + dst_ptr[x / 2] = val | (dst_ptr[x / 2] & 0xf0); + else + dst_ptr[x / 2] = (val << 4) | (dst_ptr[x / 2] & 0x0f); + } } } }
-static void blend_rect_1(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_1(const dib_info *dst, const dib_info *src, const POINT *offset, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { const RGBQUAD *color_table = get_dib_color_table( dst ); - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - BYTE *dst_ptr = get_pixel_ptr_1( dst, rc->left, rc->top ); - int i, x, y; + int i, j, x, y;
- for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) + for (i = 0; i < clipped_rects->count; i++) { - for (i = 0, x = (dst->rect.left + rc->left) & 7; i < rc->right - rc->left; i++, x++) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + offset->x, rc->top + offset->y ); + BYTE *dst_ptr = get_pixel_ptr_1( dst, rc->left, rc->top ); + + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) { - DWORD val = (dst_ptr[x / 8] & pixel_masks_1[x % 8]) ? 1 : 0; - RGBQUAD rgb = color_table[val]; - val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[i], blend ); - val = rgb_to_pixel_colortable(dst, val >> 16, val >> 8, val) ? 0xff : 0; - dst_ptr[x / 8] = (dst_ptr[x / 8] & ~pixel_masks_1[x % 8]) | (val & pixel_masks_1[x % 8]); + for (j = 0, x = (dst->rect.left + rc->left) & 7; j < rc->right - rc->left; j++, x++) + { + DWORD val = (dst_ptr[x / 8] & pixel_masks_1[x % 8]) ? 1 : 0; + RGBQUAD rgb = color_table[val]; + val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[j], blend ); + val = rgb_to_pixel_colortable(dst, val >> 16, val >> 8, val) ? 0xff : 0; + dst_ptr[x / 8] = (dst_ptr[x / 8] & ~pixel_masks_1[x % 8]) | (val & pixel_masks_1[x % 8]); + } } } }
-static void blend_rect_null(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_null(const dib_info *dst, const dib_info *src, const POINT *offset, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { }
For now the generating algorithm is straightforward, but can be improved in the future.
Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=50898 Signed-off-by: Gabriel Ivăncescu gabrielopcode@gmail.com --- dlls/gdi32/dibdrv/primitives.c | 61 +++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 16 deletions(-)
diff --git a/dlls/gdi32/dibdrv/primitives.c b/dlls/gdi32/dibdrv/primitives.c index 0a5f7e5..8947c60 100644 --- a/dlls/gdi32/dibdrv/primitives.c +++ b/dlls/gdi32/dibdrv/primitives.c @@ -3497,22 +3497,44 @@ static void convert_to_16(dib_info *dst, const dib_info *src, const RECT *src_re } }
-static inline BOOL color_tables_match(const dib_info *d1, const dib_info *d2) +/* + * To lookup RGB values into nearest color in the color table, Windows uses 5-bits of the RGB + * at the "center" of the RGB cube, presumably to do a similar lookup cache. The lowest 3 bits + * of the color are thus set to halfway (0x04) and then it's used in the distance calculation + * to the exact color in the color table. We exploit this as well to create a lookup cache. +*/ +struct rgb_lookup_colortable_ctx +{ + /* lookup table indexed by 5-bit-per-color RGB into a color table */ + BYTE map[32 * 32 * 32]; +}; + +static void rgb_lookup_colortable_init(const dib_info *dib, struct rgb_lookup_colortable_ctx *ctx) { - if (!d1->color_table || !d2->color_table) return (!d1->color_table && !d2->color_table); - return !memcmp(d1->color_table, d2->color_table, (1 << d1->bit_count) * sizeof(d1->color_table[0])); + unsigned r, g, b; + + for (b = 4; b < 256; b += 1 << 3) + for (g = 4; g < 256; g += 1 << 3) + for (r = 4; r < 256; r += 1 << 3) + ctx->map[r >> 3 | (g & ~7) << 2 | (b & ~7) << 7] = rgb_to_pixel_colortable(dib, r, g, b); }
-static inline DWORD rgb_lookup_colortable(const dib_info *dst, BYTE r, BYTE g, BYTE b) +static inline BYTE rgb_lookup_colortable(const struct rgb_lookup_colortable_ctx *ctx, BYTE r, BYTE g, BYTE b) { - /* Windows reduces precision to 5 bits, probably in order to build some sort of lookup cache */ - return rgb_to_pixel_colortable( dst, (r & ~7) + 4, (g & ~7) + 4, (b & ~7) + 4 ); + return ctx->map[(r >> 3) | (g & ~7) << 2 | (b & ~7) << 7]; +} + +static inline BOOL color_tables_match(const dib_info *d1, const dib_info *d2) +{ + if (!d1->color_table || !d2->color_table) return (!d1->color_table && !d2->color_table); + return !memcmp(d1->color_table, d2->color_table, (1 << d1->bit_count) * sizeof(d1->color_table[0])); }
static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rect, BOOL dither) { BYTE *dst_start = get_pixel_ptr_8(dst, 0, 0), *dst_pixel; INT x, y, pad_size = ((dst->width + 3) & ~3) - (src_rect->right - src_rect->left); + struct rgb_lookup_colortable_ctx lookup_ctx; DWORD src_val;
switch(src->bit_count) @@ -3521,6 +3543,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec { DWORD *src_start = get_pixel_ptr_32(src, src_rect->left, src_rect->top), *src_pixel;
+ rgb_lookup_colortable_init(dst, &lookup_ctx); if(src->funcs == &funcs_8888) { for(y = src_rect->top; y < src_rect->bottom; y++) @@ -3530,7 +3553,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, src_val >> 16, src_val >> 8, src_val ); + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, src_val >> 16, src_val >> 8, src_val ); } if(pad_size) memset(dst_pixel, 0, pad_size); dst_start += dst->stride; @@ -3546,7 +3569,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, src_val >> src->red_shift, src_val >> src->green_shift, src_val >> src->blue_shift ); @@ -3565,7 +3588,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, get_field(src_val, src->red_shift, src->red_len), get_field(src_val, src->green_shift, src->green_len), get_field(src_val, src->blue_shift, src->blue_len)); @@ -3582,13 +3605,14 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec { BYTE *src_start = get_pixel_ptr_24(src, src_rect->left, src_rect->top), *src_pixel;
+ rgb_lookup_colortable_init(dst, &lookup_ctx); for(y = src_rect->top; y < src_rect->bottom; y++) { dst_pixel = dst_start; src_pixel = src_start; for(x = src_rect->left; x < src_rect->right; x++, src_pixel += 3) { - *dst_pixel++ = rgb_lookup_colortable(dst, src_pixel[2], src_pixel[1], src_pixel[0] ); + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, src_pixel[2], src_pixel[1], src_pixel[0] ); } if(pad_size) memset(dst_pixel, 0, pad_size); dst_start += dst->stride; @@ -3600,6 +3624,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec case 16: { WORD *src_start = get_pixel_ptr_16(src, src_rect->left, src_rect->top), *src_pixel; + rgb_lookup_colortable_init(dst, &lookup_ctx); if(src->funcs == &funcs_555) { for(y = src_rect->top; y < src_rect->bottom; y++) @@ -3609,7 +3634,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, ((src_val >> 7) & 0xf8) | ((src_val >> 12) & 0x07), ((src_val >> 2) & 0xf8) | ((src_val >> 7) & 0x07), ((src_val << 3) & 0xf8) | ((src_val >> 2) & 0x07) ); @@ -3628,7 +3653,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, (((src_val >> src->red_shift) << 3) & 0xf8) | (((src_val >> src->red_shift) >> 2) & 0x07), (((src_val >> src->green_shift) << 3) & 0xf8) | @@ -3650,7 +3675,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, (((src_val >> src->red_shift) << 3) & 0xf8) | (((src_val >> src->red_shift) >> 2) & 0x07), (((src_val >> src->green_shift) << 2) & 0xfc) | @@ -3672,7 +3697,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, get_field(src_val, src->red_shift, src->red_len), get_field(src_val, src->green_shift, src->green_len), get_field(src_val, src->blue_shift, src->blue_len)); @@ -4807,8 +4832,10 @@ static void blend_rect_8(const dib_info *dst, const dib_info *src, const POINT * const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { const RGBQUAD *color_table = get_dib_color_table( dst ); + struct rgb_lookup_colortable_ctx lookup_ctx; int i, x, y;
+ rgb_lookup_colortable_init( dst, &lookup_ctx ); for (i = 0; i < clipped_rects->count; i++) { const RECT *rc = &clipped_rects->rects[i]; @@ -4821,7 +4848,7 @@ static void blend_rect_8(const dib_info *dst, const dib_info *src, const POINT * { RGBQUAD rgb = color_table[dst_ptr[x]]; DWORD val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[x], blend ); - dst_ptr[x] = rgb_lookup_colortable( dst, val >> 16, val >> 8, val ); + dst_ptr[x] = rgb_lookup_colortable( &lookup_ctx, val >> 16, val >> 8, val ); } } } @@ -4831,8 +4858,10 @@ static void blend_rect_4(const dib_info *dst, const dib_info *src, const POINT * const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { const RGBQUAD *color_table = get_dib_color_table( dst ); + struct rgb_lookup_colortable_ctx lookup_ctx; int i, j, x, y;
+ rgb_lookup_colortable_init( dst, &lookup_ctx ); for (i = 0; i < clipped_rects->count; i++) { const RECT *rc = &clipped_rects->rects[i]; @@ -4846,7 +4875,7 @@ static void blend_rect_4(const dib_info *dst, const dib_info *src, const POINT * DWORD val = ((x & 1) ? dst_ptr[x / 2] : (dst_ptr[x / 2] >> 4)) & 0x0f; RGBQUAD rgb = color_table[val]; val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[j], blend ); - val = rgb_lookup_colortable( dst, val >> 16, val >> 8, val ); + val = rgb_lookup_colortable( &lookup_ctx, val >> 16, val >> 8, val ); if (x & 1) dst_ptr[x / 2] = val | (dst_ptr[x / 2] & 0xf0); else
Usually, when there's performance issues with RGB into color table conversions, the same color table is used. So instead of generating it over and over, this caches it if it's frequently accessed (as should be the case when performance is an issue in the first place).
Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=50898 Signed-off-by: Gabriel Ivăncescu gabrielopcode@gmail.com --- dlls/gdi32/dibdrv/primitives.c | 89 +++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-)
diff --git a/dlls/gdi32/dibdrv/primitives.c b/dlls/gdi32/dibdrv/primitives.c index 8947c60..6071280 100644 --- a/dlls/gdi32/dibdrv/primitives.c +++ b/dlls/gdi32/dibdrv/primitives.c @@ -3511,12 +3511,99 @@ struct rgb_lookup_colortable_ctx
static void rgb_lookup_colortable_init(const dib_info *dib, struct rgb_lookup_colortable_ctx *ctx) { - unsigned r, g, b; + /* + Globally cache frequently generated maps, as most apps usually only use a few color tables often. + We keep the variables separated for better cache locality when looking them up (tables are large). + */ + static SRWLOCK global_cache_lock = SRWLOCK_INIT; + static volatile DWORD global_cache_sequence; + + /* ticks when the cached map was last accessed or generated; we discard the oldest first */ + static DWORD global_cache_ticks[4]; + + /* number of colors in the given cached color table */ + static unsigned global_cache_color_table_size[ARRAY_SIZE(global_cache_ticks)]; + + /* the actual cached color tables to lookup and see if they're cached */ + static RGBQUAD global_cache_color_table[ARRAY_SIZE(global_cache_ticks)][256]; + + /* the associated generated maps for the color tables above */ + static BYTE global_cache_map[ARRAY_SIZE(global_cache_ticks)][ARRAY_SIZE(ctx->map)]; + + + unsigned color_table_size = dib->color_table ? dib->color_table_size : 1 << dib->bit_count; + const RGBQUAD *color_table = get_dib_color_table(dib); + DWORD newest, oldest_entry, sequence; + unsigned i, r, g, b; + + /* check the global cache first */ + if (TryAcquireSRWLockShared(&global_cache_lock)) + { + newest = global_cache_ticks[0]; + for (i = 0; i < ARRAY_SIZE(global_cache_color_table_size); i++) + { + if (color_table_size == global_cache_color_table_size[i] && + !memcmp(color_table, global_cache_color_table[i], color_table_size * sizeof(color_table[0]))) + { + memcpy(ctx->map, global_cache_map[i], sizeof(ctx->map)); + + /* it's a shared lock, but the exact timestamp is unimportant, so it's OK */ + global_cache_ticks[i] = GetTickCount(); + + ReleaseSRWLockShared(&global_cache_lock); + return; + } + if ((LONG)(newest - global_cache_ticks[i]) < 0) + newest = global_cache_ticks[i]; + } + sequence = global_cache_sequence; + ReleaseSRWLockShared(&global_cache_lock); + } + else + { + newest = global_cache_ticks[0]; + sequence = global_cache_sequence - 1; /* make sure we re-check the cache */ + } +
for (b = 4; b < 256; b += 1 << 3) for (g = 4; g < 256; g += 1 << 3) for (r = 4; r < 256; r += 1 << 3) ctx->map[r >> 3 | (g & ~7) << 2 | (b & ~7) << 7] = rgb_to_pixel_colortable(dib, r, g, b); + + + /* update the global cache if needed */ + AcquireSRWLockExclusive(&global_cache_lock); + + /* if the cache was updated underneath us, check it again */ + if (sequence != global_cache_sequence) + { + for (i = 0; i < ARRAY_SIZE(global_cache_color_table_size); i++) + { + if ((LONG)(newest - global_cache_ticks[i]) <= 0 && + color_table_size == global_cache_color_table_size[i] && + !memcmp(color_table, global_cache_color_table[i], color_table_size * sizeof(color_table[0]))) + { + global_cache_ticks[i] = GetTickCount(); + ReleaseSRWLockExclusive(&global_cache_lock); + return; + } + } + } + global_cache_sequence++; + + oldest_entry = 0; + for (i = 1; i < ARRAY_SIZE(global_cache_color_table_size); i++) + if ((LONG)(global_cache_ticks[oldest_entry] - global_cache_ticks[i]) > 0) + oldest_entry = i; + + memcpy(global_cache_color_table[oldest_entry], color_table, color_table_size * sizeof(color_table[0])); + memcpy(global_cache_map[oldest_entry], ctx->map, sizeof(ctx->map)); + + global_cache_color_table_size[oldest_entry] = color_table_size; + global_cache_ticks[oldest_entry] = GetTickCount(); + + ReleaseSRWLockExclusive(&global_cache_lock); }
static inline BYTE rgb_lookup_colortable(const struct rgb_lookup_colortable_ctx *ctx, BYTE r, BYTE g, BYTE b)
On Fri, Apr 02, 2021 at 03:54:02PM +0300, Gabriel Ivăncescu wrote:
Usually, when there's performance issues with RGB into color table conversions, the same color table is used. So instead of generating it over and over, this caches it if it's frequently accessed (as should be the case when performance is an issue in the first place).
Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=50898 Signed-off-by: Gabriel Ivăncescu gabrielopcode@gmail.com
dlls/gdi32/dibdrv/primitives.c | 89 +++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-)
diff --git a/dlls/gdi32/dibdrv/primitives.c b/dlls/gdi32/dibdrv/primitives.c index 8947c60..6071280 100644 --- a/dlls/gdi32/dibdrv/primitives.c +++ b/dlls/gdi32/dibdrv/primitives.c @@ -3511,12 +3511,99 @@ struct rgb_lookup_colortable_ctx
static void rgb_lookup_colortable_init(const dib_info *dib, struct rgb_lookup_colortable_ctx *ctx) {
- unsigned r, g, b;
/*
Globally cache frequently generated maps, as most apps usually only use a few color tables often.
We keep the variables separated for better cache locality when looking them up (tables are large).
*/
static SRWLOCK global_cache_lock = SRWLOCK_INIT;
static volatile DWORD global_cache_sequence;
/* ticks when the cached map was last accessed or generated; we discard the oldest first */
static DWORD global_cache_ticks[4];
/* number of colors in the given cached color table */
static unsigned global_cache_color_table_size[ARRAY_SIZE(global_cache_ticks)];
/* the actual cached color tables to lookup and see if they're cached */
static RGBQUAD global_cache_color_table[ARRAY_SIZE(global_cache_ticks)][256];
/* the associated generated maps for the color tables above */
static BYTE global_cache_map[ARRAY_SIZE(global_cache_ticks)][ARRAY_SIZE(ctx->map)];
unsigned color_table_size = dib->color_table ? dib->color_table_size : 1 << dib->bit_count;
const RGBQUAD *color_table = get_dib_color_table(dib);
DWORD newest, oldest_entry, sequence;
unsigned i, r, g, b;
/* check the global cache first */
if (TryAcquireSRWLockShared(&global_cache_lock))
{
newest = global_cache_ticks[0];
for (i = 0; i < ARRAY_SIZE(global_cache_color_table_size); i++)
{
if (color_table_size == global_cache_color_table_size[i] &&
!memcmp(color_table, global_cache_color_table[i], color_table_size * sizeof(color_table[0])))
{
memcpy(ctx->map, global_cache_map[i], sizeof(ctx->map));
/* it's a shared lock, but the exact timestamp is unimportant, so it's OK */
global_cache_ticks[i] = GetTickCount();
ReleaseSRWLockShared(&global_cache_lock);
return;
}
if ((LONG)(newest - global_cache_ticks[i]) < 0)
newest = global_cache_ticks[i];
}
sequence = global_cache_sequence;
ReleaseSRWLockShared(&global_cache_lock);
}
else
{
newest = global_cache_ticks[0];
sequence = global_cache_sequence - 1; /* make sure we re-check the cache */
}
for (b = 4; b < 256; b += 1 << 3) for (g = 4; g < 256; g += 1 << 3) for (r = 4; r < 256; r += 1 << 3) ctx->map[r >> 3 | (g & ~7) << 2 | (b & ~7) << 7] = rgb_to_pixel_colortable(dib, r, g, b);
/* update the global cache if needed */
AcquireSRWLockExclusive(&global_cache_lock);
/* if the cache was updated underneath us, check it again */
if (sequence != global_cache_sequence)
{
for (i = 0; i < ARRAY_SIZE(global_cache_color_table_size); i++)
{
if ((LONG)(newest - global_cache_ticks[i]) <= 0 &&
color_table_size == global_cache_color_table_size[i] &&
!memcmp(color_table, global_cache_color_table[i], color_table_size * sizeof(color_table[0])))
{
global_cache_ticks[i] = GetTickCount();
ReleaseSRWLockExclusive(&global_cache_lock);
return;
}
}
}
global_cache_sequence++;
oldest_entry = 0;
for (i = 1; i < ARRAY_SIZE(global_cache_color_table_size); i++)
if ((LONG)(global_cache_ticks[oldest_entry] - global_cache_ticks[i]) > 0)
oldest_entry = i;
memcpy(global_cache_color_table[oldest_entry], color_table, color_table_size * sizeof(color_table[0]));
memcpy(global_cache_map[oldest_entry], ctx->map, sizeof(ctx->map));
global_cache_color_table_size[oldest_entry] = color_table_size;
global_cache_ticks[oldest_entry] = GetTickCount();
ReleaseSRWLockExclusive(&global_cache_lock);
}
Yeah, I'm not convinced, not even close. You're going to have to provide some pretty good evidence, with real world apps, that this is necessary over a local, lazy-init solution.
Huw.
On 07/04/2021 12:03, Huw Davies wrote:
On Fri, Apr 02, 2021 at 03:54:02PM +0300, Gabriel Ivăncescu wrote:
Usually, when there's performance issues with RGB into color table conversions, the same color table is used. So instead of generating it over and over, this caches it if it's frequently accessed (as should be the case when performance is an issue in the first place).
Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=50898 Signed-off-by: Gabriel Ivăncescu gabrielopcode@gmail.com
dlls/gdi32/dibdrv/primitives.c | 89 +++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-)
diff --git a/dlls/gdi32/dibdrv/primitives.c b/dlls/gdi32/dibdrv/primitives.c index 8947c60..6071280 100644 --- a/dlls/gdi32/dibdrv/primitives.c +++ b/dlls/gdi32/dibdrv/primitives.c @@ -3511,12 +3511,99 @@ struct rgb_lookup_colortable_ctx
static void rgb_lookup_colortable_init(const dib_info *dib, struct rgb_lookup_colortable_ctx *ctx) {
- unsigned r, g, b;
/*
Globally cache frequently generated maps, as most apps usually only use a few color tables often.
We keep the variables separated for better cache locality when looking them up (tables are large).
*/
static SRWLOCK global_cache_lock = SRWLOCK_INIT;
static volatile DWORD global_cache_sequence;
/* ticks when the cached map was last accessed or generated; we discard the oldest first */
static DWORD global_cache_ticks[4];
/* number of colors in the given cached color table */
static unsigned global_cache_color_table_size[ARRAY_SIZE(global_cache_ticks)];
/* the actual cached color tables to lookup and see if they're cached */
static RGBQUAD global_cache_color_table[ARRAY_SIZE(global_cache_ticks)][256];
/* the associated generated maps for the color tables above */
static BYTE global_cache_map[ARRAY_SIZE(global_cache_ticks)][ARRAY_SIZE(ctx->map)];
unsigned color_table_size = dib->color_table ? dib->color_table_size : 1 << dib->bit_count;
const RGBQUAD *color_table = get_dib_color_table(dib);
DWORD newest, oldest_entry, sequence;
unsigned i, r, g, b;
/* check the global cache first */
if (TryAcquireSRWLockShared(&global_cache_lock))
{
newest = global_cache_ticks[0];
for (i = 0; i < ARRAY_SIZE(global_cache_color_table_size); i++)
{
if (color_table_size == global_cache_color_table_size[i] &&
!memcmp(color_table, global_cache_color_table[i], color_table_size * sizeof(color_table[0])))
{
memcpy(ctx->map, global_cache_map[i], sizeof(ctx->map));
/* it's a shared lock, but the exact timestamp is unimportant, so it's OK */
global_cache_ticks[i] = GetTickCount();
ReleaseSRWLockShared(&global_cache_lock);
return;
}
if ((LONG)(newest - global_cache_ticks[i]) < 0)
newest = global_cache_ticks[i];
}
sequence = global_cache_sequence;
ReleaseSRWLockShared(&global_cache_lock);
}
else
{
newest = global_cache_ticks[0];
sequence = global_cache_sequence - 1; /* make sure we re-check the cache */
}
for (b = 4; b < 256; b += 1 << 3) for (g = 4; g < 256; g += 1 << 3) for (r = 4; r < 256; r += 1 << 3) ctx->map[r >> 3 | (g & ~7) << 2 | (b & ~7) << 7] = rgb_to_pixel_colortable(dib, r, g, b);
/* update the global cache if needed */
AcquireSRWLockExclusive(&global_cache_lock);
/* if the cache was updated underneath us, check it again */
if (sequence != global_cache_sequence)
{
for (i = 0; i < ARRAY_SIZE(global_cache_color_table_size); i++)
{
if ((LONG)(newest - global_cache_ticks[i]) <= 0 &&
color_table_size == global_cache_color_table_size[i] &&
!memcmp(color_table, global_cache_color_table[i], color_table_size * sizeof(color_table[0])))
{
global_cache_ticks[i] = GetTickCount();
ReleaseSRWLockExclusive(&global_cache_lock);
return;
}
}
}
global_cache_sequence++;
oldest_entry = 0;
for (i = 1; i < ARRAY_SIZE(global_cache_color_table_size); i++)
if ((LONG)(global_cache_ticks[oldest_entry] - global_cache_ticks[i]) > 0)
oldest_entry = i;
memcpy(global_cache_color_table[oldest_entry], color_table, color_table_size * sizeof(color_table[0]));
memcpy(global_cache_map[oldest_entry], ctx->map, sizeof(ctx->map));
global_cache_color_table_size[oldest_entry] = color_table_size;
global_cache_ticks[oldest_entry] = GetTickCount();
ReleaseSRWLockExclusive(&global_cache_lock); }
Yeah, I'm not convinced, not even close. You're going to have to provide some pretty good evidence, with real world apps, that this is necessary over a local, lazy-init solution.
Huw.
Well a simple real world test is in the bug report with Winamp's Credits, while it's not that important, it's easy enough to test compared to, say, games.
When I tested on Windows it does seem to cache color tables between calls because it was just way too fast, unless they have some voodoo algorithm to generate the color table extremely quickly on each call.
Of course it depends on your CPU's performance, but this patch makes it use very little CPU so definitely it's massively faster than other methods.
By the way, I revised the table generation algorithm, it's correct now and simplified, and only about ~32% slower on average with 256 colors. I attached the diff here on top of these patches so you can have a look if you prefer that.
What it does is basically it simply goes through all 6 directions at an entire offset and checks distance on all. If none were filled in a particular direction, it drops it from further searches (that's the main benefit).
It doesn't mean it will go along with these patches of course, it's just to give an idea and why it can be extended easily. If you want I can easily drop the global cache for now and just send this patch instead—which is still a good improvement.
While it does provide a massive performance benefit by itself and allows Winamp to show the Credits at full speed (just like Windows), it still uses ~80% of my CPU Core to do it, while Windows uses only ~10-20% (and the global cache does as well, that's why I suspect that of Windows).
However, I'm perfectly fine with the idea of dropping the global cache patch, if you think just this patch would be enough of an improvement. I mean, we could always extend the global cache later, maybe in better ways.
But at least it doesn't "lock us" into it like the lazy init approach would, since then the cache would become way more complicated no matter what shape it would have. For example, we'd have to store the initialized bits, copy them from the cache (so we don't keep the lock while we process the image), do the lazy init processing, then reacquire the lock and merge the results (the underlying cache could've changed as well, and we have to merge new lazy inited entries in the cache). It sounds way too complicated and not worth it to go with it, that's why I dislike the lazy init approach.
What do you think?
On Wed, Apr 07, 2021 at 04:34:15PM +0300, Gabriel Ivăncescu wrote:
On 07/04/2021 12:03, Huw Davies wrote:
Yeah, I'm not convinced, not even close. You're going to have to provide some pretty good evidence, with real world apps, that this is necessary over a local, lazy-init solution.
Well a simple real world test is in the bug report with Winamp's Credits, while it's not that important, it's easy enough to test compared to, say, games.
When I tested on Windows it does seem to cache color tables between calls because it was just way too fast, unless they have some voodoo algorithm to generate the color table extremely quickly on each call.
A possible algorithm[1] has been suggested (twice). To make progress here it would be a good idea to try that.
Huw. [1] a local lazy-init.
On 08/04/2021 10:31, Huw Davies wrote:
On Wed, Apr 07, 2021 at 04:34:15PM +0300, Gabriel Ivăncescu wrote:
On 07/04/2021 12:03, Huw Davies wrote:
Yeah, I'm not convinced, not even close. You're going to have to provide some pretty good evidence, with real world apps, that this is necessary over a local, lazy-init solution.
Well a simple real world test is in the bug report with Winamp's Credits, while it's not that important, it's easy enough to test compared to, say, games.
When I tested on Windows it does seem to cache color tables between calls because it was just way too fast, unless they have some voodoo algorithm to generate the color table extremely quickly on each call.
A possible algorithm[1] has been suggested (twice). To make progress here it would be a good idea to try that.
Huw. [1] a local lazy-init.
Well I guess my persuasion failed. I didn't do extensive tests, but the lazy init seems to still use a lot of CPU when the framerate is high (with the Credits tab). It's not particularly a problem since my other patch has similar performance without the global cache, but...
I still think this will be a "dead end" and this code will never see further improvement due to complicating the global cache even more (and otherwise we'd have to throw the lazy init away, which nullifies the point). And it's unfortunate that Windows still does it faster, and has a cache for sure.
But anyway I'll send the lazy init way then, since at least they're still better than what we have currently...