This is to prepare so we don't recalculate the lookup cache map for color tables on every clipped rect (which is expensive).
Signed-off-by: Gabriel Ivăncescu gabrielopcode@gmail.com ---
This is a no-op patch, it's needed for next patch.
dlls/gdi32/dibdrv/bitblt.c | 11 +- dlls/gdi32/dibdrv/dibdrv.h | 18 +-- dlls/gdi32/dibdrv/primitives.c | 284 +++++++++++++++++++-------------- 3 files changed, 174 insertions(+), 139 deletions(-)
diff --git a/dlls/gdi32/dibdrv/bitblt.c b/dlls/gdi32/dibdrv/bitblt.c index 8f67535..590ba08 100644 --- a/dlls/gdi32/dibdrv/bitblt.c +++ b/dlls/gdi32/dibdrv/bitblt.c @@ -584,17 +584,12 @@ static void mask_rect( dib_info *dst, const RECT *dst_rect, const dib_info *src, static DWORD blend_rect( dib_info *dst, const RECT *dst_rect, const dib_info *src, const RECT *src_rect, HRGN clip, BLENDFUNCTION blend ) { - POINT origin; struct clipped_rects clipped_rects; - int i;
if (!get_clipped_rects( dst, dst_rect, clip, &clipped_rects )) return ERROR_SUCCESS; - for (i = 0; i < clipped_rects.count; i++) - { - origin.x = src_rect->left + clipped_rects.rects[i].left - dst_rect->left; - origin.y = src_rect->top + clipped_rects.rects[i].top - dst_rect->top; - dst->funcs->blend_rect( dst, &clipped_rects.rects[i], src, &origin, blend ); - } + + dst->funcs->blend_rect( dst, src, src_rect->left - dst_rect->left, src_rect->top - dst_rect->top, &clipped_rects, blend ); + free_clipped_rects( &clipped_rects ); return ERROR_SUCCESS; } diff --git a/dlls/gdi32/dibdrv/dibdrv.h b/dlls/gdi32/dibdrv/dibdrv.h index 88b4c62..2a69625 100644 --- a/dlls/gdi32/dibdrv/dibdrv.h +++ b/dlls/gdi32/dibdrv/dibdrv.h @@ -165,6 +165,13 @@ static inline dibdrv_physdev *get_dibdrv_pdev( PHYSDEV dev ) return (dibdrv_physdev *)dev; }
+struct clipped_rects +{ + RECT *rects; + int count; + RECT buffer[32]; +}; + struct line_params { int err_start, err_add_1, err_add_2, bias; @@ -189,8 +196,8 @@ typedef struct primitive_funcs const dib_info *brush, const rop_mask_bits *bits); void (* copy_rect)(const dib_info *dst, const RECT *rc, const dib_info *src, const POINT *origin, int rop2, int overlap); - void (* blend_rect)(const dib_info *dst, const RECT *rc, const dib_info *src, - const POINT *origin, BLENDFUNCTION blend); + void (* blend_rect)(const dib_info *dst, const dib_info *src, LONG diff_x, LONG diff_y, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend); BOOL (* gradient_rect)(const dib_info *dib, const RECT *rc, const TRIVERTEX *v, int mode); void (* mask_rect)(const dib_info *dst, const RECT *rc, const dib_info *src, const POINT *origin, int rop2); @@ -240,13 +247,6 @@ typedef struct DWORD octant; } bres_params;
-struct clipped_rects -{ - RECT *rects; - int count; - RECT buffer[32]; -}; - extern void get_rop_codes(INT rop, struct rop_codes *codes) DECLSPEC_HIDDEN; extern void reset_dash_origin(dibdrv_physdev *pdev) DECLSPEC_HIDDEN; extern void init_dib_info_from_bitmapinfo(dib_info *dib, const BITMAPINFO *info, void *bits) DECLSPEC_HIDDEN; diff --git a/dlls/gdi32/dibdrv/primitives.c b/dlls/gdi32/dibdrv/primitives.c index 01a1c7c..569baae 100644 --- a/dlls/gdi32/dibdrv/primitives.c +++ b/dlls/gdi32/dibdrv/primitives.c @@ -4651,199 +4651,239 @@ static inline DWORD blend_rgb( BYTE dst_r, BYTE dst_g, BYTE dst_b, DWORD src, BL blend_color( dst_r, src >> 16, blend.SourceConstantAlpha ) << 16); }
-static void blend_rect_8888(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_8888(const dib_info *dst, const dib_info *src, LONG diff_x, LONG diff_y, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - DWORD *dst_ptr = get_pixel_ptr_32( dst, rc->left, rc->top ); - int x, y; + int i, x, y;
- if (blend.AlphaFormat & AC_SRC_ALPHA) + for (i = 0; i < clipped_rects->count; i++) { - if (blend.SourceConstantAlpha == 255) - for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) - for (x = 0; x < rc->right - rc->left; x++) - dst_ptr[x] = blend_argb( dst_ptr[x], src_ptr[x] ); + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + diff_x, rc->top + diff_y ); + DWORD *dst_ptr = get_pixel_ptr_32( dst, rc->left, rc->top ); + + if (blend.AlphaFormat & AC_SRC_ALPHA) + { + if (blend.SourceConstantAlpha == 255) + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) + for (x = 0; x < rc->right - rc->left; x++) + dst_ptr[x] = blend_argb( dst_ptr[x], src_ptr[x] ); + else + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) + for (x = 0; x < rc->right - rc->left; x++) + dst_ptr[x] = blend_argb_alpha( dst_ptr[x], src_ptr[x], blend.SourceConstantAlpha ); + } + else if (src->compression == BI_RGB) + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) + for (x = 0; x < rc->right - rc->left; x++) + dst_ptr[x] = blend_argb_constant_alpha( dst_ptr[x], src_ptr[x], blend.SourceConstantAlpha ); else - for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) - for (x = 0; x < rc->right - rc->left; x++) - dst_ptr[x] = blend_argb_alpha( dst_ptr[x], src_ptr[x], blend.SourceConstantAlpha ); - } - else if (src->compression == BI_RGB) - for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) - for (x = 0; x < rc->right - rc->left; x++) - dst_ptr[x] = blend_argb_constant_alpha( dst_ptr[x], src_ptr[x], blend.SourceConstantAlpha ); - else - for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) - for (x = 0; x < rc->right - rc->left; x++) - dst_ptr[x] = blend_argb_no_src_alpha( dst_ptr[x], src_ptr[x], blend.SourceConstantAlpha ); + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) + for (x = 0; x < rc->right - rc->left; x++) + dst_ptr[x] = blend_argb_no_src_alpha( dst_ptr[x], src_ptr[x], blend.SourceConstantAlpha ); + } }
-static void blend_rect_32(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_32(const dib_info *dst, const dib_info *src, LONG diff_x, LONG diff_y, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - DWORD *dst_ptr = get_pixel_ptr_32( dst, rc->left, rc->top ); - int x, y; + int i, x, y;
- if (dst->red_len == 8 && dst->green_len == 8 && dst->blue_len == 8) + for (i = 0; i < clipped_rects->count; i++) { - for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + diff_x, rc->top + diff_y ); + DWORD *dst_ptr = get_pixel_ptr_32( dst, rc->left, rc->top ); + + if (dst->red_len == 8 && dst->green_len == 8 && dst->blue_len == 8) { - for (x = 0; x < rc->right - rc->left; x++) + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) { - DWORD val = blend_rgb( dst_ptr[x] >> dst->red_shift, - dst_ptr[x] >> dst->green_shift, - dst_ptr[x] >> dst->blue_shift, - src_ptr[x], blend ); - dst_ptr[x] = ((( val & 0xff) << dst->blue_shift) | - (((val >> 8) & 0xff) << dst->green_shift) | - (((val >> 16) & 0xff) << dst->red_shift)); + for (x = 0; x < rc->right - rc->left; x++) + { + DWORD val = blend_rgb( dst_ptr[x] >> dst->red_shift, + dst_ptr[x] >> dst->green_shift, + dst_ptr[x] >> dst->blue_shift, + src_ptr[x], blend ); + dst_ptr[x] = ((( val & 0xff) << dst->blue_shift) | + (((val >> 8) & 0xff) << dst->green_shift) | + (((val >> 16) & 0xff) << dst->red_shift)); + } } } - } - else - { - for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) + else { - for (x = 0; x < rc->right - rc->left; x++) + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 4, src_ptr += src->stride / 4) { - DWORD val = blend_rgb( get_field( dst_ptr[x], dst->red_shift, dst->red_len ), - get_field( dst_ptr[x], dst->green_shift, dst->green_len ), - get_field( dst_ptr[x], dst->blue_shift, dst->blue_len ), - src_ptr[x], blend ); - dst_ptr[x] = rgb_to_pixel_masks( dst, val >> 16, val >> 8, val ); + for (x = 0; x < rc->right - rc->left; x++) + { + DWORD val = blend_rgb( get_field( dst_ptr[x], dst->red_shift, dst->red_len ), + get_field( dst_ptr[x], dst->green_shift, dst->green_len ), + get_field( dst_ptr[x], dst->blue_shift, dst->blue_len ), + src_ptr[x], blend ); + dst_ptr[x] = rgb_to_pixel_masks( dst, val >> 16, val >> 8, val ); + } } } } }
-static void blend_rect_24(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_24(const dib_info *dst, const dib_info *src, LONG diff_x, LONG diff_y, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - BYTE *dst_ptr = get_pixel_ptr_24( dst, rc->left, rc->top ); - int x, y; + int i, x, y;
- for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) + for (i = 0; i < clipped_rects->count; i++) { - for (x = 0; x < rc->right - rc->left; x++) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + diff_x, rc->top + diff_y ); + BYTE *dst_ptr = get_pixel_ptr_24( dst, rc->left, rc->top ); + + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) { - DWORD val = blend_rgb( dst_ptr[x * 3 + 2], dst_ptr[x * 3 + 1], dst_ptr[x * 3], - src_ptr[x], blend ); - dst_ptr[x * 3] = val; - dst_ptr[x * 3 + 1] = val >> 8; - dst_ptr[x * 3 + 2] = val >> 16; + for (x = 0; x < rc->right - rc->left; x++) + { + DWORD val = blend_rgb( dst_ptr[x * 3 + 2], dst_ptr[x * 3 + 1], dst_ptr[x * 3], + src_ptr[x], blend ); + dst_ptr[x * 3] = val; + dst_ptr[x * 3 + 1] = val >> 8; + dst_ptr[x * 3 + 2] = val >> 16; + } } } }
-static void blend_rect_555(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_555(const dib_info *dst, const dib_info *src, LONG diff_x, LONG diff_y, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - WORD *dst_ptr = get_pixel_ptr_16( dst, rc->left, rc->top ); - int x, y; + int i, x, y;
- for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 2, src_ptr += src->stride / 4) + for (i = 0; i < clipped_rects->count; i++) { - for (x = 0; x < rc->right - rc->left; x++) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + diff_x, rc->top + diff_y ); + WORD *dst_ptr = get_pixel_ptr_16( dst, rc->left, rc->top ); + + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 2, src_ptr += src->stride / 4) { - DWORD val = blend_rgb( ((dst_ptr[x] >> 7) & 0xf8) | ((dst_ptr[x] >> 12) & 0x07), - ((dst_ptr[x] >> 2) & 0xf8) | ((dst_ptr[x] >> 7) & 0x07), - ((dst_ptr[x] << 3) & 0xf8) | ((dst_ptr[x] >> 2) & 0x07), - src_ptr[x], blend ); - dst_ptr[x] = ((val >> 9) & 0x7c00) | ((val >> 6) & 0x03e0) | ((val >> 3) & 0x001f); + for (x = 0; x < rc->right - rc->left; x++) + { + DWORD val = blend_rgb( ((dst_ptr[x] >> 7) & 0xf8) | ((dst_ptr[x] >> 12) & 0x07), + ((dst_ptr[x] >> 2) & 0xf8) | ((dst_ptr[x] >> 7) & 0x07), + ((dst_ptr[x] << 3) & 0xf8) | ((dst_ptr[x] >> 2) & 0x07), + src_ptr[x], blend ); + dst_ptr[x] = ((val >> 9) & 0x7c00) | ((val >> 6) & 0x03e0) | ((val >> 3) & 0x001f); + } } } }
-static void blend_rect_16(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_16(const dib_info *dst, const dib_info *src, LONG diff_x, LONG diff_y, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - WORD *dst_ptr = get_pixel_ptr_16( dst, rc->left, rc->top ); - int x, y; + int i, x, y;
- for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 2, src_ptr += src->stride / 4) + for (i = 0; i < clipped_rects->count; i++) { - for (x = 0; x < rc->right - rc->left; x++) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + diff_x, rc->top + diff_y ); + WORD *dst_ptr = get_pixel_ptr_16( dst, rc->left, rc->top ); + + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride / 2, src_ptr += src->stride / 4) { - DWORD val = blend_rgb( get_field( dst_ptr[x], dst->red_shift, dst->red_len ), - get_field( dst_ptr[x], dst->green_shift, dst->green_len ), - get_field( dst_ptr[x], dst->blue_shift, dst->blue_len ), - src_ptr[x], blend ); - dst_ptr[x] = rgb_to_pixel_masks( dst, val >> 16, val >> 8, val ); + for (x = 0; x < rc->right - rc->left; x++) + { + DWORD val = blend_rgb( get_field( dst_ptr[x], dst->red_shift, dst->red_len ), + get_field( dst_ptr[x], dst->green_shift, dst->green_len ), + get_field( dst_ptr[x], dst->blue_shift, dst->blue_len ), + src_ptr[x], blend ); + dst_ptr[x] = rgb_to_pixel_masks( dst, val >> 16, val >> 8, val ); + } } } }
-static void blend_rect_8(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_8(const dib_info *dst, const dib_info *src, LONG diff_x, LONG diff_y, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { const RGBQUAD *color_table = get_dib_color_table( dst ); - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - BYTE *dst_ptr = get_pixel_ptr_8( dst, rc->left, rc->top ); - int x, y; + int i, x, y;
- for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) + for (i = 0; i < clipped_rects->count; i++) { - for (x = 0; x < rc->right - rc->left; x++) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + diff_x, rc->top + diff_y ); + BYTE *dst_ptr = get_pixel_ptr_8( dst, rc->left, rc->top ); + + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) { - RGBQUAD rgb = color_table[dst_ptr[x]]; - DWORD val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[x], blend ); - dst_ptr[x] = rgb_lookup_colortable( dst, val >> 16, val >> 8, val ); + for (x = 0; x < rc->right - rc->left; x++) + { + RGBQUAD rgb = color_table[dst_ptr[x]]; + DWORD val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[x], blend ); + dst_ptr[x] = rgb_lookup_colortable( dst, val >> 16, val >> 8, val ); + } } } }
-static void blend_rect_4(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_4(const dib_info *dst, const dib_info *src, LONG diff_x, LONG diff_y, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { const RGBQUAD *color_table = get_dib_color_table( dst ); - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - BYTE *dst_ptr = get_pixel_ptr_4( dst, rc->left, rc->top ); - int i, x, y; + int i, j, x, y;
- for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) + for (i = 0; i < clipped_rects->count; i++) { - for (i = 0, x = (dst->rect.left + rc->left) & 1; i < rc->right - rc->left; i++, x++) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + diff_x, rc->top + diff_y ); + BYTE *dst_ptr = get_pixel_ptr_4( dst, rc->left, rc->top ); + + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) { - DWORD val = ((x & 1) ? dst_ptr[x / 2] : (dst_ptr[x / 2] >> 4)) & 0x0f; - RGBQUAD rgb = color_table[val]; - val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[i], blend ); - val = rgb_lookup_colortable( dst, val >> 16, val >> 8, val ); - if (x & 1) - dst_ptr[x / 2] = val | (dst_ptr[x / 2] & 0xf0); - else - dst_ptr[x / 2] = (val << 4) | (dst_ptr[x / 2] & 0x0f); + for (j = 0, x = (dst->rect.left + rc->left) & 1; j < rc->right - rc->left; j++, x++) + { + DWORD val = ((x & 1) ? dst_ptr[x / 2] : (dst_ptr[x / 2] >> 4)) & 0x0f; + RGBQUAD rgb = color_table[val]; + val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[j], blend ); + val = rgb_lookup_colortable( dst, val >> 16, val >> 8, val ); + if (x & 1) + dst_ptr[x / 2] = val | (dst_ptr[x / 2] & 0xf0); + else + dst_ptr[x / 2] = (val << 4) | (dst_ptr[x / 2] & 0x0f); + } } } }
-static void blend_rect_1(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_1(const dib_info *dst, const dib_info *src, LONG diff_x, LONG diff_y, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { const RGBQUAD *color_table = get_dib_color_table( dst ); - DWORD *src_ptr = get_pixel_ptr_32( src, origin->x, origin->y ); - BYTE *dst_ptr = get_pixel_ptr_1( dst, rc->left, rc->top ); - int i, x, y; + int i, j, x, y;
- for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) + for (i = 0; i < clipped_rects->count; i++) { - for (i = 0, x = (dst->rect.left + rc->left) & 7; i < rc->right - rc->left; i++, x++) + const RECT *rc = &clipped_rects->rects[i]; + DWORD *src_ptr = get_pixel_ptr_32( src, rc->left + diff_x, rc->top + diff_y ); + BYTE *dst_ptr = get_pixel_ptr_1( dst, rc->left, rc->top ); + + for (y = rc->top; y < rc->bottom; y++, dst_ptr += dst->stride, src_ptr += src->stride / 4) { - DWORD val = (dst_ptr[x / 8] & pixel_masks_1[x % 8]) ? 1 : 0; - RGBQUAD rgb = color_table[val]; - val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[i], blend ); - val = rgb_to_pixel_colortable(dst, val >> 16, val >> 8, val) ? 0xff : 0; - dst_ptr[x / 8] = (dst_ptr[x / 8] & ~pixel_masks_1[x % 8]) | (val & pixel_masks_1[x % 8]); + for (j = 0, x = (dst->rect.left + rc->left) & 7; j < rc->right - rc->left; j++, x++) + { + DWORD val = (dst_ptr[x / 8] & pixel_masks_1[x % 8]) ? 1 : 0; + RGBQUAD rgb = color_table[val]; + val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[j], blend ); + val = rgb_to_pixel_colortable(dst, val >> 16, val >> 8, val) ? 0xff : 0; + dst_ptr[x / 8] = (dst_ptr[x / 8] & ~pixel_masks_1[x % 8]) | (val & pixel_masks_1[x % 8]); + } } } }
-static void blend_rect_null(const dib_info *dst, const RECT *rc, - const dib_info *src, const POINT *origin, BLENDFUNCTION blend) +static void blend_rect_null(const dib_info *dst, const dib_info *src, LONG diff_x, LONG diff_y, + const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { }
This vastly improves the performance. The cache generation is relatively constant in terms of algorithm complexity, around O(n) with some extra overhead for the entire cache (depending on the color table's colors, because they get "blocked" soon enough), which is fixed at 32768 entries. It scales well with large amount of colors in the color table. The lookup after that should be fast.
In contrast, the current method is O(N * M) where N is the amount of pixels and M is the number of colors in the table, which is very slow for larger M (especially 256 colors).
More detailed information in the bug report.
Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=50898 Signed-off-by: Gabriel Ivăncescu gabrielopcode@gmail.com --- dlls/gdi32/dibdrv/primitives.c | 272 +++++++++++++++++++++++++++++++-- 1 file changed, 256 insertions(+), 16 deletions(-)
diff --git a/dlls/gdi32/dibdrv/primitives.c b/dlls/gdi32/dibdrv/primitives.c index 569baae..cc86ed9 100644 --- a/dlls/gdi32/dibdrv/primitives.c +++ b/dlls/gdi32/dibdrv/primitives.c @@ -3497,22 +3497,255 @@ static void convert_to_16(dib_info *dst, const dib_info *src, const RECT *src_re } }
-static inline BOOL color_tables_match(const dib_info *d1, const dib_info *d2) +/* + * To lookup RGB values into nearest color in the color table, Windows uses 5-bits of the RGB + * at the "center" of the RGB cube, presumably to do a similar lookup cache. The lowest 3 bits + * of the color are thus set to halfway (0x04) and then it's used in the distance calculation + * to the exact color in the color table. We exploit this as well to create a lookup cache. + * + * Generating the table is done by going "outwards" from the center of each color in the table. + * First, we find out the center RGB cube spot into the map for each color in the color table. + * In case of conflict, we calculate the *true* distance, since color table is not quantized. + * Each color in the color table should then have an associated center in the lookup cache map. + * + * Next, we go "outwards" from each center by increasing an offset for the main axis. This is + * always expressed as an offset from the center, and starts from 1 and goes up. Each offset + * has multiple generations. A generation defines the distance from the center, and is always + * increasing. Thus when we're at a generation, we go through *all* colors in the color table + * first, fill the entire generation for all of them, then move to the next generation. Since + * the next generation will always have a larger distance than the current one, we know that + * once we filled all the previous generations, we're done with them and don't need to check. + * + * Note that an offset spans multiple generations, due to how distances work in 3 dimensions. + * For example, an offset of 1 is simple and has 3 generations. The examples below are all + * offsets from the center cube, e.g. (0,-1,1) means +1 for first axis, -1 for the second. + * + * First generation (offset = 1): + * ( 0, 0, 1) ( 0, 0,-1) ( 0, 1, 0) ( 0,-1, 0) ( 1, 0, 0) (-1, 0, 0) + * + * In the first generation, the distance is always 1. For each color in the color table, we + * go through all of the above displacements from the center, and fill the lookup cache map. + * + * Second generation (offset = 1 still, but distance is larger now; some are dups, that's OK): + * ( 0, 1, 1) ( 0,-1, 1) ( 1, 0, 1) (-1, 0, 1) ( 0, 1,-1) ( 0,-1,-1) ( 1, 0,-1) (-1, 0,-1) + * ( 0, 1, 1) ( 0, 1,-1) ( 1, 1, 0) (-1, 1, 0) ( 0,-1, 1) ( 0,-1,-1) ( 1,-1, 0) (-1,-1, 0) + * ( 1, 0, 1) ( 1, 0,-1) ( 1, 1, 0) ( 1,-1, 0) (-1, 0, 1) (-1, 0,-1) (-1, 1, 0) (-1,-1, 0) + * + * On the 2nd line, we swapped the main axis from (x,x,1) to (x,1,x), and on 3rd to (1,x,x). + * Note that the distance is always the same (sqrt(2)), and we just permute the axis values. + * + * Third generation (offset = 1 still, but distance is even larger: sqrt(3)): + * ( 1, 1, 1) ( 1,-1, 1) (-1, 1, 1) (-1,-1, 1) ( 1, 1,-1) ( 1,-1,-1) (-1, 1,-1) (-1,-1,-1) + * ... + * + * Only after all of the coordinate absolute values are equal to the offset do we increase it. + * When the offset is increased, the main axis' absolute value is increased, and the distance. + * For other offsets, such as offset = 3, generations start with zeros for the other axis, for + * example (0,0,3). The next generation will increase one axis by 1, e.g. (0,1,3) and (1,0,3) + * which are part of the same generation (same distance). The next generation will increase the + * third axis, e.g. (1,1,3). Then the second axis is raised again, and the process repeats for + * the third axis until it's equal to the second axis, e.g. (0,2,3)->(1,2,3)->(2,2,3). + * + * As each generation is increased, the lowest value in the axis is increased, and the other + * starts from zero again. Then they are swapped and permuted. This is due to how the distance + * is calculated in 3 dimensions. For each generation we fill *each* color in the color table's + * displacement in all directions, since they have the same distance. + * + * Lastly, we keep track of each of the six directions (in 3 dimensions) and if one direction + * was not filled at all in the current offset, we mark it as "blocked" and we no longer check + * for it the next offset. When all six directions are blocked, the entry is no longer checked. + * + * The lookup cache map is completely filled when *all* color table entries are blocked. +*/ +struct rgb_lookup_colortable_ctx { - if (!d1->color_table || !d2->color_table) return (!d1->color_table && !d2->color_table); - return !memcmp(d1->color_table, d2->color_table, (1 << d1->bit_count) * sizeof(d1->color_table[0])); + /* lookup table indexed by 5-bit-per-color RGB into a color table */ + BYTE map[32 * 32 * 32]; +}; + +static inline BOOL rgb_lookup_colortable_set_pos(struct rgb_lookup_colortable_ctx *ctx, unsigned pos, + const RGBQUAD *color_table, unsigned index, BYTE *map_set_bits) +{ + int dr1, dg1, db1, dr2, dg2, db2, pos_r, pos_g, pos_b; + + if (pos >= ARRAY_SIZE(ctx->map)) return FALSE; + + if (map_set_bits[pos / 8] & (1 << pos % 8)) + { + /* because we loop from low idx to high, a color table index larger than ours means + it's part of a former generation, so it can't possibly have a larger distance */ + if (index <= ctx->map[pos]) return FALSE; + + /* check the *true* distance since the color table is not quantized to 5 bits */ + pos_r = ((pos & 0x001f) << 3) | 4; + pos_g = ((pos & 0x03e0) >> 2) | 4; + pos_b = ((pos & 0x7c00) >> 7) | 4; + dr1 = pos_r - color_table[index].rgbRed; + dg1 = pos_g - color_table[index].rgbGreen; + db1 = pos_b - color_table[index].rgbBlue; + dr2 = pos_r - color_table[ctx->map[pos]].rgbRed; + dg2 = pos_g - color_table[ctx->map[pos]].rgbGreen; + db2 = pos_b - color_table[ctx->map[pos]].rgbBlue; + + if (dr1*dr1 + dg1*dg1 + db1*db1 >= dr2*dr2 + dg2*dg2 + db2*db2) + return FALSE; + } + else + map_set_bits[pos / 8] |= 1 << pos % 8; + + ctx->map[pos] = index; + return TRUE; }
-static inline DWORD rgb_lookup_colortable(const dib_info *dst, BYTE r, BYTE g, BYTE b) +static void rgb_lookup_colortable_init(const dib_info *dib, struct rgb_lookup_colortable_ctx *ctx) { - /* Windows reduces precision to 5 bits, probably in order to build some sort of lookup cache */ - return rgb_to_pixel_colortable( dst, (r & ~7) + 4, (g & ~7) + 4, (b & ~7) + 4 ); + BYTE indices[256], available_directions[256], tmp_directions[256], map_set_bits[ARRAY_SIZE(ctx->map) / 8]; + unsigned color_table_size = dib->color_table ? dib->color_table_size : 1 << dib->bit_count; + const RGBQUAD *color_table = get_dib_color_table(dib); + unsigned idx, offset, num_entries; + int i, j; + + /* Testing shows that for low amount of colors, the overhead is larger than the + O(N*M) algorithm, presumably due to branch prediction and cache locality, but + it gets quickly out of hand (up to 5x slower) for larger amount of colors... */ + if (color_table_size <= 40) + { + unsigned r, g, b; + + for (b = 4; b < 256; b += 1 << 3) + for (g = 4; g < 256; g += 1 << 3) + for (r = 4; r < 256; r += 1 << 3) + ctx->map[r >> 3 | (g & ~7) << 2 | (b & ~7) << 7] = rgb_to_pixel_colortable(dib, r, g, b); + return; + } + + memset(map_set_bits, 0, sizeof(map_set_bits)); + memset(available_directions, 0xff, color_table_size); + memset(tmp_directions, 0, color_table_size); + + /* indirect list of valid color table indices, as we remove those fully surrounded */ + for (idx = 0; idx < color_table_size; idx++) + indices[idx] = idx; + + /* first, fill the centers (offset = 0) of each quantized table color in the map */ + for (idx = 0; idx < color_table_size; idx++) + { + int dr1, dg1, db1, dr2, dg2, db2, pos_r, pos_g, pos_b; + unsigned pos = (color_table[idx].rgbRed >> 3) | + (color_table[idx].rgbGreen & ~7) << 2 | + (color_table[idx].rgbBlue & ~7) << 7; + + if (map_set_bits[pos / 8] & (1 << pos % 8)) + { + pos_r = (color_table[idx].rgbRed & ~7) | 4; + pos_g = (color_table[idx].rgbGreen & ~7) | 4; + pos_b = (color_table[idx].rgbBlue & ~7) | 4; + dr1 = pos_r - color_table[idx].rgbRed; + dg1 = pos_g - color_table[idx].rgbGreen; + db1 = pos_b - color_table[idx].rgbBlue; + dr2 = pos_r - color_table[ctx->map[pos]].rgbRed; + dg2 = pos_g - color_table[ctx->map[pos]].rgbGreen; + db2 = pos_b - color_table[ctx->map[pos]].rgbBlue; + + if (dr1*dr1 + dg1*dg1 + db1*db1 >= dr2*dr2 + dg2*dg2 + db2*db2) + continue; + } + else + map_set_bits[pos / 8] |= 1 << pos % 8; + + ctx->map[pos] = idx; + } + + /* now do the rest */ + for (offset = 1, num_entries = color_table_size; num_entries != 0; offset++) + { + for (i = 0; i <= offset; i++) + { + for (j = 0; j <= i; j++) + { + /* we're at one generation now, go through each color in the color table */ + for (idx = 0; idx < num_entries; idx++) + { + unsigned direction_mask = 1, color_table_index = indices[idx]; + unsigned shift2 = 5, shift3 = 10; + int center, main_axis = offset; + + center = (color_table[color_table_index].rgbRed >> 3) | + (color_table[color_table_index].rgbGreen & ~7) << 2 | + (color_table[color_table_index].rgbBlue & ~7) << 7; + do + { + do + { + if (available_directions[idx] & direction_mask) + { + BOOL direction_blocked = TRUE; + do + { + unsigned shift_tmp; + do + { + do + { + unsigned pos = center + main_axis + (i << shift2) + (j << shift3); + + if (rgb_lookup_colortable_set_pos(ctx, pos, color_table, color_table_index, map_set_bits)) + direction_blocked = FALSE; + j = -j; + } while (j < 0); + i = -i; + } while (i < 0); + + /* repeat once and swap the two non-main axis */ + shift_tmp = shift2; shift2 = shift3; shift3 = shift_tmp; + } while (shift2 > shift3 && i != j); + + if (!direction_blocked) + tmp_directions[idx] |= direction_mask; + } + direction_mask <<= 1; + main_axis = -main_axis; + } while (main_axis < 0); + + /* change main to next axis */ + shift2 = 0; + shift3 = (main_axis < 32) ? 10 : 5; + main_axis <<= 5; + } while (main_axis < 32768); + } + } + } + + for (i = 0, idx = 0; idx < num_entries; idx++) + { + /* discard this color table index from further searches if it's fully surrounded */ + if (!tmp_directions[idx]) continue; + + available_directions[i] = tmp_directions[idx]; + tmp_directions[i] = 0; + + indices[i++] = indices[idx]; + } + num_entries = i; + } +} + +static inline BYTE rgb_lookup_colortable(const struct rgb_lookup_colortable_ctx *ctx, BYTE r, BYTE g, BYTE b) +{ + return ctx->map[(r >> 3) | (g & ~7) << 2 | (b & ~7) << 7]; +} + +static inline BOOL color_tables_match(const dib_info *d1, const dib_info *d2) +{ + if (!d1->color_table || !d2->color_table) return (!d1->color_table && !d2->color_table); + return !memcmp(d1->color_table, d2->color_table, (1 << d1->bit_count) * sizeof(d1->color_table[0])); }
static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rect, BOOL dither) { BYTE *dst_start = get_pixel_ptr_8(dst, 0, 0), *dst_pixel; INT x, y, pad_size = ((dst->width + 3) & ~3) - (src_rect->right - src_rect->left); + struct rgb_lookup_colortable_ctx lookup_ctx; DWORD src_val;
switch(src->bit_count) @@ -3521,6 +3754,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec { DWORD *src_start = get_pixel_ptr_32(src, src_rect->left, src_rect->top), *src_pixel;
+ rgb_lookup_colortable_init(dst, &lookup_ctx); if(src->funcs == &funcs_8888) { for(y = src_rect->top; y < src_rect->bottom; y++) @@ -3530,7 +3764,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, src_val >> 16, src_val >> 8, src_val ); + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, src_val >> 16, src_val >> 8, src_val ); } if(pad_size) memset(dst_pixel, 0, pad_size); dst_start += dst->stride; @@ -3546,7 +3780,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, src_val >> src->red_shift, src_val >> src->green_shift, src_val >> src->blue_shift ); @@ -3565,7 +3799,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, get_field(src_val, src->red_shift, src->red_len), get_field(src_val, src->green_shift, src->green_len), get_field(src_val, src->blue_shift, src->blue_len)); @@ -3582,13 +3816,14 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec { BYTE *src_start = get_pixel_ptr_24(src, src_rect->left, src_rect->top), *src_pixel;
+ rgb_lookup_colortable_init(dst, &lookup_ctx); for(y = src_rect->top; y < src_rect->bottom; y++) { dst_pixel = dst_start; src_pixel = src_start; for(x = src_rect->left; x < src_rect->right; x++, src_pixel += 3) { - *dst_pixel++ = rgb_lookup_colortable(dst, src_pixel[2], src_pixel[1], src_pixel[0] ); + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, src_pixel[2], src_pixel[1], src_pixel[0] ); } if(pad_size) memset(dst_pixel, 0, pad_size); dst_start += dst->stride; @@ -3600,6 +3835,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec case 16: { WORD *src_start = get_pixel_ptr_16(src, src_rect->left, src_rect->top), *src_pixel; + rgb_lookup_colortable_init(dst, &lookup_ctx); if(src->funcs == &funcs_555) { for(y = src_rect->top; y < src_rect->bottom; y++) @@ -3609,7 +3845,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, ((src_val >> 7) & 0xf8) | ((src_val >> 12) & 0x07), ((src_val >> 2) & 0xf8) | ((src_val >> 7) & 0x07), ((src_val << 3) & 0xf8) | ((src_val >> 2) & 0x07) ); @@ -3628,7 +3864,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, (((src_val >> src->red_shift) << 3) & 0xf8) | (((src_val >> src->red_shift) >> 2) & 0x07), (((src_val >> src->green_shift) << 3) & 0xf8) | @@ -3650,7 +3886,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, (((src_val >> src->red_shift) << 3) & 0xf8) | (((src_val >> src->red_shift) >> 2) & 0x07), (((src_val >> src->green_shift) << 2) & 0xfc) | @@ -3672,7 +3908,7 @@ static void convert_to_8(dib_info *dst, const dib_info *src, const RECT *src_rec for(x = src_rect->left; x < src_rect->right; x++) { src_val = *src_pixel++; - *dst_pixel++ = rgb_lookup_colortable(dst, + *dst_pixel++ = rgb_lookup_colortable(&lookup_ctx, get_field(src_val, src->red_shift, src->red_len), get_field(src_val, src->green_shift, src->green_len), get_field(src_val, src->blue_shift, src->blue_len)); @@ -4807,8 +5043,10 @@ static void blend_rect_8(const dib_info *dst, const dib_info *src, LONG diff_x, const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { const RGBQUAD *color_table = get_dib_color_table( dst ); + struct rgb_lookup_colortable_ctx lookup_ctx; int i, x, y;
+ rgb_lookup_colortable_init( dst, &lookup_ctx ); for (i = 0; i < clipped_rects->count; i++) { const RECT *rc = &clipped_rects->rects[i]; @@ -4821,7 +5059,7 @@ static void blend_rect_8(const dib_info *dst, const dib_info *src, LONG diff_x, { RGBQUAD rgb = color_table[dst_ptr[x]]; DWORD val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[x], blend ); - dst_ptr[x] = rgb_lookup_colortable( dst, val >> 16, val >> 8, val ); + dst_ptr[x] = rgb_lookup_colortable( &lookup_ctx, val >> 16, val >> 8, val ); } } } @@ -4831,8 +5069,10 @@ static void blend_rect_4(const dib_info *dst, const dib_info *src, LONG diff_x, const struct clipped_rects *clipped_rects, BLENDFUNCTION blend) { const RGBQUAD *color_table = get_dib_color_table( dst ); + struct rgb_lookup_colortable_ctx lookup_ctx; int i, j, x, y;
+ rgb_lookup_colortable_init( dst, &lookup_ctx ); for (i = 0; i < clipped_rects->count; i++) { const RECT *rc = &clipped_rects->rects[i]; @@ -4846,7 +5086,7 @@ static void blend_rect_4(const dib_info *dst, const dib_info *src, LONG diff_x, DWORD val = ((x & 1) ? dst_ptr[x / 2] : (dst_ptr[x / 2] >> 4)) & 0x0f; RGBQUAD rgb = color_table[val]; val = blend_rgb( rgb.rgbRed, rgb.rgbGreen, rgb.rgbBlue, src_ptr[j], blend ); - val = rgb_lookup_colortable( dst, val >> 16, val >> 8, val ); + val = rgb_lookup_colortable( &lookup_ctx, val >> 16, val >> 8, val ); if (x & 1) dst_ptr[x / 2] = val | (dst_ptr[x / 2] & 0xf0); else
On Wed, Mar 31, 2021 at 03:35:58PM +0300, Gabriel Ivăncescu wrote:
This vastly improves the performance. The cache generation is relatively constant in terms of algorithm complexity, around O(n) with some extra overhead for the entire cache (depending on the color table's colors, because they get "blocked" soon enough), which is fixed at 32768 entries. It scales well with large amount of colors in the color table. The lookup after that should be fast.
In contrast, the current method is O(N * M) where N is the amount of pixels and M is the number of colors in the table, which is very slow for larger M (especially 256 colors).
More detailed information in the bug report.
Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=50898 Signed-off-by: Gabriel Ivăncescu gabrielopcode@gmail.com
dlls/gdi32/dibdrv/primitives.c | 272 +++++++++++++++++++++++++++++++-- 1 file changed, 256 insertions(+), 16 deletions(-)
diff --git a/dlls/gdi32/dibdrv/primitives.c b/dlls/gdi32/dibdrv/primitives.c index 569baae..cc86ed9 100644 --- a/dlls/gdi32/dibdrv/primitives.c +++ b/dlls/gdi32/dibdrv/primitives.c @@ -3497,22 +3497,255 @@ static void convert_to_16(dib_info *dst, const dib_info *src, const RECT *src_re } }
-static inline BOOL color_tables_match(const dib_info *d1, const dib_info *d2) +/*
- To lookup RGB values into nearest color in the color table, Windows uses 5-bits of the RGB
- at the "center" of the RGB cube, presumably to do a similar lookup cache. The lowest 3 bits
- of the color are thus set to halfway (0x04) and then it's used in the distance calculation
- to the exact color in the color table. We exploit this as well to create a lookup cache.
- Generating the table is done by going "outwards" from the center of each color in the table.
- First, we find out the center RGB cube spot into the map for each color in the color table.
- In case of conflict, we calculate the *true* distance, since color table is not quantized.
- Each color in the color table should then have an associated center in the lookup cache map.
- Next, we go "outwards" from each center by increasing an offset for the main axis. This is
- always expressed as an offset from the center, and starts from 1 and goes up. Each offset
- has multiple generations. A generation defines the distance from the center, and is always
- increasing. Thus when we're at a generation, we go through *all* colors in the color table
- first, fill the entire generation for all of them, then move to the next generation. Since
- the next generation will always have a larger distance than the current one, we know that
- once we filled all the previous generations, we're done with them and don't need to check.
- Note that an offset spans multiple generations, due to how distances work in 3 dimensions.
- For example, an offset of 1 is simple and has 3 generations. The examples below are all
- offsets from the center cube, e.g. (0,-1,1) means +1 for first axis, -1 for the second.
- First generation (offset = 1):
- ( 0, 0, 1) ( 0, 0,-1) ( 0, 1, 0) ( 0,-1, 0) ( 1, 0, 0) (-1, 0, 0)
- In the first generation, the distance is always 1. For each color in the color table, we
- go through all of the above displacements from the center, and fill the lookup cache map.
- Second generation (offset = 1 still, but distance is larger now; some are dups, that's OK):
- ( 0, 1, 1) ( 0,-1, 1) ( 1, 0, 1) (-1, 0, 1) ( 0, 1,-1) ( 0,-1,-1) ( 1, 0,-1) (-1, 0,-1)
- ( 0, 1, 1) ( 0, 1,-1) ( 1, 1, 0) (-1, 1, 0) ( 0,-1, 1) ( 0,-1,-1) ( 1,-1, 0) (-1,-1, 0)
- ( 1, 0, 1) ( 1, 0,-1) ( 1, 1, 0) ( 1,-1, 0) (-1, 0, 1) (-1, 0,-1) (-1, 1, 0) (-1,-1, 0)
- On the 2nd line, we swapped the main axis from (x,x,1) to (x,1,x), and on 3rd to (1,x,x).
- Note that the distance is always the same (sqrt(2)), and we just permute the axis values.
- Third generation (offset = 1 still, but distance is even larger: sqrt(3)):
- ( 1, 1, 1) ( 1,-1, 1) (-1, 1, 1) (-1,-1, 1) ( 1, 1,-1) ( 1,-1,-1) (-1, 1,-1) (-1,-1,-1)
- ...
- Only after all of the coordinate absolute values are equal to the offset do we increase it.
- When the offset is increased, the main axis' absolute value is increased, and the distance.
- For other offsets, such as offset = 3, generations start with zeros for the other axis, for
- example (0,0,3). The next generation will increase one axis by 1, e.g. (0,1,3) and (1,0,3)
- which are part of the same generation (same distance). The next generation will increase the
- third axis, e.g. (1,1,3). Then the second axis is raised again, and the process repeats for
- the third axis until it's equal to the second axis, e.g. (0,2,3)->(1,2,3)->(2,2,3).
- As each generation is increased, the lowest value in the axis is increased, and the other
- starts from zero again. Then they are swapped and permuted. This is due to how the distance
- is calculated in 3 dimensions. For each generation we fill *each* color in the color table's
- displacement in all directions, since they have the same distance.
- Lastly, we keep track of each of the six directions (in 3 dimensions) and if one direction
- was not filled at all in the current offset, we mark it as "blocked" and we no longer check
- for it the next offset. When all six directions are blocked, the entry is no longer checked.
- The lookup cache map is completely filled when *all* color table entries are blocked.
+*/
Perhaps I'm misunderstanding your algorithm, but doesn't this end up with (2,2,2) coming before (3,0,0)?
+static void rgb_lookup_colortable_init(const dib_info *dib, struct rgb_lookup_colortable_ctx *ctx) {
- /* Windows reduces precision to 5 bits, probably in order to build some sort of lookup cache */
- return rgb_to_pixel_colortable( dst, (r & ~7) + 4, (g & ~7) + 4, (b & ~7) + 4 );
- BYTE indices[256], available_directions[256], tmp_directions[256], map_set_bits[ARRAY_SIZE(ctx->map) / 8];
- unsigned color_table_size = dib->color_table ? dib->color_table_size : 1 << dib->bit_count;
- const RGBQUAD *color_table = get_dib_color_table(dib);
- unsigned idx, offset, num_entries;
- int i, j;
- /* Testing shows that for low amount of colors, the overhead is larger than the
O(N*M) algorithm, presumably due to branch prediction and cache locality, but
it gets quickly out of hand (up to 5x slower) for larger amount of colors... */
- if (color_table_size <= 40)
- {
As a side note, there are going to be a vanishingly small number of colour tables with sizes [17,255], so if you need to special case, special case for sizes <= 16.
unsigned r, g, b;
for (b = 4; b < 256; b += 1 << 3)
for (g = 4; g < 256; g += 1 << 3)
for (r = 4; r < 256; r += 1 << 3)
ctx->map[r >> 3 | (g & ~7) << 2 | (b & ~7) << 7] = rgb_to_pixel_colortable(dib, r, g, b);
return;
- }
- memset(map_set_bits, 0, sizeof(map_set_bits));
- memset(available_directions, 0xff, color_table_size);
- memset(tmp_directions, 0, color_table_size);
- /* indirect list of valid color table indices, as we remove those fully surrounded */
- for (idx = 0; idx < color_table_size; idx++)
indices[idx] = idx;
- /* first, fill the centers (offset = 0) of each quantized table color in the map */
- for (idx = 0; idx < color_table_size; idx++)
- {
int dr1, dg1, db1, dr2, dg2, db2, pos_r, pos_g, pos_b;
unsigned pos = (color_table[idx].rgbRed >> 3) |
(color_table[idx].rgbGreen & ~7) << 2 |
(color_table[idx].rgbBlue & ~7) << 7;
if (map_set_bits[pos / 8] & (1 << pos % 8))
{
pos_r = (color_table[idx].rgbRed & ~7) | 4;
pos_g = (color_table[idx].rgbGreen & ~7) | 4;
pos_b = (color_table[idx].rgbBlue & ~7) | 4;
dr1 = pos_r - color_table[idx].rgbRed;
dg1 = pos_g - color_table[idx].rgbGreen;
db1 = pos_b - color_table[idx].rgbBlue;
dr2 = pos_r - color_table[ctx->map[pos]].rgbRed;
dg2 = pos_g - color_table[ctx->map[pos]].rgbGreen;
db2 = pos_b - color_table[ctx->map[pos]].rgbBlue;
if (dr1*dr1 + dg1*dg1 + db1*db1 >= dr2*dr2 + dg2*dg2 + db2*db2)
continue;
}
else
map_set_bits[pos / 8] |= 1 << pos % 8;
ctx->map[pos] = idx;
- }
- /* now do the rest */
- for (offset = 1, num_entries = color_table_size; num_entries != 0; offset++)
- {
for (i = 0; i <= offset; i++)
{
for (j = 0; j <= i; j++)
{
/* we're at one generation now, go through each color in the color table */
for (idx = 0; idx < num_entries; idx++)
{
unsigned direction_mask = 1, color_table_index = indices[idx];
unsigned shift2 = 5, shift3 = 10;
int center, main_axis = offset;
center = (color_table[color_table_index].rgbRed >> 3) |
(color_table[color_table_index].rgbGreen & ~7) << 2 |
(color_table[color_table_index].rgbBlue & ~7) << 7;
do
{
do
{
if (available_directions[idx] & direction_mask)
{
BOOL direction_blocked = TRUE;
do
{
unsigned shift_tmp;
do
{
do
{
unsigned pos = center + main_axis + (i << shift2) + (j << shift3);
if (rgb_lookup_colortable_set_pos(ctx, pos, color_table, color_table_index, map_set_bits))
direction_blocked = FALSE;
j = -j;
} while (j < 0);
i = -i;
} while (i < 0);
/* repeat once and swap the two non-main axis */
shift_tmp = shift2; shift2 = shift3; shift3 = shift_tmp;
} while (shift2 > shift3 && i != j);
if (!direction_blocked)
tmp_directions[idx] |= direction_mask;
}
direction_mask <<= 1;
main_axis = -main_axis;
} while (main_axis < 0);
/* change main to next axis */
shift2 = 0;
shift3 = (main_axis < 32) ? 10 : 5;
main_axis <<= 5;
} while (main_axis < 32768);
}
}
}
I suspect things might become clearer if one were to unwind some of these loops.
Did you consider using a lazy initialization of the lookup table instead?
Huw.
Hi Huw,
Thanks for the review!
On 01/04/2021 10:52, Huw Davies wrote:
On Wed, Mar 31, 2021 at 03:35:58PM +0300, Gabriel Ivăncescu wrote:
This vastly improves the performance. The cache generation is relatively constant in terms of algorithm complexity, around O(n) with some extra overhead for the entire cache (depending on the color table's colors, because they get "blocked" soon enough), which is fixed at 32768 entries. It scales well with large amount of colors in the color table. The lookup after that should be fast.
In contrast, the current method is O(N * M) where N is the amount of pixels and M is the number of colors in the table, which is very slow for larger M (especially 256 colors).
More detailed information in the bug report.
Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=50898 Signed-off-by: Gabriel Ivăncescu gabrielopcode@gmail.com
dlls/gdi32/dibdrv/primitives.c | 272 +++++++++++++++++++++++++++++++-- 1 file changed, 256 insertions(+), 16 deletions(-)
diff --git a/dlls/gdi32/dibdrv/primitives.c b/dlls/gdi32/dibdrv/primitives.c index 569baae..cc86ed9 100644 --- a/dlls/gdi32/dibdrv/primitives.c +++ b/dlls/gdi32/dibdrv/primitives.c @@ -3497,22 +3497,255 @@ static void convert_to_16(dib_info *dst, const dib_info *src, const RECT *src_re } }
-static inline BOOL color_tables_match(const dib_info *d1, const dib_info *d2) +/*
- To lookup RGB values into nearest color in the color table, Windows uses 5-bits of the RGB
- at the "center" of the RGB cube, presumably to do a similar lookup cache. The lowest 3 bits
- of the color are thus set to halfway (0x04) and then it's used in the distance calculation
- to the exact color in the color table. We exploit this as well to create a lookup cache.
- Generating the table is done by going "outwards" from the center of each color in the table.
- First, we find out the center RGB cube spot into the map for each color in the color table.
- In case of conflict, we calculate the *true* distance, since color table is not quantized.
- Each color in the color table should then have an associated center in the lookup cache map.
- Next, we go "outwards" from each center by increasing an offset for the main axis. This is
- always expressed as an offset from the center, and starts from 1 and goes up. Each offset
- has multiple generations. A generation defines the distance from the center, and is always
- increasing. Thus when we're at a generation, we go through *all* colors in the color table
- first, fill the entire generation for all of them, then move to the next generation. Since
- the next generation will always have a larger distance than the current one, we know that
- once we filled all the previous generations, we're done with them and don't need to check.
- Note that an offset spans multiple generations, due to how distances work in 3 dimensions.
- For example, an offset of 1 is simple and has 3 generations. The examples below are all
- offsets from the center cube, e.g. (0,-1,1) means +1 for first axis, -1 for the second.
- First generation (offset = 1):
- ( 0, 0, 1) ( 0, 0,-1) ( 0, 1, 0) ( 0,-1, 0) ( 1, 0, 0) (-1, 0, 0)
- In the first generation, the distance is always 1. For each color in the color table, we
- go through all of the above displacements from the center, and fill the lookup cache map.
- Second generation (offset = 1 still, but distance is larger now; some are dups, that's OK):
- ( 0, 1, 1) ( 0,-1, 1) ( 1, 0, 1) (-1, 0, 1) ( 0, 1,-1) ( 0,-1,-1) ( 1, 0,-1) (-1, 0,-1)
- ( 0, 1, 1) ( 0, 1,-1) ( 1, 1, 0) (-1, 1, 0) ( 0,-1, 1) ( 0,-1,-1) ( 1,-1, 0) (-1,-1, 0)
- ( 1, 0, 1) ( 1, 0,-1) ( 1, 1, 0) ( 1,-1, 0) (-1, 0, 1) (-1, 0,-1) (-1, 1, 0) (-1,-1, 0)
- On the 2nd line, we swapped the main axis from (x,x,1) to (x,1,x), and on 3rd to (1,x,x).
- Note that the distance is always the same (sqrt(2)), and we just permute the axis values.
- Third generation (offset = 1 still, but distance is even larger: sqrt(3)):
- ( 1, 1, 1) ( 1,-1, 1) (-1, 1, 1) (-1,-1, 1) ( 1, 1,-1) ( 1,-1,-1) (-1, 1,-1) (-1,-1,-1)
- ...
- Only after all of the coordinate absolute values are equal to the offset do we increase it.
- When the offset is increased, the main axis' absolute value is increased, and the distance.
- For other offsets, such as offset = 3, generations start with zeros for the other axis, for
- example (0,0,3). The next generation will increase one axis by 1, e.g. (0,1,3) and (1,0,3)
- which are part of the same generation (same distance). The next generation will increase the
- third axis, e.g. (1,1,3). Then the second axis is raised again, and the process repeats for
- the third axis until it's equal to the second axis, e.g. (0,2,3)->(1,2,3)->(2,2,3).
- As each generation is increased, the lowest value in the axis is increased, and the other
- starts from zero again. Then they are swapped and permuted. This is due to how the distance
- is calculated in 3 dimensions. For each generation we fill *each* color in the color table's
- displacement in all directions, since they have the same distance.
- Lastly, we keep track of each of the six directions (in 3 dimensions) and if one direction
- was not filled at all in the current offset, we mark it as "blocked" and we no longer check
- for it the next offset. When all six directions are blocked, the entry is no longer checked.
- The lookup cache map is completely filled when *all* color table entries are blocked.
+*/
Perhaps I'm misunderstanding your algorithm, but doesn't this end up with (2,2,2) coming before (3,0,0)?
Actually yeah, you're right, I messed up somewhere when testing with the quantized cubes. I'll need to revamp this at some point, but for now I'll leave it aside. I'm thinking of going with something else easier (see below), if it's acceptable.
+static void rgb_lookup_colortable_init(const dib_info *dib, struct rgb_lookup_colortable_ctx *ctx) {
- /* Windows reduces precision to 5 bits, probably in order to build some sort of lookup cache */
- return rgb_to_pixel_colortable( dst, (r & ~7) + 4, (g & ~7) + 4, (b & ~7) + 4 );
- BYTE indices[256], available_directions[256], tmp_directions[256], map_set_bits[ARRAY_SIZE(ctx->map) / 8];
- unsigned color_table_size = dib->color_table ? dib->color_table_size : 1 << dib->bit_count;
- const RGBQUAD *color_table = get_dib_color_table(dib);
- unsigned idx, offset, num_entries;
- int i, j;
- /* Testing shows that for low amount of colors, the overhead is larger than the
O(N*M) algorithm, presumably due to branch prediction and cache locality, but
it gets quickly out of hand (up to 5x slower) for larger amount of colors... */
- if (color_table_size <= 40)
- {
As a side note, there are going to be a vanishingly small number of colour tables with sizes [17,255], so if you need to special case, special case for sizes <= 16.
unsigned r, g, b;
for (b = 4; b < 256; b += 1 << 3)
for (g = 4; g < 256; g += 1 << 3)
for (r = 4; r < 256; r += 1 << 3)
ctx->map[r >> 3 | (g & ~7) << 2 | (b & ~7) << 7] = rgb_to_pixel_colortable(dib, r, g, b);
return;
- }
- memset(map_set_bits, 0, sizeof(map_set_bits));
- memset(available_directions, 0xff, color_table_size);
- memset(tmp_directions, 0, color_table_size);
- /* indirect list of valid color table indices, as we remove those fully surrounded */
- for (idx = 0; idx < color_table_size; idx++)
indices[idx] = idx;
- /* first, fill the centers (offset = 0) of each quantized table color in the map */
- for (idx = 0; idx < color_table_size; idx++)
- {
int dr1, dg1, db1, dr2, dg2, db2, pos_r, pos_g, pos_b;
unsigned pos = (color_table[idx].rgbRed >> 3) |
(color_table[idx].rgbGreen & ~7) << 2 |
(color_table[idx].rgbBlue & ~7) << 7;
if (map_set_bits[pos / 8] & (1 << pos % 8))
{
pos_r = (color_table[idx].rgbRed & ~7) | 4;
pos_g = (color_table[idx].rgbGreen & ~7) | 4;
pos_b = (color_table[idx].rgbBlue & ~7) | 4;
dr1 = pos_r - color_table[idx].rgbRed;
dg1 = pos_g - color_table[idx].rgbGreen;
db1 = pos_b - color_table[idx].rgbBlue;
dr2 = pos_r - color_table[ctx->map[pos]].rgbRed;
dg2 = pos_g - color_table[ctx->map[pos]].rgbGreen;
db2 = pos_b - color_table[ctx->map[pos]].rgbBlue;
if (dr1*dr1 + dg1*dg1 + db1*db1 >= dr2*dr2 + dg2*dg2 + db2*db2)
continue;
}
else
map_set_bits[pos / 8] |= 1 << pos % 8;
ctx->map[pos] = idx;
- }
- /* now do the rest */
- for (offset = 1, num_entries = color_table_size; num_entries != 0; offset++)
- {
for (i = 0; i <= offset; i++)
{
for (j = 0; j <= i; j++)
{
/* we're at one generation now, go through each color in the color table */
for (idx = 0; idx < num_entries; idx++)
{
unsigned direction_mask = 1, color_table_index = indices[idx];
unsigned shift2 = 5, shift3 = 10;
int center, main_axis = offset;
center = (color_table[color_table_index].rgbRed >> 3) |
(color_table[color_table_index].rgbGreen & ~7) << 2 |
(color_table[color_table_index].rgbBlue & ~7) << 7;
do
{
do
{
if (available_directions[idx] & direction_mask)
{
BOOL direction_blocked = TRUE;
do
{
unsigned shift_tmp;
do
{
do
{
unsigned pos = center + main_axis + (i << shift2) + (j << shift3);
if (rgb_lookup_colortable_set_pos(ctx, pos, color_table, color_table_index, map_set_bits))
direction_blocked = FALSE;
j = -j;
} while (j < 0);
i = -i;
} while (i < 0);
/* repeat once and swap the two non-main axis */
shift_tmp = shift2; shift2 = shift3; shift3 = shift_tmp;
} while (shift2 > shift3 && i != j);
if (!direction_blocked)
tmp_directions[idx] |= direction_mask;
}
direction_mask <<= 1;
main_axis = -main_axis;
} while (main_axis < 0);
/* change main to next axis */
shift2 = 0;
shift3 = (main_axis < 32) ? 10 : 5;
main_axis <<= 5;
} while (main_axis < 32768);
}
}
}
I suspect things might become clearer if one were to unwind some of these loops.
Did you consider using a lazy initialization of the lookup table instead?
Huw.
Lazy initialization meaning build it as we scan the image? That would work too. I also thought of caching the color tables globally.
Now I don't know if that's acceptable, but here's my idea since it's what is good in "practice":
Hold a small fixed size global cache (4-5 entries) of color tables, with synchronization in case of multi-threading (just for correctness, I doubt any app will actually multi-thread this). If we go the lazy initialization approach, we'll also keep track which of the entries were not filled yet in the cache.
It would be a global table for all color tables, but since I expect all the performance problems to be using the same color table over and over again in a loop, it would work "in practice".
When we have to discard/replace one, we choose the oldest one that wasn't accessed (we keep some timestamp on each cache). Does that sound feasible?
Maybe if I end up using this global cache, I can discard the lazy init since it complicates things, do you think that's better?
Thanks, Gabriel
On Thu, Apr 01, 2021 at 04:07:00PM +0300, Gabriel Ivăncescu wrote:
On 01/04/2021 10:52, Huw Davies wrote:
On Wed, Mar 31, 2021 at 03:35:58PM +0300, Gabriel Ivăncescu wrote:
Did you consider using a lazy initialization of the lookup table instead?
Huw.
Lazy initialization meaning build it as we scan the image? That would work too.
Yes, build the lookup using rgb_to_pixel_colortable() as needed.
I also thought of caching the color tables globally.
Now I don't know if that's acceptable, but here's my idea since it's what is good in "practice":
Hold a small fixed size global cache (4-5 entries) of color tables, with synchronization in case of multi-threading (just for correctness, I doubt any app will actually multi-thread this). If we go the lazy initialization approach, we'll also keep track which of the entries were not filled yet in the cache.
It would be a global table for all color tables, but since I expect all the performance problems to be using the same color table over and over again in a loop, it would work "in practice".
When we have to discard/replace one, we choose the oldest one that wasn't accessed (we keep some timestamp on each cache). Does that sound feasible?
Maybe if I end up using this global cache, I can discard the lazy init since it complicates things, do you think that's better?
Hooking up the global one is going to be more complicated. I suggest you try the (local) lazy init version first.
Huw.
On 01/04/2021 22:10, Huw Davies wrote:
On Thu, Apr 01, 2021 at 04:07:00PM +0300, Gabriel Ivăncescu wrote:
On 01/04/2021 10:52, Huw Davies wrote:
On Wed, Mar 31, 2021 at 03:35:58PM +0300, Gabriel Ivăncescu wrote:
Did you consider using a lazy initialization of the lookup table instead?
Huw.
Lazy initialization meaning build it as we scan the image? That would work too.
Yes, build the lookup using rgb_to_pixel_colortable() as needed.
I also thought of caching the color tables globally.
Now I don't know if that's acceptable, but here's my idea since it's what is good in "practice":
Hold a small fixed size global cache (4-5 entries) of color tables, with synchronization in case of multi-threading (just for correctness, I doubt any app will actually multi-thread this). If we go the lazy initialization approach, we'll also keep track which of the entries were not filled yet in the cache.
It would be a global table for all color tables, but since I expect all the performance problems to be using the same color table over and over again in a loop, it would work "in practice".
When we have to discard/replace one, we choose the oldest one that wasn't accessed (we keep some timestamp on each cache). Does that sound feasible?
Maybe if I end up using this global cache, I can discard the lazy init since it complicates things, do you think that's better?
Hooking up the global one is going to be more complicated. I suggest you try the (local) lazy init version first.
Huw.
The lazy init will mostly just help with small images, and may also add some overhead for large images, since now we'll have to check each pixel whether it's been initialized or not.
But the larger concern is that it makes the global cache harder to implement. We'll have to "merge" the tables at the end etc. Actually, the cache itself is pretty simple without lazy init, and can all be contained within the rgb_lookup_colortable_init function.
I think I'll send the cached version first so you can get an idea, maybe it's simpler than you expected.
Also, this will allow later to improve the algorithm so it generates the table faster, like the one I originally had but was wrong, which won't be possible with lazy init. For example, just scan through all six directions (front/back, left/right, up/down) in squares and don't assume distances are smaller, so we still check the distance on each conflict, but when one direction is blocked we stop checking it for that color—which will be the main advantage here.
I'm also thinking of a lookup table to do the walk now, in proper order, depending on how big it becomes.
But anyway I leave that for possibly later, I'll send the cache first so you can get an idea why it's not very complicated without lazy init.
On Wed, Mar 31, 2021 at 03:35:57PM +0300, Gabriel Ivăncescu wrote:
@@ -189,8 +196,8 @@ typedef struct primitive_funcs const dib_info *brush, const rop_mask_bits *bits); void (* copy_rect)(const dib_info *dst, const RECT *rc, const dib_info *src, const POINT *origin, int rop2, int overlap);
- void (* blend_rect)(const dib_info *dst, const RECT *rc, const dib_info *src,
const POINT *origin, BLENDFUNCTION blend);
- void (* blend_rect)(const dib_info *dst, const dib_info *src, LONG diff_x, LONG diff_y,
const struct clipped_rects *clipped_rects, BLENDFUNCTION blend);
Using a POINT *offset instead of diff_[xy] would be cleaner.
Huw.