-- v7: gdiplus: improve performance of matrix multiplication by unrolling loop.
From: Bartosz Kosiorek gang65@poczta.onet.pl
Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=53947 --- dlls/gdiplus/matrix.c | 25 +++++++++++++---------- dlls/gdiplus/tests/matrix.c | 40 +++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 11 deletions(-)
diff --git a/dlls/gdiplus/matrix.c b/dlls/gdiplus/matrix.c index 40abbc93e21..13e7aa05e11 100644 --- a/dlls/gdiplus/matrix.c +++ b/dlls/gdiplus/matrix.c @@ -287,24 +287,27 @@ GpStatus WINGDIPAPI GdipRotateMatrix(GpMatrix *matrix, REAL angle, GpStatus WINGDIPAPI GdipScaleMatrix(GpMatrix *matrix, REAL scaleX, REAL scaleY, GpMatrixOrder order) { - REAL scale[6]; - TRACE("(%p, %.2f, %.2f, %d)\n", matrix, scaleX, scaleY, order);
if(!matrix) return InvalidParameter;
- scale[0] = scaleX; - scale[1] = 0.0; - scale[2] = 0.0; - scale[3] = scaleY; - scale[4] = 0.0; - scale[5] = 0.0; - if(order == MatrixOrderAppend) - matrix_multiply(matrix->matrix, scale, matrix->matrix); + { + matrix->matrix[0] *= scaleX; + matrix->matrix[1] *= scaleY; + matrix->matrix[2] *= scaleX; + matrix->matrix[3] *= scaleY; + matrix->matrix[4] *= scaleX; + matrix->matrix[5] *= scaleY; + } else if (order == MatrixOrderPrepend) - matrix_multiply(scale, matrix->matrix, matrix->matrix); + { + matrix->matrix[0] *= scaleX; + matrix->matrix[1] *= scaleX; + matrix->matrix[2] *= scaleY; + matrix->matrix[3] *= scaleY; + } else return InvalidParameter;
diff --git a/dlls/gdiplus/tests/matrix.c b/dlls/gdiplus/tests/matrix.c index 5ca0209f6a9..a2f6916fd66 100644 --- a/dlls/gdiplus/tests/matrix.c +++ b/dlls/gdiplus/tests/matrix.c @@ -110,6 +110,45 @@ static void test_transform(void) GdipDeleteMatrix(matrix); }
+static void test_scale(void) +{ + GpStatus status; + GpMatrix *matrix = NULL; + REAL elems[6]; + + static const REAL expected_elem[] = {3.0, -4.0, 90.0, 80.0, -1500.0, 1200.0}; + static const REAL expected_elem2[] = {3.0, -6.0, 60.0, 80.0, -500.0, 600.0}; + + GdipCreateMatrix2(1.0, -2.0, 30.0, 40.0, -500.0, 600.0, &matrix); + + status = GdipScaleMatrix(NULL, 3, 2, MatrixOrderAppend); + expect(InvalidParameter, status); + status = GdipScaleMatrix(matrix, 3, 2, MatrixOrderAppend); + expect(Ok, status); + + status = GdipGetMatrixElements(matrix, elems); + expect(Ok, status); + GdipDeleteMatrix(matrix); + + for(INT i = 0; i < 6; i++) + ok(expected_elem[i] == elems[i], "Expected #%d to be (%.1f) but got (%.1f)\n", i, + expected_elem[i], elems[i]); + + GdipCreateMatrix2(1.0, -2.0, 30.0, 40.0, -500.0, 600.0, &matrix); + + status = GdipScaleMatrix(matrix, 3, 2, MatrixOrderPrepend); + expect(Ok, status); + + status = GdipGetMatrixElements(matrix, elems); + expect(Ok, status); + GdipDeleteMatrix(matrix); + + for(INT i = 0; i < 6; i++) + ok(expected_elem2[i] == elems[i], "Expected #%d to be (%.1f) but got (%.1f)\n", i, + expected_elem2[i], elems[i]); + +} + static void test_isinvertible(void) { GpStatus status; @@ -390,6 +429,7 @@ START_TEST(matrix) GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, NULL);
test_constructor_destructor(); + test_scale(); test_transform(); test_isinvertible(); test_invert();
From: Bartosz Kosiorek gang65@poczta.onet.pl
--- dlls/gdiplus/matrix.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/dlls/gdiplus/matrix.c b/dlls/gdiplus/matrix.c index 13e7aa05e11..4724440c3a5 100644 --- a/dlls/gdiplus/matrix.c +++ b/dlls/gdiplus/matrix.c @@ -39,17 +39,15 @@ WINE_DEFAULT_DEBUG_CHANNEL(gdiplus); * * and puts the output in out. * */ -static void matrix_multiply(GDIPCONST REAL * left, GDIPCONST REAL * right, REAL * out) +static FORCEINLINE void matrix_multiply(GDIPCONST REAL * left, GDIPCONST REAL * right, REAL * out) { REAL temp[6]; - int i, odd; - - for(i = 0; i < 6; i++){ - odd = i % 2; - temp[i] = left[i - odd] * right[odd] + left[i - odd + 1] * right[odd + 2] + - (i >= 4 ? right[odd + 4] : 0.0); - } - + temp[0] = left[0] * right[0] + left[1] * right[2]; + temp[1] = left[0] * right[1] + left[1] * right[3]; + temp[2] = left[2] * right[0] + left[3] * right[2]; + temp[3] = left[2] * right[1] + left[3] * right[3]; + temp[4] = left[4] * right[0] + left[5] * right[2] + right[4]; + temp[5] = left[4] * right[1] + left[5] * right[3] + right[5]; memcpy(out, temp, 6 * sizeof(REAL)); }
On Fri Dec 2 18:26:50 2022 +0000, Bartosz Kosiorek wrote:
changed this line in [version 7 of the diff](/wine/wine/-/merge_requests/1618/diffs?diff_id=22556&start_sha=6649da3f7f307c8db15f34d1161f28e9b11bfa54#58fa05bb30bb04422bf1e80b5b24932c74730b9b_42_42)
Hmm, it actually looks like the `inline` keyword does make it more likely for GCC to inline the function, but the decision is still based on a heuristic and inlining is not guaranteed. So `FORCEINLINE` still seems like the better option.
On Fri Dec 2 17:42:17 2022 +0000, Bartosz Kosiorek wrote:
Generally the best results I have with inlining, loop enrolling and scale optimization. Lowest values of running Times on Linux 64 bit (I run about 10 times and get lowest value): Wine gdiplus.dll **without** optimizations:
- 500 x `GdipScaleMatrix` time (seconds): 0.31s
- 700 x `GdipMultiplyMatrix` time (seconds): 0.20s
Wine gdiplus.dll **without** optimizations with **inlining** `matrix_multiply` (Scaling is also using `matrix_multiply`):
- 500 x `GdipScaleMatrix` time (seconds): 0.21s
- 700 x `GdipMultiplyMatrix` time (seconds): 0.16s
Wine gdiplus.dll **with** optimizations and **inlining** (current Merge Request):
- 500 x `GdipScaleMatrix` time (seconds): 0.07s
- 700 x `GdipMultiplyMatrix` time (seconds): 0.11s
I don't think that the loop makes things any clearer for `matrix_multiply` regardless, nor can I think of any reasonable way that the extra integer math and branches would improve performance, so I think that part is fine regardless.
For GdipScaleMatrix, it's a little less convincing, because the new version of the code is less clear. I'm curious what happens if you apply only the second patch in the series. I suspect that constant propagation combined with the second patch may make the first patch redundant.
On Fri Dec 2 19:30:10 2022 +0000, Esme Povirk wrote:
I don't think that the loop makes things any clearer for `matrix_multiply` regardless, nor can I think of any reasonable way that the extra integer math and branches would improve performance, so I think that part is fine regardless. For `GdipScaleMatrix`, it's a little less convincing, because the new version of the code is less clear. I'm curious what happens if you apply only the second patch in the series. I suspect that constant propagation combined with the second patch may make the first patch redundant.
The GdipScaleMatrix binary code (after compiling) is most efficient with proposed source code.
I could do whatever you want to get binary code in efficient form (as it is the output from this Merge Request). Could you please describe how constant propagation should looks like?
We could also add some function description, or additional comment, which will describe whole formula.
There are also matrix calculators which would help understand it. For example: https://matrixcalc.org/ Or some nice Wikipedia articles about transformation matrix: https://en.wikipedia.org/wiki/Transformation_matrix#/media/File:2D_affine_tr...
In long term, I would like to optimize all Matrix transformations (Shearing, Rotation)
I could do whatever you want to get binary code in efficient form (as it is the output from this Merge Request). Could you please describe how constant propagation should looks like?
If I understand correctly, Esme just wants you to test the performance of changing matrix_multiply without changing GdipScaleMatrix.
On Fri Dec 2 20:46:15 2022 +0000, Alex Henrie wrote:
I could do whatever you want to get binary code in efficient form (as
it is the output from this Merge Request). Could you please describe how constant propagation should looks like? If I understand correctly, Esme just wants you to test the performance of changing matrix_multiply without changing GdipScaleMatrix.
With only `gdiplus: improve performance of matrix multiplication by unrolling loop.` commit applied, I had:
Wine gdiplus.dll **with only `matrix_multiply` optimizations** and `matrix_multiply` inlining: * 500 x `GdipScaleMatrix` time (seconds): 0.21s * 700 x `GdipMultiplyMatrix` time (seconds): 0.14s
Generally I cannot get value below 0.21s.
On Fri Dec 2 21:47:10 2022 +0000, Bartosz Kosiorek wrote:
With only `gdiplus: improve performance of matrix multiplication by unrolling loop.` commit applied, I had: Wine gdiplus.dll **with only `matrix_multiply` optimizations** and `matrix_multiply` inlining:
- 500 x `GdipScaleMatrix` time (seconds): 0.21s
- 700 x `GdipMultiplyMatrix` time (seconds): 0.14s
Generally I cannot get value below 0.21s (with `GdipScaleMatrix` sometimes I get 0.06s). It seems that similar optimizations are applied in native Windows gdiplus.dll, as `GdipScaleMatrix` is faster than `GdipMultiplyMatrix` (which will not be possible with using `matrix_multiply`): Native (Windows gdiplus.dll)
- 500 x`GdipScaleMatrix` time (seconds): 0.84s
- 700 x`GdipMultiplyMatrix` time (seconds): 1.22s
I just wanted to verify that it simplifies the generated code for GdipScaleMatrix even with the second patch applied, which it does.
This merge request was approved by Esme Povirk.
The GdipTranslateMatrix is blinking fast now, especially with MatrixOrderAppend.
It is used commonly in EMF: EmfPlusRecordTypeTranslateWorldTransform, GdipTranslateWorldTransform, Pens and Brushes.