-- v4: gdiplus: improve performance of matrix multiplication by unrolling loop.
From: Bartosz Kosiorek gang65@poczta.onet.pl
Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=53947 --- dlls/gdiplus/matrix.c | 25 +++++++++++++---------- dlls/gdiplus/tests/matrix.c | 40 +++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 11 deletions(-)
diff --git a/dlls/gdiplus/matrix.c b/dlls/gdiplus/matrix.c index 40abbc93e21..13e7aa05e11 100644 --- a/dlls/gdiplus/matrix.c +++ b/dlls/gdiplus/matrix.c @@ -287,24 +287,27 @@ GpStatus WINGDIPAPI GdipRotateMatrix(GpMatrix *matrix, REAL angle, GpStatus WINGDIPAPI GdipScaleMatrix(GpMatrix *matrix, REAL scaleX, REAL scaleY, GpMatrixOrder order) { - REAL scale[6]; - TRACE("(%p, %.2f, %.2f, %d)\n", matrix, scaleX, scaleY, order);
if(!matrix) return InvalidParameter;
- scale[0] = scaleX; - scale[1] = 0.0; - scale[2] = 0.0; - scale[3] = scaleY; - scale[4] = 0.0; - scale[5] = 0.0; - if(order == MatrixOrderAppend) - matrix_multiply(matrix->matrix, scale, matrix->matrix); + { + matrix->matrix[0] *= scaleX; + matrix->matrix[1] *= scaleY; + matrix->matrix[2] *= scaleX; + matrix->matrix[3] *= scaleY; + matrix->matrix[4] *= scaleX; + matrix->matrix[5] *= scaleY; + } else if (order == MatrixOrderPrepend) - matrix_multiply(scale, matrix->matrix, matrix->matrix); + { + matrix->matrix[0] *= scaleX; + matrix->matrix[1] *= scaleX; + matrix->matrix[2] *= scaleY; + matrix->matrix[3] *= scaleY; + } else return InvalidParameter;
diff --git a/dlls/gdiplus/tests/matrix.c b/dlls/gdiplus/tests/matrix.c index 5ca0209f6a9..5f219dafff1 100644 --- a/dlls/gdiplus/tests/matrix.c +++ b/dlls/gdiplus/tests/matrix.c @@ -110,6 +110,45 @@ static void test_transform(void) GdipDeleteMatrix(matrix); }
+static void test_scale(void) +{ + GpStatus status; + GpMatrix *matrix = NULL; + REAL elems[6]; + + REAL expected_elem[] = {3.0, -4.0, 90.0, 80.0, -1500.0, 1200.0}; + REAL expected_elem2[] = {3.0, -6.0, 60.0, 80.0, -500.0, 600.0}; + + GdipCreateMatrix2(1.0, -2.0, 30.0, 40.0, -500.0, 600.0, &matrix); + + status = GdipScaleMatrix(NULL, 3, 2, MatrixOrderAppend); + expect(InvalidParameter, status); + status = GdipScaleMatrix(matrix, 3, 2, MatrixOrderAppend); + expect(Ok, status); + + status = GdipGetMatrixElements(matrix, elems); + expect(Ok, status); + GdipDeleteMatrix(matrix); + + for(INT i = 0; i < 6; i++) + ok(expected_elem[i] == elems[i], "Expected #%d to be (%.1f) but got (%.1f)\n", i, + expected_elem[i], elems[i]); + + GdipCreateMatrix2(1.0, -2.0, 30.0, 40.0, -500.0, 600.0, &matrix); + + status = GdipScaleMatrix(matrix, 3, 2, MatrixOrderPrepend); + expect(Ok, status); + + status = GdipGetMatrixElements(matrix, elems); + expect(Ok, status); + GdipDeleteMatrix(matrix); + + for(INT i = 0; i < 6; i++) + ok(expected_elem2[i] == elems[i], "Expected #%d to be (%.1f) but got (%.1f)\n", i, + expected_elem2[i], elems[i]); + +} + static void test_isinvertible(void) { GpStatus status; @@ -390,6 +429,7 @@ START_TEST(matrix) GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, NULL);
test_constructor_destructor(); + test_scale(); test_transform(); test_isinvertible(); test_invert();
From: Bartosz Kosiorek gang65@poczta.onet.pl
--- dlls/gdiplus/matrix.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/dlls/gdiplus/matrix.c b/dlls/gdiplus/matrix.c index 13e7aa05e11..ab6a75d3a00 100644 --- a/dlls/gdiplus/matrix.c +++ b/dlls/gdiplus/matrix.c @@ -42,13 +42,15 @@ WINE_DEFAULT_DEBUG_CHANNEL(gdiplus); static void matrix_multiply(GDIPCONST REAL * left, GDIPCONST REAL * right, REAL * out) { REAL temp[6]; - int i, odd; - - for(i = 0; i < 6; i++){ - odd = i % 2; - temp[i] = left[i - odd] * right[odd] + left[i - odd + 1] * right[odd + 2] + - (i >= 4 ? right[odd + 4] : 0.0); - } + // Unroll loop to optimize execution speed + temp[0] = left[0] * right[0] + left[1] * right[2]; + temp[1] = left[0] * right[1] + left[1] * right[3]; + temp[2] = left[2] * right[0] + left[3] * right[2]; + temp[3] = left[2] * right[1] + left[3] * right[3]; + temp[4] = left[4] * right[0] + left[5] * right[2] + + right[4]; + temp[5] = left[4] * right[1] + left[5] * right[3] + + right[5];
memcpy(out, temp, 6 * sizeof(REAL)); }
Hi,
It looks like your patch introduced the new failures shown below. Please investigate and fix them before resubmitting your patch. If they are not new, fixing them anyway would help a lot. Otherwise please ask for the known failures list to be updated.
The tests also ran into some preexisting test failures. If you know how to fix them that would be helpful. See the TestBot job for the details:
The full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=127076
Your paranoid android.
=== debian11 (32 bit report) ===
kernel32: comm: Timeout
Did you measure the performance difference? Or checked if generated code is any different, especially for the second patch. I looks like forced inlining might have the same effect, if it's not inlined already automatically.
Alex Henrie (@alexhenrie) commented about dlls/gdiplus/tests/matrix.c:
GdipDeleteMatrix(matrix);
}
+static void test_scale(void) +{
- GpStatus status;
- GpMatrix *matrix = NULL;
- REAL elems[6];
- REAL expected_elem[] = {3.0, -4.0, 90.0, 80.0, -1500.0, 1200.0};
- REAL expected_elem2[] = {3.0, -6.0, 60.0, 80.0, -500.0, 600.0};
These arrays could be `static const`.
Alex Henrie (@alexhenrie) commented about dlls/gdiplus/matrix.c:
static void matrix_multiply(GDIPCONST REAL * left, GDIPCONST REAL * right, REAL * out) { REAL temp[6];
- int i, odd;
- for(i = 0; i < 6; i++){
odd = i % 2;
temp[i] = left[i - odd] * right[odd] + left[i - odd + 1] * right[odd + 2] +
(i >= 4 ? right[odd + 4] : 0.0);
- }
- // Unroll loop to optimize execution speed
This comment is unnecessary (and misleading, since there isn't a loop anymore). Also, the Wine project only uses `/* */` comments in C code, not `//` comments. Just delete the comment.
Alex Henrie (@alexhenrie) commented about dlls/gdiplus/matrix.c:
- int i, odd;
- for(i = 0; i < 6; i++){
odd = i % 2;
temp[i] = left[i - odd] * right[odd] + left[i - odd + 1] * right[odd + 2] +
(i >= 4 ? right[odd + 4] : 0.0);
- }
- // Unroll loop to optimize execution speed
- temp[0] = left[0] * right[0] + left[1] * right[2];
- temp[1] = left[0] * right[1] + left[1] * right[3];
- temp[2] = left[2] * right[0] + left[3] * right[2];
- temp[3] = left[2] * right[1] + left[3] * right[3];
- temp[4] = left[4] * right[0] + left[5] * right[2] +
right[4];
- temp[5] = left[4] * right[1] + left[5] * right[3] +
right[5];
There's no reason to split the last two assignments onto two lines each. The lines are nowhere near the 100-character guideline.
On Fri Dec 2 15:47:41 2022 +0000, Nikolay Sivov wrote:
Did you measure the performance difference? Or checked if generated code is any different, especially for the second patch. I looks like forced inlining might have the same effect, if it's not inlined already automatically.
The performance improvement is quite significant:
Running Times on Linux 64 bit:
Native (Windows gdiplus.dll) * 500 Matrix Scaling time (seconds): 0.84s * 700 Matrix Multipling time (seconds): 1.22s
Wine gdiplus.dll without optimizations: * 500 Matrix Scaling time (seconds): 0.28s * 700 Matrix Multipling time (seconds): 0.35s
Wine gdiplus.dll with optimizations: * 500 Matrix Scaling time (seconds): 0.13s * 700 Matrix Multipling time (seconds): 0.17s
Application which I am using for testing: [gdiplusdisplay.exe](/uploads/c5c790eaef4824c2125d56b95714b05e/gdiplusdisplay.exe)