Based on https://source.winehq.org/patches/data/238821 by Zhao Yi.
-- v2: ntdll: Correctly handle shift greater than the type width in 64-bit shift functions. ntdll: Avoid depending on compiler support for 64-bit shift functions. ntdll/tests: Add tests for runtime 64-bit shift functions. ntdll: Fix the calling convention for runtime 64-bit shift functions.
From: Zebediah Figura zfigura@codeweavers.com
Based on a patch by Zhao Yi. --- dlls/ntdll/large_int.c | 74 ++++++++++++++--------------- dlls/ntdll/ntdll.spec | 6 +-- dlls/ntoskrnl.exe/ntoskrnl.exe.spec | 6 +-- 3 files changed, 42 insertions(+), 44 deletions(-)
diff --git a/dlls/ntdll/large_int.c b/dlls/ntdll/large_int.c index b38074158c8..0faabe0a991 100644 --- a/dlls/ntdll/large_int.c +++ b/dlls/ntdll/large_int.c @@ -809,40 +809,55 @@ ULONGLONG WINAPI _aulldiv( ULONGLONG a, ULONGLONG b ) return udivmod(a, b, NULL); }
+ +LONGLONG __regs__allshl( LONGLONG a, unsigned char b ) +{ + return a << b; +} + /****************************************************************************** * _allshl (NTDLL.@) - * - * Shift a 64 bit integer to the left. - * - * PARAMS - * a [I] Initial number. - * b [I] Number to shift a by to the left. - * - * RETURNS - * The left-shifted value. */ -LONGLONG WINAPI _allshl( LONGLONG a, LONG b ) +__ASM_GLOBAL_FUNC( _allshl, + "xchgl (%esp),%ecx\n\t" + "pushl %edx\n\t" + "pushl %eax\n\t" + "pushl %ecx\n\t" + "jmp " __ASM_NAME("__regs__allshl") ) + + +LONGLONG __regs__allshr( LONGLONG a, unsigned char b ) { - return a << b; + return a >> b; }
/****************************************************************************** * _allshr (NTDLL.@) - * - * Shift a 64 bit integer to the right. - * - * PARAMS - * a [I] Initial number. - * b [I] Number to shift a by to the right. - * - * RETURNS - * The right-shifted value. */ -LONGLONG WINAPI _allshr( LONGLONG a, LONG b ) +__ASM_GLOBAL_FUNC( _allshr, + "xchgl (%esp),%ecx\n\t" + "pushl %edx\n\t" + "pushl %eax\n\t" + "pushl %ecx\n\t" + "jmp " __ASM_NAME("__regs__allshr") ) + + +ULONGLONG __regs__aullshr( ULONGLONG a, unsigned char b ) { return a >> b; }
+/****************************************************************************** + * _allshr (NTDLL.@) + */ +__ASM_GLOBAL_FUNC( _aullshr, + "xchgl (%esp),%ecx\n\t" + "pushl %edx\n\t" + "pushl %eax\n\t" + "pushl %ecx\n\t" + "jmp " __ASM_NAME("__regs__aullshr") ) + + /****************************************************************************** * _alldvrm (NTDLL.@) * @@ -899,23 +914,6 @@ ULONGLONG WINAPI _aullrem( ULONGLONG a, ULONGLONG b ) return r; }
-/****************************************************************************** - * _aullshr (NTDLL.@) - * - * Shift a 64 bit unsigned integer to the right. - * - * PARAMS - * a [I] Initial number. - * b [I] Number to shift a by to the right. - * - * RETURNS - * The right-shifted value. - */ -ULONGLONG WINAPI _aullshr( ULONGLONG a, LONG b ) -{ - return a >> b; -} - /****************************************************************************** * _aulldvrm (NTDLL.@) * diff --git a/dlls/ntdll/ntdll.spec b/dlls/ntdll/ntdll.spec index 1862358e593..89b05728951 100644 --- a/dlls/ntdll/ntdll.spec +++ b/dlls/ntdll/ntdll.spec @@ -1492,13 +1492,13 @@ @ cdecl -norelay -arch=i386 -ret64 _allmul(int64 int64) @ cdecl -arch=i386 -norelay _alloca_probe() @ cdecl -norelay -arch=i386 -ret64 _allrem(int64 int64) -@ stdcall -arch=i386 -ret64 _allshl(int64 long) -@ stdcall -arch=i386 -ret64 _allshr(int64 long) +@ cdecl -norelay -arch=i386 -ret64 _allshl(int64 long) +@ cdecl -norelay -arch=i386 -ret64 _allshr(int64 long) @ cdecl -ret64 _atoi64(str) @ cdecl -norelay -arch=i386 -ret64 _aulldiv(int64 int64) @ cdecl -arch=i386 -norelay _aulldvrm(int64 int64) @ cdecl -norelay -arch=i386 -ret64 _aullrem(int64 int64) -@ stdcall -arch=i386 -ret64 _aullshr(int64 long) +@ cdecl -norelay -arch=i386 -ret64 _aullshr(int64 long) @ cdecl -arch=i386 -norelay _chkstk() @ stub _fltused @ cdecl -arch=i386 -ret64 _ftol() diff --git a/dlls/ntoskrnl.exe/ntoskrnl.exe.spec b/dlls/ntoskrnl.exe/ntoskrnl.exe.spec index 8b0ee1c4b51..460d7d0459f 100644 --- a/dlls/ntoskrnl.exe/ntoskrnl.exe.spec +++ b/dlls/ntoskrnl.exe/ntoskrnl.exe.spec @@ -1540,12 +1540,12 @@ @ cdecl -arch=i386 -norelay -ret64 _allmul(int64 int64) @ cdecl -arch=i386 -norelay _alloca_probe() @ cdecl -arch=i386 -norelay -ret64 _allrem(int64 int64) -@ stdcall -arch=i386 -ret64 _allshl(int64 long) -@ stdcall -arch=i386 -ret64 _allshr(int64 long) +@ cdecl -arch=i386 -norelay -ret64 _allshl(int64 long) +@ cdecl -arch=i386 -norelay -ret64 _allshr(int64 long) @ cdecl -arch=i386 -norelay -ret64 _aulldiv(int64 int64) @ cdecl -arch=i386 -norelay _aulldvrm(int64 int64) @ cdecl -arch=i386 -norelay -ret64 _aullrem(int64 int64) -@ stdcall -arch=i386 -ret64 _aullshr(int64 long) +@ cdecl -arch=i386 -norelay -ret64 _aullshr(int64 long) @ cdecl -arch=i386 -norelay _chkstk() @ cdecl -arch=i386 _except_handler2(ptr ptr ptr ptr) @ cdecl -arch=i386 _except_handler3(ptr ptr ptr ptr)
Hi,
It looks like your patch introduced the new failures shown below. Please investigate and fix them before resubmitting your patch. If they are not new, fixing them anyway would help a lot. Otherwise please ask for the known failures list to be updated.
The tests also ran into some preexisting test failures. If you know how to fix them that would be helpful. See the TestBot job for the details:
The full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=118332
Your paranoid android.
=== debian11 (32 bit report) ===
ntdll: om.c:933: Test succeeded inside todo block: 0: NtCreateKey failed c000003b om.c:1056: Test succeeded inside todo block: NULL: NtCreateKey failed c0000005 om.c:1060: Test succeeded inside todo block: NULL: NtCreateKey failed c0000005
From: Zebediah Figura zfigura@codeweavers.com
--- dlls/ntdll/tests/large_int.c | 72 ++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+)
diff --git a/dlls/ntdll/tests/large_int.c b/dlls/ntdll/tests/large_int.c index 9635cac24ad..a8f628ae989 100644 --- a/dlls/ntdll/tests/large_int.c +++ b/dlls/ntdll/tests/large_int.c @@ -39,6 +39,7 @@ static LONGLONG (WINAPI *p_allrem)( LONGLONG a, LONGLONG b ); static LONGLONG (WINAPI *p_allmul)( LONGLONG a, LONGLONG b ); static ULONGLONG (WINAPI *p_aulldiv)( ULONGLONG a, ULONGLONG b ); static ULONGLONG (WINAPI *p_aullrem)( ULONGLONG a, ULONGLONG b ); +static void *p_allshl, *p_allshr, *p_aullshr;
static void InitFunctionPtrs(void) { @@ -54,8 +55,11 @@ static void InitFunctionPtrs(void) p_alldiv = (void *)GetProcAddress(hntdll, "_alldiv"); p_allrem = (void *)GetProcAddress(hntdll, "_allrem"); p_allmul = (void *)GetProcAddress(hntdll, "_allmul"); + p_allshl = (void *)GetProcAddress(hntdll, "_allshl"); + p_allshr = (void *)GetProcAddress(hntdll, "_allshr"); p_aulldiv = (void *)GetProcAddress(hntdll, "_aulldiv"); p_aullrem = (void *)GetProcAddress(hntdll, "_aullrem"); + p_aullshr = (void *)GetProcAddress(hntdll, "_aullshr"); } /* if */ }
@@ -445,9 +449,24 @@ static void test_RtlLargeIntegerToChar(void) static void test_builtins(void) { #ifdef __i386__ + void *code_mem; ULONGLONG u; LONGLONG l;
+ static const BYTE call_shift_code[] = + { + 0x31, 0xc0, /* xorl %eax,%eax */ + 0x31, 0xd2, /* xorl %edx,%edx */ + 0x31, 0xc9, /* xorl %ecx,%ecx */ + 0x87, 0x44, 0x24, 0x08, /* xchgl 8(%esp),%eax */ + 0x87, 0x54, 0x24, 0x0c, /* xchgl 12(%esp),%edx */ + 0x87, 0x4c, 0x24, 0x10, /* xchgl 16(%esp),%ecx */ + 0xff, 0x64, 0x24, 0x04, /* jmp *4(%esp) */ + }; + LONGLONG (__cdecl *call_shift_func)(void *func, LONGLONG a, LONG b); + + code_mem = VirtualAlloc(NULL, 0x1000, MEM_RESERVE | MEM_COMMIT, PAGE_EXECUTE_READWRITE); + l = p_alldiv(100, 7); ok(l == 14, "_alldiv returned %s\n", wine_dbgstr_longlong(l));
@@ -489,6 +508,59 @@ static void test_builtins(void)
l = p_allmul(0x300000001ll, 4); ok(l == 0xc00000004, "_allmul = %s\n", wine_dbgstr_longlong(l)); + + memcpy(code_mem, call_shift_code, sizeof(call_shift_code)); + call_shift_func = code_mem; + + l = call_shift_func(p_allshl, 0x0123456789abcdefll, 12); + ok(l == 0x3456789abcdef000ll, "got %#I64x\n", l); + + l = call_shift_func(p_allshl, 0x0123456789abcdefll, 44); + ok(l == 0xbcdef00000000000ll, "got %#I64x\n", l); + + l = call_shift_func(p_allshl, 0x0123456789abcdefll, 88); + todo_wine ok(!l, "got %#I64x\n", l); + + l = call_shift_func(p_allshl, 0x0123456789abcdefll, 0x88); + todo_wine ok(!l, "got %#I64x\n", l); + + l = call_shift_func(p_allshl, 0x0123456789abcdefll, 0x108); + ok(l == 0x23456789abcdef00ll, "got %#I64x\n", l); + + l = call_shift_func(p_allshr, 0x0123456789abcdefll, 12); + ok(l == 0x0123456789abcll, "got %#I64x\n", l); + + l = call_shift_func(p_allshr, 0x0123456789abcdefll, 44); + ok(l == 0x01234ll, "got %#I64x\n", l); + + l = call_shift_func(p_allshr, 0x0123456789abcdefll, 88); + todo_wine ok(!l, "got %#I64x\n", l); + + l = call_shift_func(p_allshr, 0x8123456789abcdefll, 12); + ok(l == 0xfff8123456789abcll, "got %#I64x\n", l); + + l = call_shift_func(p_allshr, 0x8123456789abcdefll, 44); + ok(l == 0xfffffffffff81234ll, "got %#I64x\n", l); + + l = call_shift_func(p_allshr, 0x8123456789abcdefll, 88); + todo_wine ok(l == -1ll, "got %#I64x\n", l); + + l = call_shift_func(p_allshr, 0x8123456789abcdefll, 0x108); + ok(l == 0xff8123456789abcdll, "got %#I64x\n", l); + + l = call_shift_func(p_aullshr, 0x8123456789abcdefll, 12); + ok(l == 0x8123456789abcll, "got %#I64x\n", l); + + l = call_shift_func(p_aullshr, 0x8123456789abcdefll, 44); + ok(l == 0x81234ll, "got %#I64x\n", l); + + l = call_shift_func(p_aullshr, 0x8123456789abcdefll, 88); + todo_wine ok(!l, "got %#I64x\n", l); + + l = call_shift_func(p_aullshr, 0x8123456789abcdefll, 0x108); + ok(l == 0x8123456789abcdll, "got %#I64x\n", l); + + VirtualFree(code_mem, 0, MEM_RELEASE); #endif /* __i386__ */ }
From: Zebediah Figura zfigura@codeweavers.com
--- dlls/ntdll/large_int.c | 45 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-)
diff --git a/dlls/ntdll/large_int.c b/dlls/ntdll/large_int.c index 0faabe0a991..d184426ffcd 100644 --- a/dlls/ntdll/large_int.c +++ b/dlls/ntdll/large_int.c @@ -812,7 +812,20 @@ ULONGLONG WINAPI _aulldiv( ULONGLONG a, ULONGLONG b )
LONGLONG __regs__allshl( LONGLONG a, unsigned char b ) { - return a << b; + const LARGE_INTEGER x = { .QuadPart = a }; + LARGE_INTEGER ret; + + if (b >= 32) + { + ret.HighPart = x.LowPart << (b & 31); + ret.LowPart = 0; + } + else + { + ret.HighPart = (x.LowPart >> (32 - b)) | (x.HighPart << b); + ret.LowPart = x.LowPart << b; + } + return ret.QuadPart; }
/****************************************************************************** @@ -828,7 +841,20 @@ __ASM_GLOBAL_FUNC( _allshl,
LONGLONG __regs__allshr( LONGLONG a, unsigned char b ) { - return a >> b; + const LARGE_INTEGER x = { .QuadPart = a }; + LARGE_INTEGER ret; + + if (b >= 32) + { + ret.HighPart = x.HighPart >> 31; + ret.LowPart = x.HighPart >> (b & 31); + } + else + { + ret.HighPart = x.HighPart >> b; + ret.LowPart = (x.HighPart << (32 - b)) | (x.LowPart >> b); + } + return ret.QuadPart; }
/****************************************************************************** @@ -844,7 +870,20 @@ __ASM_GLOBAL_FUNC( _allshr,
ULONGLONG __regs__aullshr( ULONGLONG a, unsigned char b ) { - return a >> b; + const ULARGE_INTEGER x = { .QuadPart = a }; + ULARGE_INTEGER ret; + + if (b >= 32) + { + ret.HighPart = 0; + ret.LowPart = x.HighPart >> (b & 31); + } + else + { + ret.HighPart = x.HighPart >> b; + ret.LowPart = (x.HighPart << (32 - b)) | (x.LowPart >> b); + } + return ret.QuadPart; }
/******************************************************************************
Hi,
It looks like your patch introduced the new failures shown below. Please investigate and fix them before resubmitting your patch. If they are not new, fixing them anyway would help a lot. Otherwise please ask for the known failures list to be updated.
The tests also ran into some preexisting test failures. If you know how to fix them that would be helpful. See the TestBot job for the details:
The full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=118334
Your paranoid android.
=== debian11 (32 bit report) ===
ntdll: om.c:933: Test succeeded inside todo block: 0: NtCreateKey failed c000003b om.c:1056: Test succeeded inside todo block: NULL: NtCreateKey failed c0000005 om.c:1060: Test succeeded inside todo block: NULL: NtCreateKey failed c0000005
From: Zebediah Figura zfigura@codeweavers.com
Based on a patch by Zhao Yi. --- dlls/ntdll/large_int.c | 9 +++++++++ dlls/ntdll/tests/large_int.c | 10 +++++----- 2 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/dlls/ntdll/large_int.c b/dlls/ntdll/large_int.c index d184426ffcd..2914b97ca70 100644 --- a/dlls/ntdll/large_int.c +++ b/dlls/ntdll/large_int.c @@ -815,6 +815,9 @@ LONGLONG __regs__allshl( LONGLONG a, unsigned char b ) const LARGE_INTEGER x = { .QuadPart = a }; LARGE_INTEGER ret;
+ if (b >= 64) + return 0; + if (b >= 32) { ret.HighPart = x.LowPart << (b & 31); @@ -844,6 +847,9 @@ LONGLONG __regs__allshr( LONGLONG a, unsigned char b ) const LARGE_INTEGER x = { .QuadPart = a }; LARGE_INTEGER ret;
+ if (b >= 64) + return (LONGLONG)(x.HighPart >> 31); + if (b >= 32) { ret.HighPart = x.HighPart >> 31; @@ -873,6 +879,9 @@ ULONGLONG __regs__aullshr( ULONGLONG a, unsigned char b ) const ULARGE_INTEGER x = { .QuadPart = a }; ULARGE_INTEGER ret;
+ if (b >= 64) + return 0; + if (b >= 32) { ret.HighPart = 0; diff --git a/dlls/ntdll/tests/large_int.c b/dlls/ntdll/tests/large_int.c index a8f628ae989..128c83299a9 100644 --- a/dlls/ntdll/tests/large_int.c +++ b/dlls/ntdll/tests/large_int.c @@ -519,10 +519,10 @@ static void test_builtins(void) ok(l == 0xbcdef00000000000ll, "got %#I64x\n", l);
l = call_shift_func(p_allshl, 0x0123456789abcdefll, 88); - todo_wine ok(!l, "got %#I64x\n", l); + ok(!l, "got %#I64x\n", l);
l = call_shift_func(p_allshl, 0x0123456789abcdefll, 0x88); - todo_wine ok(!l, "got %#I64x\n", l); + ok(!l, "got %#I64x\n", l);
l = call_shift_func(p_allshl, 0x0123456789abcdefll, 0x108); ok(l == 0x23456789abcdef00ll, "got %#I64x\n", l); @@ -534,7 +534,7 @@ static void test_builtins(void) ok(l == 0x01234ll, "got %#I64x\n", l);
l = call_shift_func(p_allshr, 0x0123456789abcdefll, 88); - todo_wine ok(!l, "got %#I64x\n", l); + ok(!l, "got %#I64x\n", l);
l = call_shift_func(p_allshr, 0x8123456789abcdefll, 12); ok(l == 0xfff8123456789abcll, "got %#I64x\n", l); @@ -543,7 +543,7 @@ static void test_builtins(void) ok(l == 0xfffffffffff81234ll, "got %#I64x\n", l);
l = call_shift_func(p_allshr, 0x8123456789abcdefll, 88); - todo_wine ok(l == -1ll, "got %#I64x\n", l); + ok(l == -1ll, "got %#I64x\n", l);
l = call_shift_func(p_allshr, 0x8123456789abcdefll, 0x108); ok(l == 0xff8123456789abcdll, "got %#I64x\n", l); @@ -555,7 +555,7 @@ static void test_builtins(void) ok(l == 0x81234ll, "got %#I64x\n", l);
l = call_shift_func(p_aullshr, 0x8123456789abcdefll, 88); - todo_wine ok(!l, "got %#I64x\n", l); + ok(!l, "got %#I64x\n", l);
l = call_shift_func(p_aullshr, 0x8123456789abcdefll, 0x108); ok(l == 0x8123456789abcdll, "got %#I64x\n", l);
Hi,
It looks like your patch introduced the new failures shown below. Please investigate and fix them before resubmitting your patch. If they are not new, fixing them anyway would help a lot. Otherwise please ask for the known failures list to be updated.
The tests also ran into some preexisting test failures. If you know how to fix them that would be helpful. See the TestBot job for the details:
The full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=118335
Your paranoid android.
=== debian11 (32 bit report) ===
ntdll: om.c:933: Test succeeded inside todo block: 0: NtCreateKey failed c000003b om.c:1056: Test succeeded inside todo block: NULL: NtCreateKey failed c0000005 om.c:1060: Test succeeded inside todo block: NULL: NtCreateKey failed c0000005
Jinoh Kang (@iamahuman) commented about dlls/ntdll/large_int.c:
+__ASM_GLOBAL_FUNC( _allshl,
"xchgl (%esp),%ecx\n\t"
"pushl %edx\n\t"
"pushl %eax\n\t"
"pushl %ecx\n\t"
"jmp " __ASM_NAME("__regs__allshl") )
+LONGLONG __regs__allshr( LONGLONG a, unsigned char b ) {
- return a << b;
- const LARGE_INTEGER x = { .QuadPart = a };
- LARGE_INTEGER ret;
- if (b >= 64)
return (LONGLONG)(x.HighPart >> 31);
Suggest using `x.HighPart >> (b >= 64 ? 31 : b & 31)` or its equivalent, as per the same reason for `_allshl`.
Compare:
- https://godbolt.org/z/sEd8Th56M (original)
With:
- https://godbolt.org/z/9WxTnbfax (64-bit return ellided).
Jinoh Kang (@iamahuman) commented about dlls/ntdll/large_int.c:
+__ASM_GLOBAL_FUNC( _allshr,
"xchgl (%esp),%ecx\n\t"
"pushl %edx\n\t"
"pushl %eax\n\t"
"pushl %ecx\n\t"
"jmp " __ASM_NAME("__regs__allshr") )
+ULONGLONG __regs__aullshr( ULONGLONG a, unsigned char b ) {
- return a >> b;
- const ULARGE_INTEGER x = { .QuadPart = a };
- ULARGE_INTEGER ret;
- if (b >= 64)
return 0;
Suggest using `b >= 64 ? 0 : x.HighPart >> (b & 31)` or its equivalent, as per the same reason for `_allshl`.
Compare:
- https://godbolt.org/z/8PjoMnGYj
With:
Jinoh Kang (@iamahuman) commented about dlls/ntdll/large_int.c:
- Shift a 64 bit integer to the left.
- PARAMS
- a [I] Initial number.
- b [I] Number to shift a by to the left.
- RETURNS
*/
- The left-shifted value.
-LONGLONG WINAPI _allshl( LONGLONG a, LONG b ) +__ASM_GLOBAL_FUNC( _allshl,
"xchgl (%esp),%ecx\n\t"
"pushl %edx\n\t"
"pushl %eax\n\t"
"pushl %ecx\n\t"
"jmp " __ASM_NAME("__regs__allshl") )
We need to either drop the tail call or switch `__regs__allshl` to stdcall.
It might not have been caught by the test due to use of frame pointer.
(Ditto for other functions.)
On 7/6/22 10:50, Jinoh Kang (@iamahuman) wrote:
Jinoh Kang (@iamahuman) commented about dlls/ntdll/large_int.c:
- Shift a 64 bit integer to the left.
- PARAMS
- a [I] Initial number.
- b [I] Number to shift a by to the left.
- RETURNS
*/
- The left-shifted value.
-LONGLONG WINAPI _allshl( LONGLONG a, LONG b ) +__ASM_GLOBAL_FUNC( _allshl,
"xchgl (%esp),%ecx\n\t"
"pushl %edx\n\t"
"pushl %eax\n\t"
"pushl %ecx\n\t"
"jmp " __ASM_NAME("__regs__allshl") )
We need to either drop the tail call or switch `__regs__allshl` to stdcall.
It might not have been caught by the test due to use of frame pointer.
(Ditto for other functions.)
Yes, that was a mistake on my part; I second-guessed the calling convention one too many times. Thanks for catching it.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/large_int.c:
return udivmod(a, b, NULL);
}
+LONGLONG __regs__allshl( LONGLONG a, unsigned char b ) +{
- const LARGE_INTEGER x = { .QuadPart = a };
- LARGE_INTEGER ret;
- if (b >= 64)
return 0;
It appears that GCC's optimizer is having a hard time dealing with mixing full (64-bit) and partial (32-bit) writes.
Compare:
- https://godbolt.org/z/KvdGfr4bY (original)
With:
- https://godbolt.org/z/vG5TGrhaY (64-bit return ellided)
I'd suggest folding the special case into the if statement below. We do already lose performance by thunking, but it's still a good idea not to slow down (or, rather, amplify the I-cache usage of) a builtin too much.
Meanwhile, clang does not appear to suffer from this problem.
On 7/6/22 10:50, Jinoh Kang (@iamahuman) wrote:
Jinoh Kang (@iamahuman) commented about dlls/ntdll/large_int.c:
return udivmod(a, b, NULL);
}
+LONGLONG __regs__allshl( LONGLONG a, unsigned char b ) +{
- const LARGE_INTEGER x = { .QuadPart = a };
- LARGE_INTEGER ret;
- if (b >= 64)
return 0;
It appears that GCC's optimizer is having a hard time dealing with mixing full (64-bit) and partial (32-bit) writes.
Compare:
- https://godbolt.org/z/KvdGfr4bY (original)
With:
- https://godbolt.org/z/vG5TGrhaY (64-bit return ellided)
I'd suggest folding the special case into the if statement below. We do already lose performance by thunking, but it's still a good idea not to slow down (or, rather, amplify the I-cache usage of) a builtin too much.
Meanwhile, clang does not appear to suffer from this problem.
It doesn't seem to be about mixing writes (I get the "bad" pattern even if I write the high and low parts independently), but rather GCC doesn't seem to be able to CSE the zero write to LowPart.
Since it's a simple enough tweak I'll submit a new version that's friendlier to gcc codegen.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/tests/large_int.c:
static void test_builtins(void) { #ifdef __i386__
void *code_mem; ULONGLONG u; LONGLONG l;
static const BYTE call_shift_code[] =
{
0x31, 0xc0, /* xorl %eax,%eax */
0x31, 0xd2, /* xorl %edx,%edx */
0x31, 0xc9, /* xorl %ecx,%ecx */
0x87, 0x44, 0x24, 0x08, /* xchgl 8(%esp),%eax */
0x87, 0x54, 0x24, 0x0c, /* xchgl 12(%esp),%edx */
0x87, 0x4c, 0x24, 0x10, /* xchgl 16(%esp),%ecx */
0xff, 0x64, 0x24, 0x04, /* jmp *4(%esp) */
It might be a good idea to test for difference of `esp` before and after the call.
On 7/6/22 10:50, Jinoh Kang (@iamahuman) wrote:
Jinoh Kang (@iamahuman) commented about dlls/ntdll/tests/large_int.c:
static void test_builtins(void) { #ifdef __i386__
void *code_mem; ULONGLONG u; LONGLONG l;
static const BYTE call_shift_code[] =
{
0x31, 0xc0, /* xorl %eax,%eax */
0x31, 0xd2, /* xorl %edx,%edx */
0x31, 0xc9, /* xorl %ecx,%ecx */
0x87, 0x44, 0x24, 0x08, /* xchgl 8(%esp),%eax */
0x87, 0x54, 0x24, 0x0c, /* xchgl 12(%esp),%edx */
0x87, 0x4c, 0x24, 0x10, /* xchgl 16(%esp),%ecx */
0xff, 0x64, 0x24, 0x04, /* jmp *4(%esp) */
It might be a good idea to test for difference of `esp` before and after the call.
Yes, that strikes me as a good idea as well.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/large_int.c:
+LONGLONG __regs__allshl( LONGLONG a, unsigned char b ) +{
- const LARGE_INTEGER x = { .QuadPart = a };
- LARGE_INTEGER ret;
- if (b >= 64)
return 0;
- if (b >= 32)
- {
ret.HighPart = x.LowPart << (b & 31);
ret.LowPart = 0;
- }
- else
- {
ret.HighPart = (x.LowPart >> (32 - b)) | (x.HighPart << b);
Shall we use intrinsics such as `__ll_lshift` (`shld` instruction)? Ditto for other functions.
On 7/6/22 10:50, Jinoh Kang (@iamahuman) wrote:
Jinoh Kang (@iamahuman) commented about dlls/ntdll/large_int.c:
+LONGLONG __regs__allshl( LONGLONG a, unsigned char b ) +{
- const LARGE_INTEGER x = { .QuadPart = a };
- LARGE_INTEGER ret;
- if (b >= 64)
return 0;
- if (b >= 32)
- {
ret.HighPart = x.LowPart << (b & 31);
ret.LowPart = 0;
- }
- else
- {
ret.HighPart = (x.LowPart >> (32 - b)) | (x.HighPart << b);
Shall we use intrinsics such as `__ll_lshift` (`shld` instruction)? Ditto for other functions.
I suppose we could, although ideally GCC would just recognize this pattern and generate shld.
Jinoh Kang (@iamahuman) commented about dlls/ntdll/large_int.c:
/******************************************************************************
_allshl (NTDLL.@)
- Shift a 64 bit integer to the left.
- PARAMS
- a [I] Initial number.
- b [I] Number to shift a by to the left.
- RETURNS
*/
- The left-shifted value.
-LONGLONG WINAPI _allshl( LONGLONG a, LONG b ) +__ASM_GLOBAL_FUNC( _allshl,
"xchgl (%esp),%ecx\n\t"
Maybe a nit, and this pattern also resides in other preëxisting code already, but do we _really_ want to use a full memory barrier in such cases?
On 7/6/22 10:50, Jinoh Kang (@iamahuman) wrote:
Jinoh Kang (@iamahuman) commented about dlls/ntdll/large_int.c:
- /******************************************************************************
_allshl (NTDLL.@)
- Shift a 64 bit integer to the left.
- PARAMS
- a [I] Initial number.
- b [I] Number to shift a by to the left.
- RETURNS
*/
- The left-shifted value.
-LONGLONG WINAPI _allshl( LONGLONG a, LONG b ) +__ASM_GLOBAL_FUNC( _allshl,
"xchgl (%esp),%ecx\n\t"
Maybe a nit, and this pattern also resides in other preëxisting code already, but do we _really_ want to use a full memory barrier in such cases?
I'm inclined to think it's not worth coding more assembly just to squeeze every last bit of performance out of these functions.