Signed-off-by: Jan Sikorski jsikorski@codeweavers.com --- dlls/msvcrt/string.c | 63 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 59 insertions(+), 4 deletions(-)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 3b352ac0bf2..7a753fbcd21 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2675,10 +2675,7 @@ int CDECL I10_OUTPUT(MSVCRT__LDOUBLE ld80, int prec, int flag, struct _I10_OUTPU } #undef I10_OUTPUT_MAX_PREC
-/********************************************************************* - * memcmp (MSVCRT.@) - */ -int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) +static inline int memcmp_unaligned(const void *ptr1, const void *ptr2, size_t n) { const unsigned char *p1, *p2;
@@ -2690,6 +2687,64 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) return 0; }
+static int memcmp_aligned(const void *p1, const void *p2, size_t size) +{ + const unsigned char *first = p1, *second = p2; + const size_t block_size = sizeof(size_t); + + size_t remainder = size & (block_size - 1); + size_t block_count = size / block_size; + + while (block_count-- > 0) + { + size_t value_1 = *(size_t *)first; + size_t value_2 = *(size_t *)second; + + if (value_1 != value_2) + return memcmp_unaligned(first, second, block_size); + + first += block_size; + second += block_size; + } + + return memcmp_unaligned(first, second, remainder); +} + +/********************************************************************* + * memcmp (MSVCRT.@) + */ +int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) +{ + const size_t block_size = sizeof(size_t); + const unsigned char *p1 = ptr1, *p2 = ptr2; + size_t align; + int result; + + if (n < block_size) + return memcmp_unaligned(p1, p2, n); + + align = -(uintptr_t)p1 & (block_size - 1); + + if ((result = memcmp_unaligned(p1, p2, align))) + return result; + + p1 += align; + p2 += align; + n -= align; + +#if defined(__i386__) || defined(__x86_64__) + return memcmp_aligned(p1, p2, n); +#else + if (!((uintptr_t)p2 & (block_size - 1))) + { + result = memcmp_aligned(p1, p2, n); + return result; + } + + return memcmp_unaligned(p1, p2, n); +#endif +} + #if defined(__i386__) || defined(__x86_64__)
#ifdef __i386__
Hi Jan,
On 4/6/22 19:14, Jan Sikorski wrote:
-/*********************************************************************
memcmp (MSVCRT.@)
- */
-int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) +static inline int memcmp_unaligned(const void *ptr1, const void *ptr2, size_t n) { const unsigned char *p1, *p2;
@@ -2690,6 +2687,64 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) return 0; }
I think it would be good to optimize memcmp_unaligned a little. I'm thinking about something along these lines (untested): static int memcmp_size_t(size_t s1, size_t s2) { const uint8_t *p1 = (const uint8_t*)&s1, *p2 = (const uint8_t*)&s2; while (*p1 == *p2) { p1++; p2++; } return *p1 > *p2 ? 1 : -1; }
static int memcmp_unaligned(const char *c1, const char *c2, size_t len) { int sh1 = 8 * ((size_t)c2 % sizeof(size_t)); int sh2 = 8 * sizeof(size_t) - sh1; const size_t *s1 = (const size_t*)c1; const size_t *s2 = (const size_t*)(c2 - sh1 / 8); size_t x, y, m;
x = s2[0]; do { y = s2[1]; m = MERGE(x, sh1, y, sh2); if (*s1 != m) return memcmp_size_t(*s1, m); s1++; s2++; len--; x = y; } while (len); return 0; }
Where MERGE is already defined in string.c file, len is length in sizeof(size_t) blocks instead of bytes. It may be even better to switch to uint64_t instead of using size_t like in memset. You can also take a look on glibc platform independent implementation (it uses the MERGE trick + loop unrolling, according to some random benchmark it's on par with your implementation performance wise on i386/x86_64 and is much faster on arm).
Thanks, Piotr
On 4/13/22 15:50, Piotr Caban wrote:
Hi Jan,
On 4/6/22 19:14, Jan Sikorski wrote:
-/*********************************************************************
- * memcmp (MSVCRT.@)
- */
-int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) +static inline int memcmp_unaligned(const void *ptr1, const void *ptr2, size_t n) { const unsigned char *p1, *p2; @@ -2690,6 +2687,64 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) return 0; }
I think it would be good to optimize memcmp_unaligned a little. I'm thinking about something along these lines (untested): static int memcmp_size_t(size_t s1, size_t s2) { const uint8_t *p1 = (const uint8_t*)&s1, *p2 = (const uint8_t*)&s2; while (*p1 == *p2) { p1++; p2++; } return *p1 > *p2 ? 1 : -1; }
You can make the small case branchless (and slightly faster) by converting to big endian then comparing the value at once.
static int memcmp_unaligned(const char *c1, const char *c2, size_t len) { int sh1 = 8 * ((size_t)c2 % sizeof(size_t)); int sh2 = 8 * sizeof(size_t) - sh1; const size_t *s1 = (const size_t*)c1; const size_t *s2 = (const size_t*)(c2 - sh1 / 8); size_t x, y, m;
x = s2[0]; do { y = s2[1]; m = MERGE(x, sh1, y, sh2); if (*s1 != m) return memcmp_size_t(*s1, m); s1++; s2++; len--; x = y; } while (len); return 0; }
Where MERGE is already defined in string.c file, len is length in sizeof(size_t) blocks instead of bytes. It may be even better to switch to uint64_t instead of using size_t like in memset. You can also take a look on glibc platform independent implementation (it uses the MERGE trick + loop unrolling, according to some random benchmark it's on par with your implementation performance wise on i386/x86_64 and is much faster on arm).
Thanks, Piotr
FWIW for me and with a basic benchmark this version is 3x slower than Jan's version on 32bit x86, 2x slower on 64bit.
On 4/19/22 10:39, Rémi Bernon wrote:
On 4/13/22 15:50, Piotr Caban wrote:
Hi Jan,
On 4/6/22 19:14, Jan Sikorski wrote:
-/*********************************************************************
- * memcmp (MSVCRT.@)
- */
-int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) +static inline int memcmp_unaligned(const void *ptr1, const void *ptr2, size_t n) { const unsigned char *p1, *p2; @@ -2690,6 +2687,64 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) return 0; }
I think it would be good to optimize memcmp_unaligned a little. I'm thinking about something along these lines (untested): static int memcmp_size_t(size_t s1, size_t s2) { const uint8_t *p1 = (const uint8_t*)&s1, *p2 = (const uint8_t*)&s2; while (*p1 == *p2) { p1++; p2++; } return *p1 > *p2 ? 1 : -1; }
You can make the small case branchless (and slightly faster) by converting to big endian then comparing the value at once.
static int memcmp_unaligned(const char *c1, const char *c2, size_t len) { int sh1 = 8 * ((size_t)c2 % sizeof(size_t)); int sh2 = 8 * sizeof(size_t) - sh1; const size_t *s1 = (const size_t*)c1; const size_t *s2 = (const size_t*)(c2 - sh1 / 8); size_t x, y, m;
x = s2[0]; do { y = s2[1]; m = MERGE(x, sh1, y, sh2); if (*s1 != m) return memcmp_size_t(*s1, m); s1++; s2++; len--; x = y; } while (len); return 0; }
Where MERGE is already defined in string.c file, len is length in sizeof(size_t) blocks instead of bytes. It may be even better to switch to uint64_t instead of using size_t like in memset. You can also take a look on glibc platform independent implementation (it uses the MERGE trick + loop unrolling, according to some random benchmark it's on par with your implementation performance wise on i386/x86_64 and is much faster on arm).
Thanks, Piotr
FWIW for me and with a basic benchmark this version is 3x slower than Jan's version on 32bit x86, 2x slower on 64bit.
To make sure we're talking about the same thing - I've meant that glibc's (platform independent) version has similar performance. The code I have attached here was only added to show general idea while keeping the code as simple as possible.