Signed-off-by: Jan Sikorski jsikorski@codeweavers.com --- It's about 13x faster on my machine than the byte version. memcmp performance is important to wined3d, where it's used to find pipelines in the cache, and the keys are pretty big. --- dlls/msvcrt/string.c | 46 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 3b352ac0bf2..66761e7a282 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -34,6 +34,10 @@ #include "wine/asm.h" #include "wine/debug.h"
+#ifdef __x86_64__ +#include <immintrin.h> +#endif + WINE_DEFAULT_DEBUG_CHANNEL(msvcrt);
/********************************************************************* @@ -2675,11 +2679,52 @@ int CDECL I10_OUTPUT(MSVCRT__LDOUBLE ld80, int prec, int flag, struct _I10_OUTPU } #undef I10_OUTPUT_MAX_PREC
+#ifdef __x86_64__ +static int sse2_memcmp(const void *p1, const void *p2, size_t size) +{ + const unsigned char *first = p1, *second = p2; + size_t remainder = size & 0xf; + size_t size_16 = size / 16; + uint16_t mask; + DWORD index; + + while (size_16-- > 0) + { + __m128i value_1 = _mm_loadu_si128((__m128i *)first); + __m128i value_2 = _mm_loadu_si128((__m128i *)second); + __m128i compare = _mm_cmpeq_epi8(value_1, value_2); + if ((mask = ~_mm_movemask_epi8(compare))) + { + _BitScanForward(&index, mask); + if (first[index] < second[index]) return -1; + else return 1; + } + + first += 16; + second += 16; + } + + while (remainder-- > 0) + { + if (*first < *second) return -1; + if (*first > *second) return 1; + + first++; + second++; + } + + return 0; +} +#endif + /********************************************************************* * memcmp (MSVCRT.@) */ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) { +#ifdef __x86_64__ + return sse2_memcmp(ptr1, ptr2, n); +#else const unsigned char *p1, *p2;
for (p1 = ptr1, p2 = ptr2; n; n--, p1++, p2++) @@ -2688,6 +2733,7 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) if (*p1 > *p2) return 1; } return 0; +#endif }
#if defined(__i386__) || defined(__x86_64__)