Signed-off-by: Elaine Lefler elaineclefler@gmail.com ---
New vectorized implementation improves performance up to 65%. --- dlls/ntdll/string.c | 162 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 3 deletions(-)
diff --git a/dlls/ntdll/string.c b/dlls/ntdll/string.c index 0fa83821d21..443fc98418a 100644 --- a/dlls/ntdll/string.c +++ b/dlls/ntdll/string.c @@ -33,6 +33,16 @@ #include "winternl.h" #include "ntdll_misc.h"
+#ifdef __x86_64__ + +#include <x86intrin.h> + +/* Enable vectorized memcpy implementation (all x86-64 CPUs have SSE2). + * TODO: This could be enabled for x86 with a cpuid check. */ +#define SSE2_MEMCPY + +#endif +
/* same as wctypes except for TAB, which doesn't have C1_BLANK for some reason... */ static const unsigned short ctypes[257] = @@ -96,10 +106,154 @@ int __cdecl memcmp( const void *ptr1, const void *ptr2, size_t n )
/********************************************************************* * memcpy (NTDLL.@) - * - * NOTES - * Behaves like memmove. */ +#ifdef SSE2_MEMCPY + +#define declare_fastcpy(n) \ +static void fastcpy_ ## n \ +( uintptr_t as, const uintptr_t as_end, uintptr_t d ) \ +{ \ + __m128i x, y; \ + x = *(const __m128i*)as; \ + /* Read 32 bytes in, 16 bytes out. Shuffle variables when done so we don't + * re-read the first part. */ \ + while (as < as_end) \ + { \ + /* Prefetch hint improves performance by minimizing cache pollution */ \ + _mm_prefetch((const void*)(as + 16), _MM_HINT_NTA); \ + _mm_prefetch((const void*)d, _MM_HINT_NTA); \ + y = *(const __m128i*)(as + 16); \ + /* (n) is the number of bytes in *as that don't go to *d. Little endian + * means the first bytes appear on the right, so srl to remove them */ \ + x = _mm_srli_si128(x, (n)); \ + /* Take same number of bytes from *(as + 16) and push them to the upper + * part of the register */ \ + x = _mm_or_si128(x, _mm_slli_si128(y, 16 - (n))); \ + *(__m128i*)d = x; \ + d += 16; \ + as += 16; \ + x = y; \ + } \ +} + +declare_fastcpy(1) +declare_fastcpy(2) +declare_fastcpy(3) +declare_fastcpy(4) +declare_fastcpy(5) +declare_fastcpy(6) +declare_fastcpy(7) +declare_fastcpy(8) +declare_fastcpy(9) +declare_fastcpy(10) +declare_fastcpy(11) +declare_fastcpy(12) +declare_fastcpy(13) +declare_fastcpy(14) +declare_fastcpy(15) + +typedef void (*fastcpy_ptr) ( uintptr_t, const uintptr_t, uintptr_t ); + +static const fastcpy_ptr fastcpy_table[16] = { + NULL, /* special case, different code path */ + fastcpy_1, + fastcpy_2, + fastcpy_3, + fastcpy_4, + fastcpy_5, + fastcpy_6, + fastcpy_7, + fastcpy_8, + fastcpy_9, + fastcpy_10, + fastcpy_11, + fastcpy_12, + fastcpy_13, + fastcpy_14, + fastcpy_15 +}; + +void * __cdecl memcpy( void *dst, const void *src, size_t n ) +{ + uintptr_t s = (uintptr_t)src; + uintptr_t d = (uintptr_t)dst; + uintptr_t as; + + _mm_prefetch((const void*)s, _MM_HINT_NTA); + _mm_prefetch((const void*)d, _MM_HINT_NTA); + + /* Ensure aligned destination */ + while (d & 15) + { + if (n-- == 0) + return dst; + *(BYTE*)d++ = *(const BYTE*)s++; + } + + if (n < 16) + { + /* Too small to vectorize */ + while (n--) *(BYTE*)d++ = *(const BYTE*)s++; + return dst; + } + + as = s & ~15; + if (as == s) + { + /* Fastest path: both pointers aligned */ + while (n >= 16) + { + _mm_prefetch((const void*)s, _MM_HINT_NTA); + _mm_prefetch((const void*)d, _MM_HINT_NTA); + *(__m128i*)d = *(const __m128i*)s; + + d += 16; + s += 16; + n -= 16; + } + } + else + { + /* Read from aligned s by rounding down. If as < src, we need to slow + * copy another 16 bytes to avoid OOB reads. */ + ptrdiff_t shift = s - as; + uintptr_t as_end = ((s + n) & ~15) - 16; + + if (as < (uintptr_t)src) + { + uintptr_t target_n = n - 16; + while (n > target_n) + { + if (n-- == 0) + return dst; + *(BYTE*)d++ = *(const BYTE*)s++; + } + + as += 16; + } + + /* Copy 16-byte chunks if any are possible. Since s is misaligned, we + * need to read one chunk ahead of what we're writing, which means + * as_end must point to the _beginning_ of the last readable chunk. + * This also guarantees there is no overrun, since delta < n - 16. */ + if (as_end > as) + { + ptrdiff_t delta = as_end - as; + fastcpy_table[shift](as, as_end, d); + s += delta; + d += delta; + n -= delta; + } + } + + /* Slow copy anything that remains */ + while (n--) *(BYTE*)d++ = *(const BYTE*)s++; + return dst; +} + +#else /* defined(SSE2_MEMCPY) */ + +/* Note: Behaves like memmove */ void * __cdecl memcpy( void *dst, const void *src, size_t n ) { volatile unsigned char *d = dst; /* avoid gcc optimizations */ @@ -118,6 +272,8 @@ void * __cdecl memcpy( void *dst, const void *src, size_t n ) return dst; }
+#endif /* !defined(SSE2_MEMCPY) */ +
/********************************************************************* * memmove (NTDLL.@)