On 3/23/22 10:33, Elaine Lefler wrote:
Signed-off-by: Elaine Lefler elaineclefler@gmail.com
New vectorized implementation improves performance up to 65%.
MSVCRT has one. Maybe deduplicate?
dlls/ntdll/string.c | 162 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 3 deletions(-)
diff --git a/dlls/ntdll/string.c b/dlls/ntdll/string.c index 0fa83821d21..443fc98418a 100644 --- a/dlls/ntdll/string.c +++ b/dlls/ntdll/string.c @@ -33,6 +33,16 @@ #include "winternl.h" #include "ntdll_misc.h"
+#ifdef __x86_64__
+#include <x86intrin.h>
+/* Enable vectorized memcpy implementation (all x86-64 CPUs have SSE2).
- TODO: This could be enabled for x86 with a cpuid check. */
+#define SSE2_MEMCPY
+#endif
/* same as wctypes except for TAB, which doesn't have C1_BLANK for some reason... */ static const unsigned short ctypes[257] = @@ -96,10 +106,154 @@ int __cdecl memcmp( const void *ptr1, const void *ptr2, size_t n )
/*********************************************************************
memcpy (NTDLL.@)
- NOTES
*/
- Behaves like memmove.
+#ifdef SSE2_MEMCPY
+#define declare_fastcpy(n) \ +static void fastcpy_ ## n \ +( uintptr_t as, const uintptr_t as_end, uintptr_t d ) \ +{ \
- __m128i x, y; \
- x = *(const __m128i*)as; \
- /* Read 32 bytes in, 16 bytes out. Shuffle variables when done so we don't
* re-read the first part. */ \
- while (as < as_end) \
- { \
/* Prefetch hint improves performance by minimizing cache pollution */ \
_mm_prefetch((const void*)(as + 16), _MM_HINT_NTA); \
_mm_prefetch((const void*)d, _MM_HINT_NTA); \
y = *(const __m128i*)(as + 16); \
/* (n) is the number of bytes in *as that don't go to *d. Little endian
* means the first bytes appear on the right, so srl to remove them */ \
x = _mm_srli_si128(x, (n)); \
/* Take same number of bytes from *(as + 16) and push them to the upper
* part of the register */ \
x = _mm_or_si128(x, _mm_slli_si128(y, 16 - (n))); \
*(__m128i*)d = x; \
d += 16; \
as += 16; \
x = y; \
- } \
+}
+declare_fastcpy(1) +declare_fastcpy(2) +declare_fastcpy(3) +declare_fastcpy(4) +declare_fastcpy(5) +declare_fastcpy(6) +declare_fastcpy(7) +declare_fastcpy(8) +declare_fastcpy(9) +declare_fastcpy(10) +declare_fastcpy(11) +declare_fastcpy(12) +declare_fastcpy(13) +declare_fastcpy(14) +declare_fastcpy(15)
+typedef void (*fastcpy_ptr) ( uintptr_t, const uintptr_t, uintptr_t );
+static const fastcpy_ptr fastcpy_table[16] = {
- NULL, /* special case, different code path */
- fastcpy_1,
- fastcpy_2,
- fastcpy_3,
- fastcpy_4,
- fastcpy_5,
- fastcpy_6,
- fastcpy_7,
- fastcpy_8,
- fastcpy_9,
- fastcpy_10,
- fastcpy_11,
- fastcpy_12,
- fastcpy_13,
- fastcpy_14,
- fastcpy_15
+};
+void * __cdecl memcpy( void *dst, const void *src, size_t n ) +{
- uintptr_t s = (uintptr_t)src;
- uintptr_t d = (uintptr_t)dst;
- uintptr_t as;
- _mm_prefetch((const void*)s, _MM_HINT_NTA);
- _mm_prefetch((const void*)d, _MM_HINT_NTA);
- /* Ensure aligned destination */
- while (d & 15)
- {
if (n-- == 0)
return dst;
*(BYTE*)d++ = *(const BYTE*)s++;
- }
- if (n < 16)
- {
/* Too small to vectorize */
while (n--) *(BYTE*)d++ = *(const BYTE*)s++;
return dst;
- }
- as = s & ~15;
- if (as == s)
- {
/* Fastest path: both pointers aligned */
while (n >= 16)
{
_mm_prefetch((const void*)s, _MM_HINT_NTA);
_mm_prefetch((const void*)d, _MM_HINT_NTA);
*(__m128i*)d = *(const __m128i*)s;
d += 16;
s += 16;
n -= 16;
}
- }
- else
- {
/* Read from aligned s by rounding down. If as < src, we need to slow
* copy another 16 bytes to avoid OOB reads. */
ptrdiff_t shift = s - as;
uintptr_t as_end = ((s + n) & ~15) - 16;
if (as < (uintptr_t)src)
{
uintptr_t target_n = n - 16;
while (n > target_n)
{
if (n-- == 0)
return dst;
*(BYTE*)d++ = *(const BYTE*)s++;
}
as += 16;
}
/* Copy 16-byte chunks if any are possible. Since s is misaligned, we
* need to read one chunk ahead of what we're writing, which means
* as_end must point to the _beginning_ of the last readable chunk.
* This also guarantees there is no overrun, since delta < n - 16. */
if (as_end > as)
{
ptrdiff_t delta = as_end - as;
fastcpy_table[shift](as, as_end, d);
s += delta;
d += delta;
n -= delta;
}
- }
- /* Slow copy anything that remains */
- while (n--) *(BYTE*)d++ = *(const BYTE*)s++;
- return dst;
+}
+#else /* defined(SSE2_MEMCPY) */
+/* Note: Behaves like memmove */ void * __cdecl memcpy( void *dst, const void *src, size_t n ) { volatile unsigned char *d = dst; /* avoid gcc optimizations */ @@ -118,6 +272,8 @@ void * __cdecl memcpy( void *dst, const void *src, size_t n ) return dst; }
+#endif /* !defined(SSE2_MEMCPY) */
/*********************************************************************
memmove (NTDLL.@)