Signed-off-by: Elaine Lefler elaineclefler@gmail.com ---
New vectorized implementation improves performance up to 65%. --- dlls/ntdll/string.c | 162 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 3 deletions(-)
diff --git a/dlls/ntdll/string.c b/dlls/ntdll/string.c index 0fa83821d21..443fc98418a 100644 --- a/dlls/ntdll/string.c +++ b/dlls/ntdll/string.c @@ -33,6 +33,16 @@ #include "winternl.h" #include "ntdll_misc.h"
+#ifdef __x86_64__ + +#include <x86intrin.h> + +/* Enable vectorized memcpy implementation (all x86-64 CPUs have SSE2). + * TODO: This could be enabled for x86 with a cpuid check. */ +#define SSE2_MEMCPY + +#endif +
/* same as wctypes except for TAB, which doesn't have C1_BLANK for some reason... */ static const unsigned short ctypes[257] = @@ -96,10 +106,154 @@ int __cdecl memcmp( const void *ptr1, const void *ptr2, size_t n )
/********************************************************************* * memcpy (NTDLL.@) - * - * NOTES - * Behaves like memmove. */ +#ifdef SSE2_MEMCPY + +#define declare_fastcpy(n) \ +static void fastcpy_ ## n \ +( uintptr_t as, const uintptr_t as_end, uintptr_t d ) \ +{ \ + __m128i x, y; \ + x = *(const __m128i*)as; \ + /* Read 32 bytes in, 16 bytes out. Shuffle variables when done so we don't + * re-read the first part. */ \ + while (as < as_end) \ + { \ + /* Prefetch hint improves performance by minimizing cache pollution */ \ + _mm_prefetch((const void*)(as + 16), _MM_HINT_NTA); \ + _mm_prefetch((const void*)d, _MM_HINT_NTA); \ + y = *(const __m128i*)(as + 16); \ + /* (n) is the number of bytes in *as that don't go to *d. Little endian + * means the first bytes appear on the right, so srl to remove them */ \ + x = _mm_srli_si128(x, (n)); \ + /* Take same number of bytes from *(as + 16) and push them to the upper + * part of the register */ \ + x = _mm_or_si128(x, _mm_slli_si128(y, 16 - (n))); \ + *(__m128i*)d = x; \ + d += 16; \ + as += 16; \ + x = y; \ + } \ +} + +declare_fastcpy(1) +declare_fastcpy(2) +declare_fastcpy(3) +declare_fastcpy(4) +declare_fastcpy(5) +declare_fastcpy(6) +declare_fastcpy(7) +declare_fastcpy(8) +declare_fastcpy(9) +declare_fastcpy(10) +declare_fastcpy(11) +declare_fastcpy(12) +declare_fastcpy(13) +declare_fastcpy(14) +declare_fastcpy(15) + +typedef void (*fastcpy_ptr) ( uintptr_t, const uintptr_t, uintptr_t ); + +static const fastcpy_ptr fastcpy_table[16] = { + NULL, /* special case, different code path */ + fastcpy_1, + fastcpy_2, + fastcpy_3, + fastcpy_4, + fastcpy_5, + fastcpy_6, + fastcpy_7, + fastcpy_8, + fastcpy_9, + fastcpy_10, + fastcpy_11, + fastcpy_12, + fastcpy_13, + fastcpy_14, + fastcpy_15 +}; + +void * __cdecl memcpy( void *dst, const void *src, size_t n ) +{ + uintptr_t s = (uintptr_t)src; + uintptr_t d = (uintptr_t)dst; + uintptr_t as; + + _mm_prefetch((const void*)s, _MM_HINT_NTA); + _mm_prefetch((const void*)d, _MM_HINT_NTA); + + /* Ensure aligned destination */ + while (d & 15) + { + if (n-- == 0) + return dst; + *(BYTE*)d++ = *(const BYTE*)s++; + } + + if (n < 16) + { + /* Too small to vectorize */ + while (n--) *(BYTE*)d++ = *(const BYTE*)s++; + return dst; + } + + as = s & ~15; + if (as == s) + { + /* Fastest path: both pointers aligned */ + while (n >= 16) + { + _mm_prefetch((const void*)s, _MM_HINT_NTA); + _mm_prefetch((const void*)d, _MM_HINT_NTA); + *(__m128i*)d = *(const __m128i*)s; + + d += 16; + s += 16; + n -= 16; + } + } + else + { + /* Read from aligned s by rounding down. If as < src, we need to slow + * copy another 16 bytes to avoid OOB reads. */ + ptrdiff_t shift = s - as; + uintptr_t as_end = ((s + n) & ~15) - 16; + + if (as < (uintptr_t)src) + { + uintptr_t target_n = n - 16; + while (n > target_n) + { + if (n-- == 0) + return dst; + *(BYTE*)d++ = *(const BYTE*)s++; + } + + as += 16; + } + + /* Copy 16-byte chunks if any are possible. Since s is misaligned, we + * need to read one chunk ahead of what we're writing, which means + * as_end must point to the _beginning_ of the last readable chunk. + * This also guarantees there is no overrun, since delta < n - 16. */ + if (as_end > as) + { + ptrdiff_t delta = as_end - as; + fastcpy_table[shift](as, as_end, d); + s += delta; + d += delta; + n -= delta; + } + } + + /* Slow copy anything that remains */ + while (n--) *(BYTE*)d++ = *(const BYTE*)s++; + return dst; +} + +#else /* defined(SSE2_MEMCPY) */ + +/* Note: Behaves like memmove */ void * __cdecl memcpy( void *dst, const void *src, size_t n ) { volatile unsigned char *d = dst; /* avoid gcc optimizations */ @@ -118,6 +272,8 @@ void * __cdecl memcpy( void *dst, const void *src, size_t n ) return dst; }
+#endif /* !defined(SSE2_MEMCPY) */ +
/********************************************************************* * memmove (NTDLL.@)
On 3/23/22 10:33, Elaine Lefler wrote:
Signed-off-by: Elaine Lefler elaineclefler@gmail.com
New vectorized implementation improves performance up to 65%.
MSVCRT has one. Maybe deduplicate?
dlls/ntdll/string.c | 162 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 3 deletions(-)
diff --git a/dlls/ntdll/string.c b/dlls/ntdll/string.c index 0fa83821d21..443fc98418a 100644 --- a/dlls/ntdll/string.c +++ b/dlls/ntdll/string.c @@ -33,6 +33,16 @@ #include "winternl.h" #include "ntdll_misc.h"
+#ifdef __x86_64__
+#include <x86intrin.h>
+/* Enable vectorized memcpy implementation (all x86-64 CPUs have SSE2).
- TODO: This could be enabled for x86 with a cpuid check. */
+#define SSE2_MEMCPY
+#endif
/* same as wctypes except for TAB, which doesn't have C1_BLANK for some reason... */ static const unsigned short ctypes[257] = @@ -96,10 +106,154 @@ int __cdecl memcmp( const void *ptr1, const void *ptr2, size_t n )
/*********************************************************************
memcpy (NTDLL.@)
- NOTES
*/
- Behaves like memmove.
+#ifdef SSE2_MEMCPY
+#define declare_fastcpy(n) \ +static void fastcpy_ ## n \ +( uintptr_t as, const uintptr_t as_end, uintptr_t d ) \ +{ \
- __m128i x, y; \
- x = *(const __m128i*)as; \
- /* Read 32 bytes in, 16 bytes out. Shuffle variables when done so we don't
* re-read the first part. */ \
- while (as < as_end) \
- { \
/* Prefetch hint improves performance by minimizing cache pollution */ \
_mm_prefetch((const void*)(as + 16), _MM_HINT_NTA); \
_mm_prefetch((const void*)d, _MM_HINT_NTA); \
y = *(const __m128i*)(as + 16); \
/* (n) is the number of bytes in *as that don't go to *d. Little endian
* means the first bytes appear on the right, so srl to remove them */ \
x = _mm_srli_si128(x, (n)); \
/* Take same number of bytes from *(as + 16) and push them to the upper
* part of the register */ \
x = _mm_or_si128(x, _mm_slli_si128(y, 16 - (n))); \
*(__m128i*)d = x; \
d += 16; \
as += 16; \
x = y; \
- } \
+}
+declare_fastcpy(1) +declare_fastcpy(2) +declare_fastcpy(3) +declare_fastcpy(4) +declare_fastcpy(5) +declare_fastcpy(6) +declare_fastcpy(7) +declare_fastcpy(8) +declare_fastcpy(9) +declare_fastcpy(10) +declare_fastcpy(11) +declare_fastcpy(12) +declare_fastcpy(13) +declare_fastcpy(14) +declare_fastcpy(15)
+typedef void (*fastcpy_ptr) ( uintptr_t, const uintptr_t, uintptr_t );
+static const fastcpy_ptr fastcpy_table[16] = {
- NULL, /* special case, different code path */
- fastcpy_1,
- fastcpy_2,
- fastcpy_3,
- fastcpy_4,
- fastcpy_5,
- fastcpy_6,
- fastcpy_7,
- fastcpy_8,
- fastcpy_9,
- fastcpy_10,
- fastcpy_11,
- fastcpy_12,
- fastcpy_13,
- fastcpy_14,
- fastcpy_15
+};
+void * __cdecl memcpy( void *dst, const void *src, size_t n ) +{
- uintptr_t s = (uintptr_t)src;
- uintptr_t d = (uintptr_t)dst;
- uintptr_t as;
- _mm_prefetch((const void*)s, _MM_HINT_NTA);
- _mm_prefetch((const void*)d, _MM_HINT_NTA);
- /* Ensure aligned destination */
- while (d & 15)
- {
if (n-- == 0)
return dst;
*(BYTE*)d++ = *(const BYTE*)s++;
- }
- if (n < 16)
- {
/* Too small to vectorize */
while (n--) *(BYTE*)d++ = *(const BYTE*)s++;
return dst;
- }
- as = s & ~15;
- if (as == s)
- {
/* Fastest path: both pointers aligned */
while (n >= 16)
{
_mm_prefetch((const void*)s, _MM_HINT_NTA);
_mm_prefetch((const void*)d, _MM_HINT_NTA);
*(__m128i*)d = *(const __m128i*)s;
d += 16;
s += 16;
n -= 16;
}
- }
- else
- {
/* Read from aligned s by rounding down. If as < src, we need to slow
* copy another 16 bytes to avoid OOB reads. */
ptrdiff_t shift = s - as;
uintptr_t as_end = ((s + n) & ~15) - 16;
if (as < (uintptr_t)src)
{
uintptr_t target_n = n - 16;
while (n > target_n)
{
if (n-- == 0)
return dst;
*(BYTE*)d++ = *(const BYTE*)s++;
}
as += 16;
}
/* Copy 16-byte chunks if any are possible. Since s is misaligned, we
* need to read one chunk ahead of what we're writing, which means
* as_end must point to the _beginning_ of the last readable chunk.
* This also guarantees there is no overrun, since delta < n - 16. */
if (as_end > as)
{
ptrdiff_t delta = as_end - as;
fastcpy_table[shift](as, as_end, d);
s += delta;
d += delta;
n -= delta;
}
- }
- /* Slow copy anything that remains */
- while (n--) *(BYTE*)d++ = *(const BYTE*)s++;
- return dst;
+}
+#else /* defined(SSE2_MEMCPY) */
+/* Note: Behaves like memmove */ void * __cdecl memcpy( void *dst, const void *src, size_t n ) { volatile unsigned char *d = dst; /* avoid gcc optimizations */ @@ -118,6 +272,8 @@ void * __cdecl memcpy( void *dst, const void *src, size_t n ) return dst; }
+#endif /* !defined(SSE2_MEMCPY) */
/*********************************************************************
memmove (NTDLL.@)
On 3/30/22 17:27, Jinoh Kang wrote:
On 3/23/22 10:33, Elaine Lefler wrote:
Signed-off-by: Elaine Lefler elaineclefler@gmail.com
New vectorized implementation improves performance up to 65%.
MSVCRT has one. Maybe deduplicate?
IIUC upstream isn't very interested in assembly optimized routine, unless really necessary.
The msvcrt implementation was probably necessary because it's often called by apps, and needs to be as optimal as possible, but I'm not sure ntdll memcpy is used so much. Maybe for realloc though, in which case it might be useful indeed.
I think an unrolled version like was done for memset should already give good results and should work portably (though I got bitten with memset already, and I wasn't very keen on trying again with memcpy so soon).
Something like this maybe, if anyone wants to try or review:
static FORCEINLINE void memmove_unaligned_24( char *d, const char *s, size_t n ) { typedef uint64_t DECLSPEC_ALIGN(1) unaligned_ui64; typedef uint32_t DECLSPEC_ALIGN(1) unaligned_ui32; typedef uint16_t DECLSPEC_ALIGN(1) unaligned_ui16; uint64_t tmp0, tmp1, tmpn;
if (n >= 16) { tmp0 = *(unaligned_ui64 *)s; tmp1 = *(unaligned_ui64 *)(s + 8); tmpn = *(unaligned_ui64 *)(s + n - 8); *(unaligned_ui64 *)d = tmp0; *(unaligned_ui64 *)(d + 8) = tmp1; *(unaligned_ui64 *)(d + n - 8) = tmpn; } else if (n >= 8) { tmp0 = *(unaligned_ui64 *)s; tmpn = *(unaligned_ui64 *)(s + n - 8); *(unaligned_ui64 *)d = tmp0; *(unaligned_ui64 *)(d + n - 8) = tmpn; } else if (n >= 4) { tmp0 = *(unaligned_ui32 *)s; tmpn = *(unaligned_ui32 *)(s + n - 4); *(unaligned_ui32 *)d = tmp0; *(unaligned_ui32 *)(d + n - 4) = tmpn; } else if (n >= 2) { tmp0 = *(unaligned_ui16 *)s; tmpn = *(unaligned_ui16 *)(s + n - 2); *(unaligned_ui16 *)d = tmp0; *(unaligned_ui16 *)(d + n - 2) = tmpn; } else if (n >= 1) { *(uint8_t *)d = *(uint8_t *)s; }
}
static FORCEINLINE void *memmove_unrolled( char *dst, const char *src, size_t n ) { typedef uint64_t DECLSPEC_ALIGN(1) unaligned_ui64; uint64_t tmp0, tmp1, tmp2; char *end;
if (n <= 24) memmove_unaligned_24( dst, src, n ); else if ((size_t)dst - (size_t)src >= n) { end = dst + n; src += n; do { tmp0 = *(unaligned_ui64 *)(src - n + 0); tmp1 = *(unaligned_ui64 *)(src - n + 8); tmp2 = *(unaligned_ui64 *)(src - n + 16); *(unaligned_ui64*)(end - n + 0) = tmp0; *(unaligned_ui64*)(end - n + 8) = tmp1; *(unaligned_ui64*)(end - n + 16) = tmp2; n -= 24; } while (n >= 24); memmove_unaligned_24( end - n, src - n, n ); } else { do { tmp0 = *(unaligned_ui64 *)(src + n - 8); tmp1 = *(unaligned_ui64 *)(src + n - 16); tmp2 = *(unaligned_ui64 *)(src + n - 24); *(unaligned_ui64*)(dst + n - 8) = tmp0; *(unaligned_ui64*)(dst + n - 16) = tmp1; *(unaligned_ui64*)(dst + n - 24) = tmp2; n -= 24; } while (n >= 24); memmove_unaligned_24( dst, src, n ); } return dst;
}
/*********************************************************************
memcpy (NTDLL.@)
- NOTES
- Behaves like memmove.
*/ void * __cdecl memcpy( void *dst, const void *src, size_t n ) { return memmove_unrolled( dst, src, n ); }
/*********************************************************************
memmove (NTDLL.@)
*/ void * __cdecl memmove( void *dst, const void *src, size_t n ) { return memmove_unrolled( dst, src, n ); }
On 30/03/2022 19:33, Rémi Bernon wrote:
On 3/30/22 17:27, Jinoh Kang wrote:
On 3/23/22 10:33, Elaine Lefler wrote:
Signed-off-by: Elaine Lefler elaineclefler@gmail.com
New vectorized implementation improves performance up to 65%.
MSVCRT has one. Maybe deduplicate?
IIUC upstream isn't very interested in assembly optimized routine, unless really necessary.
The msvcrt implementation was probably necessary because it's often called by apps, and needs to be as optimal as possible, but I'm not sure ntdll memcpy is used so much. Maybe for realloc though, in which case it might be useful indeed.
I think an unrolled version like was done for memset should already give good results and should work portably (though I got bitten with memset already, and I wasn't very keen on trying again with memcpy so soon).
Why not just copy pasting it from msvcrt since it's already done?
On 3/30/22 19:46, Gabriel Ivăncescu wrote:
On 30/03/2022 19:33, Rémi Bernon wrote:
On 3/30/22 17:27, Jinoh Kang wrote:
On 3/23/22 10:33, Elaine Lefler wrote:
Signed-off-by: Elaine Lefler elaineclefler@gmail.com
New vectorized implementation improves performance up to 65%.
MSVCRT has one. Maybe deduplicate?
IIUC upstream isn't very interested in assembly optimized routine, unless really necessary.
The msvcrt implementation was probably necessary because it's often called by apps, and needs to be as optimal as possible, but I'm not sure ntdll memcpy is used so much. Maybe for realloc though, in which case it might be useful indeed.
I think an unrolled version like was done for memset should already give good results and should work portably (though I got bitten with memset already, and I wasn't very keen on trying again with memcpy so soon).
Why not just copy pasting it from msvcrt since it's already done?
Copying the C version? Sure, why not, though looking at it it feels unnecessarily complex.
On Wed, Mar 30, 2022 at 11:46 AM Gabriel Ivăncescu gabrielopcode@gmail.com wrote:
Why not just copy pasting it from msvcrt since it's already done?
This is interesting. Mine is more efficient than the msvcrt version (at least for sse2), because the msvcrt version relies on unaligned moves whereas mine uses bit shifting to allow both src and dst to be aligned. Also, in my testing I found that it's most efficient to use plain old bytes copy for the first/last parts of the function, anything else just causes branch mispredictions.
It would also be optimal to write an AVX2 version of this function for systems that support it (which is most of them, nowadays). AVX2 has a 32-byte register and can move data a lot faster. It needs a cpuid check though. And probably not worth it for anything other than memcpy.
As for memset, while the code looks like it deals in aligned pointers, examining the output with objdump shows that GCC fails to produce any MOVDQA/MOVAPS/MOVNTDQ instructions. So it's not really doing what it claims to do. In order to emit those instructions you need to define an aligned 16-byte structure and use it in your pointers.
On Wed, Mar 30, 2022 at 10:34 AM Rémi Bernon rbernon@codeweavers.com wrote:
IIUC upstream isn't very interested in assembly optimized routine, unless really necessary.
I don't think hand-optimized asm is necessary here. Though the usage of _mm_ functions is pretty close to it. I think my routine can be refactored to be more platform-independent. The main annoyance is the fact that PSLLDQ/PSRLDQ only accept compile-time constants as shift values, meaning each of the fastcpy functions has slightly different machine code - hence the jump table. You really don't want conditional jumps inside that inner loop. For memcpy I think the annoyance is worth it, but I'd hesitate to use it elsewhere.
To be honest, I had planned to submit patches for several of these functions, including memset, and strlen. Most of these can be done without the need for special intrinsics, but the two-argument functions like strcmp get a lot more ugly. It might be best to create a separate .c file for platform-specific implementations. IMO, string.c is widely-used enough to justify extra complexity, but of course I'm not the project maintainer.
The duplication between msvcrt and ntdll seems strange. From what I can tell (may be wrong), most apps are probably using the msvcrt version, but it looks like Wine libraries run the ntdll version instead. Also, msvcrt seems to have some routines that are more optimized than ntdll, but memset and only memset has the optimized version in both. So it looks like I'm not the first person to miss this detail. Is it possible to remove the code from ntdll and only call msvcrt (or the other way around)? Or, worst case, have them both compile the same C file so we don't have the same function in two places?
- Elaine
Looking into the code duplication some more: msvcrt links with ntdll, so it seems the simplest solution is to remove the functions from msvcrt, which will cause the ntdll versions to be called.
It's not as clean as I'd like, though. Some of the functions are better in msvcrt, so they should be backported, and others can't be moved at all. This isn't the only source of duplication either. For instance, math.c seems to be full of duplicates, but I don't even want to touch that...
If upstream typically doesn't like this kind of patch, maybe I should resubmit after I've developed an AVX2 patch as well? That would provide a much more convincing argument for speed, compared to the current patch which is fairly marginal.
Elaine Lefler elaineclefler@gmail.com writes:
If upstream typically doesn't like this kind of patch, maybe I should resubmit after I've developed an AVX2 patch as well? That would provide a much more convincing argument for speed, compared to the current patch which is fairly marginal.
Note that to make a convincing speed argument, you need to show a measurable improvement for a real world app. That's especially true for these ntdll functions, since apps are expected to mostly use the msvcrt variants instead.
On 4/2/22 12:39, Elaine Lefler wrote:
Looking into the code duplication some more: msvcrt links with ntdll, so it seems the simplest solution is to remove the functions from msvcrt, which will cause the ntdll versions to be called.
Have you checked with Windows ntdll.dll? Note that some apps might actually use memcpy/... from ntdll and not msvcrt. If Windows supports them, so should we.
It's not as clean as I'd like, though. Some of the functions are better in msvcrt, so they should be backported, and others can't be moved at all. This isn't the only source of duplication either. For instance, math.c seems to be full of duplicates, but I don't even want to touch that...
Maybe use PARENTSRC?
If upstream typically doesn't like this kind of patch, maybe I should resubmit after I've developed an AVX2 patch as well? That would provide a much more convincing argument for speed, compared to the current patch which is fairly marginal.