Re: [PATCH] ntdll: Optimize memcpy for x86-64.

30 Mar 2022

On 3/23/22 10:33, Elaine Lefler wrote:
...
Signed-off-by: Elaine Lefler elaineclefler@gmail.com
New vectorized implementation improves performance up to 65%.
MSVCRT has one. Maybe deduplicate?
...

dlls/ntdll/string.c | 162 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 3 deletions(-)

diff --git a/dlls/ntdll/string.c b/dlls/ntdll/string.c
index 0fa83821d21..443fc98418a 100644
--- a/dlls/ntdll/string.c
+++ b/dlls/ntdll/string.c
@@ -33,6 +33,16 @@
 #include "winternl.h"
 #include "ntdll_misc.h"
+#ifdef __x86_64__



+#include <x86intrin.h>



+/* Enable vectorized memcpy implementation (all x86-64 CPUs have SSE2).


TODO: This could be enabled for x86 with a cpuid check. */



+#define SSE2_MEMCPY



+#endif



/* same as wctypes except for TAB, which doesn't have C1_BLANK for some reason... */
 static const unsigned short ctypes[257] =
@@ -96,10 +106,154 @@ int __cdecl memcmp( const void *ptr1, const void *ptr2, size_t n )
/*********************************************************************

             memcpy   (NTDLL.@)









NOTES



Behaves like memmove.

*/

+#ifdef SSE2_MEMCPY



+#define declare_fastcpy(n) \
+static void fastcpy_ ## n \
+( uintptr_t as, const uintptr_t as_end, uintptr_t d ) \
+{ \

__m128i x, y; \
x = *(const __m128i*)as; \
/* Read 32 bytes in, 16 bytes out. Shuffle variables when done so we don't
* re-read the first part. */ \


while (as < as_end) \
{ \
   /* Prefetch hint improves performance by minimizing cache pollution */ \


   _mm_prefetch((const void*)(as + 16), _MM_HINT_NTA); \


   _mm_prefetch((const void*)d, _MM_HINT_NTA); \


   y = *(const __m128i*)(as + 16);  \


   /* (n) is the number of bytes in *as that don't go to *d. Little endian


    * means the first bytes appear on the right, so srl to remove them */ \


   x = _mm_srli_si128(x, (n)); \


   /* Take same number of bytes from *(as + 16) and push them to the upper


    * part of the register */ \


   x = _mm_or_si128(x, _mm_slli_si128(y, 16 - (n))); \


   *(__m128i*)d = x; \


   d += 16; \


   as += 16; \


   x = y; \


} \

+}



+declare_fastcpy(1)
+declare_fastcpy(2)
+declare_fastcpy(3)
+declare_fastcpy(4)
+declare_fastcpy(5)
+declare_fastcpy(6)
+declare_fastcpy(7)
+declare_fastcpy(8)
+declare_fastcpy(9)
+declare_fastcpy(10)
+declare_fastcpy(11)
+declare_fastcpy(12)
+declare_fastcpy(13)
+declare_fastcpy(14)
+declare_fastcpy(15)



+typedef void (*fastcpy_ptr) ( uintptr_t, const uintptr_t, uintptr_t );



+static const fastcpy_ptr fastcpy_table[16] = {

NULL,       /* special case, different code path */
fastcpy_1,
fastcpy_2,
fastcpy_3,
fastcpy_4,
fastcpy_5,
fastcpy_6,
fastcpy_7,
fastcpy_8,
fastcpy_9,
fastcpy_10,
fastcpy_11,
fastcpy_12,
fastcpy_13,
fastcpy_14,
fastcpy_15

+};



+void * __cdecl memcpy( void *dst, const void *src, size_t n )
+{

uintptr_t s = (uintptr_t)src;
uintptr_t d = (uintptr_t)dst;
uintptr_t as;

_mm_prefetch((const void*)s, _MM_HINT_NTA);
_mm_prefetch((const void*)d, _MM_HINT_NTA);

/* Ensure aligned destination */
while (d & 15)
{
   if (n-- == 0)


       return dst;


   *(BYTE*)d++ = *(const BYTE*)s++;


}

if (n < 16)
{
   /* Too small to vectorize */


   while (n--) *(BYTE*)d++ = *(const BYTE*)s++;


   return dst;


}

as = s & ~15;
if (as == s)
{
   /* Fastest path: both pointers aligned */


   while (n >= 16)


   {


       _mm_prefetch((const void*)s, _MM_HINT_NTA);


       _mm_prefetch((const void*)d, _MM_HINT_NTA);


       *(__m128i*)d = *(const __m128i*)s;



       d += 16;


       s += 16;


       n -= 16;


   }


}
else
{
   /* Read from aligned s by rounding down. If as < src, we need to slow


    * copy another 16 bytes to avoid OOB reads. */


   ptrdiff_t shift = s - as;


   uintptr_t as_end = ((s + n) & ~15) - 16;



   if (as < (uintptr_t)src)


   {


       uintptr_t target_n = n - 16;


       while (n > target_n)


       {


           if (n-- == 0)


               return dst;


           *(BYTE*)d++ = *(const BYTE*)s++;


       }



       as += 16;


   }



   /* Copy 16-byte chunks if any are possible. Since s is misaligned, we


    * need to read one chunk ahead of what we're writing, which means


    * as_end must point to the _beginning_ of the last readable chunk.


    * This also guarantees there is no overrun, since delta < n - 16. */


   if (as_end > as)


   {


       ptrdiff_t delta = as_end - as;


       fastcpy_table[shift](as, as_end, d);


       s += delta;


       d += delta;


       n -= delta;


   }


}

/* Slow copy anything that remains */
while (n--) *(BYTE*)d++ = *(const BYTE*)s++;
return dst;

+}



+#else   /* defined(SSE2_MEMCPY) */



+/* Note: Behaves like memmove */
 void * __cdecl memcpy( void *dst, const void *src, size_t n )
 {
     volatile unsigned char *d = dst;  /* avoid gcc optimizations */
@@ -118,6 +272,8 @@ void * __cdecl memcpy( void *dst, const void *src, size_t n )
     return dst;
 }
+#endif  /* !defined(SSE2_MEMCPY) */



/*********************************************************************

             memmove   (NTDLL.@)



-- 
Sincerely,
Jinoh Kang


    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

2002

2001

Re: [PATCH] ntdll: Optimize memcpy for x86-64.

Signed-off-by: Elaine Lefler elaineclefler@gmail.com