Re: [PATCH] ntdll: Optimize memcpy for x86-64.

30 Mar 2022

      On 3/30/22 17:27, Jinoh Kang wrote:
...
On 3/23/22 10:33, Elaine Lefler wrote:
...
Signed-off-by: Elaine Lefler elaineclefler@gmail.com
New vectorized implementation improves performance up to 65%.
MSVCRT has one. Maybe deduplicate?
IIUC upstream isn't very interested in assembly optimized routine, 
unless really necessary.
The msvcrt implementation was probably necessary because it's often 
called by apps, and needs to be as optimal as possible, but I'm not sure 
ntdll memcpy is used so much. Maybe for realloc though, in which case it 
might be useful indeed.
I think an unrolled version like was done for memset should already give 
good results and should work portably (though I got bitten with memset 
already, and I wasn't very keen on trying again with memcpy so soon).
Something like this maybe, if anyone wants to try or review:
...
static FORCEINLINE void memmove_unaligned_24( char *d, const char *s, size_t n )
{
    typedef uint64_t DECLSPEC_ALIGN(1) unaligned_ui64;
    typedef uint32_t DECLSPEC_ALIGN(1) unaligned_ui32;
    typedef uint16_t DECLSPEC_ALIGN(1) unaligned_ui16;
    uint64_t tmp0, tmp1, tmpn;
if (n >= 16)
{
    tmp0 = *(unaligned_ui64 *)s;
    tmp1 = *(unaligned_ui64 *)(s + 8);
    tmpn = *(unaligned_ui64 *)(s + n - 8);
    *(unaligned_ui64 *)d = tmp0;
    *(unaligned_ui64 *)(d + 8) = tmp1;
    *(unaligned_ui64 *)(d + n - 8) = tmpn;
}
else if (n >= 8)
{
    tmp0 = *(unaligned_ui64 *)s;
    tmpn = *(unaligned_ui64 *)(s + n - 8);
    *(unaligned_ui64 *)d = tmp0;
    *(unaligned_ui64 *)(d + n - 8) = tmpn;
}
else if (n >= 4)
{
    tmp0 = *(unaligned_ui32 *)s;
    tmpn = *(unaligned_ui32 *)(s + n - 4);
    *(unaligned_ui32 *)d = tmp0;
    *(unaligned_ui32 *)(d + n - 4) = tmpn;
}
else if (n >= 2)
{
    tmp0 = *(unaligned_ui16 *)s;
    tmpn = *(unaligned_ui16 *)(s + n - 2);
    *(unaligned_ui16 *)d = tmp0;
    *(unaligned_ui16 *)(d + n - 2) = tmpn;
}
else if (n >= 1)
{
    *(uint8_t *)d = *(uint8_t *)s;
}

}
static FORCEINLINE void *memmove_unrolled( char *dst, const char *src, size_t n )
{
    typedef uint64_t DECLSPEC_ALIGN(1) unaligned_ui64;
    uint64_t tmp0, tmp1, tmp2;
    char *end;
if (n <= 24) memmove_unaligned_24( dst, src, n );
else if ((size_t)dst - (size_t)src >= n)
{
    end = dst + n; src += n;
    do
    {
        tmp0 = *(unaligned_ui64 *)(src - n +  0);
        tmp1 = *(unaligned_ui64 *)(src - n +  8);
        tmp2 = *(unaligned_ui64 *)(src - n + 16);
        *(unaligned_ui64*)(end - n +  0) = tmp0;
        *(unaligned_ui64*)(end - n +  8) = tmp1;
        *(unaligned_ui64*)(end - n + 16) = tmp2;
        n -= 24;
    }
    while (n >= 24);
    memmove_unaligned_24( end - n, src - n, n );
}
else
{
    do
    {
        tmp0 = *(unaligned_ui64 *)(src + n -  8);
        tmp1 = *(unaligned_ui64 *)(src + n - 16);
        tmp2 = *(unaligned_ui64 *)(src + n - 24);
        *(unaligned_ui64*)(dst + n -  8) = tmp0;
        *(unaligned_ui64*)(dst + n - 16) = tmp1;
        *(unaligned_ui64*)(dst + n - 24) = tmp2;
        n -= 24;
    }
    while (n >= 24);
    memmove_unaligned_24( dst, src, n );
}
return dst;

}
/*********************************************************************

             memcpy   (NTDLL.@)

NOTES
Behaves like memmove.

*/
void * __cdecl memcpy( void *dst, const void *src, size_t n )
{
    return memmove_unrolled( dst, src, n );
}
/*********************************************************************

             memmove   (NTDLL.@)

*/
void * __cdecl memmove( void *dst, const void *src, size_t n )
{
    return memmove_unrolled( dst, src, n );
}
-- 
Rémi Bernon rbernon@codeweavers.com

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

2002

2001

Re: [PATCH] ntdll: Optimize memcpy for x86-64.

Signed-off-by: Elaine Lefler elaineclefler@gmail.com