For n larger than 16 we store 16 bytes on each end of the buffer, eventually overlapping, and then 16 additional bytes for n > 32.
Then we can find a 32-byte aligned range overlapping the remaining part of the destination buffer, which is filled 32 bytes at a time in a loop.
Signed-off-by: Rémi Bernon rbernon@codeweavers.com ---
So, this is what I was thinking instead of having fully specialized assembly versions.
Overall I believe the performance should be better than SSE2 for very small sizes and very large sizes (when ERMS kicks in), but a bit worse for 128 <= n <= 1024.
I also don't even think the last patch is really useful, it only helps improving performance for the intermediate sizes, and I would think instead that the ERMS path could cover them instead once it is good enough on most CPUs for that range of size.
dlls/msvcrt/string.c | 60 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 4d09405094d..3a7312572ab 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2855,13 +2855,65 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) return memmove(dst, src, n); }
+static void memset_aligned_32(unsigned char *d, uint64_t v, size_t n) +{ + while (n >= 32) + { + *(uint64_t*)(d + n - 32) = v; + *(uint64_t*)(d + n - 24) = v; + *(uint64_t*)(d + n - 16) = v; + *(uint64_t*)(d + n - 8) = v; + n -= 32; + } +} + /********************************************************************* * memset (MSVCRT.@) */ -void* __cdecl memset(void *dst, int c, size_t n) -{ - volatile unsigned char *d = dst; /* avoid gcc optimizations */ - while (n--) *d++ = c; +void *__cdecl memset(void *dst, int c, size_t n) +{ + uint64_t v = 0x101010101010101ull * (unsigned char)c; + unsigned char *d = (unsigned char *)dst; + size_t a = 0x20 - ((uintptr_t)d & 0x1f); + + if (n >= 16) + { + *(uint64_t *)(d + 0) = v; + *(uint64_t *)(d + 8) = v; + *(uint64_t *)(d + n - 16) = v; + *(uint64_t *)(d + n - 8) = v; + if (n <= 32) return dst; + *(uint64_t *)(d + 16) = v; + *(uint64_t *)(d + 24) = v; + *(uint64_t *)(d + n - 32) = v; + *(uint64_t *)(d + n - 24) = v; + if (n <= 64) return dst; + memset_aligned_32(d + a, v, (n - a) & ~0x1f); + return dst; + } + if (n >= 8) + { + *(uint64_t *)d = v; + *(uint64_t *)(d + n - 8) = v; + return dst; + } + if (n >= 4) + { + *(uint32_t *)d = v; + *(uint32_t *)(d + n - 4) = v; + return dst; + } + if (n >= 2) + { + *(uint16_t *)d = v; + *(uint16_t *)(d + n - 2) = v; + return dst; + } + if (n >= 1) + { + *(uint8_t *)d = v; + return dst; + } return dst; }