For intermediate sizes.
Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/msvcrt/string.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index d09b44fbcd6..6e9fb8d119d 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2859,7 +2859,35 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) static void memset_aligned_32(unsigned char *d, uint64_t v, size_t n) { #if defined(__i386__) || defined(__x86_64__) - if (n >= 2048 && erms_supported) __stosb(d, v, n); +#ifdef __i386__ + if (n < 2048 && sse2_supported) +#else + if (n < 2048) +#endif + { + __asm__ __volatile__ ( + "movd %1, %%xmm0\n\t" + "pshufd $0, %%xmm0, %%xmm0\n\t" + "test $0x20, %2\n\t" + "je 1f\n\t" + "sub $0x20, %2\n\t" + "movdqa %%xmm0, 0x00(%0,%2)\n\t" + "movdqa %%xmm0, 0x10(%0,%2)\n\t" + "je 2f\n\t" + "1:\n\t" + "sub $0x40, %2\n\t" + "movdqa %%xmm0, 0x00(%0,%2)\n\t" + "movdqa %%xmm0, 0x10(%0,%2)\n\t" + "movdqa %%xmm0, 0x20(%0,%2)\n\t" + "movdqa %%xmm0, 0x30(%0,%2)\n\t" + "ja 1b\n\t" + "2:\n\t" + : + : "r"(d), "r"((uint32_t)v), "c"(n) + : "memory" + ); + } + else if (erms_supported) __stosb(d, v, n); else #endif while (n >= 32)