For n larger than 16 we store 16 bytes on each end of the buffer, eventually overlapping, and then 16 additional bytes for n > 32.
Then we can find a 32-byte aligned range overlapping the remaining part of the destination buffer, which is filled 32 bytes at a time in a loop.
Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/msvcrt/string.c | 59 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 4d09405094d..da7e9612345 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2855,13 +2855,66 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) return memmove(dst, src, n); }
+static void *__cdecl memset_aligned_32(unsigned char *d, uint64_t v, size_t n, void *ret) +{ + while (n >= 32) + { + *(uint64_t *)(d + n - 32) = v; + *(uint64_t *)(d + n - 24) = v; + *(uint64_t *)(d + n - 16) = v; + *(uint64_t *)(d + n - 8) = v; + n -= 32; + } + return ret; +} + /********************************************************************* * memset (MSVCRT.@) */ -void* __cdecl memset(void *dst, int c, size_t n) +void *__cdecl memset(void *dst, int c, size_t n) { - volatile unsigned char *d = dst; /* avoid gcc optimizations */ - while (n--) *d++ = c; + uint64_t v = 0x101010101010101ull * (unsigned char)c; + unsigned char *d = (unsigned char *)dst; + size_t a = 0x20 - ((uintptr_t)d & 0x1f); + + if (n >= 16) + { + *(uint64_t *)(d + 0) = v; + *(uint64_t *)(d + 8) = v; + *(uint64_t *)(d + n - 16) = v; + *(uint64_t *)(d + n - 8) = v; + if (n <= 32) return dst; + *(uint64_t *)(d + 16) = v; + *(uint64_t *)(d + 24) = v; + *(uint64_t *)(d + n - 32) = v; + *(uint64_t *)(d + n - 24) = v; + if (n <= 64) return dst; + n = (n - a) & ~0x1f; + return memset_aligned_32(d + a, v, n, dst); + } + if (n >= 8) + { + *(uint64_t *)d = v; + *(uint64_t *)(d + n - 8) = v; + return dst; + } + if (n >= 4) + { + *(uint32_t *)d = v; + *(uint32_t *)(d + n - 4) = v; + return dst; + } + if (n >= 2) + { + *(uint16_t *)d = v; + *(uint16_t *)(d + n - 2) = v; + return dst; + } + if (n >= 1) + { + *(uint8_t *)d = v; + return dst; + } return dst; }
Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/msvcrt/math.c | 13 +++++++++ dlls/msvcrt/msvcrt.h | 1 + dlls/msvcrt/string.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+)
diff --git a/dlls/msvcrt/math.c b/dlls/msvcrt/math.c index 7f59a4d20d4..9974e72d78f 100644 --- a/dlls/msvcrt/math.c +++ b/dlls/msvcrt/math.c @@ -43,6 +43,7 @@ #include <limits.h> #include <locale.h> #include <math.h> +#include <intrin.h>
#include "msvcrt.h" #include "winternl.h" @@ -64,11 +65,23 @@ typedef int (CDECL *MSVCRT_matherr_func)(struct _exception *);
static MSVCRT_matherr_func MSVCRT_default_matherr_func = NULL;
+BOOL erms_supported; BOOL sse2_supported; static BOOL sse2_enabled;
void msvcrt_init_math( void *module ) { +#if defined(__i386__) || defined(__x86_64__) + int regs[4]; + + __cpuid(regs, 0); + if (regs[0] >= 7) + { + __cpuidex(regs, 7, 0); + erms_supported = ((regs[1] >> 9) & 1); + } +#endif + sse2_supported = IsProcessorFeaturePresent( PF_XMMI64_INSTRUCTIONS_AVAILABLE ); #if _MSVCR_VER <=71 sse2_enabled = FALSE; diff --git a/dlls/msvcrt/msvcrt.h b/dlls/msvcrt/msvcrt.h index 60f8c2f5ef2..022eced35d9 100644 --- a/dlls/msvcrt/msvcrt.h +++ b/dlls/msvcrt/msvcrt.h @@ -33,6 +33,7 @@ #undef strncpy #undef wcsncpy
+extern BOOL erms_supported DECLSPEC_HIDDEN; extern BOOL sse2_supported DECLSPEC_HIDDEN;
#define DBL80_MAX_10_EXP 4932 diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index da7e9612345..b8a5cc89663 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2732,6 +2732,13 @@ __ASM_GLOBAL_FUNC( sse2_memmove, MEMMOVE_CLEANUP "ret" )
+#undef MEMMOVE_INIT +#undef MEMMOVE_CLEANUP +#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + #endif
/********************************************************************* @@ -2855,6 +2862,59 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) return memmove(dst, src, n); }
+#if defined(__i386__) || defined(__x86_64__) + +#ifdef __i386__ +#define DEST_REG "%edi" +#define LEN_REG "%ecx" +#define VAL_REG "%eax" + +#define MEMSET_INIT \ + "pushl %edi\n\t" \ + "movl 8(%esp), " DEST_REG "\n\t" \ + "movl 12(%esp), " VAL_REG "\n\t" \ + "movl 16(%esp), " LEN_REG "\n\t" + +#define MEMSET_RET \ + "movl 20(%esp), %eax\n\t" \ + "popl %edi\n\t" \ + "ret" + +#else + +#define DEST_REG "%rdi" +#define LEN_REG "%rcx" +#define VAL_REG "%eax" + +#define MEMSET_INIT \ + "pushq %rdi\n\t" \ + "movq %rcx, " DEST_REG "\n\t" \ + "movl %edx, " VAL_REG "\n\t" \ + "movq %r8, " LEN_REG "\n\t" + +#define MEMSET_RET \ + "movq %r9, %rax\n\t" \ + "popq %rdi\n\t" \ + "ret" + +#endif + +void *__cdecl erms_memset_aligned_32(unsigned char *d, unsigned int c, size_t n, void *ret); +__ASM_GLOBAL_FUNC( erms_memset_aligned_32, + MEMSET_INIT + "cld\n\t" + "rep\n\t" + "stosb\n\t" + MEMSET_RET ) + +#undef MEMSET_INIT +#undef MEMSET_RET +#undef DEST_REG +#undef LEN_REG +#undef VAL_REG + +#endif + static void *__cdecl memset_aligned_32(unsigned char *d, uint64_t v, size_t n, void *ret) { while (n >= 32) @@ -2890,6 +2950,9 @@ void *__cdecl memset(void *dst, int c, size_t n) *(uint64_t *)(d + n - 24) = v; if (n <= 64) return dst; n = (n - a) & ~0x1f; +#if defined(__i386__) || defined(__x86_64__) + if (n >= 2048 && erms_supported) return erms_memset_aligned_32(d + a, v, n, dst); +#endif return memset_aligned_32(d + a, v, n, dst); } if (n >= 8)
For intermediate sizes.
Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/msvcrt/string.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index b8a5cc89663..6fb269e4185 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2907,6 +2907,27 @@ __ASM_GLOBAL_FUNC( erms_memset_aligned_32, "stosb\n\t" MEMSET_RET )
+void *__cdecl sse2_memset_aligned_32(unsigned char *d, unsigned int c, size_t n, void *ret); +__ASM_GLOBAL_FUNC( sse2_memset_aligned_32, + MEMSET_INIT + "movd " VAL_REG ", %xmm0\n\t" + "pshufd $0, %xmm0, %xmm0\n\t" + "test $0x20, " LEN_REG "\n\t" + "je 1f\n\t" + "sub $0x20, " LEN_REG "\n\t" + "movdqa %xmm0, 0x00(" DEST_REG ", " LEN_REG ")\n\t" + "movdqa %xmm0, 0x10(" DEST_REG ", " LEN_REG ")\n\t" + "je 2f\n\t" + "1:\n\t" + "sub $0x40, " LEN_REG "\n\t" + "movdqa %xmm0, 0x00(" DEST_REG ", " LEN_REG ")\n\t" + "movdqa %xmm0, 0x10(" DEST_REG ", " LEN_REG ")\n\t" + "movdqa %xmm0, 0x20(" DEST_REG ", " LEN_REG ")\n\t" + "movdqa %xmm0, 0x30(" DEST_REG ", " LEN_REG ")\n\t" + "ja 1b\n\t" + "2:\n\t" + MEMSET_RET ) + #undef MEMSET_INIT #undef MEMSET_RET #undef DEST_REG @@ -2952,6 +2973,11 @@ void *__cdecl memset(void *dst, int c, size_t n) n = (n - a) & ~0x1f; #if defined(__i386__) || defined(__x86_64__) if (n >= 2048 && erms_supported) return erms_memset_aligned_32(d + a, v, n, dst); +#ifdef __i386__ + if (sse2_supported) return sse2_memset_aligned_32(d + a, v, n, dst); +#else + return sse2_memset_aligned_32(d + a, v, n, dst); +#endif #endif return memset_aligned_32(d + a, v, n, dst); }