For n larger than 16 we store 16 bytes on each end of the buffer, eventually overlapping, and then 16 additional bytes for n > 32.
Then we can find a 32-byte aligned range overlapping the remaining part of the destination buffer, which is filled 32 bytes at a time in a loop.
Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/msvcrt/string.c | 60 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 3 deletions(-)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 4d09405094d..f2b1b4a5b11 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2855,13 +2855,67 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) return memmove(dst, src, n); }
+static inline void memset_aligned_32(unsigned char *d, uint64_t v, size_t n) +{ + while (n >= 32) + { + *(uint64_t *)(d + n - 32) = v; + *(uint64_t *)(d + n - 24) = v; + *(uint64_t *)(d + n - 16) = v; + *(uint64_t *)(d + n - 8) = v; + n -= 32; + } +} + /********************************************************************* * memset (MSVCRT.@) */ -void* __cdecl memset(void *dst, int c, size_t n) +void *__cdecl memset(void *dst, int c, size_t n) { - volatile unsigned char *d = dst; /* avoid gcc optimizations */ - while (n--) *d++ = c; + uint64_t v = 0x101010101010101ull * (unsigned char)c; + unsigned char *d = (unsigned char *)dst; + size_t a = 0x20 - ((uintptr_t)d & 0x1f); + + if (n >= 16) + { + *(uint64_t *)(d + 0) = v; + *(uint64_t *)(d + 8) = v; + *(uint64_t *)(d + n - 16) = v; + *(uint64_t *)(d + n - 8) = v; + if (n <= 32) return dst; + *(uint64_t *)(d + 16) = v; + *(uint64_t *)(d + 24) = v; + *(uint64_t *)(d + n - 32) = v; + *(uint64_t *)(d + n - 24) = v; + if (n <= 64) return dst; + + n = (n - a) & ~0x1f; + memset_aligned_32(d + a, v, n); + return dst; + } + if (n >= 8) + { + *(uint64_t *)d = v; + *(uint64_t *)(d + n - 8) = v; + return dst; + } + if (n >= 4) + { + *(uint32_t *)d = v; + *(uint32_t *)(d + n - 4) = v; + return dst; + } + if (n >= 2) + { + *(uint16_t *)d = v; + *(uint16_t *)(d + n - 2) = v; + return dst; + } + if (n >= 1) + { + *(uint8_t *)d = v; + return dst; + } return dst; }
Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/msvcrt/math.c | 13 +++++++++ dlls/msvcrt/msvcrt.h | 1 + dlls/msvcrt/string.c | 64 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+)
diff --git a/dlls/msvcrt/math.c b/dlls/msvcrt/math.c index 7f59a4d20d4..9974e72d78f 100644 --- a/dlls/msvcrt/math.c +++ b/dlls/msvcrt/math.c @@ -43,6 +43,7 @@ #include <limits.h> #include <locale.h> #include <math.h> +#include <intrin.h>
#include "msvcrt.h" #include "winternl.h" @@ -64,11 +65,23 @@ typedef int (CDECL *MSVCRT_matherr_func)(struct _exception *);
static MSVCRT_matherr_func MSVCRT_default_matherr_func = NULL;
+BOOL erms_supported; BOOL sse2_supported; static BOOL sse2_enabled;
void msvcrt_init_math( void *module ) { +#if defined(__i386__) || defined(__x86_64__) + int regs[4]; + + __cpuid(regs, 0); + if (regs[0] >= 7) + { + __cpuidex(regs, 7, 0); + erms_supported = ((regs[1] >> 9) & 1); + } +#endif + sse2_supported = IsProcessorFeaturePresent( PF_XMMI64_INSTRUCTIONS_AVAILABLE ); #if _MSVCR_VER <=71 sse2_enabled = FALSE; diff --git a/dlls/msvcrt/msvcrt.h b/dlls/msvcrt/msvcrt.h index 60f8c2f5ef2..022eced35d9 100644 --- a/dlls/msvcrt/msvcrt.h +++ b/dlls/msvcrt/msvcrt.h @@ -33,6 +33,7 @@ #undef strncpy #undef wcsncpy
+extern BOOL erms_supported DECLSPEC_HIDDEN; extern BOOL sse2_supported DECLSPEC_HIDDEN;
#define DBL80_MAX_10_EXP 4932 diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index f2b1b4a5b11..32291f06001 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2732,6 +2732,13 @@ __ASM_GLOBAL_FUNC( sse2_memmove, MEMMOVE_CLEANUP "ret" )
+#undef MEMMOVE_INIT +#undef MEMMOVE_CLEANUP +#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + #endif
/********************************************************************* @@ -2855,6 +2862,56 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) return memmove(dst, src, n); }
+#if defined(__i386__) || defined(__x86_64__) + +#ifdef __i386__ +#define DEST_REG "%edi" +#define LEN_REG "%ecx" +#define VAL_REG "%eax" + +#define MEMSET_INIT \ + "movl " DEST_REG ", %edx\n\t" \ + "movl 4(%esp), " DEST_REG "\n\t" \ + "movl 8(%esp), " VAL_REG "\n\t" \ + "movl 12(%esp), " LEN_REG "\n\t" + +#define MEMSET_RET \ + "movl %edx, " DEST_REG "\n\t" \ + "ret" + +#else + +#define DEST_REG "%rdi" +#define LEN_REG "%rcx" +#define VAL_REG "%eax" + +#define MEMSET_INIT \ + "movq " DEST_REG ", %r9\n\t" \ + "movq %rcx, " DEST_REG "\n\t" \ + "movl %edx, " VAL_REG "\n\t" \ + "movq %r8, " LEN_REG "\n\t" + +#define MEMSET_RET \ + "movq %r9, " DEST_REG "\n\t" \ + "ret" + +#endif + +void __cdecl erms_memset_aligned_32(unsigned char *d, unsigned int c, size_t n); +__ASM_GLOBAL_FUNC( erms_memset_aligned_32, + MEMSET_INIT + "rep\n\t" + "stosb\n\t" + MEMSET_RET ) + +#undef MEMSET_INIT +#undef MEMSET_RET +#undef DEST_REG +#undef LEN_REG +#undef VAL_REG + +#endif + static inline void memset_aligned_32(unsigned char *d, uint64_t v, size_t n) { while (n >= 32) @@ -2890,6 +2947,13 @@ void *__cdecl memset(void *dst, int c, size_t n) if (n <= 64) return dst;
n = (n - a) & ~0x1f; +#if defined(__i386__) || defined(__x86_64__) + if (n >= 2048 && erms_supported) + { + erms_memset_aligned_32(d + a, v, n); + return dst; + } +#endif memset_aligned_32(d + a, v, n); return dst; }
Signed-off-by: Piotr Caban piotr@codeweavers.com
Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/msvcrt/string.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 32291f06001..4a0e778e77a 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2904,6 +2904,27 @@ __ASM_GLOBAL_FUNC( erms_memset_aligned_32, "stosb\n\t" MEMSET_RET )
+void __cdecl sse2_memset_aligned_32(unsigned char *d, unsigned int c, size_t n); +__ASM_GLOBAL_FUNC( sse2_memset_aligned_32, + MEMSET_INIT + "movd " VAL_REG ", %xmm0\n\t" + "pshufd $0, %xmm0, %xmm0\n\t" + "test $0x20, " LEN_REG "\n\t" + "je 1f\n\t" + "sub $0x20, " LEN_REG "\n\t" + "movdqa %xmm0, 0x00(" DEST_REG ", " LEN_REG ")\n\t" + "movdqa %xmm0, 0x10(" DEST_REG ", " LEN_REG ")\n\t" + "je 2f\n\t" + "1:\n\t" + "sub $0x40, " LEN_REG "\n\t" + "movdqa %xmm0, 0x00(" DEST_REG ", " LEN_REG ")\n\t" + "movdqa %xmm0, 0x10(" DEST_REG ", " LEN_REG ")\n\t" + "movdqa %xmm0, 0x20(" DEST_REG ", " LEN_REG ")\n\t" + "movdqa %xmm0, 0x30(" DEST_REG ", " LEN_REG ")\n\t" + "ja 1b\n\t" + "2:\n\t" + MEMSET_RET ) + #undef MEMSET_INIT #undef MEMSET_RET #undef DEST_REG @@ -2953,9 +2974,21 @@ void *__cdecl memset(void *dst, int c, size_t n) erms_memset_aligned_32(d + a, v, n); return dst; } +#ifdef __x86_64__ + sse2_memset_aligned_32(d + a, v, n); + return dst; +#else + if (sse2_supported) + { + sse2_memset_aligned_32(d + a, v, n); + return dst; + } +#endif #endif +#ifndef __x86_64__ memset_aligned_32(d + a, v, n); return dst; +#endif } if (n >= 8) {
Signed-off-by: Piotr Caban piotr@codeweavers.com
On Tue, 14 Sep 2021, Rémi Bernon wrote:
For n larger than 16 we store 16 bytes on each end of the buffer, eventually overlapping, and then 16 additional bytes for n > 32.
Then we can find a 32-byte aligned range overlapping the remaining part of the destination buffer, which is filled 32 bytes at a time in a loop.
Signed-off-by: Rémi Bernon rbernon@codeweavers.com
dlls/msvcrt/string.c | 60 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 3 deletions(-)
- volatile unsigned char *d = dst; /* avoid gcc optimizations */
- while (n--) *d++ = c;
- uint64_t v = 0x101010101010101ull * (unsigned char)c;
- unsigned char *d = (unsigned char *)dst;
- size_t a = 0x20 - ((uintptr_t)d & 0x1f);
- if (n >= 16)
- {
*(uint64_t *)(d + 0) = v;
*(uint64_t *)(d + 8) = v;
*(uint64_t *)(d + n - 16) = v;
*(uint64_t *)(d + n - 8) = v;
FYI this broke memset on ARM (32 bit) due to misalignment. ARM used to be quite alignment-picky in older versions, but since ARMv7, 32 bit register loads/stores can be unaligned. For 64 bit writes, there's an instruction STRD, which can't be used unaligned though, but in these cases, the compiler is free to use it.
The surprising thing about STRD is that it only requires 32 bit alignment, even if it writes 64 bit. First I tried to replace
*(uint64_t *)(d + 0) = v;
with
*(uint32_t *)(d + 0) = v; *(uint32_t *)(d + 4) = v;
hoping to use 32 bit stores (which work unaligned). However, after casting to uint32_t*, the compiler is free to assume that the resulting pointer is 32 bit aligned, and STRD only requires 32 bit alignment, so the compiler can still fuse these two stores into one single STRD.
By using
*(volatile uint32_t *)(d + 0) = v; *(volatile uint32_t *)(d + 4) = v;
the compiler emits them as two separate 32 bit stores though (which work fine with any alignment).
I'll send a PoC patch that fixes things for me.
// Martin
On 9/15/21 9:49 PM, Martin Storsjö wrote:
On Tue, 14 Sep 2021, Rémi Bernon wrote:
For n larger than 16 we store 16 bytes on each end of the buffer, eventually overlapping, and then 16 additional bytes for n > 32.
Then we can find a 32-byte aligned range overlapping the remaining part of the destination buffer, which is filled 32 bytes at a time in a loop.
Signed-off-by: Rémi Bernon rbernon@codeweavers.com
dlls/msvcrt/string.c | 60 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 3 deletions(-)
- volatile unsigned char *d = dst; /* avoid gcc optimizations */ - while (n--) *d++ = c; + uint64_t v = 0x101010101010101ull * (unsigned char)c; + unsigned char *d = (unsigned char *)dst; + size_t a = 0x20 - ((uintptr_t)d & 0x1f);
+ if (n >= 16) + { + *(uint64_t *)(d + 0) = v; + *(uint64_t *)(d + 8) = v; + *(uint64_t *)(d + n - 16) = v; + *(uint64_t *)(d + n - 8) = v;
FYI this broke memset on ARM (32 bit) due to misalignment. ARM used to be quite alignment-picky in older versions, but since ARMv7, 32 bit register loads/stores can be unaligned. For 64 bit writes, there's an instruction STRD, which can't be used unaligned though, but in these cases, the compiler is free to use it.
The surprising thing about STRD is that it only requires 32 bit alignment, even if it writes 64 bit. First I tried to replace
*(uint64_t *)(d + 0) = v;
with
*(uint32_t *)(d + 0) = v; *(uint32_t *)(d + 4) = v;
hoping to use 32 bit stores (which work unaligned). However, after casting to uint32_t*, the compiler is free to assume that the resulting pointer is 32 bit aligned, and STRD only requires 32 bit alignment, so the compiler can still fuse these two stores into one single STRD.
By using
*(volatile uint32_t *)(d + 0) = v; *(volatile uint32_t *)(d + 4) = v;
the compiler emits them as two separate 32 bit stores though (which work fine with any alignment).
I'll send a PoC patch that fixes things for me.
// Martin
Ouch, okay, sorry about that and thanks for the information.