[PATCH v2] msvcrt: Add an ermsb-based memset for x86/64.
Signed-off-by: Guillaume Charifi <guillaume.charifi(a)sfr.fr> --- dlls/msvcrt/string.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 4d09405094d..6d1500cb194 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2527,6 +2527,7 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ "popq " SRC_REG "\n\t" \ __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + #endif void * __cdecl sse2_memmove(void *dst, const void *src, size_t n); @@ -2732,6 +2733,64 @@ __ASM_GLOBAL_FUNC( sse2_memmove, MEMMOVE_CLEANUP "ret" ) +#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + +#ifdef __i386__ + +#define DEST_REG "%edi" +#define SRC_REG "%eax" +#define LEN_REG "%ecx" +#define TMP_REG + +#define MEMSET_INIT \ + "pushl " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + "movl 8(%esp), " DEST_REG "\n\t" \ + "movl 12(%esp), " SRC_REG "\n\t" \ + "movl 16(%esp), " LEN_REG "\n\t" + +#define MEMSET_CLEANUP \ + "movl 8(%esp), %eax\n\t" \ + "popl " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") + +#else + +#define DEST_REG "%rdi" +#define SRC_REG "%rax" +#define LEN_REG "%rcx" +#define TMP_REG "%r9" + +#define MEMSET_INIT \ + "pushq " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + "movq %rcx, " DEST_REG "\n\t" \ + "movq %rdx, " SRC_REG "\n\t" \ + "movq %r8, " LEN_REG "\n\t" \ + "movq " DEST_REG ", " TMP_REG "\n\t" + +#define MEMSET_CLEANUP \ + "movq " TMP_REG ", %rax\n\t" \ + "popq " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + +#endif + +void * __cdecl ermsb_memset(void *dst, int c, size_t n); +__ASM_GLOBAL_FUNC( ermsb_memset, + MEMSET_INIT + "rep stosb\n\t" + MEMSET_CLEANUP + "ret" ) + +#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + #endif /********************************************************************* @@ -2860,9 +2919,13 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) */ void* __cdecl memset(void *dst, int c, size_t n) { +#if defined(__i386__) || defined(__x86_64__) + return ermsb_memset(dst, c, n); +#else volatile unsigned char *d = dst; /* avoid gcc optimizations */ while (n--) *d++ = c; return dst; +#endif } /********************************************************************* -- 2.33.0
On 8/31/21 12:14 PM, Guillaume Charifi wrote:
Signed-off-by: Guillaume Charifi <guillaume.charifi(a)sfr.fr> --- dlls/msvcrt/string.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 4d09405094d..6d1500cb194 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2527,6 +2527,7 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ "popq " SRC_REG "\n\t" \ __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + #endif
void * __cdecl sse2_memmove(void *dst, const void *src, size_t n); @@ -2732,6 +2733,64 @@ __ASM_GLOBAL_FUNC( sse2_memmove, MEMMOVE_CLEANUP "ret" )
+#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + +#ifdef __i386__ + +#define DEST_REG "%edi" +#define SRC_REG "%eax" +#define LEN_REG "%ecx" +#define TMP_REG + +#define MEMSET_INIT \ + "pushl " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + "movl 8(%esp), " DEST_REG "\n\t" \ + "movl 12(%esp), " SRC_REG "\n\t" \ + "movl 16(%esp), " LEN_REG "\n\t" + +#define MEMSET_CLEANUP \ + "movl 8(%esp), %eax\n\t" \ + "popl " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") + +#else + +#define DEST_REG "%rdi" +#define SRC_REG "%rax" +#define LEN_REG "%rcx" +#define TMP_REG "%r9" + +#define MEMSET_INIT \ + "pushq " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + "movq %rcx, " DEST_REG "\n\t" \ + "movq %rdx, " SRC_REG "\n\t" \ + "movq %r8, " LEN_REG "\n\t" \ + "movq " DEST_REG ", " TMP_REG "\n\t" + +#define MEMSET_CLEANUP \ + "movq " TMP_REG ", %rax\n\t" \ + "popq " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + +#endif + +void * __cdecl ermsb_memset(void *dst, int c, size_t n); +__ASM_GLOBAL_FUNC( ermsb_memset, + MEMSET_INIT + "rep stosb\n\t" + MEMSET_CLEANUP + "ret" ) + +#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + #endif
/********************************************************************* @@ -2860,9 +2919,13 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) */ void* __cdecl memset(void *dst, int c, size_t n) { +#if defined(__i386__) || defined(__x86_64__) + return ermsb_memset(dst, c, n); +#else volatile unsigned char *d = dst; /* avoid gcc optimizations */ while (n--) *d++ = c; return dst; +#endif }
/*********************************************************************
In my opinion we should instead get rid of the volatile, it's what really kills the performance. GCC has the bad idea of replacing the loop with a builtin memset call, which doesn't work in Wine, but we can solve this in various ways I think. Then rep stosb/movsb aren't guaranteed to be fast on all CPUs. Even on modern CPUs, which have the "it's really fast" bit, as far as I could see it sometimes really makes a difference for large sizes (ie >= 4096), which I don't think is the common case. For a start we could add -ffreestanding / -fno-builtin compilation flags to msvcrt instead, although it won't be much faster, it will at least fix the issue with the builtin replacement. Then we can also unroll the loop a bit, still writing it in C, which makes it much faster and especially on small sizes. I actually have something like that locally (and in Proton) in ntdll, where it actually helps for zeroed heap allocations (which call ntdll memset instead of msvcrt):
static inline void memset_c_unaligned_32( char *d, uint64_t v, size_t n ) { if (n >= 24) { *(uint64_t *)d = v; *(uint64_t *)(d + 8) = v; *(uint64_t *)(d + 16) = v; *(uint64_t *)(d + n - 8) = v; } else if (n >= 16) { *(uint64_t *)d = v; *(uint64_t *)(d + 8) = v; *(uint64_t *)(d + n - 8) = v; } else if (n >= 8) { *(uint64_t *)d = v; *(uint64_t *)(d + n - 8) = v; } else if (n >= 4) { *(uint32_t *)d = v; *(uint32_t *)(d + n - 4) = v; } else if (n >= 2) { *(uint16_t *)d = v; *(uint16_t *)(d + n - 2) = v; } else if (n >= 1) { *(uint8_t *)d = v; } }
/********************************************************************* * memset (NTDLL.@) */ void *__cdecl memset(void *dst, int c, size_t n) { uint16_t tmp16 = ((uint16_t)c << 8) | c; uint32_t tmp32 = ((uint32_t)tmp16 << 16) | tmp16; uint64_t v = ((uint64_t)tmp32 << 32) | tmp32;
if (n <= 32) { memset_c_unaligned_32( dst, v, n ); return dst; }
while (n >= 48) { *(uint64_t*)((char *)dst + n - 8) = v; *(uint64_t*)((char *)dst + n - 16) = v; *(uint64_t*)((char *)dst + n - 24) = v; *(uint64_t*)((char *)dst + n - 32) = v; *(uint64_t*)((char *)dst + n - 40) = v; *(uint64_t*)((char *)dst + n - 48) = v; n -= 48; }
while (n >= 24) { *(uint64_t*)((char *)dst + n - 8) = v; *(uint64_t*)((char *)dst + n - 16) = v; *(uint64_t*)((char *)dst + n - 24) = v; n -= 24; }
memset_c_unaligned_32( dst, v, n ); return dst; }
Cheers, -- Rémi Bernon <rbernon(a)codeweavers.com>
On 8/31/21 12:52 PM, Rémi Bernon wrote:
For a start we could add -ffreestanding / -fno-builtin compilation flags to msvcrt instead, although it won't be much faster, it will at least fix the issue with the builtin replacement. As far as I understand -fno-builtin does not guarantee that there will be no call to memset introduced by compiler. Isn't it similar to the memcpy problem we have while using clang?
On 8/31/21 2:18 PM, Piotr Caban wrote:
On 8/31/21 12:52 PM, Rémi Bernon wrote:
For a start we could add -ffreestanding / -fno-builtin compilation flags to msvcrt instead, although it won't be much faster, it will at least fix the issue with the builtin replacement. As far as I understand -fno-builtin does not guarantee that there will be no call to memset introduced by compiler. Isn't it similar to the memcpy problem we have while using clang?
I don't really know, but in practice it has the desired effect. And yes, there's probably the same problem with memcpy. Note that apparently there's a flag for each builtin but -fno-builtin-memset doesn't do the trick for GCC (it does on clang). Also note that, as far as I could see, unrolling the loop a bit like in my quote is already enough to make it stop optimizing it to a builtin memset call, without having to add specific flags (although there's always the risk of it becoming even more clever). -- Rémi Bernon <rbernon(a)codeweavers.com>
On 8/31/21 2:41 PM, Rémi Bernon wrote:
On 8/31/21 2:18 PM, Piotr Caban wrote:
On 8/31/21 12:52 PM, Rémi Bernon wrote:
For a start we could add -ffreestanding / -fno-builtin compilation flags to msvcrt instead, although it won't be much faster, it will at least fix the issue with the builtin replacement. As far as I understand -fno-builtin does not guarantee that there will be no call to memset introduced by compiler. Isn't it similar to the memcpy problem we have while using clang?
I don't really know, but in practice it has the desired effect. And yes, there's probably the same problem with memcpy.
Note that apparently there's a flag for each builtin but -fno-builtin-memset doesn't do the trick for GCC (it does on clang).
Also note that, as far as I could see, unrolling the loop a bit like in my quote is already enough to make it stop optimizing it to a builtin memset call, without having to add specific flags (although there's always the risk of it becoming even more clever).
For memcpy, it's a bit longer but I have this, which works too: https://github.com/rbernon/wine/commit/6f0b5ed9abb5d8.patch -- Rémi Bernon <rbernon(a)codeweavers.com>
Signed-off-by: Guillaume Charifi <guillaume.charifi(a)sfr.fr> --- dlls/msvcrt/string.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 4d09405094d..6d1500cb194 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2527,6 +2527,7 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ "popq " SRC_REG "\n\t" \ __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + #endif void * __cdecl sse2_memmove(void *dst, const void *src, size_t n); @@ -2732,6 +2733,64 @@ __ASM_GLOBAL_FUNC( sse2_memmove, MEMMOVE_CLEANUP "ret" ) +#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + +#ifdef __i386__ + +#define DEST_REG "%edi" +#define SRC_REG "%eax" +#define LEN_REG "%ecx" +#define TMP_REG + +#define MEMSET_INIT \ + "pushl " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + "movl 8(%esp), " DEST_REG "\n\t" \ + "movl 12(%esp), " SRC_REG "\n\t" \ + "movl 16(%esp), " LEN_REG "\n\t" + +#define MEMSET_CLEANUP \ + "movl 8(%esp), %eax\n\t" \ + "popl " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") + +#else + +#define DEST_REG "%rdi" +#define SRC_REG "%rax" +#define LEN_REG "%rcx" +#define TMP_REG "%r9" + +#define MEMSET_INIT \ + "pushq " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + "movq %rcx, " DEST_REG "\n\t" \ + "movq %rdx, " SRC_REG "\n\t" \ + "movq %r8, " LEN_REG "\n\t" \ + "movq " DEST_REG ", " TMP_REG "\n\t" + +#define MEMSET_CLEANUP \ + "movq " TMP_REG ", %rax\n\t" \ + "popq " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + +#endif + +void * __cdecl ermsb_memset(void *dst, int c, size_t n); +__ASM_GLOBAL_FUNC( ermsb_memset, + MEMSET_INIT + "rep stosb\n\t" + MEMSET_CLEANUP + "ret" ) + +#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + #endif /********************************************************************* @@ -2860,9 +2919,13 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) */ void* __cdecl memset(void *dst, int c, size_t n) { +#if defined(__i386__) || defined(__x86_64__) + return ermsb_memset(dst, c, n); +#else volatile unsigned char *d = dst; /* avoid gcc optimizations */ while (n--) *d++ = c; return dst; +#endif } /*********************************************************************
Hi, While running your changed tests, I think I found new failures. Being a bot and all I'm not very good at pattern recognition, so I might be wrong, but could you please double-check? Full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=97752 Your paranoid android. === debiant2 (build log) === error: corrupt patch at line 11 Task: Patch failed to apply === debiant2 (build log) === error: corrupt patch at line 11 Task: Patch failed to apply
participants (5)
-
Guillaume Charifi -
Guillaume Charifi-Hoareau -
Marvin -
Piotr Caban -
Rémi Bernon