Signed-off-by: Guillaume Charifi [email protected] --- dlls/msvcrt/string.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 4d09405094d..6d1500cb194 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2527,6 +2527,7 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ "popq " SRC_REG "\n\t" \ __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + #endif
void * __cdecl sse2_memmove(void *dst, const void *src, size_t n); @@ -2732,6 +2733,64 @@ __ASM_GLOBAL_FUNC( sse2_memmove, MEMMOVE_CLEANUP "ret" )
+#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + +#ifdef __i386__ + +#define DEST_REG "%edi" +#define SRC_REG "%eax" +#define LEN_REG "%ecx" +#define TMP_REG + +#define MEMSET_INIT \ + "pushl " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + "movl 8(%esp), " DEST_REG "\n\t" \ + "movl 12(%esp), " SRC_REG "\n\t" \ + "movl 16(%esp), " LEN_REG "\n\t" + +#define MEMSET_CLEANUP \ + "movl 8(%esp), %eax\n\t" \ + "popl " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") + +#else + +#define DEST_REG "%rdi" +#define SRC_REG "%rax" +#define LEN_REG "%rcx" +#define TMP_REG "%r9" + +#define MEMSET_INIT \ + "pushq " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + "movq %rcx, " DEST_REG "\n\t" \ + "movq %rdx, " SRC_REG "\n\t" \ + "movq %r8, " LEN_REG "\n\t" \ + "movq " DEST_REG ", " TMP_REG "\n\t" + +#define MEMSET_CLEANUP \ + "movq " TMP_REG ", %rax\n\t" \ + "popq " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + +#endif + +void * __cdecl ermsb_memset(void *dst, int c, size_t n); +__ASM_GLOBAL_FUNC( ermsb_memset, + MEMSET_INIT + "rep stosb\n\t" + MEMSET_CLEANUP + "ret" ) + +#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + #endif
/********************************************************************* @@ -2860,9 +2919,13 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) */ void* __cdecl memset(void *dst, int c, size_t n) { +#if defined(__i386__) || defined(__x86_64__) + return ermsb_memset(dst, c, n); +#else volatile unsigned char *d = dst; /* avoid gcc optimizations */ while (n--) *d++ = c; return dst; +#endif }
/*********************************************************************
On 8/31/21 12:14 PM, Guillaume Charifi wrote:
Signed-off-by: Guillaume Charifi [email protected]
dlls/msvcrt/string.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 4d09405094d..6d1500cb194 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2527,6 +2527,7 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ "popq " SRC_REG "\n\t" \ __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
#endif
void * __cdecl sse2_memmove(void *dst, const void *src, size_t n);
@@ -2732,6 +2733,64 @@ __ASM_GLOBAL_FUNC( sse2_memmove, MEMMOVE_CLEANUP "ret" )
+#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG
+#ifdef __i386__
+#define DEST_REG "%edi" +#define SRC_REG "%eax" +#define LEN_REG "%ecx" +#define TMP_REG
+#define MEMSET_INIT \
- "pushl " DEST_REG "\n\t" \
- __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
- "movl 8(%esp), " DEST_REG "\n\t" \
- "movl 12(%esp), " SRC_REG "\n\t" \
- "movl 16(%esp), " LEN_REG "\n\t"
+#define MEMSET_CLEANUP \
- "movl 8(%esp), %eax\n\t" \
- "popl " DEST_REG "\n\t" \
- __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t")
+#else
+#define DEST_REG "%rdi" +#define SRC_REG "%rax" +#define LEN_REG "%rcx" +#define TMP_REG "%r9"
+#define MEMSET_INIT \
- "pushq " DEST_REG "\n\t" \
- __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
- "movq %rcx, " DEST_REG "\n\t" \
- "movq %rdx, " SRC_REG "\n\t" \
- "movq %r8, " LEN_REG "\n\t" \
- "movq " DEST_REG ", " TMP_REG "\n\t"
+#define MEMSET_CLEANUP \
- "movq " TMP_REG ", %rax\n\t" \
- "popq " DEST_REG "\n\t" \
- __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
+#endif
+void * __cdecl ermsb_memset(void *dst, int c, size_t n); +__ASM_GLOBAL_FUNC( ermsb_memset,
MEMSET_INIT
"rep stosb\n\t"
MEMSET_CLEANUP
"ret" )
+#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG
#endif
/*********************************************************************
@@ -2860,9 +2919,13 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) */ void* __cdecl memset(void *dst, int c, size_t n) { +#if defined(__i386__) || defined(__x86_64__)
- return ermsb_memset(dst, c, n);
+#else volatile unsigned char *d = dst; /* avoid gcc optimizations */ while (n--) *d++ = c; return dst; +#endif }
/*********************************************************************
In my opinion we should instead get rid of the volatile, it's what really kills the performance. GCC has the bad idea of replacing the loop with a builtin memset call, which doesn't work in Wine, but we can solve this in various ways I think.
Then rep stosb/movsb aren't guaranteed to be fast on all CPUs. Even on modern CPUs, which have the "it's really fast" bit, as far as I could see it sometimes really makes a difference for large sizes (ie >= 4096), which I don't think is the common case.
For a start we could add -ffreestanding / -fno-builtin compilation flags to msvcrt instead, although it won't be much faster, it will at least fix the issue with the builtin replacement.
Then we can also unroll the loop a bit, still writing it in C, which makes it much faster and especially on small sizes. I actually have something like that locally (and in Proton) in ntdll, where it actually helps for zeroed heap allocations (which call ntdll memset instead of msvcrt):
static inline void memset_c_unaligned_32( char *d, uint64_t v, size_t n ) { if (n >= 24) { *(uint64_t *)d = v; *(uint64_t *)(d + 8) = v; *(uint64_t *)(d + 16) = v; *(uint64_t *)(d + n - 8) = v; } else if (n >= 16) { *(uint64_t *)d = v; *(uint64_t *)(d + 8) = v; *(uint64_t *)(d + n - 8) = v; } else if (n >= 8) { *(uint64_t *)d = v; *(uint64_t *)(d + n - 8) = v; } else if (n >= 4) { *(uint32_t *)d = v; *(uint32_t *)(d + n - 4) = v; } else if (n >= 2) { *(uint16_t *)d = v; *(uint16_t *)(d + n - 2) = v; } else if (n >= 1) { *(uint8_t *)d = v; } }
/*********************************************************************
memset (NTDLL.@)
*/ void *__cdecl memset(void *dst, int c, size_t n) { uint16_t tmp16 = ((uint16_t)c << 8) | c; uint32_t tmp32 = ((uint32_t)tmp16 << 16) | tmp16; uint64_t v = ((uint64_t)tmp32 << 32) | tmp32;
if (n <= 32) { memset_c_unaligned_32( dst, v, n ); return dst; } while (n >= 48) { *(uint64_t*)((char *)dst + n - 8) = v; *(uint64_t*)((char *)dst + n - 16) = v; *(uint64_t*)((char *)dst + n - 24) = v; *(uint64_t*)((char *)dst + n - 32) = v; *(uint64_t*)((char *)dst + n - 40) = v; *(uint64_t*)((char *)dst + n - 48) = v; n -= 48; } while (n >= 24) { *(uint64_t*)((char *)dst + n - 8) = v; *(uint64_t*)((char *)dst + n - 16) = v; *(uint64_t*)((char *)dst + n - 24) = v; n -= 24; } memset_c_unaligned_32( dst, v, n ); return dst;
}
Cheers,
On 8/31/21 12:52 PM, Rémi Bernon wrote:
For a start we could add -ffreestanding / -fno-builtin compilation flags to msvcrt instead, although it won't be much faster, it will at least fix the issue with the builtin replacement.
As far as I understand -fno-builtin does not guarantee that there will be no call to memset introduced by compiler. Isn't it similar to the memcpy problem we have while using clang?
On 8/31/21 2:18 PM, Piotr Caban wrote:
On 8/31/21 12:52 PM, Rémi Bernon wrote:
For a start we could add -ffreestanding / -fno-builtin compilation flags to msvcrt instead, although it won't be much faster, it will at least fix the issue with the builtin replacement.
As far as I understand -fno-builtin does not guarantee that there will be no call to memset introduced by compiler. Isn't it similar to the memcpy problem we have while using clang?
I don't really know, but in practice it has the desired effect. And yes, there's probably the same problem with memcpy.
Note that apparently there's a flag for each builtin but -fno-builtin-memset doesn't do the trick for GCC (it does on clang).
Also note that, as far as I could see, unrolling the loop a bit like in my quote is already enough to make it stop optimizing it to a builtin memset call, without having to add specific flags (although there's always the risk of it becoming even more clever).
On 8/31/21 2:41 PM, Rémi Bernon wrote:
On 8/31/21 2:18 PM, Piotr Caban wrote:
On 8/31/21 12:52 PM, Rémi Bernon wrote:
For a start we could add -ffreestanding / -fno-builtin compilation flags to msvcrt instead, although it won't be much faster, it will at least fix the issue with the builtin replacement.
As far as I understand -fno-builtin does not guarantee that there will be no call to memset introduced by compiler. Isn't it similar to the memcpy problem we have while using clang?
I don't really know, but in practice it has the desired effect. And yes, there's probably the same problem with memcpy.
Note that apparently there's a flag for each builtin but -fno-builtin-memset doesn't do the trick for GCC (it does on clang).
Also note that, as far as I could see, unrolling the loop a bit like in my quote is already enough to make it stop optimizing it to a builtin memset call, without having to add specific flags (although there's always the risk of it becoming even more clever).
For memcpy, it's a bit longer but I have this, which works too: https://github.com/rbernon/wine/commit/6f0b5ed9abb5d8.patch
Signed-off-by: Guillaume Charifi [email protected] --- dlls/msvcrt/string.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 4d09405094d..6d1500cb194 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2527,6 +2527,7 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ "popq " SRC_REG "\n\t" \ __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + #endif
void * __cdecl sse2_memmove(void *dst, const void *src, size_t n); @@ -2732,6 +2733,64 @@ __ASM_GLOBAL_FUNC( sse2_memmove, MEMMOVE_CLEANUP "ret" )
+#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + +#ifdef __i386__ + +#define DEST_REG "%edi" +#define SRC_REG "%eax" +#define LEN_REG "%ecx" +#define TMP_REG + +#define MEMSET_INIT \ + "pushl " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + "movl 8(%esp), " DEST_REG "\n\t" \ + "movl 12(%esp), " SRC_REG "\n\t" \ + "movl 16(%esp), " LEN_REG "\n\t" + +#define MEMSET_CLEANUP \ + "movl 8(%esp), %eax\n\t" \ + "popl " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") + +#else + +#define DEST_REG "%rdi" +#define SRC_REG "%rax" +#define LEN_REG "%rcx" +#define TMP_REG "%r9" + +#define MEMSET_INIT \ + "pushq " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + "movq %rcx, " DEST_REG "\n\t" \ + "movq %rdx, " SRC_REG "\n\t" \ + "movq %r8, " LEN_REG "\n\t" \ + "movq " DEST_REG ", " TMP_REG "\n\t" + +#define MEMSET_CLEANUP \ + "movq " TMP_REG ", %rax\n\t" \ + "popq " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") + +#endif + +void * __cdecl ermsb_memset(void *dst, int c, size_t n); +__ASM_GLOBAL_FUNC( ermsb_memset, + MEMSET_INIT + "rep stosb\n\t" + MEMSET_CLEANUP + "ret" ) + +#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG + #endif
/********************************************************************* @@ -2860,9 +2919,13 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) */ void* __cdecl memset(void *dst, int c, size_t n) { +#if defined(__i386__) || defined(__x86_64__) + return ermsb_memset(dst, c, n); +#else volatile unsigned char *d = dst; /* avoid gcc optimizations */ while (n--) *d++ = c; return dst; +#endif }
/*********************************************************************
Hi,
While running your changed tests, I think I found new failures. Being a bot and all I'm not very good at pattern recognition, so I might be wrong, but could you please double-check?
Full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=97752
Your paranoid android.
=== debiant2 (build log) ===
error: corrupt patch at line 11 Task: Patch failed to apply
=== debiant2 (build log) ===
error: corrupt patch at line 11 Task: Patch failed to apply