On 8/31/21 12:14 PM, Guillaume Charifi wrote:
Signed-off-by: Guillaume Charifi guillaume.charifi@sfr.fr
dlls/msvcrt/string.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 4d09405094d..6d1500cb194 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2527,6 +2527,7 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ "popq " SRC_REG "\n\t" \ __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
#endif
void * __cdecl sse2_memmove(void *dst, const void *src, size_t n);
@@ -2732,6 +2733,64 @@ __ASM_GLOBAL_FUNC( sse2_memmove, MEMMOVE_CLEANUP "ret" )
+#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG
+#ifdef __i386__
+#define DEST_REG "%edi" +#define SRC_REG "%eax" +#define LEN_REG "%ecx" +#define TMP_REG
+#define MEMSET_INIT \
- "pushl " DEST_REG "\n\t" \
- __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
- "movl 8(%esp), " DEST_REG "\n\t" \
- "movl 12(%esp), " SRC_REG "\n\t" \
- "movl 16(%esp), " LEN_REG "\n\t"
+#define MEMSET_CLEANUP \
- "movl 8(%esp), %eax\n\t" \
- "popl " DEST_REG "\n\t" \
- __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t")
+#else
+#define DEST_REG "%rdi" +#define SRC_REG "%rax" +#define LEN_REG "%rcx" +#define TMP_REG "%r9"
+#define MEMSET_INIT \
- "pushq " DEST_REG "\n\t" \
- __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
- "movq %rcx, " DEST_REG "\n\t" \
- "movq %rdx, " SRC_REG "\n\t" \
- "movq %r8, " LEN_REG "\n\t" \
- "movq " DEST_REG ", " TMP_REG "\n\t"
+#define MEMSET_CLEANUP \
- "movq " TMP_REG ", %rax\n\t" \
- "popq " DEST_REG "\n\t" \
- __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
+#endif
+void * __cdecl ermsb_memset(void *dst, int c, size_t n); +__ASM_GLOBAL_FUNC( ermsb_memset,
MEMSET_INIT
"rep stosb\n\t"
MEMSET_CLEANUP
"ret" )
+#undef DEST_REG +#undef SRC_REG +#undef LEN_REG +#undef TMP_REG
#endif
/*********************************************************************
@@ -2860,9 +2919,13 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) */ void* __cdecl memset(void *dst, int c, size_t n) { +#if defined(__i386__) || defined(__x86_64__)
- return ermsb_memset(dst, c, n);
+#else volatile unsigned char *d = dst; /* avoid gcc optimizations */ while (n--) *d++ = c; return dst; +#endif }
/*********************************************************************
In my opinion we should instead get rid of the volatile, it's what really kills the performance. GCC has the bad idea of replacing the loop with a builtin memset call, which doesn't work in Wine, but we can solve this in various ways I think.
Then rep stosb/movsb aren't guaranteed to be fast on all CPUs. Even on modern CPUs, which have the "it's really fast" bit, as far as I could see it sometimes really makes a difference for large sizes (ie >= 4096), which I don't think is the common case.
For a start we could add -ffreestanding / -fno-builtin compilation flags to msvcrt instead, although it won't be much faster, it will at least fix the issue with the builtin replacement.
Then we can also unroll the loop a bit, still writing it in C, which makes it much faster and especially on small sizes. I actually have something like that locally (and in Proton) in ntdll, where it actually helps for zeroed heap allocations (which call ntdll memset instead of msvcrt):
static inline void memset_c_unaligned_32( char *d, uint64_t v, size_t n ) { if (n >= 24) { *(uint64_t *)d = v; *(uint64_t *)(d + 8) = v; *(uint64_t *)(d + 16) = v; *(uint64_t *)(d + n - 8) = v; } else if (n >= 16) { *(uint64_t *)d = v; *(uint64_t *)(d + 8) = v; *(uint64_t *)(d + n - 8) = v; } else if (n >= 8) { *(uint64_t *)d = v; *(uint64_t *)(d + n - 8) = v; } else if (n >= 4) { *(uint32_t *)d = v; *(uint32_t *)(d + n - 4) = v; } else if (n >= 2) { *(uint16_t *)d = v; *(uint16_t *)(d + n - 2) = v; } else if (n >= 1) { *(uint8_t *)d = v; } }
/*********************************************************************
memset (NTDLL.@)
*/ void *__cdecl memset(void *dst, int c, size_t n) { uint16_t tmp16 = ((uint16_t)c << 8) | c; uint32_t tmp32 = ((uint32_t)tmp16 << 16) | tmp16; uint64_t v = ((uint64_t)tmp32 << 32) | tmp32;
if (n <= 32) { memset_c_unaligned_32( dst, v, n ); return dst; } while (n >= 48) { *(uint64_t*)((char *)dst + n - 8) = v; *(uint64_t*)((char *)dst + n - 16) = v; *(uint64_t*)((char *)dst + n - 24) = v; *(uint64_t*)((char *)dst + n - 32) = v; *(uint64_t*)((char *)dst + n - 40) = v; *(uint64_t*)((char *)dst + n - 48) = v; n -= 48; } while (n >= 24) { *(uint64_t*)((char *)dst + n - 8) = v; *(uint64_t*)((char *)dst + n - 16) = v; *(uint64_t*)((char *)dst + n - 24) = v; n -= 24; } memset_c_unaligned_32( dst, v, n ); return dst;
}
Cheers,