Instead of going backward, which breaks the Linux kernel transparent huge pages allocation assumptions.
This can be reproduced by calling memset on large, newly allocated, memory regions.
Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/msvcrt/string.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 5655fbfe68a..48d44d3b72e 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2857,13 +2857,14 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n)
static inline void memset_aligned_32(unsigned char *d, uint64_t v, size_t n) { - while (n >= 32) - { - *(uint64_t *)(d + n - 32) = v; - *(uint64_t *)(d + n - 24) = v; - *(uint64_t *)(d + n - 16) = v; - *(uint64_t *)(d + n - 8) = v; - n -= 32; + unsigned char *end = d + n; + while (d < end) + { + *(uint64_t *)(d + 0) = v; + *(uint64_t *)(d + 8) = v; + *(uint64_t *)(d + 16) = v; + *(uint64_t *)(d + 24) = v; + d += 32; } }
Although less frequently used, heap allocations are zeroed out with ntdll memset and they don't benefit from the msvcrt memset optimisation.
Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/ntdll/string.c | 66 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 3 deletions(-)
diff --git a/dlls/ntdll/string.c b/dlls/ntdll/string.c index f1cea6caa83..cd4f3c6a8a2 100644 --- a/dlls/ntdll/string.c +++ b/dlls/ntdll/string.c @@ -25,6 +25,7 @@ #include <stdlib.h> #include <stdio.h> #include <string.h> +#include <stdint.h>
#include "windef.h" #include "winbase.h" @@ -140,13 +141,72 @@ void * __cdecl memmove( void *dst, const void *src, size_t n ) }
+static inline void memset_aligned_32( unsigned char *d, uint64_t v, size_t n ) +{ + unsigned char *end = d + n; + while (d < end) + { + *(uint64_t *)(d + 0) = v; + *(uint64_t *)(d + 8) = v; + *(uint64_t *)(d + 16) = v; + *(uint64_t *)(d + 24) = v; + d += 32; + } +} + /********************************************************************* * memset (NTDLL.@) */ -void * __cdecl memset( void *dst, int c, size_t n ) +void *__cdecl memset( void *dst, int c, size_t n ) { - volatile unsigned char *d = dst; /* avoid gcc optimizations */ - while (n--) *d++ = c; + typedef uint64_t DECLSPEC_ALIGN(1) unaligned_ui64; + typedef uint32_t DECLSPEC_ALIGN(1) unaligned_ui32; + typedef uint16_t DECLSPEC_ALIGN(1) unaligned_ui16; + + uint64_t v = 0x101010101010101ull * (unsigned char)c; + unsigned char *d = (unsigned char *)dst; + size_t a = 0x20 - ((uintptr_t)d & 0x1f); + + if (n >= 16) + { + *(unaligned_ui64 *)(d + 0) = v; + *(unaligned_ui64 *)(d + 8) = v; + *(unaligned_ui64 *)(d + n - 16) = v; + *(unaligned_ui64 *)(d + n - 8) = v; + if (n <= 32) return dst; + *(unaligned_ui64 *)(d + 16) = v; + *(unaligned_ui64 *)(d + 24) = v; + *(unaligned_ui64 *)(d + n - 32) = v; + *(unaligned_ui64 *)(d + n - 24) = v; + if (n <= 64) return dst; + + n = (n - a) & ~0x1f; + memset_aligned_32( d + a, v, n ); + return dst; + } + if (n >= 8) + { + *(unaligned_ui64 *)d = v; + *(unaligned_ui64 *)(d + n - 8) = v; + return dst; + } + if (n >= 4) + { + *(unaligned_ui32 *)d = v; + *(unaligned_ui32 *)(d + n - 4) = v; + return dst; + } + if (n >= 2) + { + *(unaligned_ui16 *)d = v; + *(unaligned_ui16 *)(d + n - 2) = v; + return dst; + } + if (n >= 1) + { + *(uint8_t *)d = v; + return dst; + } return dst; }