From: Martin Storsjö martin@martin.st
This fixes a regression in memset on ARM since 7b17d7081512db52ef852705445762ac4016c29f.
ARM can do 64 bit writes with the STRD instruction, but that instruction requires a 32 bit aligned address - while these stores are unaligned.
Two consecutive stores to uint32_t* pointers can also be fused into one single STRD, as a uint32_t* is supposed to be properly aligned - therefore, do these stores as stores to volatile uint32_t* to avoid fusing them.
Signed-off-by: Martin Storsjö martin@martin.st --- dlls/msvcrt/string.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index f2b1b4a5b11..bf491a91f40 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2878,15 +2878,37 @@ void *__cdecl memset(void *dst, int c, size_t n)
if (n >= 16) { +#ifdef __arm__ + *(volatile uint32_t *)(d + 0) = v; + *(volatile uint32_t *)(d + 4) = v; + *(volatile uint32_t *)(d + 8) = v; + *(volatile uint32_t *)(d + 12) = v; + *(volatile uint32_t *)(d + n - 16) = v; + *(volatile uint32_t *)(d + n - 12) = v; + *(volatile uint32_t *)(d + n - 8) = v; + *(volatile uint32_t *)(d + n - 4) = v; +#else *(uint64_t *)(d + 0) = v; *(uint64_t *)(d + 8) = v; *(uint64_t *)(d + n - 16) = v; *(uint64_t *)(d + n - 8) = v; +#endif if (n <= 32) return dst; +#ifdef __arm__ + *(volatile uint32_t *)(d + 16) = v; + *(volatile uint32_t *)(d + 20) = v; + *(volatile uint32_t *)(d + 24) = v; + *(volatile uint32_t *)(d + 28) = v; + *(volatile uint32_t *)(d + n - 32) = v; + *(volatile uint32_t *)(d + n - 28) = v; + *(volatile uint32_t *)(d + n - 24) = v; + *(volatile uint32_t *)(d + n - 20) = v; +#else *(uint64_t *)(d + 16) = v; *(uint64_t *)(d + 24) = v; *(uint64_t *)(d + n - 32) = v; *(uint64_t *)(d + n - 24) = v; +#endif if (n <= 64) return dst;
n = (n - a) & ~0x1f; @@ -2895,8 +2917,15 @@ void *__cdecl memset(void *dst, int c, size_t n) } if (n >= 8) { +#ifdef __arm__ + *(volatile uint32_t *)d = v; + *(volatile uint32_t *)(d + 4) = v; + *(volatile uint32_t *)(d + n - 4) = v; + *(volatile uint32_t *)(d + n - 8) = v; +#else *(uint64_t *)d = v; *(uint64_t *)(d + n - 8) = v; +#endif return dst; } if (n >= 4)