From: Martin Storsjö martin@martin.st
This fixes a regression in memset on ARM since 7b17d7081512db52ef852705445762ac4016c29f.
ARM can do 64 bit writes with the STRD instruction, but that instruction requires a 32 bit aligned address - while these stores are unaligned.
Two consecutive stores to uint32_t* pointers can also be fused into one single STRD, as a uint32_t* is supposed to be properly aligned - therefore, do these stores as stores to volatile uint32_t* to avoid fusing them.
Signed-off-by: Martin Storsjö martin@martin.st --- dlls/msvcrt/string.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index f2b1b4a5b11..bf491a91f40 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2878,15 +2878,37 @@ void *__cdecl memset(void *dst, int c, size_t n)
if (n >= 16) { +#ifdef __arm__ + *(volatile uint32_t *)(d + 0) = v; + *(volatile uint32_t *)(d + 4) = v; + *(volatile uint32_t *)(d + 8) = v; + *(volatile uint32_t *)(d + 12) = v; + *(volatile uint32_t *)(d + n - 16) = v; + *(volatile uint32_t *)(d + n - 12) = v; + *(volatile uint32_t *)(d + n - 8) = v; + *(volatile uint32_t *)(d + n - 4) = v; +#else *(uint64_t *)(d + 0) = v; *(uint64_t *)(d + 8) = v; *(uint64_t *)(d + n - 16) = v; *(uint64_t *)(d + n - 8) = v; +#endif if (n <= 32) return dst; +#ifdef __arm__ + *(volatile uint32_t *)(d + 16) = v; + *(volatile uint32_t *)(d + 20) = v; + *(volatile uint32_t *)(d + 24) = v; + *(volatile uint32_t *)(d + 28) = v; + *(volatile uint32_t *)(d + n - 32) = v; + *(volatile uint32_t *)(d + n - 28) = v; + *(volatile uint32_t *)(d + n - 24) = v; + *(volatile uint32_t *)(d + n - 20) = v; +#else *(uint64_t *)(d + 16) = v; *(uint64_t *)(d + 24) = v; *(uint64_t *)(d + n - 32) = v; *(uint64_t *)(d + n - 24) = v; +#endif if (n <= 64) return dst;
n = (n - a) & ~0x1f; @@ -2895,8 +2917,15 @@ void *__cdecl memset(void *dst, int c, size_t n) } if (n >= 8) { +#ifdef __arm__ + *(volatile uint32_t *)d = v; + *(volatile uint32_t *)(d + 4) = v; + *(volatile uint32_t *)(d + n - 4) = v; + *(volatile uint32_t *)(d + n - 8) = v; +#else *(uint64_t *)d = v; *(uint64_t *)(d + n - 8) = v; +#endif return dst; } if (n >= 4)
On 9/15/21 10:27 PM, Martin Storsjo wrote:
From: Martin Storsjö martin@martin.st
This fixes a regression in memset on ARM since 7b17d7081512db52ef852705445762ac4016c29f.
ARM can do 64 bit writes with the STRD instruction, but that instruction requires a 32 bit aligned address - while these stores are unaligned.
Two consecutive stores to uint32_t* pointers can also be fused into one single STRD, as a uint32_t* is supposed to be properly aligned - therefore, do these stores as stores to volatile uint32_t* to avoid fusing them.
Signed-off-by: Martin Storsjö martin@martin.st
dlls/msvcrt/string.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index f2b1b4a5b11..bf491a91f40 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2878,15 +2878,37 @@ void *__cdecl memset(void *dst, int c, size_t n)
if (n >= 16) {
+#ifdef __arm__
*(volatile uint32_t *)(d + 0) = v;
*(volatile uint32_t *)(d + 4) = v;
*(volatile uint32_t *)(d + 8) = v;
*(volatile uint32_t *)(d + 12) = v;
*(volatile uint32_t *)(d + n - 16) = v;
*(volatile uint32_t *)(d + n - 12) = v;
*(volatile uint32_t *)(d + n - 8) = v;
*(volatile uint32_t *)(d + n - 4) = v;
+#else *(uint64_t *)(d + 0) = v; *(uint64_t *)(d + 8) = v; *(uint64_t *)(d + n - 16) = v; *(uint64_t *)(d + n - 8) = v; +#endif if (n <= 32) return dst; +#ifdef __arm__
*(volatile uint32_t *)(d + 16) = v;
*(volatile uint32_t *)(d + 20) = v;
*(volatile uint32_t *)(d + 24) = v;
*(volatile uint32_t *)(d + 28) = v;
*(volatile uint32_t *)(d + n - 32) = v;
*(volatile uint32_t *)(d + n - 28) = v;
*(volatile uint32_t *)(d + n - 24) = v;
*(volatile uint32_t *)(d + n - 20) = v;
+#else *(uint64_t *)(d + 16) = v; *(uint64_t *)(d + 24) = v; *(uint64_t *)(d + n - 32) = v; *(uint64_t *)(d + n - 24) = v; +#endif if (n <= 64) return dst;
n = (n - a) & ~0x1f;
@@ -2895,8 +2917,15 @@ void *__cdecl memset(void *dst, int c, size_t n) } if (n >= 8) { +#ifdef __arm__
*(volatile uint32_t *)d = v;
*(volatile uint32_t *)(d + 4) = v;
*(volatile uint32_t *)(d + n - 4) = v;
*(volatile uint32_t *)(d + n - 8) = v;
+#else *(uint64_t *)d = v; *(uint64_t *)(d + n - 8) = v; +#endif return dst; } if (n >= 4)
I'm confused that it causes trouble when I thought it could benefit more than just Intel architectures...
Maybe it could be made not too ugly with some macro to wrap the 64bit stores, defined differently for arm?
On Wed, 15 Sep 2021, Rémi Bernon wrote:
On 9/15/21 10:27 PM, Martin Storsjo wrote:
From: Martin Storsjö martin@martin.st
This fixes a regression in memset on ARM since 7b17d7081512db52ef852705445762ac4016c29f.
ARM can do 64 bit writes with the STRD instruction, but that instruction requires a 32 bit aligned address - while these stores are unaligned.
Two consecutive stores to uint32_t* pointers can also be fused into one single STRD, as a uint32_t* is supposed to be properly aligned - therefore, do these stores as stores to volatile uint32_t* to avoid fusing them.
Signed-off-by: Martin Storsjö martin@martin.st
dlls/msvcrt/string.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+)
I'm confused that it causes trouble when I thought it could benefit more than just Intel architectures...
It's kinda arch dependent - some architectures (sparc, mips iirc?) flat out reject that kind of unaligned loads/stores. ARM used to be much more strict about it too, but in newer architecture versions it's kinda lenient, but some instructions still require proper alignment. And with casting unaligned pointers to a larger data type like that, the compiler is free to assume it is aligned to that size.
Maybe it could be made not too ugly with some macro to wrap the 64bit stores, defined differently for arm?
Thanks, that makes it much more tolerable.
// Martin
Hi Martin,
On 9/15/21 10:27 PM, Martin Storsjo wrote:
ARM can do 64 bit writes with the STRD instruction, but that instruction requires a 32 bit aligned address - while these stores are unaligned.
Two consecutive stores to uint32_t* pointers can also be fused into one single STRD, as a uint32_t* is supposed to be properly aligned - therefore, do these stores as stores to volatile uint32_t* to avoid fusing them.
How about letting the compiler know that the pointers are unaligned instead? Is attached patch working for you?
Thanks, Piotr
On Thu, 16 Sep 2021, Piotr Caban wrote:
Hi Martin,
On 9/15/21 10:27 PM, Martin Storsjo wrote:
ARM can do 64 bit writes with the STRD instruction, but that instruction requires a 32 bit aligned address - while these stores are unaligned.
Two consecutive stores to uint32_t* pointers can also be fused into one single STRD, as a uint32_t* is supposed to be properly aligned - therefore, do these stores as stores to volatile uint32_t* to avoid fusing them.
How about letting the compiler know that the pointers are unaligned instead? Is attached patch working for you?
Thanks, that's even better!
This way the compiler has more freedom to reason about it and can choose to use another instruction with less alignment requirements (both GCC and Clang seem to compile it to use a 16 byte VST, an unaligned SIMD store instead) which probably is much better than forcing the compiler to do a long sequence of 32 bit stores.
Clang doesn't seem to know/exploit that the regular 32 bit store instructions work unaligned though, so the smaller stores get exploded into a long series of single byte writes. But I guess that's just a missed optimization opportunity in Clang, I'll see if I can report it.
// Martin
On Thu, 16 Sep 2021, Martin Storsjö wrote:
On Thu, 16 Sep 2021, Piotr Caban wrote:
Hi Martin,
On 9/15/21 10:27 PM, Martin Storsjo wrote:
ARM can do 64 bit writes with the STRD instruction, but that instruction requires a 32 bit aligned address - while these stores are unaligned.
Two consecutive stores to uint32_t* pointers can also be fused into one single STRD, as a uint32_t* is supposed to be properly aligned - therefore, do these stores as stores to volatile uint32_t* to avoid fusing them.
How about letting the compiler know that the pointers are unaligned instead? Is attached patch working for you?
Thanks, that's even better!
This way the compiler has more freedom to reason about it and can choose to use another instruction with less alignment requirements (both GCC and Clang seem to compile it to use a 16 byte VST, an unaligned SIMD store instead) which probably is much better than forcing the compiler to do a long sequence of 32 bit stores.
Clang doesn't seem to know/exploit that the regular 32 bit store instructions work unaligned though, so the smaller stores get exploded into a long series of single byte writes. But I guess that's just a missed optimization opportunity in Clang, I'll see if I can report it.
FWIW this seems to be a target specific issue; Clang does optimize it correctly for an armv7-linux-gnueabihf target, but not for armv7-windows. I'll see about getting that fixed.
// Martin
On Thu, 16 Sep 2021, Martin Storsjö wrote:
On Thu, 16 Sep 2021, Martin Storsjö wrote:
Clang doesn't seem to know/exploit that the regular 32 bit store instructions work unaligned though, so the smaller stores get exploded into a long series of single byte writes. But I guess that's just a missed optimization opportunity in Clang, I'll see if I can report it.
FWIW this seems to be a target specific issue; Clang does optimize it correctly for an armv7-linux-gnueabihf target, but not for armv7-windows. I'll see about getting that fixed.
For the record, this should have been fixed in Clang now: https://reviews.llvm.org/D109960
// Martin