For n larger than 16 we store 16 bytes on each end of the buffer, eventually overlapping, and then 16 additional bytes for n > 32.
Then we can find a 32-byte aligned range overlapping the remaining part of the destination buffer, which is filled 32 bytes at a time in a loop.
Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/msvcrt/string.c | 60 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 4d09405094d..b470a875607 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2855,13 +2855,65 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n) return memmove(dst, src, n); }
+static void memset_aligned_32(unsigned char *d, uint64_t v, size_t n) +{ + while (n >= 32) + { + *(uint64_t *)(d + n - 32) = v; + *(uint64_t *)(d + n - 24) = v; + *(uint64_t *)(d + n - 16) = v; + *(uint64_t *)(d + n - 8) = v; + n -= 32; + } +} + /********************************************************************* * memset (MSVCRT.@) */ -void* __cdecl memset(void *dst, int c, size_t n) -{ - volatile unsigned char *d = dst; /* avoid gcc optimizations */ - while (n--) *d++ = c; +void *__cdecl memset(void *dst, int c, size_t n) +{ + uint64_t v = 0x101010101010101ull * (unsigned char)c; + unsigned char *d = (unsigned char *)dst; + size_t a = 0x20 - ((uintptr_t)d & 0x1f); + + if (n >= 16) + { + *(uint64_t *)(d + 0) = v; + *(uint64_t *)(d + 8) = v; + *(uint64_t *)(d + n - 16) = v; + *(uint64_t *)(d + n - 8) = v; + if (n <= 32) return dst; + *(uint64_t *)(d + 16) = v; + *(uint64_t *)(d + 24) = v; + *(uint64_t *)(d + n - 32) = v; + *(uint64_t *)(d + n - 24) = v; + if (n <= 64) return dst; + memset_aligned_32(d + a, v, (n - a) & ~0x1f); + return dst; + } + if (n >= 8) + { + *(uint64_t *)d = v; + *(uint64_t *)(d + n - 8) = v; + return dst; + } + if (n >= 4) + { + *(uint32_t *)d = v; + *(uint32_t *)(d + n - 4) = v; + return dst; + } + if (n >= 2) + { + *(uint16_t *)d = v; + *(uint16_t *)(d + n - 2) = v; + return dst; + } + if (n >= 1) + { + *(uint8_t *)d = v; + return dst; + } return dst; }
Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/msvcrt/math.c | 13 +++++++++++++ dlls/msvcrt/msvcrt.h | 1 + dlls/msvcrt/string.c | 8 ++++++++ 3 files changed, 22 insertions(+)
diff --git a/dlls/msvcrt/math.c b/dlls/msvcrt/math.c index 7f59a4d20d4..9974e72d78f 100644 --- a/dlls/msvcrt/math.c +++ b/dlls/msvcrt/math.c @@ -43,6 +43,7 @@ #include <limits.h> #include <locale.h> #include <math.h> +#include <intrin.h>
#include "msvcrt.h" #include "winternl.h" @@ -64,11 +65,23 @@ typedef int (CDECL *MSVCRT_matherr_func)(struct _exception *);
static MSVCRT_matherr_func MSVCRT_default_matherr_func = NULL;
+BOOL erms_supported; BOOL sse2_supported; static BOOL sse2_enabled;
void msvcrt_init_math( void *module ) { +#if defined(__i386__) || defined(__x86_64__) + int regs[4]; + + __cpuid(regs, 0); + if (regs[0] >= 7) + { + __cpuidex(regs, 7, 0); + erms_supported = ((regs[1] >> 9) & 1); + } +#endif + sse2_supported = IsProcessorFeaturePresent( PF_XMMI64_INSTRUCTIONS_AVAILABLE ); #if _MSVCR_VER <=71 sse2_enabled = FALSE; diff --git a/dlls/msvcrt/msvcrt.h b/dlls/msvcrt/msvcrt.h index 60f8c2f5ef2..022eced35d9 100644 --- a/dlls/msvcrt/msvcrt.h +++ b/dlls/msvcrt/msvcrt.h @@ -33,6 +33,7 @@ #undef strncpy #undef wcsncpy
+extern BOOL erms_supported DECLSPEC_HIDDEN; extern BOOL sse2_supported DECLSPEC_HIDDEN;
#define DBL80_MAX_10_EXP 4932 diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index b470a875607..26bb9cd8ba4 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2857,6 +2857,14 @@ void * __cdecl memcpy(void *dst, const void *src, size_t n)
static void memset_aligned_32(unsigned char *d, uint64_t v, size_t n) { +#if defined(__i386__) || defined(__x86_64__) + if (n >= 2048 && erms_supported) + { + unsigned char c = v; + __asm__ __volatile__ ("cld; rep; stosb" : "+D"(d), "+c"(n) : "a"(c) : "memory", "cc"); + return; + } +#endif while (n >= 32) { *(uint64_t *)(d + n - 32) = v;
For intermediate sizes.
Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/msvcrt/string.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+)
diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 26bb9cd8ba4..8a6095dda57 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -2864,6 +2864,32 @@ static void memset_aligned_32(unsigned char *d, uint64_t v, size_t n) __asm__ __volatile__ ("cld; rep; stosb" : "+D"(d), "+c"(n) : "a"(c) : "memory", "cc"); return; } +#ifdef __i386__ + if (sse2_supported) +#endif + { + unsigned int c = v; + __asm__ __volatile__ ( + "movd %2, %%xmm0\n\t" + "pshufd $0, %%xmm0, %%xmm0\n\t" + "test $0x20, %0\n\t" + "je 1f\n\t" + "sub $0x20, %0\n\t" + "movdqa %%xmm0, 0x00(%1,%0)\n\t" + "movdqa %%xmm0, 0x10(%1,%0)\n\t" + "je 2f\n\t" + "1:\n\t" + "sub $0x40, %0\n\t" + "movdqa %%xmm0, 0x00(%1,%0)\n\t" + "movdqa %%xmm0, 0x10(%1,%0)\n\t" + "movdqa %%xmm0, 0x20(%1,%0)\n\t" + "movdqa %%xmm0, 0x30(%1,%0)\n\t" + "ja 1b\n\t" + "2:\n\t" + : "+r"(n) : "r"(d), "r"(c) : "xmm0", "memory", "cc" + ); + return; + } #endif while (n >= 32) {
Hi,
While running your changed tests, I think I found new failures. Being a bot and all I'm not very good at pattern recognition, so I might be wrong, but could you please double-check?
Full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=97908
Your paranoid android.
=== debiant2 (build log) ===
../wine/dlls/msvcrt/string.c:2872:9: error: the register ‘xmm0’ cannot be clobbered in ‘asm’ for the current target Task: The win32 Wine build failed
=== debiant2 (build log) ===
../wine/dlls/msvcrt/string.c:2872:9: error: the register ‘xmm0’ cannot be clobbered in ‘asm’ for the current target Task: The wow32 Wine build failed
On 9/14/21 12:15 PM, Marvin wrote:
Hi,
While running your changed tests, I think I found new failures. Being a bot and all I'm not very good at pattern recognition, so I might be wrong, but could you please double-check?
Full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=97908
Your paranoid android.
=== debiant2 (build log) ===
../wine/dlls/msvcrt/string.c:2872:9: error: the register ‘xmm0’ cannot be clobbered in ‘asm’ for the current target Task: The win32 Wine build failed
=== debiant2 (build log) ===
../wine/dlls/msvcrt/string.c:2872:9: error: the register ‘xmm0’ cannot be clobbered in ‘asm’ for the current target Task: The wow32 Wine build failed
Interesting...
I guess we can either:
* guard it with #ifdef __SSE2__, and in that case now that we import x86intrin.h we could use intel intrinsics instead of inline assembly,
* or make it an external assembly function.
On 9/14/21 12:55 PM, Rémi Bernon wrote:
On 9/14/21 12:15 PM, Marvin wrote:
- guard it with #ifdef __SSE2__, and in that case now that we import
x86intrin.h we could use intel intrinsics instead of inline assembly,
I didn't check that __SSE2__ is not defined when i686-w64-mingw is used. I think that the SSE enabled version should be available in this case.
- or make it an external assembly function.
We can also consider moving the ERMS version to separate function to avoid using inline assembly.
On 9/14/21 1:10 PM, Piotr Caban wrote:
On 9/14/21 12:55 PM, Rémi Bernon wrote:
On 9/14/21 12:15 PM, Marvin wrote:
- guard it with #ifdef __SSE2__, and in that case now that we import
x86intrin.h we could use intel intrinsics instead of inline assembly,
I didn't check that __SSE2__ is not defined when i686-w64-mingw is used. I think that the SSE enabled version should be available in this case.
It may be defined, and it is for me when I build locally, but not on the testbot for some reason. You can have the same result with -mno-sse.
- or make it an external assembly function.
We can also consider moving the ERMS version to separate function to avoid using inline assembly.
That too. The call has an overhead though (but for large sizes that may not matter much).
On 9/14/21 12:49 PM, Piotr Caban wrote:
On 9/14/21 11:05 AM, Rémi Bernon wrote:
+#ifdef __i386__
#if defined(__i386__) && defined(__SSE2__)
+ if (sse2_supported) +#endif + { + unsigned int c = v; + __asm__ __volatile__ (
Thanks, Piotr
It doesn't seem to be enough, the inline assembly statement needs to be guarded when SSE is disabled.
And what about using intel intrinsics? Like for instance:
#ifdef __SSE2__ #ifdef __i386__ if (sse2_supported) #endif { __m128i x = _mm_set1_epi64x(v); while (n >= 64) { _mm_store_si128((__m128i *)(d + n - 64), x); _mm_store_si128((__m128i *)(d + n - 48), x); _mm_store_si128((__m128i *)(d + n - 32), x); _mm_store_si128((__m128i *)(d + n - 16), x); n -= 64; } if (n >= 32) { _mm_store_si128((__m128i *)(d + n - 32), x); _mm_store_si128((__m128i *)(d + n - 16), x); } return; } #endif
In all cases, if SSE is disabled at compile-time it will not be able to use SSE2 path at runtime, even if supported. Which was possible with the assembly function.
Is this something we would like to have?
On 9/14/21 1:01 PM, Rémi Bernon wrote:
And what about using intel intrinsics? Like for instance:
#ifdef __SSE2__ #ifdef __i386__ if (sse2_supported) #endif { __m128i x = _mm_set1_epi64x(v); while (n >= 64) { _mm_store_si128((__m128i *)(d + n - 64), x); _mm_store_si128((__m128i *)(d + n - 48), x); _mm_store_si128((__m128i *)(d + n - 32), x); _mm_store_si128((__m128i *)(d + n - 16), x); n -= 64; } if (n >= 32) { _mm_store_si128((__m128i *)(d + n - 32), x); _mm_store_si128((__m128i *)(d + n - 16), x); } return; } #endif
In all cases, if SSE is disabled at compile-time it will not be able to use SSE2 path at runtime, even if supported. Which was possible with the assembly function.
Is this something we would like to have?
I don't think this is portable. I quick test shows that it doesn't compile with x86_64-w64-mingw on my machine.
Thanks, Piotr