This is a preparation for supporting more xstate features in the contexts.
The remaining parts are: - support arbitrary xstate in usr1_handler(); - support arbitrary xstate in the server contexts (probably transferring only the present part); - support arbitrary xstate in context manipulation functions; - enabling more xstates in system.c and in user shared data.
-- v2: ntdll: Don't hardcode xstate size in exception stack layout. ntdll: Don't hardcode xstate size in syscall frame. ntdll: Don't hardcode xstate feature mask.
From: Paul Gofman pgofman@codeweavers.com
--- dlls/ntdll/unix/signal_i386.c | 61 +++++++++++++++++---------------- dlls/ntdll/unix/signal_x86_64.c | 49 ++++++++++++++------------ dlls/ntdll/unix/system.c | 4 +++ dlls/ntdll/unix/unix_private.h | 9 +++++ 4 files changed, 72 insertions(+), 51 deletions(-)
diff --git a/dlls/ntdll/unix/signal_i386.c b/dlls/ntdll/unix/signal_i386.c index cd6417b57b4..d634e46a80f 100644 --- a/dlls/ntdll/unix/signal_i386.c +++ b/dlls/ntdll/unix/signal_i386.c @@ -525,6 +525,7 @@ struct x86_thread_data UINT dr7; /* 1f0 */ SYSTEM_SERVICE_TABLE *syscall_table; /* 1f4 syscall table */ struct syscall_frame *syscall_frame; /* 1f8 frame pointer on syscall entry */ + UINT64 xstate_features_mask; /* 1fc */ };
C_ASSERT( sizeof(struct x86_thread_data) <= sizeof(((struct ntdll_thread_data *)0)->cpu_data) ); @@ -609,8 +610,6 @@ struct xcontext ULONG64 host_compaction_mask; };
-extern BOOL xstate_compaction_enabled; - static inline XSTATE *xstate_from_context( const CONTEXT *context ) { CONTEXT_EX *xctx = (CONTEXT_EX *)(context + 1); @@ -832,7 +831,7 @@ static inline void save_context( struct xcontext *xcontext, const ucontext_t *si context->ContextFlags |= CONTEXT_FLOATING_POINT | CONTEXT_EXTENDED_REGISTERS; memcpy( context->ExtendedRegisters, fpux, sizeof(*fpux) ); if (!fpu) fpux_to_fpu( &context->FloatSave, fpux ); - if ((cpu_info.ProcessorFeatureBits & CPU_FEATURE_AVX) && (xs = XState_sig(fpux))) + if (xstate_extended_features() && (xs = XState_sig(fpux))) { context_init_xstate( context, xs ); xcontext->host_compaction_mask = xs->CompactionMask; @@ -936,7 +935,7 @@ NTSTATUS WINAPI NtSetContextThread( HANDLE handle, const CONTEXT *context ) DWORD flags = context->ContextFlags & ~CONTEXT_i386; BOOL self = (handle == GetCurrentThread());
- if ((flags & CONTEXT_XSTATE) && (cpu_info.ProcessorFeatureBits & CPU_FEATURE_AVX)) + if ((flags & CONTEXT_XSTATE) && xstate_extended_features()) { CONTEXT_EX *context_ex = (CONTEXT_EX *)(context + 1); XSTATE *xs = (XSTATE *)((char *)context_ex + context_ex->XState.Offset); @@ -944,7 +943,7 @@ NTSTATUS WINAPI NtSetContextThread( HANDLE handle, const CONTEXT *context ) if (context_ex->XState.Length < offsetof(XSTATE, YmmContext) || context_ex->XState.Length > sizeof(XSTATE)) return STATUS_INVALID_PARAMETER; - if ((xs->Mask & XSTATE_MASK_GSSE) && (context_ex->XState.Length < sizeof(XSTATE))) + if ((xs->Mask & xstate_extended_features()) && (context_ex->XState.Length < sizeof(XSTATE))) return STATUS_BUFFER_OVERFLOW; } else flags &= ~CONTEXT_XSTATE; @@ -1138,7 +1137,7 @@ NTSTATUS WINAPI NtGetContextThread( HANDLE handle, CONTEXT *context )
context->ContextFlags |= CONTEXT_EXTENDED_REGISTERS; } - if ((needed_flags & CONTEXT_XSTATE) && (cpu_info.ProcessorFeatureBits & CPU_FEATURE_AVX)) + if ((needed_flags & CONTEXT_XSTATE) && xstate_extended_features()) { CONTEXT_EX *context_ex = (CONTEXT_EX *)(context + 1); XSTATE *xstate = (XSTATE *)((char *)context_ex + context_ex->XState.Offset); @@ -1148,7 +1147,7 @@ NTSTATUS WINAPI NtGetContextThread( HANDLE handle, CONTEXT *context ) || context_ex->XState.Length > sizeof(XSTATE)) return STATUS_INVALID_PARAMETER;
- mask = (xstate_compaction_enabled ? xstate->CompactionMask : xstate->Mask) & XSTATE_MASK_GSSE; + mask = (xstate_compaction_enabled ? xstate->CompactionMask : xstate->Mask) & xstate_extended_features(); xstate->Mask = frame->xstate.Mask & mask; xstate->CompactionMask = xstate_compaction_enabled ? (0x8000000000000000 | mask) : 0; memset( xstate->Reserved, 0, sizeof(xstate->Reserved) ); @@ -1485,7 +1484,7 @@ static void setup_raise_exception( ucontext_t *sigcontext, void *stack_ptr,
context_init_xstate( &stack->context, dst_xs ); memset( dst_xs, 0, offsetof(XSTATE, YmmContext) ); - dst_xs->CompactionMask = xstate_compaction_enabled ? 0x8000000000000004 : 0; + dst_xs->CompactionMask = xstate_compaction_enabled ? 0x8000000000000000 | xstate_extended_features() : 0; if (src_xs->Mask & 4) { dst_xs->Mask = 4; @@ -1587,7 +1586,7 @@ NTSTATUS call_user_exception_dispatcher( EXCEPTION_RECORD *rec, CONTEXT *context
context_init_xstate( &stack->context, dst_xs ); memset( dst_xs, 0, offsetof(XSTATE, YmmContext) ); - dst_xs->CompactionMask = xstate_compaction_enabled ? 0x8000000000000004 : 0; + dst_xs->CompactionMask = xstate_compaction_enabled ? 0x8000000000000000 | xstate_extended_features() : 0; if (src_xs->Mask & 4) { dst_xs->Mask = 4; @@ -2481,6 +2480,7 @@ void call_init_thunk( LPTHREAD_START_ROUTINE entry, void *arg, BOOL suspend, TEB ldt_set_fs( thread_data->fs, teb ); thread_data->gs = get_gs(); thread_data->syscall_table = KeServiceDescriptorTable; + thread_data->xstate_features_mask = xstate_supported_features_mask;
context.SegCs = get_cs(); context.SegDs = get_ds(); @@ -2504,6 +2504,8 @@ void call_init_thunk( LPTHREAD_START_ROUTINE entry, void *arg, BOOL suspend, TEB *ctx = context; ctx->ContextFlags = CONTEXT_FULL | CONTEXT_FLOATING_POINT | CONTEXT_EXTENDED_REGISTERS; memset( frame, 0, sizeof(*frame) ); + if (xstate_compaction_enabled) + frame->xstate.CompactionMask = 0x8000000000000000 | xstate_supported_features_mask; NtSetContextThread( GetCurrentThread(), ctx );
stack = (DWORD *)ctx; @@ -2605,26 +2607,27 @@ __ASM_GLOBAL_FUNC( __wine_syscall_dispatcher, "addl %fs:0x1f4,%ebx\n\t" /* x86_thread_data()->syscall_table */ "testl $3,(%ecx)\n\t" /* frame->syscall_flags & (SYSCALL_HAVE_XSAVE | SYSCALL_HAVE_XSAVEC) */ "jz 2f\n\t" - "movl $7,%eax\n\t" - "xorl %edx,%edx\n\t" - "movl %edx,0x240(%ecx)\n\t" - "movl %edx,0x244(%ecx)\n\t" - "movl %edx,0x248(%ecx)\n\t" - "movl %edx,0x24c(%ecx)\n\t" - "movl %edx,0x250(%ecx)\n\t" - "movl %edx,0x254(%ecx)\n\t" + "movl %fs:0x1fc,%eax\n\t" /* x86_thread_data()->xstate_features_mask */ + "movl %fs:0x200,%edx\n\t" /* x86_thread_data()->xstate_features_mask high dword */ + "xorl %edi,%edi\n\t" + "movl %edi,0x240(%ecx)\n\t" + "movl %edi,0x244(%ecx)\n\t" + "movl %edi,0x248(%ecx)\n\t" + "movl %edi,0x24c(%ecx)\n\t" + "movl %edi,0x250(%ecx)\n\t" + "movl %edi,0x254(%ecx)\n\t" "testl $2,(%ecx)\n\t" /* frame->syscall_flags & SYSCALL_HAVE_XSAVEC */ "jz 1f\n\t" - "movl %edx,0x258(%ecx)\n\t" - "movl %edx,0x25c(%ecx)\n\t" - "movl %edx,0x260(%ecx)\n\t" - "movl %edx,0x264(%ecx)\n\t" - "movl %edx,0x268(%ecx)\n\t" - "movl %edx,0x26c(%ecx)\n\t" - "movl %edx,0x270(%ecx)\n\t" - "movl %edx,0x274(%ecx)\n\t" - "movl %edx,0x278(%ecx)\n\t" - "movl %edx,0x27c(%ecx)\n\t" + "movl %edi,0x258(%ecx)\n\t" + "movl %edi,0x25c(%ecx)\n\t" + "movl %edi,0x260(%ecx)\n\t" + "movl %edi,0x264(%ecx)\n\t" + "movl %edi,0x268(%ecx)\n\t" + "movl %edi,0x26c(%ecx)\n\t" + "movl %edi,0x270(%ecx)\n\t" + "movl %edi,0x274(%ecx)\n\t" + "movl %edi,0x278(%ecx)\n\t" + "movl %edi,0x27c(%ecx)\n\t" /* The xsavec instruction is not supported by * binutils < 2.25. */ ".byte 0x0f, 0xc7, 0x61, 0x40\n\t" /* xsavec 0x40(%ecx) */ @@ -2669,8 +2672,8 @@ __ASM_GLOBAL_FUNC( __wine_syscall_dispatcher, "testl $3,%ecx\n\t" /* SYSCALL_HAVE_XSAVE | SYSCALL_HAVE_XSAVEC */ "jz 1f\n\t" "movl %eax,%esi\n\t" - "movl $7,%eax\n\t" - "xorl %edx,%edx\n\t" + "movl %fs:0x1fc,%eax\n\t" /* x86_thread_data()->xstate_features_mask */ + "movl %fs:0x200,%edx\n\t" /* x86_thread_data()->xstate_features_mask high dword */ "xrstor 0x40(%esp)\n\t" "movl %esi,%eax\n\t" "jmp 3f\n" diff --git a/dlls/ntdll/unix/signal_x86_64.c b/dlls/ntdll/unix/signal_x86_64.c index 405ac49e5a3..dff37a9ce7e 100644 --- a/dlls/ntdll/unix/signal_x86_64.c +++ b/dlls/ntdll/unix/signal_x86_64.c @@ -441,6 +441,8 @@ struct amd64_thread_data struct syscall_frame *syscall_frame; /* 0328 syscall frame pointer */ SYSTEM_SERVICE_TABLE *syscall_table; /* 0330 syscall table */ DWORD fs; /* 0338 WOW TEB selector */ + DWORD align; + UINT64 xstate_features_mask; /* 0340 */ };
C_ASSERT( sizeof(struct amd64_thread_data) <= sizeof(((struct ntdll_thread_data *)0)->cpu_data) ); @@ -448,6 +450,7 @@ C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct amd64_thread_data, pth C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct amd64_thread_data, syscall_frame ) == 0x328 ); C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct amd64_thread_data, syscall_table ) == 0x330 ); C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct amd64_thread_data, fs ) == 0x338 ); +C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct amd64_thread_data, xstate_features_mask ) == 0x340 );
static inline struct amd64_thread_data *amd64_thread_data(void) { @@ -477,8 +480,6 @@ struct xcontext ULONG64 host_compaction_mask; };
-extern BOOL xstate_compaction_enabled; - static inline XSTATE *xstate_from_context( const CONTEXT *context ) { CONTEXT_EX *xctx = (CONTEXT_EX *)(context + 1); @@ -898,7 +899,7 @@ static void save_context( struct xcontext *xcontext, const ucontext_t *sigcontex context->ContextFlags |= CONTEXT_FLOATING_POINT; context->FltSave = *FPU_sig(sigcontext); context->MxCsr = context->FltSave.MxCsr; - if ((cpu_info.ProcessorFeatureBits & CPU_FEATURE_AVX) && (xs = XState_sig(FPU_sig(sigcontext)))) + if (xstate_extended_features() && (xs = XState_sig(FPU_sig(sigcontext)))) { /* xcontext and sigcontext are both on the signal stack, so we can * just reference sigcontext without overflowing 32 bit XState.Offset */ @@ -928,7 +929,7 @@ static void restore_context( const struct xcontext *xcontext, ucontext_t *sigcon amd64_thread_data()->dr7 = context->Dr7; set_sigcontext( context, sigcontext ); if (FPU_sig(sigcontext)) *FPU_sig(sigcontext) = context->FltSave; - if ((cpu_info.ProcessorFeatureBits & CPU_FEATURE_AVX) && (xs = XState_sig(FPU_sig(sigcontext)))) + if (xstate_extended_features() && (xs = XState_sig(FPU_sig(sigcontext)))) xs->CompactionMask = xcontext->host_compaction_mask; leave_handler( sigcontext ); } @@ -977,7 +978,7 @@ NTSTATUS WINAPI NtSetContextThread( HANDLE handle, const CONTEXT *context ) BOOL self = (handle == GetCurrentThread()); struct syscall_frame *frame = amd64_thread_data()->syscall_frame;
- if ((flags & CONTEXT_XSTATE) && (cpu_info.ProcessorFeatureBits & CPU_FEATURE_AVX)) + if ((flags & CONTEXT_XSTATE) && xstate_extended_features()) { CONTEXT_EX *context_ex = (CONTEXT_EX *)(context + 1); XSTATE *xs = (XSTATE *)((char *)context_ex + context_ex->XState.Offset); @@ -985,7 +986,7 @@ NTSTATUS WINAPI NtSetContextThread( HANDLE handle, const CONTEXT *context ) if (context_ex->XState.Length < offsetof(XSTATE, YmmContext) || context_ex->XState.Length > sizeof(XSTATE)) return STATUS_INVALID_PARAMETER; - if ((xs->Mask & XSTATE_MASK_GSSE) && (context_ex->XState.Length < sizeof(XSTATE))) + if ((xs->Mask & xstate_extended_features()) && (context_ex->XState.Length < sizeof(XSTATE))) return STATUS_BUFFER_OVERFLOW; } else flags &= ~CONTEXT_XSTATE; @@ -1155,7 +1156,7 @@ NTSTATUS WINAPI NtGetContextThread( HANDLE handle, CONTEXT *context ) context->MxCsr = context->FltSave.MxCsr; context->ContextFlags |= CONTEXT_FLOATING_POINT; } - if ((needed_flags & CONTEXT_XSTATE) && (cpu_info.ProcessorFeatureBits & CPU_FEATURE_AVX)) + if ((needed_flags & CONTEXT_XSTATE) && xstate_extended_features()) { CONTEXT_EX *context_ex = (CONTEXT_EX *)(context + 1); XSTATE *xstate = (XSTATE *)((char *)context_ex + context_ex->XState.Offset); @@ -1165,7 +1166,7 @@ NTSTATUS WINAPI NtGetContextThread( HANDLE handle, CONTEXT *context ) || context_ex->XState.Length > sizeof(XSTATE)) return STATUS_INVALID_PARAMETER;
- mask = (xstate_compaction_enabled ? xstate->CompactionMask : xstate->Mask) & XSTATE_MASK_GSSE; + mask = (xstate_compaction_enabled ? xstate->CompactionMask : xstate->Mask) & xstate_extended_features(); xstate->Mask = frame->xstate.Mask & mask; xstate->CompactionMask = xstate_compaction_enabled ? (0x8000000000000000 | mask) : 0; memset( xstate->Reserved, 0, sizeof(xstate->Reserved) ); @@ -1377,7 +1378,7 @@ NTSTATUS get_thread_wow64_context( HANDLE handle, void *ctx, ULONG size ) context_ex->XState.Length > sizeof(XSTATE)) return STATUS_INVALID_PARAMETER;
- mask = (xstate_compaction_enabled ? xstate->CompactionMask : xstate->Mask) & XSTATE_MASK_GSSE; + mask = (xstate_compaction_enabled ? xstate->CompactionMask : xstate->Mask) & xstate_extended_features(); xstate->Mask = frame->xstate.Mask & mask; xstate->CompactionMask = xstate_compaction_enabled ? (0x8000000000000000 | mask) : 0; memset( xstate->Reserved, 0, sizeof(xstate->Reserved) ); @@ -1442,7 +1443,7 @@ static void setup_raise_exception( ucontext_t *sigcontext, EXCEPTION_RECORD *rec assert( !((ULONG_PTR)&stack->xstate & 63) ); context_init_xstate( &stack->context, &stack->xstate ); memset( &stack->xstate, 0, offsetof(XSTATE, YmmContext) ); - stack->xstate.CompactionMask = xstate_compaction_enabled ? 0x8000000000000004 : 0; + stack->xstate.CompactionMask = xstate_compaction_enabled ? 0x8000000000000000 | xstate_extended_features() : 0; if (src_xs->Mask & 4) { stack->xstate.Mask = 4; @@ -2479,6 +2480,7 @@ void call_init_thunk( LPTHREAD_START_ROUTINE entry, void *arg, BOOL suspend, TEB I386_CONTEXT *wow_context;
thread_data->syscall_table = KeServiceDescriptorTable; + thread_data->xstate_features_mask = xstate_supported_features_mask;
#if defined __linux__ arch_prctl( ARCH_SET_GS, teb ); @@ -2539,6 +2541,8 @@ void call_init_thunk( LPTHREAD_START_ROUTINE entry, void *arg, BOOL suspend, TEB *ctx = context; ctx->ContextFlags = CONTEXT_FULL; memset( frame, 0, sizeof(*frame) ); + if (xstate_compaction_enabled) + frame->xstate.CompactionMask = 0x8000000000000000 | xstate_supported_features_mask; NtSetContextThread( GetCurrentThread(), ctx );
frame->cs = cs64_sel; @@ -2636,18 +2640,19 @@ __ASM_GLOBAL_FUNC( __wine_syscall_dispatcher, "movl 0xb0(%rcx),%r14d\n\t" /* frame->syscall_flags */ "testl $3,%r14d\n\t" /* SYSCALL_HAVE_XSAVE | SYSCALL_HAVE_XSAVEC */ "jz 2f\n\t" - "movl $7,%eax\n\t" - "xorl %edx,%edx\n\t" - "movq %rdx,0x2c0(%rcx)\n\t" - "movq %rdx,0x2c8(%rcx)\n\t" - "movq %rdx,0x2d0(%rcx)\n\t" + "movl %gs:0x340,%eax\n\t" /* amd64_thread_data()->xstate_features_mask */ + "movl %gs:0x344,%edx\n\t" /* amd64_thread_data()->xstate_features_mask high dword */ + "xorq %rbp,%rbp\n\t" + "movq %rbp,0x2c0(%rcx)\n\t" + "movq %rbp,0x2c8(%rcx)\n\t" + "movq %rbp,0x2d0(%rcx)\n\t" "testl $2,%r14d\n\t" /* SYSCALL_HAVE_XSAVEC */ "jz 1f\n\t" - "movq %rdx,0x2d8(%rcx)\n\t" - "movq %rdx,0x2e0(%rcx)\n\t" - "movq %rdx,0x2e8(%rcx)\n\t" - "movq %rdx,0x2f0(%rcx)\n\t" - "movq %rdx,0x2f8(%rcx)\n\t" + "movq %rbp,0x2d8(%rcx)\n\t" + "movq %rbp,0x2e0(%rcx)\n\t" + "movq %rbp,0x2e8(%rcx)\n\t" + "movq %rbp,0x2f0(%rcx)\n\t" + "movq %rbp,0x2f8(%rcx)\n\t" /* The xsavec instruction is not supported by * binutils < 2.25. */ ".byte 0x48, 0x0f, 0xc7, 0xa1, 0xc0, 0x00, 0x00, 0x00\n\t" /* xsavec64 0xc0(%rcx) */ @@ -2749,8 +2754,8 @@ __ASM_GLOBAL_FUNC( __wine_syscall_dispatcher, "2:\ttestl $3,%r14d\n\t" /* SYSCALL_HAVE_XSAVE | SYSCALL_HAVE_XSAVEC */ "jz 3f\n\t" "movq %rax,%r11\n\t" - "movl $7,%eax\n\t" - "xorl %edx,%edx\n\t" + "movl %gs:0x340,%eax\n\t" /* amd64_thread_data()->xstate_features_mask */ + "movl %gs:0x344,%edx\n\t" /* amd64_thread_data()->xstate_features_mask high dword */ "xrstor64 0xc0(%rcx)\n\t" "movq %r11,%rax\n\t" "movl 0xb4(%rcx),%edx\n\t" /* frame->restore_flags */ diff --git a/dlls/ntdll/unix/system.c b/dlls/ntdll/unix/system.c index d2df1dd50c2..c3ffdc6921e 100644 --- a/dlls/ntdll/unix/system.c +++ b/dlls/ntdll/unix/system.c @@ -247,6 +247,7 @@ static pthread_mutex_t timezone_mutex = PTHREAD_MUTEX_INITIALIZER; #if defined(__i386__) || defined(__x86_64__)
BOOL xstate_compaction_enabled = FALSE; +UINT64 xstate_supported_features_mask;
#define AUTH 0x68747541 /* "Auth" */ #define ENTI 0x69746e65 /* "enti" */ @@ -396,6 +397,9 @@ static void get_cpuinfo( SYSTEM_CPU_INFORMATION *info ) { do_cpuid( 0x0000000d, 1, regs3 ); /* get XSAVE details */ if (regs3[0] & 2) xstate_compaction_enabled = TRUE; + xstate_supported_features_mask = 3; + if (features & CPU_FEATURE_AVX) + xstate_supported_features_mask |= (UINT64)1 << XSTATE_AVX; }
if (regs[1] == AUTH && regs[3] == ENTI && regs[2] == CAMD) diff --git a/dlls/ntdll/unix/unix_private.h b/dlls/ntdll/unix/unix_private.h index 07f2724eac7..75c81b21f84 100644 --- a/dlls/ntdll/unix/unix_private.h +++ b/dlls/ntdll/unix/unix_private.h @@ -212,6 +212,15 @@ extern int server_pipe( int fd[2] );
extern void fpux_to_fpu( I386_FLOATING_SAVE_AREA *fpu, const XSAVE_FORMAT *fpux ); extern void fpu_to_fpux( XSAVE_FORMAT *fpux, const I386_FLOATING_SAVE_AREA *fpu ); + +extern BOOL xstate_compaction_enabled; +extern UINT64 xstate_supported_features_mask; + +static inline UINT64 xstate_extended_features(void) +{ + return xstate_supported_features_mask & ~(UINT64)3; +} + extern void *get_cpu_area( USHORT machine ); extern void set_thread_id( TEB *teb, DWORD pid, DWORD tid ); extern NTSTATUS init_thread_stack( TEB *teb, ULONG_PTR limit, SIZE_T reserve_size, SIZE_T commit_size );
From: Paul Gofman pgofman@codeweavers.com
--- dlls/ntdll/unix/signal_i386.c | 50 +++++++++---------- dlls/ntdll/unix/signal_x86_64.c | 85 +++++++++++++++++---------------- dlls/ntdll/unix/system.c | 19 ++++++++ dlls/ntdll/unix/unix_private.h | 3 ++ include/winnt.h | 8 ++++ 5 files changed, 99 insertions(+), 66 deletions(-)
diff --git a/dlls/ntdll/unix/signal_i386.c b/dlls/ntdll/unix/signal_i386.c index d634e46a80f..515cfe5a212 100644 --- a/dlls/ntdll/unix/signal_i386.c +++ b/dlls/ntdll/unix/signal_i386.c @@ -508,10 +508,10 @@ struct syscall_frame /* Leave space for the whole set of YMM registers. They're not used in * 32-bit mode, but some processors fault if they're not in writable memory. */ - DECLSPEC_ALIGN(64) XSTATE xstate; /* 240 */ + DECLSPEC_ALIGN(64) XSAVE_AREA_HEADER xstate; /* 240 */ };
-C_ASSERT( sizeof(struct syscall_frame) == 0x380 ); +C_ASSERT( sizeof(struct syscall_frame) == 0x280 );
struct x86_thread_data { @@ -526,12 +526,14 @@ struct x86_thread_data SYSTEM_SERVICE_TABLE *syscall_table; /* 1f4 syscall table */ struct syscall_frame *syscall_frame; /* 1f8 frame pointer on syscall entry */ UINT64 xstate_features_mask; /* 1fc */ + UINT xstate_features_size; /* 204 */ };
C_ASSERT( sizeof(struct x86_thread_data) <= sizeof(((struct ntdll_thread_data *)0)->cpu_data) ); C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct x86_thread_data, gs ) == 0x1d8 ); C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct x86_thread_data, syscall_table ) == 0x1f4 ); C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct x86_thread_data, syscall_frame ) == 0x1f8 ); +C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct x86_thread_data, xstate_features_size ) == 0x204 );
/* flags to control the behavior of the syscall dispatcher */ #define SYSCALL_HAVE_XSAVE 1 @@ -938,12 +940,12 @@ NTSTATUS WINAPI NtSetContextThread( HANDLE handle, const CONTEXT *context ) if ((flags & CONTEXT_XSTATE) && xstate_extended_features()) { CONTEXT_EX *context_ex = (CONTEXT_EX *)(context + 1); - XSTATE *xs = (XSTATE *)((char *)context_ex + context_ex->XState.Offset); + XSAVE_AREA_HEADER *xs = (XSAVE_AREA_HEADER *)((char *)context_ex + context_ex->XState.Offset);
- if (context_ex->XState.Length < offsetof(XSTATE, YmmContext) || - context_ex->XState.Length > sizeof(XSTATE)) + if (context_ex->XState.Length < sizeof(XSAVE_AREA_HEADER) || + context_ex->XState.Length > sizeof(XSAVE_AREA_HEADER) + xstate_features_size) return STATUS_INVALID_PARAMETER; - if ((xs->Mask & xstate_extended_features()) && (context_ex->XState.Length < sizeof(XSTATE))) + if ((xs->Mask & xstate_extended_features()) && (context_ex->XState.Length < xstate_get_size( xs, xs->Mask ))) return STATUS_BUFFER_OVERFLOW; } else flags &= ~CONTEXT_XSTATE; @@ -1019,14 +1021,9 @@ NTSTATUS WINAPI NtSetContextThread( HANDLE handle, const CONTEXT *context ) if (flags & CONTEXT_XSTATE) { CONTEXT_EX *context_ex = (CONTEXT_EX *)(context + 1); - XSTATE *xs = (XSTATE *)((char *)context_ex + context_ex->XState.Offset); + XSAVE_AREA_HEADER *xs = (XSAVE_AREA_HEADER *)((char *)context_ex + context_ex->XState.Offset);
- if (xs->Mask & XSTATE_MASK_GSSE) - { - frame->xstate.Mask |= XSTATE_MASK_GSSE; - frame->xstate.YmmContext = xs->YmmContext; - } - else frame->xstate.Mask &= ~XSTATE_MASK_GSSE; + copy_xstate( &frame->xstate, xs, xs->Mask ); }
frame->restore_flags |= flags & ~CONTEXT_INTEGER; @@ -1140,21 +1137,21 @@ NTSTATUS WINAPI NtGetContextThread( HANDLE handle, CONTEXT *context ) if ((needed_flags & CONTEXT_XSTATE) && xstate_extended_features()) { CONTEXT_EX *context_ex = (CONTEXT_EX *)(context + 1); - XSTATE *xstate = (XSTATE *)((char *)context_ex + context_ex->XState.Offset); - unsigned int mask; + XSAVE_AREA_HEADER *xstate = (XSAVE_AREA_HEADER *)((char *)context_ex + context_ex->XState.Offset); + UINT64 mask;
- if (context_ex->XState.Length < offsetof(XSTATE, YmmContext) - || context_ex->XState.Length > sizeof(XSTATE)) + if (context_ex->XState.Length < sizeof(XSAVE_AREA_HEADER) || + context_ex->XState.Length > sizeof(XSAVE_AREA_HEADER) + xstate_features_size) return STATUS_INVALID_PARAMETER;
mask = (xstate_compaction_enabled ? xstate->CompactionMask : xstate->Mask) & xstate_extended_features(); xstate->Mask = frame->xstate.Mask & mask; xstate->CompactionMask = xstate_compaction_enabled ? (0x8000000000000000 | mask) : 0; - memset( xstate->Reserved, 0, sizeof(xstate->Reserved) ); + memset( xstate->Reserved2, 0, sizeof(xstate->Reserved2) ); if (xstate->Mask) { - if (context_ex->XState.Length < sizeof(XSTATE)) return STATUS_BUFFER_OVERFLOW; - xstate->YmmContext = frame->xstate.YmmContext; + if (context_ex->XState.Length < xstate_get_size( xstate, xstate->Mask )) return STATUS_BUFFER_OVERFLOW; + copy_xstate( xstate, &frame->xstate, xstate->Mask ); } } /* update the cached version of the debug registers */ @@ -1483,7 +1480,7 @@ static void setup_raise_exception( ucontext_t *sigcontext, void *stack_ptr, XSTATE *dst_xs = (XSTATE *)(((ULONG_PTR)stack->xstate + 63) & ~63);
context_init_xstate( &stack->context, dst_xs ); - memset( dst_xs, 0, offsetof(XSTATE, YmmContext) ); + memset( dst_xs, 0, sizeof(XSAVE_AREA_HEADER) ); dst_xs->CompactionMask = xstate_compaction_enabled ? 0x8000000000000000 | xstate_extended_features() : 0; if (src_xs->Mask & 4) { @@ -1623,7 +1620,8 @@ __ASM_GLOBAL_FUNC( call_user_mode_callback, __ASM_CFI(".cfi_rel_offset %edi,-12\n\t") "movl 0x18(%ebp),%edx\n\t" /* teb */ "pushl 0(%edx)\n\t" /* teb->Tib.ExceptionList */ - "subl $0x380,%esp\n\t" /* sizeof(struct syscall_frame) */ + "subl $0x280,%esp\n\t" /* sizeof(struct syscall_frame) */ + "subl %fs:0x204,%esp\n\t" /* x86_thread_data()->xstate_features_size */ "andl $~63,%esp\n\t" "leal 8(%ebp),%eax\n\t" "movl %eax,0x38(%esp)\n\t" /* frame->syscall_cfa */ @@ -2403,6 +2401,7 @@ NTSTATUS signal_alloc_thread( TEB *teb ) else thread_data->fs = gdt_fs_sel;
teb->WOW32Reserved = __wine_syscall_dispatcher; + thread_data->xstate_features_size = xstate_features_size; return STATUS_SUCCESS; }
@@ -2431,7 +2430,8 @@ void signal_init_process(void) struct sigaction sig_act; void *kernel_stack = (char *)ntdll_get_thread_data()->kernel_stack + kernel_stack_size;
- x86_thread_data()->syscall_frame = (struct syscall_frame *)kernel_stack - 1; + x86_thread_data()->syscall_frame = (struct syscall_frame *)((ULONG_PTR)((char *)kernel_stack - sizeof(struct syscall_frame) - xstate_features_size) & ~(ULONG_PTR)63); + x86_thread_data()->xstate_features_size = xstate_features_size;
if (cpu_info.ProcessorFeatureBits & CPU_FEATURE_FXSR) syscall_flags |= SYSCALL_HAVE_FXSAVE; if (cpu_info.ProcessorFeatureBits & CPU_FEATURE_XSAVE) syscall_flags |= SYSCALL_HAVE_XSAVE; @@ -2481,6 +2481,7 @@ void call_init_thunk( LPTHREAD_START_ROUTINE entry, void *arg, BOOL suspend, TEB thread_data->gs = get_gs(); thread_data->syscall_table = KeServiceDescriptorTable; thread_data->xstate_features_mask = xstate_supported_features_mask; + assert( thread_data->xstate_features_size == xstate_features_size );
context.SegCs = get_cs(); context.SegDs = get_ds(); @@ -2547,7 +2548,8 @@ __ASM_GLOBAL_FUNC( signal_start_thread, "movl 0x1f8(%ecx),%eax\n\t" /* x86_thread_data()->syscall_frame */ "orl %eax,%eax\n\t" "jnz 1f\n\t" - "leal -0x380(%esp),%eax\n\t" /* sizeof(struct syscall_frame) */ + "leal -0x280(%esp),%eax\n\t" /* sizeof(struct syscall_frame) */ + "subl %fs:0x204,%eax\n\t" /* x86_thread_data()->xstate_features_size */ "andl $~63,%eax\n\t" "movl %eax,0x1f8(%ecx)\n" /* x86_thread_data()->syscall_frame */ /* switch to kernel stack */ diff --git a/dlls/ntdll/unix/signal_x86_64.c b/dlls/ntdll/unix/signal_x86_64.c index dff37a9ce7e..a092e5dadaf 100644 --- a/dlls/ntdll/unix/signal_x86_64.c +++ b/dlls/ntdll/unix/signal_x86_64.c @@ -424,10 +424,12 @@ struct syscall_frame DWORD restore_flags; /* 00b4 */ DWORD align[2]; /* 00b8 */ XMM_SAVE_AREA32 xsave; /* 00c0 */ - DECLSPEC_ALIGN(64) XSTATE xstate; /* 02c0 */ + DECLSPEC_ALIGN(64) XSAVE_AREA_HEADER xstate; /* 02c0 */ };
-C_ASSERT( sizeof( struct syscall_frame ) == 0x400); +C_ASSERT( offsetof( struct syscall_frame, xsave ) == 0xc0 ); +C_ASSERT( offsetof( struct syscall_frame, xstate ) == 0x2c0 ); +C_ASSERT( sizeof( struct syscall_frame ) == 0x300);
struct amd64_thread_data { @@ -441,7 +443,7 @@ struct amd64_thread_data struct syscall_frame *syscall_frame; /* 0328 syscall frame pointer */ SYSTEM_SERVICE_TABLE *syscall_table; /* 0330 syscall table */ DWORD fs; /* 0338 WOW TEB selector */ - DWORD align; + DWORD xstate_features_size; /* 033c */ UINT64 xstate_features_mask; /* 0340 */ };
@@ -450,6 +452,7 @@ C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct amd64_thread_data, pth C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct amd64_thread_data, syscall_frame ) == 0x328 ); C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct amd64_thread_data, syscall_table ) == 0x330 ); C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct amd64_thread_data, fs ) == 0x338 ); +C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct amd64_thread_data, xstate_features_size ) == 0x33c ); C_ASSERT( offsetof( TEB, GdiTebBatch ) + offsetof( struct amd64_thread_data, xstate_features_mask ) == 0x340 );
static inline struct amd64_thread_data *amd64_thread_data(void) @@ -981,12 +984,12 @@ NTSTATUS WINAPI NtSetContextThread( HANDLE handle, const CONTEXT *context ) if ((flags & CONTEXT_XSTATE) && xstate_extended_features()) { CONTEXT_EX *context_ex = (CONTEXT_EX *)(context + 1); - XSTATE *xs = (XSTATE *)((char *)context_ex + context_ex->XState.Offset); + XSAVE_AREA_HEADER *xs = (XSAVE_AREA_HEADER *)((char *)context_ex + context_ex->XState.Offset);
- if (context_ex->XState.Length < offsetof(XSTATE, YmmContext) || - context_ex->XState.Length > sizeof(XSTATE)) + if (context_ex->XState.Length < sizeof(XSAVE_AREA_HEADER) || + context_ex->XState.Length > sizeof(XSAVE_AREA_HEADER) + xstate_features_size) return STATUS_INVALID_PARAMETER; - if ((xs->Mask & xstate_extended_features()) && (context_ex->XState.Length < sizeof(XSTATE))) + if ((xs->Mask & xstate_extended_features()) && (context_ex->XState.Length < xstate_get_size( xs, xs->Mask ))) return STATUS_BUFFER_OVERFLOW; } else flags &= ~CONTEXT_XSTATE; @@ -1051,14 +1054,9 @@ NTSTATUS WINAPI NtSetContextThread( HANDLE handle, const CONTEXT *context ) if (flags & CONTEXT_XSTATE) { CONTEXT_EX *context_ex = (CONTEXT_EX *)(context + 1); - XSTATE *xs = (XSTATE *)((char *)context_ex + context_ex->XState.Offset); + XSAVE_AREA_HEADER *xs = (XSAVE_AREA_HEADER *)((char *)context_ex + context_ex->XState.Offset);
- if (xs->Mask & XSTATE_MASK_GSSE) - { - frame->xstate.Mask |= XSTATE_MASK_GSSE; - memcpy( &frame->xstate.YmmContext, &xs->YmmContext, sizeof(xs->YmmContext) ); - } - else frame->xstate.Mask &= ~XSTATE_MASK_GSSE; + copy_xstate( &frame->xstate, xs, xs->Mask ); }
frame->restore_flags |= flags & ~CONTEXT_INTEGER; @@ -1159,21 +1157,21 @@ NTSTATUS WINAPI NtGetContextThread( HANDLE handle, CONTEXT *context ) if ((needed_flags & CONTEXT_XSTATE) && xstate_extended_features()) { CONTEXT_EX *context_ex = (CONTEXT_EX *)(context + 1); - XSTATE *xstate = (XSTATE *)((char *)context_ex + context_ex->XState.Offset); - unsigned int mask; + XSAVE_AREA_HEADER *xstate = (XSAVE_AREA_HEADER *)((char *)context_ex + context_ex->XState.Offset); + UINT64 mask;
- if (context_ex->XState.Length < offsetof(XSTATE, YmmContext) - || context_ex->XState.Length > sizeof(XSTATE)) + if (context_ex->XState.Length < sizeof(XSAVE_AREA_HEADER) || + context_ex->XState.Length > sizeof(XSAVE_AREA_HEADER) + xstate_features_size) return STATUS_INVALID_PARAMETER;
mask = (xstate_compaction_enabled ? xstate->CompactionMask : xstate->Mask) & xstate_extended_features(); xstate->Mask = frame->xstate.Mask & mask; xstate->CompactionMask = xstate_compaction_enabled ? (0x8000000000000000 | mask) : 0; - memset( xstate->Reserved, 0, sizeof(xstate->Reserved) ); + memset( xstate->Reserved2, 0, sizeof(xstate->Reserved2) ); if (xstate->Mask) { - if (context_ex->XState.Length < sizeof(XSTATE)) return STATUS_BUFFER_OVERFLOW; - memcpy( &xstate->YmmContext, &frame->xstate.YmmContext, sizeof(xstate->YmmContext) ); + if (context_ex->XState.Length < xstate_get_size( xstate, xstate->Mask )) return STATUS_BUFFER_OVERFLOW; + copy_xstate( xstate, &frame->xstate, xstate->Mask ); } } /* update the cached version of the debug registers */ @@ -1280,14 +1278,9 @@ NTSTATUS set_thread_wow64_context( HANDLE handle, const void *ctx, ULONG size ) if (flags & CONTEXT_I386_XSTATE) { CONTEXT_EX *context_ex = (CONTEXT_EX *)(context + 1); - XSTATE *xs = (XSTATE *)((char *)context_ex + context_ex->XState.Offset); + XSAVE_AREA_HEADER *xs = (XSAVE_AREA_HEADER *)((char *)context_ex + context_ex->XState.Offset);
- if (xs->Mask & XSTATE_MASK_GSSE) - { - frame->xstate.Mask |= XSTATE_MASK_GSSE; - memcpy( &frame->xstate.YmmContext, &xs->YmmContext, sizeof(xs->YmmContext) ); - } - else frame->xstate.Mask &= ~XSTATE_MASK_GSSE; + copy_xstate( &frame->xstate, xs, xs->Mask ); frame->restore_flags |= CONTEXT_XSTATE; } return STATUS_SUCCESS; @@ -1368,24 +1361,24 @@ NTSTATUS get_thread_wow64_context( HANDLE handle, void *ctx, ULONG size ) fpux_to_fpu( &context->FloatSave, &frame->xsave ); context->ContextFlags |= CONTEXT_I386_FLOATING_POINT; } - if ((needed_flags & CONTEXT_I386_XSTATE) && (cpu_info.ProcessorFeatureBits & CPU_FEATURE_AVX)) + if ((needed_flags & CONTEXT_I386_XSTATE) && xstate_extended_features()) { CONTEXT_EX *context_ex = (CONTEXT_EX *)(context + 1); - XSTATE *xstate = (XSTATE *)((char *)context_ex + context_ex->XState.Offset); - unsigned int mask; + XSAVE_AREA_HEADER *xstate = (XSAVE_AREA_HEADER *)((char *)context_ex + context_ex->XState.Offset); + UINT64 mask;
- if (context_ex->XState.Length < offsetof(XSTATE, YmmContext) || - context_ex->XState.Length > sizeof(XSTATE)) + if (context_ex->XState.Length < sizeof(XSAVE_AREA_HEADER) || + context_ex->XState.Length > sizeof(XSAVE_AREA_HEADER) + xstate_features_size) return STATUS_INVALID_PARAMETER;
mask = (xstate_compaction_enabled ? xstate->CompactionMask : xstate->Mask) & xstate_extended_features(); xstate->Mask = frame->xstate.Mask & mask; xstate->CompactionMask = xstate_compaction_enabled ? (0x8000000000000000 | mask) : 0; - memset( xstate->Reserved, 0, sizeof(xstate->Reserved) ); + memset( xstate->Reserved2, 0, sizeof(xstate->Reserved2) ); if (xstate->Mask) { - if (context_ex->XState.Length < sizeof(XSTATE)) return STATUS_BUFFER_OVERFLOW; - memcpy( &xstate->YmmContext, &frame->xstate.YmmContext, sizeof(xstate->YmmContext) ); + if (context_ex->XState.Length < xstate_get_size( xstate, xstate->Mask )) return STATUS_BUFFER_OVERFLOW; + copy_xstate( xstate, &frame->xstate, xstate->Mask ); } } return STATUS_SUCCESS; @@ -1544,7 +1537,7 @@ NTSTATUS call_user_exception_dispatcher( EXCEPTION_RECORD *rec, CONTEXT *context { assert( !((ULONG_PTR)&stack->xstate & 63) ); context_init_xstate( &stack->context, &stack->xstate ); - memcpy( &stack->xstate, &frame->xstate, sizeof(frame->xstate) ); + memcpy( &stack->xstate, &frame->xstate, sizeof(XSAVE_AREA_HEADER) + xstate_features_size ); } else context_init_xstate( &stack->context, NULL );
@@ -1586,13 +1579,15 @@ __ASM_GLOBAL_FUNC( call_user_mode_callback, "fnstcw -0x2c(%rbp)\n\t" "movq %rsi,-0x38(%rbp)\n\t" /* ret_ptr */ "movq %rdx,-0x40(%rbp)\n\t" /* ret_len */ - "subq $0x408,%rsp\n\t" /* sizeof(struct syscall_frame) + exception */ + "subq $0x308,%rsp\n\t" /* sizeof(struct syscall_frame) + exception */ + "movl %gs:0x33c,%esi\n\t" /* amd64_thread_data()->xstate_features_size */ + "subq %rsi,%rsp\n\t" "andq $~63,%rsp\n\t" "leaq 0x10(%rbp),%rax\n\t" "movq %rax,0xa8(%rsp)\n\t" /* frame->syscall_cfa */ "movq 0x328(%r8),%r10\n\t" /* amd64_thread_data()->syscall_frame */ "movq (%r8),%rax\n\t" /* NtCurrentTeb()->Tib.ExceptionList */ - "movq %rax,0x400(%rsp)\n\t" + "movq %rax,0x300(%rsp,%rsi)\n\t" "movl 0xb0(%r10),%r14d\n\t" /* prev_frame->syscall_flags */ "movl %r14d,0xb0(%rsp)\n\t" /* frame->syscall_flags */ "movq %r10,0xa0(%rsp)\n\t" /* frame->prev_frame */ @@ -1625,7 +1620,8 @@ __ASM_GLOBAL_FUNC( user_mode_callback_return, __ASM_CFI(".cfi_rel_offset %r13,-0x18\n\t") __ASM_CFI(".cfi_rel_offset %r14,-0x20\n\t") __ASM_CFI(".cfi_rel_offset %r15,-0x28\n\t") - "movq 0x400(%r10),%rax\n\t" /* exception list */ + "movl %gs:0x33c,%eax\n\t" /* amd64_thread_data()->xstate_features_size */ + "movq 0x300(%r10,%rax),%rax\n\t" /* exception list */ "movq %rax,0(%rcx)\n\t" /* teb->Tib.ExceptionList */ "movq -0x38(%rbp),%r10\n\t" /* ret_ptr */ "movq -0x40(%rbp),%r11\n\t" /* ret_len */ @@ -2303,6 +2299,7 @@ NTSTATUS signal_alloc_thread( TEB *teb ) } else thread_data->fs = fs32_sel; } + thread_data->xstate_features_size = xstate_features_size; return STATUS_SUCCESS; }
@@ -2389,7 +2386,8 @@ void signal_init_process(void) WOW_TEB *wow_teb = get_wow_teb( NtCurrentTeb() ); void *ptr, *kernel_stack = (char *)ntdll_get_thread_data()->kernel_stack + kernel_stack_size;
- amd64_thread_data()->syscall_frame = (struct syscall_frame *)kernel_stack - 1; + amd64_thread_data()->syscall_frame = (struct syscall_frame *)((ULONG_PTR)((char *)kernel_stack - sizeof(struct syscall_frame) - xstate_features_size) & ~(ULONG_PTR)63); + amd64_thread_data()->xstate_features_size = xstate_features_size;
/* sneak in a syscall dispatcher pointer at a fixed address (7ffe1000) */ ptr = (char *)user_shared_data + page_size; @@ -2481,6 +2479,7 @@ void call_init_thunk( LPTHREAD_START_ROUTINE entry, void *arg, BOOL suspend, TEB
thread_data->syscall_table = KeServiceDescriptorTable; thread_data->xstate_features_mask = xstate_supported_features_mask; + assert( thread_data->xstate_features_size == xstate_features_size );
#if defined __linux__ arch_prctl( ARCH_SET_GS, teb ); @@ -2585,7 +2584,9 @@ __ASM_GLOBAL_FUNC( signal_start_thread, "movq 0x328(%rcx),%r8\n\t" /* amd64_thread_data()->syscall_frame */ "orq %r8,%r8\n\t" "jnz 1f\n\t" - "leaq -0x400(%rsp),%r8\n\t" /* sizeof(struct syscall_frame) */ + "leaq -0x300(%rsp),%r8\n\t" /* sizeof(struct syscall_frame) */ + "movl %gs:0x33c,%eax\n\t" /* amd64_thread_data()->xstate_features_size */ + "subq %rax,%r8\n\t" "andq $~63,%r8\n\t" "movq %r8,0x328(%rcx)\n" /* amd64_thread_data()->syscall_frame */ /* switch to kernel stack */ diff --git a/dlls/ntdll/unix/system.c b/dlls/ntdll/unix/system.c index c3ffdc6921e..6bed29104d6 100644 --- a/dlls/ntdll/unix/system.c +++ b/dlls/ntdll/unix/system.c @@ -248,6 +248,23 @@ static pthread_mutex_t timezone_mutex = PTHREAD_MUTEX_INITIALIZER;
BOOL xstate_compaction_enabled = FALSE; UINT64 xstate_supported_features_mask; +UINT64 xstate_features_size; + +unsigned int xstate_get_size( XSAVE_AREA_HEADER *xsave, UINT64 mask ) +{ + if (!(mask & ((UINT64)1 << XSTATE_AVX))) return sizeof(XSAVE_AREA_HEADER); + return sizeof(XSAVE_AREA_HEADER) + sizeof(YMMCONTEXT); +} + +void copy_xstate( XSAVE_AREA_HEADER *dst, XSAVE_AREA_HEADER *src, UINT64 mask ) +{ + mask &= xstate_extended_features() & src->Mask; + if (src->CompactionMask) mask &= src->CompactionMask; + if (dst->CompactionMask) mask &= dst->CompactionMask; + dst->Mask = (dst->Mask & ~xstate_extended_features()) | mask; + if (mask & ((UINT64)1 << XSTATE_AVX)) + *(YMMCONTEXT *)(dst + 1) = *(YMMCONTEXT *)(src + 1); +}
#define AUTH 0x68747541 /* "Auth" */ #define ENTI 0x69746e65 /* "enti" */ @@ -400,6 +417,8 @@ static void get_cpuinfo( SYSTEM_CPU_INFORMATION *info ) xstate_supported_features_mask = 3; if (features & CPU_FEATURE_AVX) xstate_supported_features_mask |= (UINT64)1 << XSTATE_AVX; + xstate_features_size = xstate_get_size( NULL, xstate_supported_features_mask ) - sizeof(XSAVE_AREA_HEADER); + xstate_features_size = (xstate_features_size + 15) & ~15; }
if (regs[1] == AUTH && regs[3] == ENTI && regs[2] == CAMD) diff --git a/dlls/ntdll/unix/unix_private.h b/dlls/ntdll/unix/unix_private.h index 75c81b21f84..5e7509d6d49 100644 --- a/dlls/ntdll/unix/unix_private.h +++ b/dlls/ntdll/unix/unix_private.h @@ -215,6 +215,9 @@ extern void fpu_to_fpux( XSAVE_FORMAT *fpux, const I386_FLOATING_SAVE_AREA *fpu
extern BOOL xstate_compaction_enabled; extern UINT64 xstate_supported_features_mask; +extern UINT64 xstate_features_size; +extern unsigned int xstate_get_size( XSAVE_AREA_HEADER *xsave, UINT64 mask ); +extern void copy_xstate( XSAVE_AREA_HEADER *dst, XSAVE_AREA_HEADER *src, UINT64 mask );
static inline UINT64 xstate_extended_features(void) { diff --git a/include/winnt.h b/include/winnt.h index 0e0d1cb5170..4b0ed1bd62f 100644 --- a/include/winnt.h +++ b/include/winnt.h @@ -1470,6 +1470,14 @@ typedef struct _XSTATE_CONFIGURATION ULONG64 EnabledUserVisibleSupervisorFeatures; } XSTATE_CONFIGURATION, *PXSTATE_CONFIGURATION;
+typedef struct _XSAVE_AREA_HEADER +{ + DWORD64 Mask; + DWORD64 CompactionMask; + DWORD64 Reserved2[6]; +} +XSAVE_AREA_HEADER, *PXSAVE_AREA_HEADER; + typedef struct _YMMCONTEXT { M128A Ymm0;
From: Paul Gofman pgofman@codeweavers.com
--- dlls/ntdll/unix/signal_i386.c | 63 +++++++++++++++------------------ dlls/ntdll/unix/signal_x86_64.c | 47 ++++++++++++------------ 2 files changed, 53 insertions(+), 57 deletions(-)
diff --git a/dlls/ntdll/unix/signal_i386.c b/dlls/ntdll/unix/signal_i386.c index 515cfe5a212..5768f77b469 100644 --- a/dlls/ntdll/unix/signal_i386.c +++ b/dlls/ntdll/unix/signal_i386.c @@ -151,7 +151,7 @@ typedef struct ucontext
#define FPU_sig(context) ((FLOATING_SAVE_AREA*)((context)->uc_mcontext.fpregs)) #define FPUX_sig(context) (FPU_sig(context) && !((context)->uc_mcontext.fpregs->status >> 16) ? (XSAVE_FORMAT *)(FPU_sig(context) + 1) : NULL) -#define XState_sig(fpu) (((unsigned int *)fpu->Reserved4)[12] == FP_XSTATE_MAGIC1 ? (XSTATE *)(fpu + 1) : NULL) +#define XState_sig(fpu) (((unsigned int *)fpu->Reserved4)[12] == FP_XSTATE_MAGIC1 ? (XSAVE_AREA_HEADER *)(fpu + 1) : NULL)
#ifdef __ANDROID__ /* custom signal restorer since we may have unmapped the one in vdso, and bionic doesn't check for that */ @@ -443,12 +443,10 @@ struct exc_stack_layout EXCEPTION_RECORD rec; /* 008 */ CONTEXT context; /* 058 */ CONTEXT_EX context_ex; /* 324 */ - BYTE xstate[sizeof(XSTATE)+64]; /* 33c extra space to allow for 64-byte alignment */ - DWORD align; /* 4bc */ + DWORD align; /* 33c */ }; C_ASSERT( offsetof(struct exc_stack_layout, context) == 0x58 ); -C_ASSERT( offsetof(struct exc_stack_layout, xstate) == 0x33c ); -C_ASSERT( sizeof(struct exc_stack_layout) == 0x4c0 ); +C_ASSERT( sizeof(struct exc_stack_layout) == 0x340 );
/* stack layout when calling KiUserApcDispatcher */ struct apc_stack_layout @@ -612,12 +610,12 @@ struct xcontext ULONG64 host_compaction_mask; };
-static inline XSTATE *xstate_from_context( const CONTEXT *context ) +static inline XSAVE_AREA_HEADER *xstate_from_context( const CONTEXT *context ) { CONTEXT_EX *xctx = (CONTEXT_EX *)(context + 1);
if ((context->ContextFlags & CONTEXT_XSTATE) != CONTEXT_XSTATE) return NULL; - return (XSTATE *)((char *)xctx + xctx->XState.Offset); + return (XSAVE_AREA_HEADER *)((char *)xctx + xctx->XState.Offset); }
static inline void context_init_xstate( CONTEXT *context, void *xstate_buffer ) @@ -630,7 +628,7 @@ static inline void context_init_xstate( CONTEXT *context, void *xstate_buffer )
if (xstate_buffer) { - xctx->XState.Length = sizeof(XSTATE); + xctx->XState.Length = sizeof(XSAVE_AREA_HEADER) + xstate_features_size; xctx->XState.Offset = (BYTE *)xstate_buffer - (BYTE *)xctx; context->ContextFlags |= CONTEXT_XSTATE;
@@ -828,7 +826,7 @@ static inline void save_context( struct xcontext *xcontext, const ucontext_t *si } if (fpux) { - XSTATE *xs; + XSAVE_AREA_HEADER *xs;
context->ContextFlags |= CONTEXT_FLOATING_POINT | CONTEXT_EXTENDED_REGISTERS; memcpy( context->ExtendedRegisters, fpux, sizeof(*fpux) ); @@ -880,16 +878,12 @@ static inline void restore_context( const struct xcontext *xcontext, ucontext_t if (fpu) *fpu = context->FloatSave; if (fpux) { - XSTATE *src_xs, *dst_xs; + XSAVE_AREA_HEADER *xs;
memcpy( fpux, context->ExtendedRegisters, sizeof(*fpux) );
- if ((dst_xs = XState_sig(fpux)) && (src_xs = xstate_from_context( context ))) - { - memcpy( &dst_xs->YmmContext, &src_xs->YmmContext, sizeof(dst_xs->YmmContext) ); - dst_xs->Mask |= src_xs->Mask; - dst_xs->CompactionMask = xcontext->host_compaction_mask; - } + if (xstate_extended_features() && (xs = XState_sig(fpux))) + xs->CompactionMask = xcontext->host_compaction_mask; } if (!fpu && !fpux) restore_fpu( context ); } @@ -1456,8 +1450,10 @@ static void setup_raise_exception( ucontext_t *sigcontext, void *stack_ptr, EXCEPTION_RECORD *rec, struct xcontext *xcontext ) { CONTEXT *context = &xcontext->c; - XSTATE *src_xs; + XSAVE_AREA_HEADER *src_xs; struct exc_stack_layout *stack; + size_t stack_size; + unsigned int xstate_size; NTSTATUS status = send_debug_event( rec, context, TRUE );
if (status == DBG_CONTINUE || status == DBG_EXCEPTION_HANDLED) @@ -1469,7 +1465,9 @@ static void setup_raise_exception( ucontext_t *sigcontext, void *stack_ptr, /* fix up instruction pointer in context for EXCEPTION_BREAKPOINT */ if (rec->ExceptionCode == EXCEPTION_BREAKPOINT) context->Eip--;
- stack = virtual_setup_exception( stack_ptr, sizeof(*stack), rec ); + xstate_size = sizeof(XSAVE_AREA_HEADER) + xstate_features_size; + stack_size = (ULONG_PTR)stack_ptr - (((ULONG_PTR)stack_ptr - sizeof(*stack) - xstate_size) & ~(ULONG_PTR)63); + stack = virtual_setup_exception( stack_ptr, stack_size, rec ); stack->rec_ptr = &stack->rec; stack->context_ptr = &stack->context; stack->rec = *rec; @@ -1477,16 +1475,13 @@ static void setup_raise_exception( ucontext_t *sigcontext, void *stack_ptr,
if ((src_xs = xstate_from_context( context ))) { - XSTATE *dst_xs = (XSTATE *)(((ULONG_PTR)stack->xstate + 63) & ~63); + XSAVE_AREA_HEADER *dst_xs = (XSAVE_AREA_HEADER *)(stack + 1);
+ assert( !((ULONG_PTR)dst_xs & 63) ); context_init_xstate( &stack->context, dst_xs ); - memset( dst_xs, 0, sizeof(XSAVE_AREA_HEADER) ); + memset( dst_xs, 0, sizeof(*dst_xs) ); dst_xs->CompactionMask = xstate_compaction_enabled ? 0x8000000000000000 | xstate_extended_features() : 0; - if (src_xs->Mask & 4) - { - dst_xs->Mask = 4; - memcpy( &dst_xs->YmmContext, &src_xs->YmmContext, sizeof(dst_xs->YmmContext) ); - } + copy_xstate( dst_xs, src_xs, src_xs->Mask ); } else { @@ -1567,11 +1562,14 @@ NTSTATUS call_user_exception_dispatcher( EXCEPTION_RECORD *rec, CONTEXT *context { struct syscall_frame *frame = x86_thread_data()->syscall_frame; ULONG esp = (frame->esp - sizeof(struct exc_stack_layout)) & ~3; - struct exc_stack_layout *stack = (struct exc_stack_layout *)esp; - XSTATE *src_xs; + struct exc_stack_layout *stack; + XSAVE_AREA_HEADER *src_xs; + unsigned int xstate_size;
if (rec->ExceptionCode == EXCEPTION_BREAKPOINT) context->Eip--;
+ xstate_size = sizeof(XSAVE_AREA_HEADER) + xstate_features_size; + stack = (struct exc_stack_layout *)((esp - sizeof(*stack) - xstate_size) & ~(ULONG_PTR)63); stack->rec_ptr = &stack->rec; stack->context_ptr = &stack->context; stack->rec = *rec; @@ -1579,16 +1577,13 @@ NTSTATUS call_user_exception_dispatcher( EXCEPTION_RECORD *rec, CONTEXT *context
if ((src_xs = xstate_from_context( context ))) { - XSTATE *dst_xs = (XSTATE *)(((ULONG_PTR)stack->xstate + 63) & ~63); + XSAVE_AREA_HEADER *dst_xs = (XSAVE_AREA_HEADER *)(stack + 1);
context_init_xstate( &stack->context, dst_xs ); - memset( dst_xs, 0, offsetof(XSTATE, YmmContext) ); + assert( !((ULONG_PTR)dst_xs & 63) ); + memset( dst_xs, 0, sizeof(*dst_xs) ); dst_xs->CompactionMask = xstate_compaction_enabled ? 0x8000000000000000 | xstate_extended_features() : 0; - if (src_xs->Mask & 4) - { - dst_xs->Mask = 4; - memcpy( &dst_xs->YmmContext, &src_xs->YmmContext, sizeof(dst_xs->YmmContext) ); - } + copy_xstate( dst_xs, src_xs, src_xs->Mask ); } else { diff --git a/dlls/ntdll/unix/signal_x86_64.c b/dlls/ntdll/unix/signal_x86_64.c index a092e5dadaf..ae2615a9062 100644 --- a/dlls/ntdll/unix/signal_x86_64.c +++ b/dlls/ntdll/unix/signal_x86_64.c @@ -140,7 +140,7 @@ __ASM_GLOBAL_FUNC( alloc_fs_sel, #define TRAP_sig(context) ((context)->uc_mcontext.gregs[REG_TRAPNO]) #define ERROR_sig(context) ((context)->uc_mcontext.gregs[REG_ERR]) #define FPU_sig(context) ((XMM_SAVE_AREA32 *)((context)->uc_mcontext.fpregs)) -#define XState_sig(fpu) (((unsigned int *)fpu->Reserved4)[12] == FP_XSTATE_MAGIC1 ? (XSTATE *)(fpu + 1) : NULL) +#define XState_sig(fpu) (((unsigned int *)fpu->Reserved4)[12] == FP_XSTATE_MAGIC1 ? (XSAVE_AREA_HEADER *)(fpu + 1) : NULL)
#elif defined(__FreeBSD__) || defined (__FreeBSD_kernel__)
@@ -359,11 +359,10 @@ struct exc_stack_layout ULONG64 align; /* 588 */ struct machine_frame machine_frame; /* 590 */ ULONG64 align2; /* 5b8 */ - XSTATE xstate; /* 5c0 */ }; C_ASSERT( offsetof(struct exc_stack_layout, rec) == 0x4f0 ); C_ASSERT( offsetof(struct exc_stack_layout, machine_frame) == 0x590 ); -C_ASSERT( sizeof(struct exc_stack_layout) == 0x700 ); +C_ASSERT( sizeof(struct exc_stack_layout) == 0x5c0 );
/* stack layout when calling KiUserApcDispatcher */ struct apc_stack_layout @@ -483,12 +482,12 @@ struct xcontext ULONG64 host_compaction_mask; };
-static inline XSTATE *xstate_from_context( const CONTEXT *context ) +static inline XSAVE_AREA_HEADER *xstate_from_context( const CONTEXT *context ) { CONTEXT_EX *xctx = (CONTEXT_EX *)(context + 1);
if ((context->ContextFlags & CONTEXT_XSTATE) != CONTEXT_XSTATE) return NULL; - return (XSTATE *)((char *)xctx + xctx->XState.Offset); + return (XSAVE_AREA_HEADER *)((char *)xctx + xctx->XState.Offset); }
static inline void context_init_xstate( CONTEXT *context, void *xstate_buffer ) @@ -501,7 +500,7 @@ static inline void context_init_xstate( CONTEXT *context, void *xstate_buffer )
if (xstate_buffer) { - xctx->XState.Length = sizeof(XSTATE); + xctx->XState.Length = sizeof(XSAVE_AREA_HEADER) + xstate_features_size; xctx->XState.Offset = (BYTE *)xstate_buffer - (BYTE *)xctx; context->ContextFlags |= CONTEXT_XSTATE;
@@ -897,7 +896,7 @@ static void save_context( struct xcontext *xcontext, const ucontext_t *sigcontex context->Dr7 = amd64_thread_data()->dr7; if (FPU_sig(sigcontext)) { - XSTATE *xs; + XSAVE_AREA_HEADER *xs;
context->ContextFlags |= CONTEXT_FLOATING_POINT; context->FltSave = *FPU_sig(sigcontext); @@ -922,7 +921,7 @@ static void save_context( struct xcontext *xcontext, const ucontext_t *sigcontex static void restore_context( const struct xcontext *xcontext, ucontext_t *sigcontext ) { const CONTEXT *context = &xcontext->c; - XSTATE *xs; + XSAVE_AREA_HEADER *xs;
amd64_thread_data()->dr0 = context->Dr0; amd64_thread_data()->dr1 = context->Dr1; @@ -1395,7 +1394,8 @@ static void setup_raise_exception( ucontext_t *sigcontext, EXCEPTION_RECORD *rec struct exc_stack_layout *stack; size_t stack_size; NTSTATUS status; - XSTATE *src_xs; + XSAVE_AREA_HEADER *src_xs; + unsigned int xstate_size;
if (rec->ExceptionCode == EXCEPTION_SINGLE_STEP) { @@ -1424,7 +1424,8 @@ static void setup_raise_exception( ucontext_t *sigcontext, EXCEPTION_RECORD *rec /* fix up instruction pointer in context for EXCEPTION_BREAKPOINT */ if (rec->ExceptionCode == EXCEPTION_BREAKPOINT) context->Rip--;
- stack_size = (ULONG_PTR)stack_ptr - (((ULONG_PTR)stack_ptr - sizeof(*stack)) & ~(ULONG_PTR)63); + xstate_size = sizeof(XSAVE_AREA_HEADER) + xstate_features_size; + stack_size = (ULONG_PTR)stack_ptr - (((ULONG_PTR)stack_ptr - sizeof(*stack) - xstate_size) & ~(ULONG_PTR)63); stack = virtual_setup_exception( stack_ptr, stack_size, rec ); stack->rec = *rec; stack->context = *context; @@ -1433,15 +1434,12 @@ static void setup_raise_exception( ucontext_t *sigcontext, EXCEPTION_RECORD *rec
if ((src_xs = xstate_from_context( context ))) { - assert( !((ULONG_PTR)&stack->xstate & 63) ); - context_init_xstate( &stack->context, &stack->xstate ); - memset( &stack->xstate, 0, offsetof(XSTATE, YmmContext) ); - stack->xstate.CompactionMask = xstate_compaction_enabled ? 0x8000000000000000 | xstate_extended_features() : 0; - if (src_xs->Mask & 4) - { - stack->xstate.Mask = 4; - memcpy( &stack->xstate.YmmContext, &src_xs->YmmContext, sizeof(stack->xstate.YmmContext) ); - } + XSAVE_AREA_HEADER *dst_xs = (XSAVE_AREA_HEADER *)(stack + 1); + assert( !((ULONG_PTR)dst_xs & 63) ); + context_init_xstate( &stack->context, dst_xs ); + memset( dst_xs, 0, sizeof(*dst_xs) ); + dst_xs->CompactionMask = xstate_compaction_enabled ? 0x8000000000000000 | xstate_extended_features() : 0; + copy_xstate( dst_xs, src_xs, src_xs->Mask ); } else { @@ -1528,16 +1526,19 @@ NTSTATUS call_user_exception_dispatcher( EXCEPTION_RECORD *rec, CONTEXT *context struct syscall_frame *frame = amd64_thread_data()->syscall_frame; struct exc_stack_layout *stack; NTSTATUS status = NtSetContextThread( GetCurrentThread(), context ); + unsigned int xstate_size;
if (status) return status; - stack = (struct exc_stack_layout *)((context->Rsp - sizeof(*stack)) & ~(ULONG_PTR)63); + xstate_size = sizeof(XSAVE_AREA_HEADER) + xstate_features_size; + stack = (struct exc_stack_layout *)((context->Rsp - sizeof(*stack) - xstate_size) & ~(ULONG_PTR)63); memmove( &stack->context, context, sizeof(*context) );
if ((context->ContextFlags & CONTEXT_XSTATE) == CONTEXT_XSTATE) { - assert( !((ULONG_PTR)&stack->xstate & 63) ); - context_init_xstate( &stack->context, &stack->xstate ); - memcpy( &stack->xstate, &frame->xstate, sizeof(XSAVE_AREA_HEADER) + xstate_features_size ); + XSAVE_AREA_HEADER *dst_xs = (XSAVE_AREA_HEADER *)(stack + 1); + assert( !((ULONG_PTR)dst_xs & 63) ); + context_init_xstate( &stack->context, dst_xs ); + memcpy( dst_xs, &frame->xstate, sizeof(XSAVE_AREA_HEADER) + xstate_features_size ); } else context_init_xstate( &stack->context, NULL );
v2: - use xstate_features_size, xstate_features_mask from thread_data in asm functions also on x64; - make sure thread_data()->xstate_features_size is always initialized before signal_start_thread(); - don't rename xstate to xstate_hdr in syscall frame.
On Wed Feb 7 15:27:09 2024 +0000, Paul Gofman wrote:
Here is my current full branch: https://gitlab.winehq.org/gofman/wine/-/commits/xstate_avx512/?ref_type=head... In the (current) final state the function minds generic feature mask to get the size which corresponds to it, and also xsave->CompactionMask:
unsigned int xstate_get_size( XSAVE_AREA_HEADER *xsave, UINT64 mask ) { UINT64 compaction_mask; unsigned int i; int off; if (xsave) compaction_mask = xsave->CompactionMask; else if (xstate_compaction_enabled) compaction_mask = ((UINT64)1 << 63) | mask; else compaction_mask = 0; mask >>= 2; off = sizeof(XSAVE_AREA_HEADER); i = 2; while (mask) { if (mask == 1) return off + xstate_feature_size[i]; off = next_xstate_offset( off, compaction_mask, i ); mask >>= 1; ++i; } return off; }
While looking at xsave->CompactionMask is not needed when dealing with Win context (which mask should always match system compaction to be valid), our "internal" xstate from signals always has CompactionMask of zero (Linux always copies xstate in uncompacted format for user signals). So I wanted xstate_get_size` / copy_xstate to deal with it transparently. Do you think I can leave the handle as is in attempt to reduce the amount of diff in the later patches, or should I introduce that only when really needed? Also, maybe if you have some general omments on the generic direction of those patches...
I'd suggest to pass the compaction mask explicitly, the xsave pointer shouldn't be necessary.
On Mon Feb 12 21:21:38 2024 +0000, Alexandre Julliard wrote:
I'd suggest to pass the compaction mask explicitly, the xsave pointer shouldn't be necessary.
Yes, sure, that will work too.