Testing: gcc -foutline-msabi-xlogues patch - wine-devel

23 Oct 2016


      I've been working on a gcc patch targeting 64-bit Wine to reduce the 
impact of the implicit register clobbers when a Microsoft's x64 function 
calls a System V ABI function and I appear to finally have a working 
implementation. However, I cannot get recent Wine tests to complete with 
an unpatched compiler (I have 30 failures, 4 which are crashes and I 
even get some dmesgs from drivers.) While the same Wine tests pass both 
with and without the new optimization, I could use some external testing 
to make sure there's no flaws that I've missed (especially a full test 
with no failures).
The patch set is not actually complete as my unit tests didn't catch 
many of the flaws (now fixed) that I caught when building real Wine, so 
I'm re-working them. I have my current version up on github: 
https://github.com/daniel-santos/gcc/tree/gcc-5_4_0-outline-msabi-xlogues.
Background
For those unfamiliar, the differences between the two ABIs requires an 
ms_abi function to save and restore 12 registers: RSI, RDI and XMM6-15. 
The size of pushing/popping RSI and RDI is insignificant, but saving and 
restoring 10 SSE registers takes between 94 and 174 bytes each function 
(dependent upon the SP offset), thereby bloating ".text" size quite a bit.
Details
This patch uses out-of-line stubs for register for saves/restores to 
reduce .text size. The *theory* (not yet proven) is that the reduction 
of instruction cache misses will offset the extra few instructions 
required to facilitate this. But while we're going this far, the 
optimization also saves/restores an additional 6 non-volatile ms_abi 
registers if they are clobbered in the function: RBX, RBP and R12-15. 
This extra step actually saves very little ".text" code, but the stub 
implements them using MOVs which the CPU is better able to parallelize. 
While MOV instructions are larger (4 bytes vs 1 or 2), they only appear 
in the stubs, theoretically gaining the performance benefits of MOVs 
without paying for the cost of the extra size.
This optimization works both with and without forced stack realignment 
(https://bugs.winehq.org/show_bug.cgi?id=27680), although more can be 
done to produce better code in the realigned case. In the forced 
realignment case, this optimization reduces .text size Wine 64 by 19% 
and in the normal (aligned stack) case by 22%. As an example, here is a 
test function (with stubs) built with -foutline-msabi-xlogues (but no 
realignment):
__attribute__ ((noinline)) long sysv_fn0 ()
{
     return 42;
}
__attribute__ ((ms_abi, noinline)) long msabifn0_6 ()
{
     __asm__ __volatile__ ("" ::: "rbx", "rbp", "r12", "r13", "r14", "r15");
     return sysv_fn0 ();
}
0000000000401000 <msabifn0_6>:
   401000:       48 8d 44 24 88          lea -0x78(%rsp),%rax
   401005:       48 81 ec f8 00 00 00    sub    $0xf8,%rsp
   40100c:       e8 09 e8 01 00          callq  41f81a <__msabi_save_18>
   401011:       31 c0                   xor    %eax,%eax
   401013:       e8 48 fc ff ff          callq  400c60 <sysv_fn0>
   401018:       48 8d b4 24 80 00 00    lea 0x80(%rsp),%rsi
   40101f:       00
   401020:       4c 8d 56 78             lea 0x78(%rsi),%r10
   401024:       e9 41 e8 01 00          jmpq   41f86a 
<__msabi_restore_ret_18>
000000000041f81a <__msabi_save_18>:
   41f81a:       4c 89 78 90             mov %r15,-0x70(%rax)
000000000041f81e <__msabi_save_17>:
   41f81e:       4c 89 70 98             mov %r14,-0x68(%rax)
000000000041f822 <__msabi_save_16>:
   41f822:       4c 89 68 a0             mov %r13,-0x60(%rax)
000000000041f826 <__msabi_save_15>:
   41f826:       4c 89 60 a8             mov %r12,-0x58(%rax)
000000000041f82a <__msabi_save_14>:
   41f82a:       48 89 68 b0             mov %rbp,-0x50(%rax)
000000000041f82e <__msabi_save_13>:
   41f82e:       48 89 58 b8             mov %rbx,-0x48(%rax)
000000000041f832 <__msabi_save_12>:
   41f832:       48 89 78 c0             mov %rdi,-0x40(%rax)
   41f836:       48 89 70 c8             mov %rsi,-0x38(%rax)
   41f83a:       44 0f 29 78 d0          movaps %xmm15,-0x30(%rax)
   41f83f:       44 0f 29 70 e0          movaps %xmm14,-0x20(%rax)
   41f844:       44 0f 29 68 f0          movaps %xmm13,-0x10(%rax)
   41f849:       44 0f 29 20             movaps %xmm12,(%rax)
   41f84d:       44 0f 29 58 10          movaps %xmm11,0x10(%rax)
   41f852:       44 0f 29 50 20          movaps %xmm10,0x20(%rax)
   41f857:       44 0f 29 48 30          movaps %xmm9,0x30(%rax)
   41f85c:       44 0f 29 40 40          movaps %xmm8,0x40(%rax)
   41f861:       0f 29 78 50             movaps %xmm7,0x50(%rax)
   41f865:       0f 29 70 60             movaps %xmm6,0x60(%rax)
   41f869:       c3                      retq
000000000041f86a <__msabi_restore_ret_18>:
   41f86a:       4c 8b 7e 90             mov -0x70(%rsi),%r15
000000000041f86e <__msabi_restore_ret_17>:
   41f86e:       4c 8b 76 98             mov -0x68(%rsi),%r14
000000000041f872 <__msabi_restore_ret_16>:
   41f872:       4c 8b 6e a0             mov -0x60(%rsi),%r13
000000000041f876 <__msabi_restore_ret_15>:
   41f876:       4c 8b 66 a8             mov -0x58(%rsi),%r12
000000000041f87a <__msabi_restore_ret_14>:
   41f87a:       48 8b 6e b0             mov -0x50(%rsi),%rbp
000000000041f87e <__msabi_restore_ret_13>:
   41f87e:       48 8b 5e b8             mov -0x48(%rsi),%rbx
000000000041f882 <__msabi_restore_ret_12>:
   41f882:       48 8b 7e c0             mov -0x40(%rsi),%rdi
   41f886:       44 0f 28 7e d0          movaps -0x30(%rsi),%xmm15
   41f88b:       44 0f 28 76 e0          movaps -0x20(%rsi),%xmm14
   41f890:       44 0f 28 6e f0          movaps -0x10(%rsi),%xmm13
   41f895:       44 0f 28 26             movaps (%rsi),%xmm12
   41f899:       44 0f 28 5e 10          movaps 0x10(%rsi),%xmm11
   41f89e:       44 0f 28 56 20          movaps 0x20(%rsi),%xmm10
   41f8a3:       44 0f 28 4e 30          movaps 0x30(%rsi),%xmm9
   41f8a8:       44 0f 28 46 40          movaps 0x40(%rsi),%xmm8
   41f8ad:       0f 28 7e 50             movaps 0x50(%rsi),%xmm7
   41f8b1:       0f 28 76 60             movaps 0x60(%rsi),%xmm6
   41f8b5:       48 8b 76 c8             mov -0x38(%rsi),%rsi
   41f8b9:       4c 89 d4                mov    %r10,%rsp
   41f8bc:       c3                      retq
Here is a similar function with forced realignment. This can be improved 
with an additional "have frame pointer" set of stubs.
__attribute__ ((ms_abi, noinline, force_align_arg_pointer)) long 
msabifn0_5 ()
{
     __asm__ __volatile__ ( "" ::: "rbx", "r12", "r13", "r14", "r15" );
     return sysv_fn0 ();
}
0000000000400fc0 <msabifn0_5>:
   400fc0:       55                      push   %rbp
   400fc1:       48 8d 44 24 90          lea    -0x70(%rsp),%rax
   400fc6:       48 89 e5                mov    %rsp,%rbp
   400fc9:       48 83 e0 f0             and $0xfffffffffffffff0,%rax
   400fcd:       48 8d 60 b0             lea    -0x50(%rax),%rsp
   400fd1:       e8 78 e8 01 00          callq  41f84e <__msabi_save_13>
   400fd6:       41 57                   push   %r15
   400fd8:       41 56                   push   %r14
   400fda:       41 55                   push   %r13
   400fdc:       41 54                   push   %r12
   400fde:       31 c0                   xor    %eax,%eax
   400fe0:       e8 7b fc ff ff          callq  400c60 <sysv_fn0>
   400fe5:       48 8d a5 20 ff ff ff    lea    -0xe0(%rbp),%rsp
   400fec:       41 5c                   pop    %r12
   400fee:       41 5d                   pop    %r13
   400ff0:       41 5e                   pop    %r14
   400ff2:       41 5f                   pop    %r15
   400ff4:       48 8d 74 24 50          lea    0x50(%rsp),%rsi
   400ff9:       e8 a0 e8 01 00          callq  41f89e <__msabi_restore_13>
   400ffe:       48 81 c4 c0 00 00 00    add    $0xc0,%rsp
   401005:       5d                      pop    %rbp
   401006:       c3                      retq
Daniel