LDRD is an ARMv7 instruction that loads two consecutive 32-bit words in one operation. A single LDRD saves one clock cycle compared to two LDRs.
The optimization introduced by this commit may not significantly affect performance, since the final LDM instruction may have to wait for the previous load instruction(s) to free up the load pipeline. However, future modifications that reduce the load pressure from the succeding instructions can benefit from this optimization. Also, shortened code leads to less time spent on instruction fetch and more efficient I-cache utilization.
Running llvm-mca --timeline on the old code reports: 01234567 Index 0123456789
[0,0] DeER . . . . tst.w r12, #2 [0,1] D--R . . . . it ne [0,2] .DeeeeER . . . ldmne.w r8, {r0, r1, r2, r3} [0,3] . D===eeeeER . . ldr.w lr, [r8, #60] [0,4] . D====eeeeER . . ldr.w sp, [r8, #56] [0,5] . DeE------R . . add.w r8, r8, #16 [0,6] . D===eeeeeeeeER ldm.w r8, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
Running llvm-mca --timeline on the new code reports: 01234567 Index 0123456789
[0,0] DeER . . . . tst.w r12, #2 [0,1] D--R . . . . it ne [0,2] .DeeeeER . . . ldmne.w r8, {r0, r1, r2, r3} [0,3] . D===eeeeER . . ldrd sp, lr, [r8, #56] [0,4] . DeE-----R . . add.w r8, r8, #16 [0,5] . D===eeeeeeeeER ldm.w r8, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
Signed-off-by: Jinoh Kang jinoh.kang.kr@gmail.com --- dlls/ntdll/unix/signal_arm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/dlls/ntdll/unix/signal_arm.c b/dlls/ntdll/unix/signal_arm.c index 8c8750153df..c63f669668c 100644 --- a/dlls/ntdll/unix/signal_arm.c +++ b/dlls/ntdll/unix/signal_arm.c @@ -1208,8 +1208,7 @@ __ASM_GLOBAL_FUNC( __wine_syscall_dispatcher, "tst ip, #2\n\t" /* CONTEXT_INTEGER */ "it ne\n\t" "ldmne r8, {r0-r3}\n\t" - "ldr lr, [r8, #0x3c]\n\t" - "ldr sp, [r8, #0x38]\n\t" + "ldrd sp, lr, [r8, #0x38]\n\t" "add r8, r8, #0x10\n\t" "ldm r8, {r4-r12,pc}\n" "5:\tmovw r0, #0x000d\n\t" /* STATUS_INVALID_PARAMETER */