Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=50453 Signed-off-by: Rémi Bernon rbernon@codeweavers.com --- dlls/ntdll/tests/exception.c | 27 ++++++++++++++------------- server/debugger.c | 1 + 2 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/dlls/ntdll/tests/exception.c b/dlls/ntdll/tests/exception.c index ca338775351..202a939b76f 100644 --- a/dlls/ntdll/tests/exception.c +++ b/dlls/ntdll/tests/exception.c @@ -1053,7 +1053,7 @@ static void test_exceptions(void) ok( res == STATUS_SUCCESS, "NtSetContextThread failed with %x\n", res ); }
-static void test_debugger(void) +static void test_debugger(DWORD cont_status) { char cmdline[MAX_PATH]; PROCESS_INFORMATION pi; @@ -1081,7 +1081,7 @@ static void test_debugger(void)
do { - continuestatus = DBG_CONTINUE; + continuestatus = cont_status; ok(WaitForDebugEvent(&de, INFINITE), "reading debug event\n");
ret = ContinueDebugEvent(de.dwProcessId, de.dwThreadId, 0xdeadbeef); @@ -1091,7 +1091,7 @@ static void test_debugger(void) if (de.dwThreadId != pi.dwThreadId) { trace("event %d not coming from main thread, ignoring\n", de.dwDebugEventCode); - ContinueDebugEvent(de.dwProcessId, de.dwThreadId, DBG_CONTINUE); + ContinueDebugEvent(de.dwProcessId, de.dwThreadId, cont_status); continue; }
@@ -3288,7 +3288,7 @@ static void test_rtlraiseexception(void) run_rtlraiseexception_test(EXCEPTION_INVALID_HANDLE); }
-static void test_debugger(void) +static void test_debugger(DWORD cont_status) { char cmdline[MAX_PATH]; PROCESS_INFORMATION pi; @@ -3316,7 +3316,7 @@ static void test_debugger(void)
do { - continuestatus = DBG_CONTINUE; + continuestatus = cont_status; ok(WaitForDebugEvent(&de, INFINITE), "reading debug event\n");
ret = ContinueDebugEvent(de.dwProcessId, de.dwThreadId, 0xdeadbeef); @@ -3326,7 +3326,7 @@ static void test_debugger(void) if (de.dwThreadId != pi.dwThreadId) { trace("event %d not coming from main thread, ignoring\n", de.dwDebugEventCode); - ContinueDebugEvent(de.dwProcessId, de.dwThreadId, DBG_CONTINUE); + ContinueDebugEvent(de.dwProcessId, de.dwThreadId, cont_status); continue; }
@@ -4252,7 +4252,7 @@ static void test_thread_context(void) #undef COMPARE }
-static void test_debugger(void) +static void test_debugger(DWORD cont_status) { char cmdline[MAX_PATH]; PROCESS_INFORMATION pi; @@ -4280,7 +4280,7 @@ static void test_debugger(void)
do { - continuestatus = DBG_CONTINUE; + continuestatus = cont_status; ok(WaitForDebugEvent(&de, INFINITE), "reading debug event\n");
ret = ContinueDebugEvent(de.dwProcessId, de.dwThreadId, 0xdeadbeef); @@ -4290,7 +4290,7 @@ static void test_debugger(void) if (de.dwThreadId != pi.dwThreadId) { trace("event %d not coming from main thread, ignoring\n", de.dwDebugEventCode); - ContinueDebugEvent(de.dwProcessId, de.dwThreadId, DBG_CONTINUE); + ContinueDebugEvent(de.dwProcessId, de.dwThreadId, cont_status); continue; }
@@ -5500,7 +5500,7 @@ static void test_thread_context(void) #undef COMPARE }
-static void test_debugger(void) +static void test_debugger(DWORD cont_status) { char cmdline[MAX_PATH]; PROCESS_INFORMATION pi; @@ -5528,7 +5528,7 @@ static void test_debugger(void)
do { - continuestatus = DBG_CONTINUE; + continuestatus = cont_status; ok(WaitForDebugEvent(&de, INFINITE), "reading debug event\n");
ret = ContinueDebugEvent(de.dwProcessId, de.dwThreadId, 0xdeadbeef); @@ -5538,7 +5538,7 @@ static void test_debugger(void) if (de.dwThreadId != pi.dwThreadId) { trace("event %d not coming from main thread, ignoring\n", de.dwDebugEventCode); - ContinueDebugEvent(de.dwProcessId, de.dwThreadId, DBG_CONTINUE); + ContinueDebugEvent(de.dwProcessId, de.dwThreadId, cont_status); continue; }
@@ -8346,7 +8346,8 @@ START_TEST(exception)
#endif
- test_debugger(); + test_debugger(DBG_EXCEPTION_HANDLED); + test_debugger(DBG_CONTINUE); test_thread_context(); test_outputdebugstring(1, FALSE); test_ripevent(1); diff --git a/server/debugger.c b/server/debugger.c index e4a6c1e43a8..ee95d4ee326 100644 --- a/server/debugger.c +++ b/server/debugger.c @@ -677,6 +677,7 @@ DECL_HANDLER(continue_debug_event) struct process *process;
if (req->status != DBG_EXCEPTION_NOT_HANDLED && + req->status != DBG_EXCEPTION_HANDLED && req->status != DBG_CONTINUE && req->status != DBG_REPLY_LATER) {
Hi,
While running your changed tests, I think I found new failures. Being a bot and all I'm not very good at pattern recognition, so I might be wrong, but could you please double-check?
Full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=83809
Your paranoid android.
=== w10pro64 (testbot log) ===
An error occurred while waiting for the test to complete: the 1052 process does not exist or is not a child process
=== w10pro64 (testbot log) ===
An error occurred while waiting for the test to complete: the 880 process does not exist or is not a child process
=== w10pro64_ar (testbot log) ===
An error occurred while waiting for the test to complete: the 7716 process does not exist or is not a child process
=== w10pro64_he (testbot log) ===
An error occurred while waiting for the test to complete: the 7752 process does not exist or is not a child process
=== w10pro64_ja (testbot log) ===
An error occurred while waiting for the test to complete: the 7996 process does not exist or is not a child process
=== w10pro64_zh_CN (testbot log) ===
An error occurred while waiting for the test to complete: the 7900 process does not exist or is not a child process
On 1/6/21 3:55 PM, Marvin wrote:
Hi,
While running your changed tests, I think I found new failures. Being a bot and all I'm not very good at pattern recognition, so I might be wrong, but could you please double-check?
Full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=83809
Your paranoid android.
=== w10pro64 (testbot log) ===
An error occurred while waiting for the test to complete: the 1052 process does not exist or is not a child process
=== w10pro64 (testbot log) ===
An error occurred while waiting for the test to complete: the 880 process does not exist or is not a child process
=== w10pro64_ar (testbot log) ===
An error occurred while waiting for the test to complete: the 7716 process does not exist or is not a child process
=== w10pro64_he (testbot log) ===
An error occurred while waiting for the test to complete: the 7752 process does not exist or is not a child process
=== w10pro64_ja (testbot log) ===
An error occurred while waiting for the test to complete: the 7996 process does not exist or is not a child process
=== w10pro64_zh_CN (testbot log) ===
An error occurred while waiting for the test to complete: the 7900 process does not exist or is not a child process
I think w10pro64 has some issues, or something is already wrong with this specific test on this specific windows version. For instance, the same test with a no-op patch gives the same kind of failures here:
https://testbot.winehq.org/JobDetails.pl?Key=83808#k305
On Wed, 6 Jan 2021, Rémi Bernon wrote:
On 1/6/21 3:55 PM, Marvin wrote:
Hi,
While running your changed tests, I think I found new failures. Being a bot and all I'm not very good at pattern recognition, so I might be wrong, but could you please double-check?
Full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=83809
[...]
I think w10pro64 has some issues, or something is already wrong with this specific test on this specific windows version. For instance, the same test with a no-op patch gives the same kind of failures here:
Both indicate a Windows crash. The telltale signs:
* The "TestAgentd.exe was restarted. Did Windows reboot?" dialog in the final screenshot. [1]
* The "The test VM has crashed, rebooted or lost connectivity" message in the testbot log.
This reminds me of the previous issue caused by the 4.19.0-8 kernel [2]. It's the same KERNEL_SECURITY_CHECK_FAILURE BSOD. However vm4 is running the 4.19.0-13 kernel which should be exempt from this issue.
The patch I applied to avoid the crashes before [3] does not prevent crashes this time around but changes the BSOD to SYSTEM_SERVICE_EXCEPTION (which does not really help me).
I've tried to identify a commit or event that could explain why this issue is back and built the following timeline:
(Run day; Commit day; Commit id; Result/Subject; Job id) 06 06 4ac05a : WineTest crash in ntdll:exception (83814) 05 05 c0ca69 : WineTest crash in ntdll:exception (83791) 04 04 221fdb : WineTest crash in ntdll:exception (83774) 02 02 6.0-rc5: WineTest crash in ntdll:exception (83753) 30 30 784cb20: WineTest crash in ntdll:exception (83733) 29 29 c20d49 : WineTest crash in ntdll:exception (83724) 28 28 ff09f1 : WineTest crash in ntdll:exception (83701) 28 c414f2 : ntdll: Accept DBG_EXCEPTION_HANDLED parameter... 27 93c485 : ntdll: Clear AC flag in align check test. 28 26 6.0-rc4: WineTest crash in ntdll:exception (83681) 28 : TestBot Engine rescued from the 99 errors of hell 27 : Network is back (18:00)
24 : Network outage (18:00) 23 23 9d7a71 : WineTest no crash (83664) 23 : Power outage (16:00) 22 22 e27a95 : WineTest no crash (83648) 22 175a36 : ntdll: Always copy context... 21 21 3d2b42 : WineTest truncated at comctl32:datetime (83627) 18 18 6.0-rc3: WineTest no crash (83589) 17 71a433 : ntdll: Save unwind information... 17 17 0aa6f8 : WineTest no crash (83554) 16 16 ef876f : WineTest no crash (83529) 15 15 79e267 : WineTest no crash (83495)
15 : Reboot vm4 to the 4.19.0-13 kernel 14 14 04ddab : WineTest crash in ntdll:exception (83424) 14 : Rebuilt vm4 from backup (4.19.0-8, oops!), restored VMs
11 11 6.0-rc2: WineTest no crash (83364) 10 10 be4592 : WineTest no crash (83328) 09 09 76c9db : WineTest no crash (83306) 08 08 310019 : WineTest no crash (83240) 07 183a8c : ntdll: ntdll: Restore non-volatile registers...
So it looks like things started going south again after the Christmas power / network outage. Yet I did not change anything in the VM configuration then.
So in case the power outage caused some corruption in the w10pro64 qcow2 [4] I restored the VM from backup but it still crashes the same way. Then I rebooted the host and it's still bad:
https://testbot.winehq.org/JobDetails.pl?Key=83847 https://testbot.winehq.org/JobDetails.pl?Key=83852
So I don't know what's up.
[1] The non-English locale screenshots are missing the "Windows rebooted" dialog which points to a bug in the LibvirtTool code creating these live snapshots.
[2] https://www.winehq.org/pipermail/wine-devel/2020-December/178570.html https://www.winehq.org/pipermail/wine-devel/2020-December/178431.html
[3] https://www.winehq.org/pipermail/wine-devel/2020-December/178398.html
[4] According to the timeline there was still one good run after the power outage but with the way Munin and the TestBot handle timezones I may have gotten the order wrong.
On Thu, 7 Jan 2021, Francois Gouget wrote: [...]
So I don't know what's up.
I tested a few more things:
* I ran the binaries from the successful 2020-12-09 and 23 runs. They now crash too. So that rules out a Wine-side change.
https://testbot.winehq.org/JobDetails.pl?Key=83854 https://testbot.winehq.org/JobDetails.pl?Key=83853
* I checked the host's unattended upgrade logs and there is no package upgrade between the 19th (python-lxml) and the 30th (python-apt-common python3-apt).
* debsums --changed does not report any modified file so I think that rules out disk corruption on the host.
* And in the guest I redid the checks mentioned in: https://www.winehq.org/pipermail/wine-devel/2020-December/178362.html
- The first sfc /scannow run found some file ownerships it did not like in the c:\windows drivers directories and in c:\ProgramData...\Start Menu and fixed them. Further runs did not report any error but Windows still crashes.
- chkdsk /f c: and the GUI filesystem checker report no error.
- The Windows Memory Diagnostics Tool reports no error.
Can we rule out some uninitialized memory or bad pointer issue in ntdll:exception?
Aaaand now ntdll:exception is crashing all 64-bit Windows versions, starting from Windows 8.
So on the 7th we had results for w864, w1064v1507, w1064v1607, w1064v1709, w1064v1809 and w1064 (2009):
https://testbot.winehq.org/JobDetails.pl?Key=83814 https://testbot.winehq.org/JobDetails.pl?Key=83817
And today they are all gone :-(
https://testbot.winehq.org/JobDetails.pl?Key=83862 https://testbot.winehq.org/JobDetails.pl?Key=83865
What survived is the 32-bit w8 VM, and the cw-gtx560 and cw-rx460 non-VM machines (Windows 8 to 10 1809).
Somehow my Windows 10 VM (fgtb-w10pro64) survived too (kernel 4.19.0-11 (instead of 4.19.0-13) and qemu 5.0-14).
I'll see if upgrading QEmu helps.
On Fri, 8 Jan 2021, Francois Gouget wrote: [...]
So on the 7th we had results for w864, w1064v1507, w1064v1607, w1064v1709, w1064v1809 and w1064 (2009):
[...]
What survived is the 32-bit w8 VM, and the cw-gtx560 and cw-rx460 non-VM machines (Windows 8 to 10 1809).
Somehow my Windows 10 VM (fgtb-w10pro64) survived too (kernel 4.19.0-11 (instead of 4.19.0-13) and qemu 5.0-14).
I found a combination that works: kernel 4.19.152-1 (4.19.0-12) + qemu 5.0.14.
I given that the combinations below don't work I don't think this means that the bug is fixed as much as that we're avoiding the error by sheer luck.
In the past 24 hours I confirmed that with these kernel + qemu combinations, ntdll:exception causes Windows to crash:
kernel 4.19.146-1 (4.19.0-11) + qemu 5.0-14~bpo10+1 kernel 4.19.152-1 (4.19.0-12) + qemu 3.1+dfsg-8+deb10u8 kernel 4.19.160-2 (4.19.0-13) + qemu 3.1+dfsg-8+deb10u8 kernel 4.19.160-2 (4.19.0-13) + qemu 5.0-14~bpo10+1 kernel 5.9.6-1~bpo10+1 (5.9.0-0.bpo.2) + qemu 5.0-14~bpo10+1
So I updated vm3 and vm4 to this configuration but I left vm1 and vm2 alone for now since they don't have any impacted VM.
I figured it out... I think.
This issue happened before: https://bugs.winehq.org/show_bug.cgi?id=40240 https://bugs.launchpad.net/qemu/+bug/1658141
The Windows crash message was a bit different but the host had similar MSR_* errors on the console.
The workaround then was to run:
echo 1 >/sys/module/kvm/parameters/ignore_msrs
But I set up an init.d script to get it to survive across reboots and unfortunately that script's dependencies were incorrect so, at least recently, it got run before /sys was mounted. But not always. This explains the sometimes puzzling and somewhat inconsistent results I got. It also explains why a simple reboot on the 27th broke ntdll:exception.
Had I noticed the "/kvm/.../parameters/" part of that path I may have realized that the right way to make this workaround permanent, at least on Debian, is to do:
echo options kvm ignore_msrs=1 >/etc/modprobe.d/local-qemu.conf
But, unlike before, that's not entirely sufficient: it does not work with QEmu 3.1; one has to upgrade QEmu to 5.0-14. I have done that on vm3 and vm4, restored the kernel to the latest 4.19.0-13 (it works with 5.9.0 too), and I have rerun WineTest on w864, w1064* and w10pro64.