Module: tools
Branch: master
Commit: 3f26a883efbff95dde8c25fd6bef67f35bd85e3e
URL: https://source.winehq.org/git/tools.git/?a=commit;h=3f26a883efbff95dde8c25f…
Author: Francois Gouget <fgouget(a)codeweavers.com>
Date: Fri Sep 20 15:04:53 2019 +0200
testbot/Engine: Put a VM offline if it repeatedly times out during revert.
Putting it in maintenance mode makes it unusable until an administrator
looks at it which could take a long time when quite often the timeout is
just caused by high load on the host.
Now that LibvirtTool 'rate-limits' how often it puts VMs that keep
having errors back online, we can simply put such VMs offline and let
LibvirtTool deal with them.
Signed-off-by: Francois Gouget <fgouget(a)codeweavers.com>
Signed-off-by: Alexandre Julliard <julliard(a)winehq.org>
---
testbot/lib/WineTestBot/Engine/Scheduler.pm | 27 ++++++++++++---------------
1 file changed, 12 insertions(+), 15 deletions(-)
diff --git a/testbot/lib/WineTestBot/Engine/Scheduler.pm b/testbot/lib/WineTestBot/Engine/Scheduler.pm
index 569dff7..e9a3ff9 100644
--- a/testbot/lib/WineTestBot/Engine/Scheduler.pm
+++ b/testbot/lib/WineTestBot/Engine/Scheduler.pm
@@ -270,31 +270,28 @@ sub _CheckAndClassifyVMs()
{
# The child process got stuck!
$FoundVMErrors = 1;
- my $NewStatus = "dirty";
if ($VM->Status eq "reverting" or $VM->Status eq "sleeping")
{
my $Errors = ($VM->Errors || 0) + 1;
$VM->Errors($Errors);
- if ($Errors >= $MaxVMErrors)
- {
- $NewStatus = "maintenance";
- NotifyAdministrator("Putting the $VMKey VM in maintenance mode",
- "The last $Errors revert operations timed out.\n\n".
- "No further operation will be attempted until an administrator has put\n".
- "the VM back online.");
- $Sched->{busyvms}->{$VMKey} = 1;
- }
+ $VM->Status("offline");
+ NotifyAdministrator("Putting the $VMKey VM offline",
+ "The last $Errors revert operations timed out.\n\n".
+ "This may be because of some transient load on the VM host but if not\n".
+ "an administrator should look into it. In any case the TestBot will try\n".
+ "to regain access to the VM.");
+ $Sched->{busyvms}->{$VMKey} = 1;
}
- $VM->Status($NewStatus);
- $VM->KillChild();
- $VM->Save();
- $VM->RecordResult($Sched->{records}, "boterror stuck process");
- if ($NewStatus eq "dirty")
+ else
{
+ $VM->Status("dirty");
$Sched->{lambvms}->{$VMKey} = 1;
$Host->{dirty}++;
$Host->{active}++;
}
+ $VM->KillChild();
+ $VM->Save();
+ $VM->RecordResult($Sched->{records}, "boterror stuck process");
}
elsif ($VM->Status =~ /^(?:dirty|running|reverting)$/)
{
Module: tools
Branch: master
Commit: 654623afdefa049d729ea47e8d852623afd0e3c3
URL: https://source.winehq.org/git/tools.git/?a=commit;h=654623afdefa049d729ea47…
Author: Francois Gouget <fgouget(a)codeweavers.com>
Date: Fri Sep 20 15:04:37 2019 +0200
testbot/LibvirtTool: Don't put VMs in maintenance even after repeated errors.
The VM may have gone offline simply because of repeated revert
timeouts. This can happen if there is too much load on the host which
is a transient issue. But the administrator may not be available right
away to investigate and put the VM back online.
So warn the administrator, keep monitoring the offline VM but space out
the attempts to use the VM so the host can try running tasks on other
VMs.
Signed-off-by: Francois Gouget <fgouget(a)codeweavers.com>
Signed-off-by: Alexandre Julliard <julliard(a)winehq.org>
---
testbot/bin/LibvirtTool.pl | 43 ++++++++++++++++++++++++++++---------------
1 file changed, 28 insertions(+), 15 deletions(-)
diff --git a/testbot/bin/LibvirtTool.pl b/testbot/bin/LibvirtTool.pl
index 60e868e..0801c99 100755
--- a/testbot/bin/LibvirtTool.pl
+++ b/testbot/bin/LibvirtTool.pl
@@ -169,10 +169,6 @@ sub FatalError($)
# Get the up-to-date VM status
$VM = CreateVMs()->GetItem($VMKey);
- # Put the VM offline or mark it for maintenance
- my $Errors = ($VM->Errors || 0) + 1;
- my $NewStatus = $Errors < $MaxVMErrors ? "offline" : "maintenance";
-
if ($VM->Status eq "maintenance")
{
# Still proceed with changing the non-Status fields and notifying the
@@ -185,10 +181,11 @@ sub FatalError($)
}
else
{
- $VM->Status($NewStatus);
+ $VM->Status("offline");
}
$VM->ChildDeadline(undef);
$VM->ChildPid(undef);
+ my $Errors = ($VM->Errors || 0) + 1;
$VM->Errors($Errors);
my ($ErrProperty, $SaveErrMessage) = $VM->Save();
@@ -196,19 +193,19 @@ sub FatalError($)
{
LogMsg "Could not put the $VMKey VM offline: $SaveErrMessage ($ErrProperty)\n";
}
- elsif ($NewStatus eq "offline")
+ elsif ($Errors >= $MaxVMErrors)
{
- NotifyAdministrator("Putting the $VMKey VM offline",
- "Could not perform the $Action operation on the $VMKey VM:\n".
+ NotifyAdministrator("The $VMKey VM needs maintenance",
+ "Got $Errors consecutive errors working on the $VMKey VM:\n".
"\n$ErrMessage\n".
- "The VM has been put offline and the TestBot will try to regain access to it.");
+ "It probably needs fixing to get back online.");
}
- elsif ($NewStatus eq "maintenance")
+ else
{
- NotifyAdministrator("The $VMKey VM needs maintenance",
- "Got $Errors consecutive errors working on the $VMKey VM:\n".
+ NotifyAdministrator("Putting the $VMKey VM offline",
+ "Could not perform the $Action operation on the $VMKey VM:\n".
"\n$ErrMessage\n".
- "An administrator needs to look at it and to put it back online.");
+ "The VM has been put offline and the TestBot will try to regain access to it.");
}
exit 1;
}
@@ -259,6 +256,14 @@ sub ChangeStatus($$;$)
sub Monitor()
{
+ # Still try recovering the VM in case of repeated errors, but space out
+ # attempts to not keep the host busy with a broken VM. Note that after
+ # 1 hour the monitor process gets killed and replaced (to deal with stuck
+ # monitor processes) but even so the VM will be checked once per hour.
+ my $Interval = ($VM->Errors || 0) >= $MaxVMErrors ? 1860 : 60;
+ my $NextTry = time() + $Interval;
+ Debug(Elapsed($Start), " Checking $VMKey in ${Interval}s\n");
+
$CurrentStatus = "offline";
while (1)
{
@@ -281,6 +286,14 @@ sub Monitor()
"to ". $VM->Status ." after ". PrettyElapsed($Start) .".");
return 0;
}
+ my $Sleep = $NextTry - time();
+ if ($Sleep > 0)
+ {
+ # Check that the VM still needs monitoring at least once per minute.
+ $Sleep = 60 if ($Sleep > 60);
+ sleep($Sleep);
+ next;
+ }
my $IsReady = $VM->GetDomain()->IsReady();
if ($IsReady and $VM->GetDomain()->IsPoweredOn())
@@ -301,8 +314,8 @@ sub Monitor()
return 0;
}
- Debug(Elapsed($Start), " $VMKey is still unreachable\n");
- sleep(60);
+ Debug(Elapsed($Start), " $VMKey is still busy / unreachable, trying again in ${Interval}s\n");
+ $NextTry = time() + $Interval;
}
}