VM revert loops typically happen when a VM is misconfigured such that
the TestBot fails to access the TestAgent daemon after reverting it.
This results in the VM being put offline until it is accessible again
through Libvirt which is the case so that it is immediately put back
online and reverted again leading to a new error.
With this patch the VM is put in maintenance mode for an administrator
to look at if it has too many consecutive errors.
Signed-off-by: Francois Gouget <fgouget(a)codeweavers.com>
---
This requires a database upgrade.
testbot/bin/LibvirtTool.pl | 30 +++++++++++++++++++++++++-----
testbot/ddl/update33.sql | 5 +++++
testbot/ddl/winetestbot.sql | 1 +
testbot/doc/winetestbot-schema.dia | 23 +++++++++++++++++++++++
testbot/lib/WineTestBot/Config.pm | 7 +++++--
testbot/lib/WineTestBot/VMs.pm | 1 +
testbot/web/admin/VMDetails.pl | 4 +++-
7 files changed, 63 insertions(+), 8 deletions(-)
create mode 100644 testbot/ddl/update33.sql
diff --git a/testbot/bin/LibvirtTool.pl b/testbot/bin/LibvirtTool.pl
index f9a43bb77..91fe05358 100755
--- a/testbot/bin/LibvirtTool.pl
+++ b/testbot/bin/LibvirtTool.pl
@@ -193,23 +193,42 @@ sub FatalError($)
my ($ErrMessage) = @_;
Error $ErrMessage;
- # Put the VM offline if nobody else modified its status before us
+ # Put the VM offline or mark it for maintenance
+ my $Errors = ($VM->Errors || 0) + 1;
+ my $NewStatus = $Errors < $MaxVMErrors ? "offline" : "maintenance";
+
$VM = CreateVMs()->GetItem($VMKey);
- $VM->Status("offline") if ($VM->Status eq $CurrentStatus);
- $VM->ChildDeadline(undef);
- $VM->ChildPid(undef);
+ # Put the VM offline if nobody else modified its status before us
+ if ($VM->Status eq $CurrentStatus)
+ {
+ $VM->Status($NewStatus);
+ $VM->ChildDeadline(undef);
+ $VM->ChildPid(undef);
+ $VM->Errors($Errors);
+ }
+ else
+ {
+ $NewStatus = "";
+ }
my ($ErrProperty, $SaveErrMessage) = $VM->Save();
if (defined $SaveErrMessage)
{
LogMsg "Could not put the $VMKey VM offline: $SaveErrMessage ($ErrProperty)\n";
}
- elsif ($VM->Status eq "offline")
+ elsif ($NewStatus eq "offline")
{
NotifyAdministrator("Putting the $VMKey VM offline",
"Could not perform the $Action operation on the $VMKey VM:\n".
"\n$ErrMessage\n".
"The VM has been put offline and the TestBot will try to regain access to it.");
}
+ elsif ($NewStatus eq "maintenance")
+ {
+ NotifyAdministrator("The $VMKey VM needs maintenance",
+ "Got ". $VM->Errors ." consecutive errors working on the $VMKey VM:\n".
+ "\n$ErrMessage\n".
+ "An administrator needs to look at it and to put it back online.");
+ }
exit 1;
}
@@ -244,6 +263,7 @@ sub ChangeStatus($$;$)
$VM->Status($To);
if ($Done)
{
+ $VM->Errors(undef) if ($To eq "idle");
$VM->ChildDeadline(undef);
$VM->ChildPid(undef);
}
diff --git a/testbot/ddl/update33.sql b/testbot/ddl/update33.sql
new file mode 100644
index 000000000..7239f1dba
--- /dev/null
+++ b/testbot/ddl/update33.sql
@@ -0,0 +1,5 @@
+USE winetestbot;
+
+ALTER TABLE VMs
+ ADD Errors INT(2) NULL
+ AFTER Status;
diff --git a/testbot/ddl/winetestbot.sql b/testbot/ddl/winetestbot.sql
index 89810ddc0..32eab4e3e 100644
--- a/testbot/ddl/winetestbot.sql
+++ b/testbot/ddl/winetestbot.sql
@@ -49,6 +49,7 @@ CREATE TABLE VMs
Type ENUM('win32', 'win64', 'build') NOT NULL,
Role ENUM('extra', 'base', 'winetest', 'retired', 'deleted') NOT NULL,
Status ENUM('dirty', 'reverting', 'sleeping', 'idle', 'running', 'off', 'offline', 'maintenance') NOT NULL,
+ Errors INT(2) NULL,
ChildPid INT(5) NULL,
ChildDeadline DATETIME NULL,
VirtURI VARCHAR(64) NOT NULL,
diff --git a/testbot/doc/winetestbot-schema.dia b/testbot/doc/winetestbot-schema.dia
index 61a722a21..2c720f2d2 100644
--- a/testbot/doc/winetestbot-schema.dia
+++ b/testbot/doc/winetestbot-schema.dia
@@ -2452,6 +2452,29 @@
<dia:string>##</dia:string>
</dia:attribute>
</dia:composite>
+ <dia:composite type="table_attribute">
+ <dia:attribute name="name">
+ <dia:string>#Errors#</dia:string>
+ </dia:attribute>
+ <dia:attribute name="type">
+ <dia:string>#INT(2)#</dia:string>
+ </dia:attribute>
+ <dia:attribute name="comment">
+ <dia:string>##</dia:string>
+ </dia:attribute>
+ <dia:attribute name="primary_key">
+ <dia:boolean val="false"/>
+ </dia:attribute>
+ <dia:attribute name="nullable">
+ <dia:boolean val="false"/>
+ </dia:attribute>
+ <dia:attribute name="unique">
+ <dia:boolean val="false"/>
+ </dia:attribute>
+ <dia:attribute name="default_value">
+ <dia:string>##</dia:string>
+ </dia:attribute>
+ </dia:composite>
<dia:composite type="table_attribute">
<dia:attribute name="name">
<dia:string>#ChildPid#</dia:string>
diff --git a/testbot/lib/WineTestBot/Config.pm b/testbot/lib/WineTestBot/Config.pm
index 2eb4f45a8..4f4c8d82a 100644
--- a/testbot/lib/WineTestBot/Config.pm
+++ b/testbot/lib/WineTestBot/Config.pm
@@ -29,7 +29,7 @@ use vars qw (@ISA @EXPORT @EXPORT_OK $UseSSL $LogDir $DataDir $BinDir
$DbDataSource $DbUsername $DbPassword $MaxRevertingVMs
$MaxRevertsWhileRunningVMs $MaxActiveVMs $MaxRunningVMs
$MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM
- $VMToolTimeout $MaxTaskTries $AdminEMail $RobotEMail
+ $VMToolTimeout $MaxVMErrors $MaxTaskTries $AdminEMail $RobotEMail
$WinePatchToOverride $WinePatchCc $SuiteTimeout $SingleTimeout
$BuildTimeout $ReconfigTimeout $TimeoutMargin $TagPrefix
$MaxUnitSize $ProjectName $PatchesMailingList $LDAPServer
@@ -43,7 +43,7 @@ require Exporter;
@EXPORT = qw($UseSSL $LogDir $DataDir $BinDir
$MaxRevertingVMs $MaxRevertsWhileRunningVMs $MaxActiveVMs
$MaxRunningVMs $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM
- $VMToolTimeout $MaxTaskTries $AdminEMail
+ $VMToolTimeout $MaxVMErrors $MaxTaskTries $AdminEMail
$RobotEMail $WinePatchToOverride $WinePatchCc $SuiteTimeout
$SingleTimeout $BuildTimeout $ReconfigTimeout $TimeoutMargin
$TagPrefix $MaxUnitSize $ProjectName $PatchesMailingList
@@ -82,6 +82,9 @@ $SleepAfterRevert = 0;
# Take into account $WaitForToolsInVM and $SleepAfterRevert
$VMToolTimeout = 6 * 60;
+# After three consecutive failures to revert a VM, put it in maintenance mode.
+$MaxVMErrors = 3;
+
# How many times to run a test that fails before giving up.
$MaxTaskTries = 3;
diff --git a/testbot/lib/WineTestBot/VMs.pm b/testbot/lib/WineTestBot/VMs.pm
index 9f5a2c78d..a29eebbba 100644
--- a/testbot/lib/WineTestBot/VMs.pm
+++ b/testbot/lib/WineTestBot/VMs.pm
@@ -667,6 +667,7 @@ my @PropertyDescriptors = (
CreateEnumPropertyDescriptor("Type", "Type of VM", !1, 1, ['win32', 'win64', 'build']),
CreateEnumPropertyDescriptor("Role", "VM Role", !1, 1, ['extra', 'base', 'winetest', 'retired', 'deleted']),
CreateEnumPropertyDescriptor("Status", "Current status", !1, 1, ['dirty', 'reverting', 'sleeping', 'idle', 'running', 'off', 'offline', 'maintenance']),
+ CreateBasicPropertyDescriptor("Errors", "Errors", !1, !1, "N", 2),
CreateBasicPropertyDescriptor("ChildPid", "Child process id", !1, !1, "N", 5),
CreateBasicPropertyDescriptor("ChildDeadline", "Child Deadline", !1, !1, "DT", 19),
CreateBasicPropertyDescriptor("VirtURI", "LibVirt URI of the VM", !1, 1, "A", 64),
diff --git a/testbot/web/admin/VMDetails.pl b/testbot/web/admin/VMDetails.pl
index 93e92763a..057147157 100644
--- a/testbot/web/admin/VMDetails.pl
+++ b/testbot/web/admin/VMDetails.pl
@@ -40,7 +40,7 @@ sub DisplayProperty($$)
my ($self, $PropertyDescriptor) = @_;
my $PropertyName = $PropertyDescriptor->GetName();
- return "" if ($PropertyName =~ /^(?:ChildPid|ChildDeadline)$/);
+ return "" if ($PropertyName =~ /^(?:ChildPid|ChildDeadline|Errors)$/);
return $self->SUPER::DisplayProperty($PropertyDescriptor);
}
@@ -53,6 +53,8 @@ sub Save($)
if ($OldStatus ne $self->{Item}->Status)
{
+ # The administrator action resets the consecutive error count
+ $self->{Item}->Errors(undef);
my ($ErrProperty, $ErrMessage) = $self->{Item}->Validate();
if (!defined $ErrMessage)
{
--
2.17.0