The Errors field definition only allows two digits which sounds like more than enough for a count of consecutive errors but can overflow in a few days if the VM hosts are knocked out, which then causes the scripts to die. It would be easy to increase the maximum number of digits but that would only postpone the issue so add VM::AddError() to make sure the field does not overflow.
Signed-off-by: Francois Gouget fgouget@codeweavers.com --- testbot/bin/LibvirtTool.pl | 3 +-- testbot/bin/WineRunBuild.pl | 3 +-- testbot/bin/WineRunReconfig.pl | 3 +-- testbot/bin/WineRunTask.pl | 3 +-- testbot/bin/WineRunWineTest.pl | 3 +-- testbot/lib/WineTestBot/Engine/Scheduler.pm | 6 ++---- testbot/lib/WineTestBot/VMs.pm | 9 +++++++++ 7 files changed, 16 insertions(+), 14 deletions(-)
diff --git a/testbot/bin/LibvirtTool.pl b/testbot/bin/LibvirtTool.pl index 47839b611d..a025085d44 100755 --- a/testbot/bin/LibvirtTool.pl +++ b/testbot/bin/LibvirtTool.pl @@ -189,8 +189,7 @@ sub FatalError($) } $VM->ChildDeadline(undef); $VM->ChildPid(undef); - my $Errors = ($VM->Errors || 0) + 1; - $VM->Errors($Errors); + my $Errors = $VM->AddError();
my ($ErrProperty, $SaveErrMessage) = $VM->Save(); if (defined $SaveErrMessage) diff --git a/testbot/bin/WineRunBuild.pl b/testbot/bin/WineRunBuild.pl index 8585882b7d..50696b7091 100755 --- a/testbot/bin/WineRunBuild.pl +++ b/testbot/bin/WineRunBuild.pl @@ -241,8 +241,7 @@ sub WrapUpAndExit($;$$$$) $VM->Status($NewVMStatus); if ($NewVMStatus eq 'offline') { - my $Errors = ($VM->Errors || 0) + 1; - $VM->Errors($Errors); + $VM->AddError(); } else { diff --git a/testbot/bin/WineRunReconfig.pl b/testbot/bin/WineRunReconfig.pl index 373e657dc4..b4035cb021 100755 --- a/testbot/bin/WineRunReconfig.pl +++ b/testbot/bin/WineRunReconfig.pl @@ -243,8 +243,7 @@ sub WrapUpAndExit($;$$$$) $VM->Status($NewVMStatus); if ($NewVMStatus eq 'offline') { - my $Errors = ($VM->Errors || 0) + 1; - $VM->Errors($Errors); + $VM->AddError(); } else { diff --git a/testbot/bin/WineRunTask.pl b/testbot/bin/WineRunTask.pl index 58e055ca8d..511bde0424 100755 --- a/testbot/bin/WineRunTask.pl +++ b/testbot/bin/WineRunTask.pl @@ -275,8 +275,7 @@ sub WrapUpAndExit($;$$$$) $VM->Status($NewVMStatus); if ($NewVMStatus eq 'offline') { - my $Errors = ($VM->Errors || 0) + 1; - $VM->Errors($Errors); + $VM->AddError(); } else { diff --git a/testbot/bin/WineRunWineTest.pl b/testbot/bin/WineRunWineTest.pl index c9a57bb86c..c1a37aec5f 100755 --- a/testbot/bin/WineRunWineTest.pl +++ b/testbot/bin/WineRunWineTest.pl @@ -272,8 +272,7 @@ sub WrapUpAndExit($;$$$$) $VM->Status($NewVMStatus); if ($NewVMStatus eq 'offline') { - my $Errors = ($VM->Errors || 0) + 1; - $VM->Errors($Errors); + $VM->AddError(); } else { diff --git a/testbot/lib/WineTestBot/Engine/Scheduler.pm b/testbot/lib/WineTestBot/Engine/Scheduler.pm index 800d136835..f606c0d07b 100644 --- a/testbot/lib/WineTestBot/Engine/Scheduler.pm +++ b/testbot/lib/WineTestBot/Engine/Scheduler.pm @@ -272,8 +272,7 @@ sub _CheckAndClassifyVMs() $FoundVMErrors = 1; if ($VM->Status eq "reverting" or $VM->Status eq "sleeping") { - my $Errors = ($VM->Errors || 0) + 1; - $VM->Errors($Errors); + my $Errors = $VM->AddError(); $VM->Status("offline"); NotifyAdministrator("Putting the $VMKey VM offline", "The last $Errors revert operations timed out.\n\n". @@ -360,8 +359,7 @@ sub _CheckAndClassifyVMs() $FoundVMErrors = 1; $VM->ChildDeadline(undef); $VM->ChildPid(undef); - my $Errors = ($VM->Errors || 0) + 1; - $VM->Errors($Errors); + my $Errors = $VM->AddError(); if ($Errors >= $MaxVMErrors) { $VM->Status("offline"); diff --git a/testbot/lib/WineTestBot/VMs.pm b/testbot/lib/WineTestBot/VMs.pm index 596499acd0..60a52b86c5 100644 --- a/testbot/lib/WineTestBot/VMs.pm +++ b/testbot/lib/WineTestBot/VMs.pm @@ -264,6 +264,15 @@ sub Status($;$) return $NewStatus; }
+sub AddError($) +{ + my ($self) = @_; + + my $Errors = ($self->Errors || 0) + 1; + $self->Errors($Errors) if ($Errors <= 99); + return $Errors; +} + =pod =over 12
Tweak the notification to reflect that. This also means the LibvirtTool monitor should not announce that the VM is working again since it may very well still be broken, for instance if the required snapshot is missing. Instead announce the VM works again only when a revert was successful.
So now the administrator gets: * $MaxVMErrors-1 messages that the VM is offline * 1 message that the VM needs maintenance * 1 message when the VM works again
Signed-off-by: Francois Gouget fgouget@codeweavers.com --- testbot/bin/LibvirtTool.pl | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-)
diff --git a/testbot/bin/LibvirtTool.pl b/testbot/bin/LibvirtTool.pl index a025085d44..c027d6bd68 100755 --- a/testbot/bin/LibvirtTool.pl +++ b/testbot/bin/LibvirtTool.pl @@ -196,19 +196,22 @@ sub FatalError($) { LogMsg "Could not put the $VMKey VM offline: $SaveErrMessage ($ErrProperty)\n"; } - elsif ($Errors >= $MaxVMErrors) + elsif ($Errors < $MaxVMErrors) { - NotifyAdministrator("The $VMKey VM needs maintenance", - "Got $Errors consecutive errors working on the $VMKey VM:\n". + NotifyAdministrator("Putting the $VMKey VM offline ($Errors)", + "Could not perform the $Action operation on the $VMKey VM:\n". "\n$ErrMessage\n". - "It probably needs fixing to get back online."); + "Got $Errors consecutive errors.\n". + "The VM has been put offline and the TestBot will try to regain access to it."); } - else + elsif ($Errors == $MaxVMErrors) { - NotifyAdministrator("Putting the $VMKey VM offline", - "Could not perform the $Action operation on the $VMKey VM:\n". + NotifyAdministrator("The $VMKey VM needs maintenance", + "Got $Errors consecutive errors working on the $VMKey VM:\n". "\n$ErrMessage\n". - "The VM has been put offline and the TestBot will try to regain access to it."); + "It probably needs fixing to get back online.\n". + "* Spacing out attempts to revive it.\n". + "* No further error notification will be sent until it starts working again.\n"); } exit 1; } @@ -405,10 +408,11 @@ sub Monitor() } if ($IsReady) { + # The VM may well still be broken, for instance if the required snapshot + # does not exist. But only revert can tell. For this reason, let revert + # announce when the VM works again. return 1 if (ChangeStatus("offline", "off", "done")); - NotifyAdministrator("The $VMKey VM is working again", - "The $VMKey VM started working again after ". - PrettyElapsed($Start) ."."); + LogMsg("Switching $VMKey from offline to off after ". PrettyElapsed($Start) ."."); return 0; }
@@ -821,9 +825,15 @@ sub Revert() Debug(Elapsed($Start), " Sleeping ${Sleep}s\n"); LogMsg "Letting $VMKey settle down for ${Sleep}s\n"; sleep($Sleep); - CheckBootCount($TA); - return ChangeStatus($CurrentStatus, "idle", "done"); + + return 1 if (ChangeStatus($CurrentStatus, "idle", "done")); + if ($VM->Errors) + { + NotifyAdministrator("The $VMKey VM is working again", + "After ". $VM->Errors ." consecutive errors $VMKey was successfully reverted to $Config->{snapshot} in ". PrettyElapsed($Start) ." and is now idle."); + } + return 0; }