The tasks themselves have a timeout which the corresponding scripts
enforce. However the scripts themselves may get stuck, typically due
to network problems. When that happens they can end up blocking the
whole TestBot. So make sure the TestBot engine itself can detect stuck
scripts and take corrective action.
Note that the detection is not very timely but will happen at the
latest in the SafetyNet() function. This means there will be at most a
10 minutes delay.
Signed-off-by: Francois Gouget <fgouget(a)codeweavers.com>
---
testbot/bin/LibvirtTool.pl | 7 +++++-
testbot/bin/WineRunBuild.pl | 1 +
testbot/bin/WineRunReconfig.pl | 1 +
testbot/bin/WineRunTask.pl | 1 +
testbot/ddl/update32.sql | 5 +++++
testbot/ddl/winetestbot.sql | 25 +++++++++++----------
testbot/doc/winetestbot-schema.dia | 23 +++++++++++++++++++
testbot/lib/WineTestBot/Config.pm | 17 ++++++++------
testbot/lib/WineTestBot/Engine/Scheduler.pm | 16 ++++++++++++-
testbot/lib/WineTestBot/Tasks.pm | 7 ++++--
testbot/lib/WineTestBot/VMs.pm | 11 ++++++---
testbot/web/admin/VMDetails.pl | 3 ++-
12 files changed, 90 insertions(+), 27 deletions(-)
create mode 100644 testbot/ddl/update32.sql
diff --git a/testbot/bin/LibvirtTool.pl b/testbot/bin/LibvirtTool.pl
index fa07d2d20..5d60dde1b 100755
--- a/testbot/bin/LibvirtTool.pl
+++ b/testbot/bin/LibvirtTool.pl
@@ -196,6 +196,7 @@ sub FatalError($)
# Put the VM offline if nobody else modified its status before us
$VM = CreateVMs()->GetItem($VMKey);
$VM->Status("offline") if ($VM->Status eq $CurrentStatus);
+ $VM->ChildDeadline(undef);
$VM->ChildPid(undef);
my ($ErrProperty, $SaveErrMessage) = $VM->Save();
if (defined $SaveErrMessage)
@@ -241,7 +242,11 @@ sub ChangeStatus($$;$)
}
$VM->Status($To);
- $VM->ChildPid(undef) if ($Done);
+ if ($Done)
+ {
+ $VM->ChildDeadline(undef);
+ $VM->ChildPid(undef);
+ }
my ($ErrProperty, $ErrMessage) = $VM->Save();
if (defined $ErrMessage)
{
diff --git a/testbot/bin/WineRunBuild.pl b/testbot/bin/WineRunBuild.pl
index 3a52dbd04..27b8be0e0 100755
--- a/testbot/bin/WineRunBuild.pl
+++ b/testbot/bin/WineRunBuild.pl
@@ -250,6 +250,7 @@ sub WrapUpAndExit($;$$)
if ($VM->Status eq 'running')
{
$VM->Status($NewVMStatus);
+ $VM->ChildDeadline(undef);
$VM->ChildPid(undef);
$VM->Save();
}
diff --git a/testbot/bin/WineRunReconfig.pl b/testbot/bin/WineRunReconfig.pl
index 77994a3a3..713712cec 100755
--- a/testbot/bin/WineRunReconfig.pl
+++ b/testbot/bin/WineRunReconfig.pl
@@ -251,6 +251,7 @@ sub WrapUpAndExit($;$$)
if ($VM->Status eq 'running')
{
$VM->Status($NewVMStatus);
+ $VM->ChildDeadline(undef);
$VM->ChildPid(undef);
$VM->Save();
}
diff --git a/testbot/bin/WineRunTask.pl b/testbot/bin/WineRunTask.pl
index 4f69a1e4e..231f34e0c 100755
--- a/testbot/bin/WineRunTask.pl
+++ b/testbot/bin/WineRunTask.pl
@@ -279,6 +279,7 @@ sub WrapUpAndExit($;$$$)
if ($VM->Status eq 'running')
{
$VM->Status($NewVMStatus);
+ $VM->ChildDeadline(undef);
$VM->ChildPid(undef);
$VM->Save();
}
diff --git a/testbot/ddl/update32.sql b/testbot/ddl/update32.sql
new file mode 100644
index 000000000..5864d63e0
--- /dev/null
+++ b/testbot/ddl/update32.sql
@@ -0,0 +1,5 @@
+USE winetestbot;
+
+ALTER TABLE VMs
+ ADD ChildDeadline DATETIME NULL
+ AFTER ChildPid;
diff --git a/testbot/ddl/winetestbot.sql b/testbot/ddl/winetestbot.sql
index 41740159b..89810ddc0 100644
--- a/testbot/ddl/winetestbot.sql
+++ b/testbot/ddl/winetestbot.sql
@@ -44,18 +44,19 @@ ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE VMs
(
- Name VARCHAR(20) NOT NULL,
- SortOrder INT(3) NOT NULL,
- Type ENUM('win32', 'win64', 'build') NOT NULL,
- Role ENUM('extra', 'base', 'winetest', 'retired', 'deleted') NOT NULL,
- Status ENUM('dirty', 'reverting', 'sleeping', 'idle', 'running', 'off', 'offline', 'maintenance') NOT NULL,
- ChildPid INT(5) NULL,
- VirtURI VARCHAR(64) NOT NULL,
- VirtDomain VARCHAR(32) NOT NULL,
- IdleSnapshot VARCHAR(32) NOT NULL,
- Hostname VARCHAR(64) NOT NULL,
- Description VARCHAR(40) NULL,
- Details VARCHAR(512) NULL,
+ Name VARCHAR(20) NOT NULL,
+ SortOrder INT(3) NOT NULL,
+ Type ENUM('win32', 'win64', 'build') NOT NULL,
+ Role ENUM('extra', 'base', 'winetest', 'retired', 'deleted') NOT NULL,
+ Status ENUM('dirty', 'reverting', 'sleeping', 'idle', 'running', 'off', 'offline', 'maintenance') NOT NULL,
+ ChildPid INT(5) NULL,
+ ChildDeadline DATETIME NULL,
+ VirtURI VARCHAR(64) NOT NULL,
+ VirtDomain VARCHAR(32) NOT NULL,
+ IdleSnapshot VARCHAR(32) NOT NULL,
+ Hostname VARCHAR(64) NOT NULL,
+ Description VARCHAR(40) NULL,
+ Details VARCHAR(512) NULL,
PRIMARY KEY (Name)
)
ENGINE=InnoDB DEFAULT CHARSET=utf8;
diff --git a/testbot/doc/winetestbot-schema.dia b/testbot/doc/winetestbot-schema.dia
index 7d18b7271..61a722a21 100644
--- a/testbot/doc/winetestbot-schema.dia
+++ b/testbot/doc/winetestbot-schema.dia
@@ -2475,6 +2475,29 @@
<dia:string>##</dia:string>
</dia:attribute>
</dia:composite>
+ <dia:composite type="table_attribute">
+ <dia:attribute name="name">
+ <dia:string>#ChildDeadline#</dia:string>
+ </dia:attribute>
+ <dia:attribute name="type">
+ <dia:string>#DATETIME#</dia:string>
+ </dia:attribute>
+ <dia:attribute name="comment">
+ <dia:string>##</dia:string>
+ </dia:attribute>
+ <dia:attribute name="primary_key">
+ <dia:boolean val="false"/>
+ </dia:attribute>
+ <dia:attribute name="nullable">
+ <dia:boolean val="false"/>
+ </dia:attribute>
+ <dia:attribute name="unique">
+ <dia:boolean val="false"/>
+ </dia:attribute>
+ <dia:attribute name="default_value">
+ <dia:string>##</dia:string>
+ </dia:attribute>
+ </dia:composite>
<dia:composite type="table_attribute">
<dia:attribute name="name">
<dia:string>#VirtURI#</dia:string>
diff --git a/testbot/lib/WineTestBot/Config.pm b/testbot/lib/WineTestBot/Config.pm
index 9052fc4d0..2eb4f45a8 100644
--- a/testbot/lib/WineTestBot/Config.pm
+++ b/testbot/lib/WineTestBot/Config.pm
@@ -28,12 +28,11 @@ WineTestBot::Config - Site-independent configuration settings
use vars qw (@ISA @EXPORT @EXPORT_OK $UseSSL $LogDir $DataDir $BinDir
$DbDataSource $DbUsername $DbPassword $MaxRevertingVMs
$MaxRevertsWhileRunningVMs $MaxActiveVMs $MaxRunningVMs
- $MaxVMsWhenIdle
- $SleepAfterRevert $WaitForToolsInVM $MaxTaskTries $AdminEMail
- $RobotEMail
+ $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM
+ $VMToolTimeout $MaxTaskTries $AdminEMail $RobotEMail
$WinePatchToOverride $WinePatchCc $SuiteTimeout $SingleTimeout
- $BuildTimeout $ReconfigTimeout $TagPrefix $MaxUnitSize
- $ProjectName $PatchesMailingList $LDAPServer
+ $BuildTimeout $ReconfigTimeout $TimeoutMargin $TagPrefix
+ $MaxUnitSize $ProjectName $PatchesMailingList $LDAPServer
$LDAPBindDN $LDAPSearchBase $LDAPSearchFilter
$LDAPRealNameAttribute $LDAPEMailAttribute $AgentPort $Tunnel
$TunnelDefaults $PrettyHostNames $JobPurgeDays $JobArchiveDays
@@ -44,9 +43,9 @@ require Exporter;
@EXPORT = qw($UseSSL $LogDir $DataDir $BinDir
$MaxRevertingVMs $MaxRevertsWhileRunningVMs $MaxActiveVMs
$MaxRunningVMs $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM
- $MaxTaskTries $AdminEMail
+ $VMToolTimeout $MaxTaskTries $AdminEMail
$RobotEMail $WinePatchToOverride $WinePatchCc $SuiteTimeout
- $SingleTimeout $BuildTimeout $ReconfigTimeout
+ $SingleTimeout $BuildTimeout $ReconfigTimeout $TimeoutMargin
$TagPrefix $MaxUnitSize $ProjectName $PatchesMailingList
$LDAPServer $LDAPBindDN $LDAPSearchBase $LDAPSearchFilter
$LDAPRealNameAttribute $LDAPEMailAttribute $AgentPort $Tunnel
@@ -80,6 +79,8 @@ $WaitForToolsInVM = 30;
# How long to let the VM settle down after the revert before starting a task on
# it (in seconds).
$SleepAfterRevert = 0;
+# Take into account $WaitForToolsInVM and $SleepAfterRevert
+$VMToolTimeout = 6 * 60;
# How many times to run a test that fails before giving up.
$MaxTaskTries = 3;
@@ -96,6 +97,8 @@ $BuildTimeout = 5 * 60;
# How long to let a full recompilation run before forcibly shutting it down
# (in seconds).
$ReconfigTimeout = 45 * 60;
+# How much to add to the task timeout to account for file transfers, etc.
+$TimeoutMargin = 2 * 60;
# Maximum amount of traces for a test unit.
$MaxUnitSize = 32 * 1024;
diff --git a/testbot/lib/WineTestBot/Engine/Scheduler.pm b/testbot/lib/WineTestBot/Engine/Scheduler.pm
index 4eef3c95d..52a6cedef 100644
--- a/testbot/lib/WineTestBot/Engine/Scheduler.pm
+++ b/testbot/lib/WineTestBot/Engine/Scheduler.pm
@@ -244,6 +244,7 @@ sub _CheckAndClassifyVMs()
# created by the scripts called from the scheduler.
$Sched->{recordgroups}->Save();
+ my $Now = time();
my $FoundVMErrors;
# Count the VMs that are 'active', that is, that use resources on the host,
# and those that are reverting. Also build a prioritized list of those that
@@ -261,7 +262,19 @@ sub _CheckAndClassifyVMs()
my $Host = _GetSchedHost($Sched, $VM);
if ($VM->HasRunningChild())
{
- if ($VM->Status =~ /^(?:dirty|running|reverting)$/)
+ if (defined $VM->ChildDeadline and $VM->ChildDeadline < $Now)
+ {
+ # The child process got stuck!
+ $FoundVMErrors = 1;
+ $VM->KillChild();
+ $VM->Status("dirty");
+ $VM->Save();
+ $VM->RecordResult($Sched->{records}, "boterror stuck process");
+ $Sched->{lambvms}->{$VMKey} = 1;
+ $Host->{dirty}++;
+ $Host->{active}++;
+ }
+ elsif ($VM->Status =~ /^(?:dirty|running|reverting)$/)
{
$Sched->{busyvms}->{$VMKey} = 1;
$Host->{$VM->Status}++;
@@ -321,6 +334,7 @@ sub _CheckAndClassifyVMs()
# The VM is missing its child process or it died unexpectedly. Mark
# the VM dirty so a revert or shutdown brings it back to a known state.
$FoundVMErrors = 1;
+ $VM->ChildDeadline(undef);
$VM->ChildPid(undef);
$VM->Status("dirty");
$VM->Save();
diff --git a/testbot/lib/WineTestBot/Tasks.pm b/testbot/lib/WineTestBot/Tasks.pm
index 5121abb19..72dd3ff23 100644
--- a/testbot/lib/WineTestBot/Tasks.pm
+++ b/testbot/lib/WineTestBot/Tasks.pm
@@ -127,7 +127,9 @@ sub Run($$)
my $Args = ["$BinDir/${ProjectName}Run$Script.pl", "--log-only",
$JobId, $StepNo, $TaskNo];
- my $ErrMessage = $self->VM->Run("running", $Args, \&_SetupTask, $self);
+ my $ErrMessage = $self->VM->Run("running", $Args,
+ $self->Timeout + $TimeoutMargin,
+ \&_SetupTask, $self);
if (!$ErrMessage)
{
$self->Status("running");
@@ -159,7 +161,7 @@ sub UpdateStatus($$)
my $TaskDir = $self->CreateDir();
if (open TASKLOG, ">>$TaskDir/err")
{
- print TASKLOG "TestBot process died unexpectedly\n";
+ print TASKLOG "TestBot process got stuck or died unexpectedly\n";
close TASKLOG;
}
umask($OldUMask);
@@ -173,6 +175,7 @@ sub UpdateStatus($$)
if ($VM->Status eq "running")
{
$VM->Status('dirty');
+ $VM->ChildDeadline(undef);
$VM->ChildPid(undef);
$VM->Save();
$VM->RecordResult(undef, "boterror process died");
diff --git a/testbot/lib/WineTestBot/VMs.pm b/testbot/lib/WineTestBot/VMs.pm
index 839986be9..9f5a2c78d 100644
--- a/testbot/lib/WineTestBot/VMs.pm
+++ b/testbot/lib/WineTestBot/VMs.pm
@@ -298,6 +298,7 @@ sub KillChild($)
WineTestBot::Log::LogMsg("Killing child ". $self->ChildPid ."\n");
kill("TERM", $self->ChildPid);
}
+ $self->ChildDeadline(undef);
$self->ChildPid(undef);
}
@@ -329,9 +330,9 @@ sub OnSaved($)
}
}
-sub Run($$$$$)
+sub Run($$$$$$)
{
- my ($self, $NewStatus, $Args, $ChildSetup, $SetupData) = @_;
+ my ($self, $NewStatus, $Args, $ChildTimeout, $ChildSetup, $SetupData) = @_;
my $Tool = basename($Args->[0]);
@@ -368,6 +369,7 @@ sub Run($$$$$)
# Set the Status and ChildPid
$self->Status($NewStatus);
$self->ChildPid($Pid);
+ $self->ChildDeadline(time() + $ChildTimeout);
my ($ErrProperty, $ErrMessage) = $self->Save();
if ($ErrMessage)
{
@@ -410,6 +412,7 @@ sub Run($$$$$)
# Reset the Status and ChildPid since the exec failed
$self->Status("offline");
+ $self->ChildDeadline(undef);
$self->ChildPid(undef);
my ($ErrProperty, $ErrMessage) = $self->Save();
if ($ErrMessage)
@@ -433,7 +436,8 @@ sub _RunVMTool($$$)
my $Tool = "LibvirtTool.pl";
unshift @$Args, "$BinDir/$Tool";
- return $self->Run($NewStatus, $Args, \&_SetupChild, undef);
+ my $Timeout = $NewStatus eq "offline" ? 3600 : $VMToolTimeout;
+ return $self->Run($NewStatus, $Args, $Timeout, \&_SetupChild, undef);
}
=pod
@@ -664,6 +668,7 @@ my @PropertyDescriptors = (
CreateEnumPropertyDescriptor("Role", "VM Role", !1, 1, ['extra', 'base', 'winetest', 'retired', 'deleted']),
CreateEnumPropertyDescriptor("Status", "Current status", !1, 1, ['dirty', 'reverting', 'sleeping', 'idle', 'running', 'off', 'offline', 'maintenance']),
CreateBasicPropertyDescriptor("ChildPid", "Child process id", !1, !1, "N", 5),
+ CreateBasicPropertyDescriptor("ChildDeadline", "Child Deadline", !1, !1, "DT", 19),
CreateBasicPropertyDescriptor("VirtURI", "LibVirt URI of the VM", !1, 1, "A", 64),
CreateBasicPropertyDescriptor("VirtDomain", "LibVirt Domain for the VM", !1, 1, "A", 32),
CreateBasicPropertyDescriptor("IdleSnapshot", "Name of idle snapshot", !1, 1, "A", 32),
diff --git a/testbot/web/admin/VMDetails.pl b/testbot/web/admin/VMDetails.pl
index f883884c7..93e92763a 100644
--- a/testbot/web/admin/VMDetails.pl
+++ b/testbot/web/admin/VMDetails.pl
@@ -39,7 +39,8 @@ sub DisplayProperty($$)
{
my ($self, $PropertyDescriptor) = @_;
- return "" if ($PropertyDescriptor->GetName() eq "ChildPid");
+ my $PropertyName = $PropertyDescriptor->GetName();
+ return "" if ($PropertyName =~ /^(?:ChildPid|ChildDeadline)$/);
return $self->SUPER::DisplayProperty($PropertyDescriptor);
}
--
2.17.0