The tasks themselves have a timeout which the corresponding scripts
enforce. However the scripts themselves may get stuck, typically due
to network problems. When that happens they can end up blocking the
whole TestBot. So make sure the TestBot engine itself can detect stuck
scripts and take corrective action.
Note that the detection is not very timely but will happen at the
latest in the SafetyNet() function. This means there will be at most a
10 minutes delay.
Signed-off-by: Francois Gouget <fgouget(a)codeweavers.com>
---
 testbot/bin/LibvirtTool.pl                  |  7 +++++-
 testbot/bin/WineRunBuild.pl                 |  1 +
 testbot/bin/WineRunReconfig.pl              |  1 +
 testbot/bin/WineRunTask.pl                  |  1 +
 testbot/ddl/update32.sql                    |  5 +++++
 testbot/ddl/winetestbot.sql                 | 25 +++++++++++----------
 testbot/doc/winetestbot-schema.dia          | 23 +++++++++++++++++++
 testbot/lib/WineTestBot/Config.pm           | 17 ++++++++------
 testbot/lib/WineTestBot/Engine/Scheduler.pm | 16 ++++++++++++-
 testbot/lib/WineTestBot/Tasks.pm            |  7 ++++--
 testbot/lib/WineTestBot/VMs.pm              | 11 ++++++---
 testbot/web/admin/VMDetails.pl              |  3 ++-
 12 files changed, 90 insertions(+), 27 deletions(-)
 create mode 100644 testbot/ddl/update32.sql
diff --git a/testbot/bin/LibvirtTool.pl b/testbot/bin/LibvirtTool.pl
index fa07d2d20..5d60dde1b 100755
--- a/testbot/bin/LibvirtTool.pl
+++ b/testbot/bin/LibvirtTool.pl
@@ -196,6 +196,7 @@ sub FatalError($)
   # Put the VM offline if nobody else modified its status before us
   $VM = CreateVMs()->GetItem($VMKey);
   $VM->Status("offline") if ($VM->Status eq $CurrentStatus);
+  $VM->ChildDeadline(undef);
   $VM->ChildPid(undef);
   my ($ErrProperty, $SaveErrMessage) = $VM->Save();
   if (defined $SaveErrMessage)
@@ -241,7 +242,11 @@ sub ChangeStatus($$;$)
   }
 
   $VM->Status($To);
-  $VM->ChildPid(undef) if ($Done);
+  if ($Done)
+  {
+    $VM->ChildDeadline(undef);
+    $VM->ChildPid(undef);
+  }
   my ($ErrProperty, $ErrMessage) = $VM->Save();
   if (defined $ErrMessage)
   {
diff --git a/testbot/bin/WineRunBuild.pl b/testbot/bin/WineRunBuild.pl
index 3a52dbd04..27b8be0e0 100755
--- a/testbot/bin/WineRunBuild.pl
+++ b/testbot/bin/WineRunBuild.pl
@@ -250,6 +250,7 @@ sub WrapUpAndExit($;$$)
   if ($VM->Status eq 'running')
   {
     $VM->Status($NewVMStatus);
+    $VM->ChildDeadline(undef);
     $VM->ChildPid(undef);
     $VM->Save();
   }
diff --git a/testbot/bin/WineRunReconfig.pl b/testbot/bin/WineRunReconfig.pl
index 77994a3a3..713712cec 100755
--- a/testbot/bin/WineRunReconfig.pl
+++ b/testbot/bin/WineRunReconfig.pl
@@ -251,6 +251,7 @@ sub WrapUpAndExit($;$$)
   if ($VM->Status eq 'running')
   {
     $VM->Status($NewVMStatus);
+    $VM->ChildDeadline(undef);
     $VM->ChildPid(undef);
     $VM->Save();
   }
diff --git a/testbot/bin/WineRunTask.pl b/testbot/bin/WineRunTask.pl
index 4f69a1e4e..231f34e0c 100755
--- a/testbot/bin/WineRunTask.pl
+++ b/testbot/bin/WineRunTask.pl
@@ -279,6 +279,7 @@ sub WrapUpAndExit($;$$$)
   if ($VM->Status eq 'running')
   {
     $VM->Status($NewVMStatus);
+    $VM->ChildDeadline(undef);
     $VM->ChildPid(undef);
     $VM->Save();
   }
diff --git a/testbot/ddl/update32.sql b/testbot/ddl/update32.sql
new file mode 100644
index 000000000..5864d63e0
--- /dev/null
+++ b/testbot/ddl/update32.sql
@@ -0,0 +1,5 @@
+USE winetestbot;
+
+ALTER TABLE VMs
+  ADD ChildDeadline DATETIME NULL
+      AFTER ChildPid;
diff --git a/testbot/ddl/winetestbot.sql b/testbot/ddl/winetestbot.sql
index 41740159b..89810ddc0 100644
--- a/testbot/ddl/winetestbot.sql
+++ b/testbot/ddl/winetestbot.sql
@@ -44,18 +44,19 @@ ENGINE=InnoDB DEFAULT CHARSET=utf8;
 
 CREATE TABLE VMs
 (
-  Name         VARCHAR(20)      NOT NULL,
-  SortOrder    INT(3)           NOT NULL,
-  Type         ENUM('win32', 'win64', 'build') NOT NULL,
-  Role         ENUM('extra', 'base', 'winetest', 'retired', 'deleted') NOT NULL,
-  Status       ENUM('dirty', 'reverting', 'sleeping', 'idle', 'running', 'off', 'offline', 'maintenance') NOT NULL,
-  ChildPid     INT(5)           NULL,
-  VirtURI      VARCHAR(64)      NOT NULL,
-  VirtDomain   VARCHAR(32)      NOT NULL,
-  IdleSnapshot VARCHAR(32)      NOT NULL,
-  Hostname     VARCHAR(64)      NOT NULL,
-  Description  VARCHAR(40)      NULL,
-  Details      VARCHAR(512)     NULL,
+  Name          VARCHAR(20)      NOT NULL,
+  SortOrder     INT(3)           NOT NULL,
+  Type          ENUM('win32', 'win64', 'build') NOT NULL,
+  Role          ENUM('extra', 'base', 'winetest', 'retired', 'deleted') NOT NULL,
+  Status        ENUM('dirty', 'reverting', 'sleeping', 'idle', 'running', 'off', 'offline', 'maintenance') NOT NULL,
+  ChildPid      INT(5)           NULL,
+  ChildDeadline DATETIME         NULL,
+  VirtURI       VARCHAR(64)      NOT NULL,
+  VirtDomain    VARCHAR(32)      NOT NULL,
+  IdleSnapshot  VARCHAR(32)      NOT NULL,
+  Hostname      VARCHAR(64)      NOT NULL,
+  Description   VARCHAR(40)      NULL,
+  Details       VARCHAR(512)     NULL,
   PRIMARY KEY (Name)
 )
 ENGINE=InnoDB DEFAULT CHARSET=utf8;
diff --git a/testbot/doc/winetestbot-schema.dia b/testbot/doc/winetestbot-schema.dia
index 7d18b7271..61a722a21 100644
--- a/testbot/doc/winetestbot-schema.dia
+++ b/testbot/doc/winetestbot-schema.dia
@@ -2475,6 +2475,29 @@
             <dia:string>##</dia:string>
           </dia:attribute>
         </dia:composite>
+        <dia:composite type="table_attribute">
+          <dia:attribute name="name">
+            <dia:string>#ChildDeadline#</dia:string>
+          </dia:attribute>
+          <dia:attribute name="type">
+            <dia:string>#DATETIME#</dia:string>
+          </dia:attribute>
+          <dia:attribute name="comment">
+            <dia:string>##</dia:string>
+          </dia:attribute>
+          <dia:attribute name="primary_key">
+            <dia:boolean val="false"/>
+          </dia:attribute>
+          <dia:attribute name="nullable">
+            <dia:boolean val="false"/>
+          </dia:attribute>
+          <dia:attribute name="unique">
+            <dia:boolean val="false"/>
+          </dia:attribute>
+          <dia:attribute name="default_value">
+            <dia:string>##</dia:string>
+          </dia:attribute>
+        </dia:composite>
         <dia:composite type="table_attribute">
           <dia:attribute name="name">
             <dia:string>#VirtURI#</dia:string>
diff --git a/testbot/lib/WineTestBot/Config.pm b/testbot/lib/WineTestBot/Config.pm
index 9052fc4d0..2eb4f45a8 100644
--- a/testbot/lib/WineTestBot/Config.pm
+++ b/testbot/lib/WineTestBot/Config.pm
@@ -28,12 +28,11 @@ WineTestBot::Config - Site-independent configuration settings
 use vars qw (@ISA @EXPORT @EXPORT_OK $UseSSL $LogDir $DataDir $BinDir
              $DbDataSource $DbUsername $DbPassword $MaxRevertingVMs
              $MaxRevertsWhileRunningVMs $MaxActiveVMs $MaxRunningVMs
-             $MaxVMsWhenIdle
-             $SleepAfterRevert $WaitForToolsInVM $MaxTaskTries $AdminEMail
-             $RobotEMail
+             $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM
+             $VMToolTimeout $MaxTaskTries $AdminEMail $RobotEMail
              $WinePatchToOverride $WinePatchCc $SuiteTimeout $SingleTimeout
-             $BuildTimeout $ReconfigTimeout $TagPrefix $MaxUnitSize
-             $ProjectName $PatchesMailingList $LDAPServer
+             $BuildTimeout $ReconfigTimeout $TimeoutMargin $TagPrefix
+             $MaxUnitSize $ProjectName $PatchesMailingList $LDAPServer
              $LDAPBindDN $LDAPSearchBase $LDAPSearchFilter
              $LDAPRealNameAttribute $LDAPEMailAttribute $AgentPort $Tunnel
              $TunnelDefaults $PrettyHostNames $JobPurgeDays $JobArchiveDays
@@ -44,9 +43,9 @@ require Exporter;
 @EXPORT = qw($UseSSL $LogDir $DataDir $BinDir
              $MaxRevertingVMs $MaxRevertsWhileRunningVMs $MaxActiveVMs
              $MaxRunningVMs $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM
-             $MaxTaskTries $AdminEMail
+             $VMToolTimeout $MaxTaskTries $AdminEMail
              $RobotEMail $WinePatchToOverride $WinePatchCc $SuiteTimeout
-             $SingleTimeout $BuildTimeout $ReconfigTimeout
+             $SingleTimeout $BuildTimeout $ReconfigTimeout $TimeoutMargin
              $TagPrefix $MaxUnitSize $ProjectName $PatchesMailingList
              $LDAPServer $LDAPBindDN $LDAPSearchBase $LDAPSearchFilter
              $LDAPRealNameAttribute $LDAPEMailAttribute $AgentPort $Tunnel
@@ -80,6 +79,8 @@ $WaitForToolsInVM = 30;
 # How long to let the VM settle down after the revert before starting a task on
 # it (in seconds).
 $SleepAfterRevert = 0;
+# Take into account $WaitForToolsInVM and $SleepAfterRevert
+$VMToolTimeout = 6 * 60;
 
 # How many times to run a test that fails before giving up.
 $MaxTaskTries = 3;
@@ -96,6 +97,8 @@ $BuildTimeout = 5 * 60;
 # How long to let a full recompilation run before forcibly shutting it down
 # (in seconds).
 $ReconfigTimeout = 45 * 60;
+# How much to add to the task timeout to account for file transfers, etc.
+$TimeoutMargin = 2 * 60;
 # Maximum amount of traces for a test unit.
 $MaxUnitSize = 32 * 1024;
 
diff --git a/testbot/lib/WineTestBot/Engine/Scheduler.pm b/testbot/lib/WineTestBot/Engine/Scheduler.pm
index 4eef3c95d..52a6cedef 100644
--- a/testbot/lib/WineTestBot/Engine/Scheduler.pm
+++ b/testbot/lib/WineTestBot/Engine/Scheduler.pm
@@ -244,6 +244,7 @@ sub _CheckAndClassifyVMs()
   # created by the scripts called from the scheduler.
   $Sched->{recordgroups}->Save();
 
+  my $Now = time();
   my $FoundVMErrors;
   # Count the VMs that are 'active', that is, that use resources on the host,
   # and those that are reverting. Also build a prioritized list of those that
@@ -261,7 +262,19 @@ sub _CheckAndClassifyVMs()
     my $Host = _GetSchedHost($Sched, $VM);
     if ($VM->HasRunningChild())
     {
-      if ($VM->Status =~ /^(?:dirty|running|reverting)$/)
+      if (defined $VM->ChildDeadline and $VM->ChildDeadline < $Now)
+      {
+        # The child process got stuck!
+        $FoundVMErrors = 1;
+        $VM->KillChild();
+        $VM->Status("dirty");
+        $VM->Save();
+        $VM->RecordResult($Sched->{records}, "boterror stuck process");
+        $Sched->{lambvms}->{$VMKey} = 1;
+        $Host->{dirty}++;
+        $Host->{active}++;
+      }
+      elsif ($VM->Status =~ /^(?:dirty|running|reverting)$/)
       {
         $Sched->{busyvms}->{$VMKey} = 1;
         $Host->{$VM->Status}++;
@@ -321,6 +334,7 @@ sub _CheckAndClassifyVMs()
         # The VM is missing its child process or it died unexpectedly. Mark
         # the VM dirty so a revert or shutdown brings it back to a known state.
         $FoundVMErrors = 1;
+        $VM->ChildDeadline(undef);
         $VM->ChildPid(undef);
         $VM->Status("dirty");
         $VM->Save();
diff --git a/testbot/lib/WineTestBot/Tasks.pm b/testbot/lib/WineTestBot/Tasks.pm
index 5121abb19..72dd3ff23 100644
--- a/testbot/lib/WineTestBot/Tasks.pm
+++ b/testbot/lib/WineTestBot/Tasks.pm
@@ -127,7 +127,9 @@ sub Run($$)
   my $Args = ["$BinDir/${ProjectName}Run$Script.pl", "--log-only",
               $JobId, $StepNo, $TaskNo];
 
-  my $ErrMessage = $self->VM->Run("running", $Args, \&_SetupTask, $self);
+  my $ErrMessage = $self->VM->Run("running", $Args,
+                                  $self->Timeout + $TimeoutMargin,
+                                  \&_SetupTask, $self);
   if (!$ErrMessage)
   {
     $self->Status("running");
@@ -159,7 +161,7 @@ sub UpdateStatus($$)
     my $TaskDir = $self->CreateDir();
     if (open TASKLOG, ">>$TaskDir/err")
     {
-      print TASKLOG "TestBot process died unexpectedly\n";
+      print TASKLOG "TestBot process got stuck or died unexpectedly\n";
       close TASKLOG;
     }
     umask($OldUMask);
@@ -173,6 +175,7 @@ sub UpdateStatus($$)
     if ($VM->Status eq "running")
     {
       $VM->Status('dirty');
+      $VM->ChildDeadline(undef);
       $VM->ChildPid(undef);
       $VM->Save();
       $VM->RecordResult(undef, "boterror process died");
diff --git a/testbot/lib/WineTestBot/VMs.pm b/testbot/lib/WineTestBot/VMs.pm
index 839986be9..9f5a2c78d 100644
--- a/testbot/lib/WineTestBot/VMs.pm
+++ b/testbot/lib/WineTestBot/VMs.pm
@@ -298,6 +298,7 @@ sub KillChild($)
     WineTestBot::Log::LogMsg("Killing child ". $self->ChildPid ."\n");
     kill("TERM", $self->ChildPid);
   }
+  $self->ChildDeadline(undef);
   $self->ChildPid(undef);
 }
 
@@ -329,9 +330,9 @@ sub OnSaved($)
   }
 }
 
-sub Run($$$$$)
+sub Run($$$$$$)
 {
-  my ($self, $NewStatus, $Args, $ChildSetup, $SetupData) = @_;
+  my ($self, $NewStatus, $Args, $ChildTimeout, $ChildSetup, $SetupData) = @_;
 
   my $Tool = basename($Args->[0]);
 
@@ -368,6 +369,7 @@ sub Run($$$$$)
     # Set the Status and ChildPid
     $self->Status($NewStatus);
     $self->ChildPid($Pid);
+    $self->ChildDeadline(time() + $ChildTimeout);
     my ($ErrProperty, $ErrMessage) = $self->Save();
     if ($ErrMessage)
     {
@@ -410,6 +412,7 @@ sub Run($$$$$)
 
   # Reset the Status and ChildPid since the exec failed
   $self->Status("offline");
+  $self->ChildDeadline(undef);
   $self->ChildPid(undef);
   my ($ErrProperty, $ErrMessage) = $self->Save();
   if ($ErrMessage)
@@ -433,7 +436,8 @@ sub _RunVMTool($$$)
 
   my $Tool = "LibvirtTool.pl";
   unshift @$Args, "$BinDir/$Tool";
-  return $self->Run($NewStatus, $Args, \&_SetupChild, undef);
+  my $Timeout = $NewStatus eq "offline" ? 3600 : $VMToolTimeout;
+  return $self->Run($NewStatus, $Args, $Timeout, \&_SetupChild, undef);
 }
 
 =pod
@@ -664,6 +668,7 @@ my @PropertyDescriptors = (
   CreateEnumPropertyDescriptor("Role", "VM Role", !1, 1, ['extra', 'base', 'winetest', 'retired', 'deleted']),
   CreateEnumPropertyDescriptor("Status", "Current status", !1, 1, ['dirty', 'reverting', 'sleeping', 'idle', 'running', 'off', 'offline', 'maintenance']),
   CreateBasicPropertyDescriptor("ChildPid", "Child process id", !1, !1, "N", 5),
+  CreateBasicPropertyDescriptor("ChildDeadline", "Child Deadline", !1, !1, "DT", 19),
   CreateBasicPropertyDescriptor("VirtURI", "LibVirt URI of the VM", !1, 1, "A", 64),
   CreateBasicPropertyDescriptor("VirtDomain", "LibVirt Domain for the VM", !1, 1, "A", 32),
   CreateBasicPropertyDescriptor("IdleSnapshot", "Name of idle snapshot", !1, 1, "A", 32),
diff --git a/testbot/web/admin/VMDetails.pl b/testbot/web/admin/VMDetails.pl
index f883884c7..93e92763a 100644
--- a/testbot/web/admin/VMDetails.pl
+++ b/testbot/web/admin/VMDetails.pl
@@ -39,7 +39,8 @@ sub DisplayProperty($$)
 {
   my ($self, $PropertyDescriptor) = @_;
 
-  return "" if ($PropertyDescriptor->GetName() eq "ChildPid");
+  my $PropertyName = $PropertyDescriptor->GetName();
+  return "" if ($PropertyName =~ /^(?:ChildPid|ChildDeadline)$/);
   return $self->SUPER::DisplayProperty($PropertyDescriptor);
 }
 
-- 
2.17.0