[PATCH] testbot/TestAgent: Modify and document the connection timeout handling. - wine-devel

18 Sep 2019

The new model better handles the case where we need to wait for the VM
to boot before we can connect to the testagentd server.
Signed-off-by: Francois Gouget fgouget@codeweavers.com
---
 testbot/bin/LibvirtTool.pl           |   9 +-
 testbot/lib/WineTestBot/Config.pm    |  15 ++-
 testbot/lib/WineTestBot/TestAgent.pm | 169 ++++++++++++++++++++++++---
 testbot/lib/WineTestBot/VMs.pm       |   8 +-
 testbot/scripts/SetWinLocale         |   2 +-
 testbot/scripts/TestAgent            |  30 +++--
 6 files changed, 181 insertions(+), 52 deletions(-)

diff --git a/testbot/bin/LibvirtTool.pl b/testbot/bin/LibvirtTool.pl
index 5c3906503..f15bab70b 100755
--- a/testbot/bin/LibvirtTool.pl
+++ b/testbot/bin/LibvirtTool.pl
@@ -408,12 +408,11 @@ sub Revert()
   # The VM is now sleeping which may allow some tasks to run
   return 1 if (ChangeStatus("reverting", "sleeping"));
-  # Check the TestAgent connection. Note that this may take some time
-  # if the VM needs to boot first.
-  Debug(Elapsed($Start), " Trying the TestAgent connection\n");
-  LogMsg "Waiting for ". $VM->Name ." (up to ${WaitForToolsInVM}s per attempt)\n";
+  # Verify that the TestAgent server accepts connections
+  Debug(Elapsed($Start), " Verifying the TestAgent server\n");
+  LogMsg "Verifying the $VMKey TestAgent server\n";
   my $TA = $VM->GetAgent();
-  $TA->SetConnectTimeout($WaitForToolsInVM, undef, $WaitForToolsInVM);
+  $TA->SetConnectTimeout(undef, undef, $WaitForBoot);
   my $Success = $TA->Ping();
   $TA->Disconnect();
   if (!$Success)
diff --git a/testbot/lib/WineTestBot/Config.pm b/testbot/lib/WineTestBot/Config.pm
index 3d9214bae..3496c310e 100644
--- a/testbot/lib/WineTestBot/Config.pm
+++ b/testbot/lib/WineTestBot/Config.pm
@@ -28,7 +28,7 @@ WineTestBot::Config - Site-independent configuration settings
 use vars qw (@ISA @EXPORT @EXPORT_OK $UseSSL $LogDir $DataDir $BinDir
              %RepoURLs $DbDataSource $DbUsername $DbPassword $MaxRevertingVMs
              $MaxRevertsWhileRunningVMs $MaxActiveVMs $MaxRunningVMs
-             $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM
+             $MaxVMsWhenIdle $SleepAfterRevert $WaitForBoot
              $VMToolTimeout $MaxVMErrors $MaxTaskTries $AdminEMail $RobotEMail
              $WinePatchToOverride $WinePatchCc
              $ExeBuildNativeTimeout $ExeBuildTestTimeout $ExeModuleTimeout
@@ -44,7 +44,7 @@ require Exporter;
 @ISA = qw(Exporter);
 @EXPORT = qw($UseSSL $LogDir $DataDir $BinDir %RepoURLs
              $MaxRevertingVMs $MaxRevertsWhileRunningVMs $MaxActiveVMs
-             $MaxRunningVMs $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM
+             $MaxRunningVMs $MaxVMsWhenIdle $SleepAfterRevert $WaitForBoot
              $VMToolTimeout $MaxVMErrors $MaxTaskTries $AdminEMail
              $RobotEMail $WinePatchToOverride $WinePatchCc $SuiteTimeout
              $ExeBuildNativeTimeout $ExeBuildTestTimeout $ExeModuleTimeout
@@ -90,14 +90,13 @@ $MaxActiveVMs = 2;
 $MaxRunningVMs = 1;
 $MaxVMsWhenIdle = undef;
-# How long to wait for each of the 3 connection attempts to the VM's TestAgent
-# server after a revert (in seconds). If there are powered off snapshots this
-# must be long enough for the VM to boot up first.
-$WaitForToolsInVM = 30;
-# How long to let the VM settle down after the revert before starting a task on
+# How long to attempt to connect to the TestAgent while the VM is booting.
+$WaitForBoot = 90;
+# How long to let the VM settle down after a revert before starting a task on
 # it (in seconds).
 $SleepAfterRevert = 0;
-# Take into account $WaitForToolsInVM and $SleepAfterRevert
+# How long to wait before considering a VM operation is stuck. In particular
+# for reverts this should take into account the time it may need to boot.
 $VMToolTimeout = 6 * 60;
# After three consecutive failures to revert a VM, put it in maintenance mode.
diff --git a/testbot/lib/WineTestBot/TestAgent.pm b/testbot/lib/WineTestBot/TestAgent.pm
index d3d7137a5..8d7f456fd 100644
--- a/testbot/lib/WineTestBot/TestAgent.pm
+++ b/testbot/lib/WineTestBot/TestAgent.pm
@@ -101,9 +101,9 @@ sub new($$$;$)
     agenthost  => $Hostname,
     agentport  => $Port,
     connection => "$Hostname:$Port",
-    ctimeout   => 20,
-    cattempts  => 3,
-    cinterval  => 0,
+    conetimeout  => 20,
+    cminattempts => 2,
+    cmintimeout  => 10,
     timeout    => 0,
     fd         => undef,
     deadline   => undef,
@@ -140,28 +140,103 @@ sub Disconnect($)
   $self->{agentversion} = undef;
 }
+=pod
+=over 12
+
+=item C<SetConnectTimeout()>
+
+Configures how many times and for how long to attempt to connect to the server.
+
+=item OneTimeout
+
+OneTimeout specifies the timeout for one connection attempt in seconds.
+Zero means there will be no timeout. Undef means the value is not changed.
+
+=item MinAttempts
+
+MinAttempts specifies the minimum number of connection attempts. It must be a
+non-zero positive integer. Undef means the value is not changed.
+
+=item MinTimeout
+
+MinTimeout specifies the minimum period during which connection attempts should
+be made in seconds. Zero means there will be no minimum timeout. Undef means
+the value is not changed.
+
+This means connection attempts will be made until either one succeeds or
+MinTimeout seconds have elapsed, even if each attempt fails quickly such that
+more than $MinAttempts attempts are performed. Conversely, if each attempt
+takes a long time such that more MinTimeout is reached before MinAttempt
+connection attempts have been made, then attempts will continue until the
+minimum number of attempts is reached.
+
+Thus the worst case connection timeout is:
+
+   max($MinAttempts * $OneTimeout, $MinTimeout + $OneTimeout)
+
+=item Return value
+
+For each modified value (i.e. not undef), the old value is returned.
+
+=item Usage
+
+To perform a single connection attempt with a 20 second timeout:
+
+    $TA->SetConnectionTimeout(20, 1, 0);
+
+For a bit more robustness in case one expects short network disruptions:
+
+    $TA->SetConnectionTimeout(20, 2, 10);
+
+But if the server needs time to boot before accepting connections, then
+MinTimeout should be set to the amount of time this is expected to take. It
+would make sense to reset MinTimeout after the first RPC has completed since
+then a reconnection would not have to wait for the server to boot again:
+
+    my $OldMinTimeout = $TA->SetConnectionTimeout(undef, undef, 90);
+
+    ... first RPC ...
+
+    $TA->SetConnectionTimeout(undef, undef, $OldMinTimeout);
+
+=back
+=cut
+
 sub SetConnectTimeout($$;$$)
 {
-  my ($self, $Timeout, $Attempts, $Interval) = @_;
+  my ($self, $OneTimeout, $MinAttempts, $MinTimeout) = @_;
   my @Ret;
-  if (defined $Timeout)
+  if (defined $OneTimeout)
   {
-    push @Ret, $self->{ctimeout};
-    $self->{ctimeout} = $Timeout;
+    push @Ret, $self->{conetimeout};
+    $self->{conetimeout} = $OneTimeout;
   }
-  if (defined $Attempts)
+  if (defined $MinAttempts)
   {
-    push @Ret, $self->{cattempts};
-    $self->{cattempts} = $Attempts;
+    push @Ret, $self->{cminattempts};
+    $self->{cminattempts} = $MinAttempts;
   }
-  if (defined $Interval)
+  if (defined $MinTimeout)
   {
-    push @Ret, $self->{cinterval};
-    $self->{cinterval} = $Interval;
+    push @Ret, $self->{cmintimeout};
+    $self->{cmintimeout} = $MinTimeout;
   }
   return @Ret;
 }
+=pod
+=over 12
+
+=item C<SetTimeout()>
+
+Configures how long an individual RPC can take to complete (in seconds).
+
+Note that some operations like Wait() and Wait2() involve multiple RPCs. These
+have their own timeouts.
+
+=back
+=cut
+
 sub SetTimeout($$)
 {
   my ($self, $Timeout) = @_;
@@ -857,14 +932,16 @@ sub _Connect($)
   my $OldRPC = $self->{rpc};
   $self->{rpc} = ($self->{rpc} ? "$self->{rpc}/" : "") ."connect";
-  my $Start = time();
-  foreach my $Attempt (1..$self->{cattempts})
+  my $Attempt = 1;
+  my $MinDeadline = $self->{cmintimeout} ? time() + $self->{cmintimeout} : 0;
+  while (1)
   {
     my $Step = "initializing";
     eval
     {
       local $SIG{ALRM} = sub { die "timeout" };
-      $self->{deadline} = $self->{ctimeout} ? time() + $self->{ctimeout} : undef;
+      my $OneDeadline = $self->{conetimeout} ? time() + $self->{conetimeout} : 0;
+      $self->{deadline} = ($OneDeadline < $MinDeadline) ? $MinDeadline : $OneDeadline;
       $self->_SetAlarm();
if ($self->{tunnel})
@@ -945,8 +1022,13 @@ sub _Connect($)
       last;
     }
-    my $Remaining = $Attempt * $self->{cinterval} - (time() - $Start);
-    sleep($Remaining) if ($Remaining > 0);
+    $Attempt++;
+    last if ($Attempt > $self->{cminattempts} and $MinDeadline <= time() + 1);
+
+    # Wait at least 1 second between attempts so that if the error happens on
+    #the client-side (e.g. in case of an invalid ssh argument) we don't busy
+    # loop for $MinTimeout seconds.
+    sleep(1);
   }
   $self->{rpc} = $OldRPC;
   return undef;
@@ -1023,6 +1105,19 @@ sub _SendStringOrFile($$$$$$)
          $self->_RecvList('');
 }
+=pod
+=over 12
+
+=item C<SendFile()>
+
+Sends the $LocalPathName file and saves it as the $ServerPathName file on the
+server. If $Flags is set to $SENDFILE_EXE the file will be made executable.
+
+Note that the transfer must complete within the SetTimeout() limit.
+
+=back
+=cut
+
 sub SendFile($$$;$)
 {
   my ($self, $LocalPathName, $ServerPathName, $Flags) = @_;
@@ -1039,6 +1134,19 @@ sub SendFile($$$;$)
   return undef;
 }
+=pod
+=over 12
+
+=item C<SendFileFromString()>
+
+Sends the $Data string and saves it as the $ServerPathName file on the
+server. If $Flags is set to $SENDFILE_EXE the file will be made executable.
+
+Note that the transfer must complete within the SetTimeout() limit.
+
+=back
+=cut
+
 sub SendFileFromString($$$;$)
 {
   my ($self, $Data, $ServerPathName, $Flags) = @_;
@@ -1059,6 +1167,19 @@ sub _GetStringOrFile($$$)
                 $self->_RecvString('String', 'd'));
 }
+=pod
+=over 12
+
+=item C<GetFile()>
+
+Retrieves the $ServerPathName file from the server and saves it as
+$LocalPathName.
+
+Note that the transfer must complete within the SetTimeout() limit.
+
+=back
+=cut
+
 sub GetFile($$$)
 {
   my ($self, $ServerPathName, $LocalPathName) = @_;
@@ -1075,6 +1196,18 @@ sub GetFile($$$)
   return undef;
 }
+=pod
+=over 12
+
+=item C<GetFileToString()>
+
+Retrieves the $ServerPathName file from the server returns it as a string.
+
+Note that the transfer must complete within the SetTimeout() limit.
+
+=back
+=cut
+
 sub GetFileToString($$)
 {
   my ($self, $ServerPathName) = @_;
diff --git a/testbot/lib/WineTestBot/VMs.pm b/testbot/lib/WineTestBot/VMs.pm
index 2bf350435..d85df1d08 100644
--- a/testbot/lib/WineTestBot/VMs.pm
+++ b/testbot/lib/WineTestBot/VMs.pm
@@ -584,10 +584,10 @@ sub RunPowerOff($)
Reverts the VM so that it is ready to run jobs.
-Note that in addition to the hypervisor revert operation this implies checking
-that it responds to our commands ($WaitForToolsInVM) and possibly letting the
-VM settle down ($SleepAfterRevert). If this operation fails the administrator
-is notified and the VM is marked as offline.
+Note that in addition to the hypervisor revert operation this may imply waiting
+for the VM to boot, checking that the TestAgent server is accessible, and
+possibly letting the VM settle down ($SleepAfterRevert). If this operation
+fails the administrator is notified and the VM is put offline.
This operation can take a long time so it is performed in a separate process.
diff --git a/testbot/scripts/SetWinLocale b/testbot/scripts/SetWinLocale
index 2690dbb1f..ca750b792 100755
--- a/testbot/scripts/SetWinLocale
+++ b/testbot/scripts/SetWinLocale
@@ -843,7 +843,7 @@ if ($OptRefresh)
# Allow up to 10 minutes for the shutdown plus reboot.
   Debug(Elapsed($Start), " Waiting for Windows to boot\n");
-  $TA->SetConnectTimeout(20, 30);
+  $TA->SetConnectTimeout(undef, undef, $WaitForBoot);
   if (!$DryRun and !$TA->Ping())
   {
     FatalError("could not reconnect to $OptHostName after the reboot: ", $TA->GetLastError(), "\n");
diff --git a/testbot/scripts/TestAgent b/testbot/scripts/TestAgent
index 25fc0d426..b4ec582be 100755
--- a/testbot/scripts/TestAgent
+++ b/testbot/scripts/TestAgent
@@ -56,7 +56,7 @@ my ($Cmd, $Hostname, $LocalFilename, $ServerFilename, $PropName, $PropValue, @Rm
 my (@Run, $RunIn, $RunOut, $RunErr, $WaitPid);
 my $SendFlags = 0;
 my $RunFlags = 0;
-my ($Port, $ConnectTimeout, $ConnectAttempts, $ConnectInterval, $Timeout);
+my ($Port, $ConnectOneTimeout, $ConnectMinAttempts, $ConnectMinTimeout, $Timeout);
 my ($Keepalive, $TunnelOpt);
 my $Usage;
@@ -100,17 +100,17 @@ while (@ARGV)
     {
         $Port = check_opt_val($arg, $Port);
     }
-    elsif ($arg eq "--connect-timeout")
+    elsif ($arg eq "--connect-one-timeout")
     {
-        $ConnectTimeout = check_opt_val($arg, $ConnectTimeout);
+        $ConnectOneTimeout = check_opt_val($arg, $ConnectOneTimeout);
     }
-    elsif ($arg eq "--connect-attempts")
+    elsif ($arg eq "--connect-min-attempts")
     {
-        $ConnectAttempts = check_opt_val($arg, $ConnectAttempts);
+        $ConnectMinAttempts = check_opt_val($arg, $ConnectMinAttempts);
     }
-    elsif ($arg eq "--connect-interval")
+    elsif ($arg eq "--connect-min-timeout")
     {
-        $ConnectInterval = check_opt_val($arg, $ConnectInterval);
+        $ConnectMinTimeout = check_opt_val($arg, $ConnectMinTimeout);
     }
     elsif ($arg eq "--timeout")
     {
@@ -361,11 +361,12 @@ if (defined $Usage)
     print "                restarts it.\n";
     print "  <hostname>    Is the hostname of the server.\n";
     print "  --port <port> Use the specified port number instead of the default one.\n";
-    print "  --connect-timeout <timeout> Use the specified timeout (in seconds) when\n";
-    print "                connecting instead of the default one.\n";
-    print "  --connect-attempts <count> How many connection attempts to perform.\n";
-    print "  --connect-interval <time> The minimum interval (in seconds) between failed\n";
-    print "                connection attempts.\n";
+    print "  --connect-one-timeout <time> Specifies the timeout for one connection\n";
+    print "                attempt (in seconds).\n";
+    print "  --connect-min-attempts <count> Specifies the minimum number of connection\n";
+    print "                attempts.\n";
+    print "  --connect-min-timeout <time> The minimum period (in seconds) during which\n";
+    print "                connection attempts should be made.\n";
     print "  --timeout <timeout> Use the specified timeout (in seconds) instead of the\n";
     print "                default one for the operation.\n";
     print "  --keepalive <keepalive> How often (in seconds) the run and wait operations\n";
@@ -389,10 +390,7 @@ if ($TunnelOpt and $TunnelOpt =~ /^ssh:/)
 }
my $TA = TestAgent->new($Hostname, $AgentPort, $TunnelInfo);
-if (defined $ConnectTimeout or defined $ConnectAttempts or defined $ConnectInterval)
-{
-    $TA->SetConnectTimeout($ConnectTimeout, $ConnectAttempts, $ConnectInterval);
-}
+$TA->SetConnectTimeout($ConnectOneTimeout, $ConnectMinAttempts, $ConnectMinTimeout);
 $TA->SetTimeout($Timeout) if (defined $Timeout);
my $RC = 0;
-- 
2.20.1