[PATCH] testbot/TestAgent: Modify and document the connection timeout handling.
The new model better handles the case where we need to wait for the VM to boot before we can connect to the testagentd server. Signed-off-by: Francois Gouget <fgouget(a)codeweavers.com> --- testbot/bin/LibvirtTool.pl | 9 +- testbot/lib/WineTestBot/Config.pm | 15 ++- testbot/lib/WineTestBot/TestAgent.pm | 169 ++++++++++++++++++++++++--- testbot/lib/WineTestBot/VMs.pm | 8 +- testbot/scripts/SetWinLocale | 2 +- testbot/scripts/TestAgent | 30 +++-- 6 files changed, 181 insertions(+), 52 deletions(-) diff --git a/testbot/bin/LibvirtTool.pl b/testbot/bin/LibvirtTool.pl index 5c3906503..f15bab70b 100755 --- a/testbot/bin/LibvirtTool.pl +++ b/testbot/bin/LibvirtTool.pl @@ -408,12 +408,11 @@ sub Revert() # The VM is now sleeping which may allow some tasks to run return 1 if (ChangeStatus("reverting", "sleeping")); - # Check the TestAgent connection. Note that this may take some time - # if the VM needs to boot first. - Debug(Elapsed($Start), " Trying the TestAgent connection\n"); - LogMsg "Waiting for ". $VM->Name ." (up to ${WaitForToolsInVM}s per attempt)\n"; + # Verify that the TestAgent server accepts connections + Debug(Elapsed($Start), " Verifying the TestAgent server\n"); + LogMsg "Verifying the $VMKey TestAgent server\n"; my $TA = $VM->GetAgent(); - $TA->SetConnectTimeout($WaitForToolsInVM, undef, $WaitForToolsInVM); + $TA->SetConnectTimeout(undef, undef, $WaitForBoot); my $Success = $TA->Ping(); $TA->Disconnect(); if (!$Success) diff --git a/testbot/lib/WineTestBot/Config.pm b/testbot/lib/WineTestBot/Config.pm index 3d9214bae..3496c310e 100644 --- a/testbot/lib/WineTestBot/Config.pm +++ b/testbot/lib/WineTestBot/Config.pm @@ -28,7 +28,7 @@ WineTestBot::Config - Site-independent configuration settings use vars qw (@ISA @EXPORT @EXPORT_OK $UseSSL $LogDir $DataDir $BinDir %RepoURLs $DbDataSource $DbUsername $DbPassword $MaxRevertingVMs $MaxRevertsWhileRunningVMs $MaxActiveVMs $MaxRunningVMs - $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM + $MaxVMsWhenIdle $SleepAfterRevert $WaitForBoot $VMToolTimeout $MaxVMErrors $MaxTaskTries $AdminEMail $RobotEMail $WinePatchToOverride $WinePatchCc $ExeBuildNativeTimeout $ExeBuildTestTimeout $ExeModuleTimeout @@ -44,7 +44,7 @@ require Exporter; @ISA = qw(Exporter); @EXPORT = qw($UseSSL $LogDir $DataDir $BinDir %RepoURLs $MaxRevertingVMs $MaxRevertsWhileRunningVMs $MaxActiveVMs - $MaxRunningVMs $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM + $MaxRunningVMs $MaxVMsWhenIdle $SleepAfterRevert $WaitForBoot $VMToolTimeout $MaxVMErrors $MaxTaskTries $AdminEMail $RobotEMail $WinePatchToOverride $WinePatchCc $SuiteTimeout $ExeBuildNativeTimeout $ExeBuildTestTimeout $ExeModuleTimeout @@ -90,14 +90,13 @@ $MaxActiveVMs = 2; $MaxRunningVMs = 1; $MaxVMsWhenIdle = undef; -# How long to wait for each of the 3 connection attempts to the VM's TestAgent -# server after a revert (in seconds). If there are powered off snapshots this -# must be long enough for the VM to boot up first. -$WaitForToolsInVM = 30; -# How long to let the VM settle down after the revert before starting a task on +# How long to attempt to connect to the TestAgent while the VM is booting. +$WaitForBoot = 90; +# How long to let the VM settle down after a revert before starting a task on # it (in seconds). $SleepAfterRevert = 0; -# Take into account $WaitForToolsInVM and $SleepAfterRevert +# How long to wait before considering a VM operation is stuck. In particular +# for reverts this should take into account the time it may need to boot. $VMToolTimeout = 6 * 60; # After three consecutive failures to revert a VM, put it in maintenance mode. diff --git a/testbot/lib/WineTestBot/TestAgent.pm b/testbot/lib/WineTestBot/TestAgent.pm index d3d7137a5..8d7f456fd 100644 --- a/testbot/lib/WineTestBot/TestAgent.pm +++ b/testbot/lib/WineTestBot/TestAgent.pm @@ -101,9 +101,9 @@ sub new($$$;$) agenthost => $Hostname, agentport => $Port, connection => "$Hostname:$Port", - ctimeout => 20, - cattempts => 3, - cinterval => 0, + conetimeout => 20, + cminattempts => 2, + cmintimeout => 10, timeout => 0, fd => undef, deadline => undef, @@ -140,28 +140,103 @@ sub Disconnect($) $self->{agentversion} = undef; } +=pod +=over 12 + +=item C<SetConnectTimeout()> + +Configures how many times and for how long to attempt to connect to the server. + +=item OneTimeout + +OneTimeout specifies the timeout for one connection attempt in seconds. +Zero means there will be no timeout. Undef means the value is not changed. + +=item MinAttempts + +MinAttempts specifies the minimum number of connection attempts. It must be a +non-zero positive integer. Undef means the value is not changed. + +=item MinTimeout + +MinTimeout specifies the minimum period during which connection attempts should +be made in seconds. Zero means there will be no minimum timeout. Undef means +the value is not changed. + +This means connection attempts will be made until either one succeeds or +MinTimeout seconds have elapsed, even if each attempt fails quickly such that +more than $MinAttempts attempts are performed. Conversely, if each attempt +takes a long time such that more MinTimeout is reached before MinAttempt +connection attempts have been made, then attempts will continue until the +minimum number of attempts is reached. + +Thus the worst case connection timeout is: + + max($MinAttempts * $OneTimeout, $MinTimeout + $OneTimeout) + +=item Return value + +For each modified value (i.e. not undef), the old value is returned. + +=item Usage + +To perform a single connection attempt with a 20 second timeout: + + $TA->SetConnectionTimeout(20, 1, 0); + +For a bit more robustness in case one expects short network disruptions: + + $TA->SetConnectionTimeout(20, 2, 10); + +But if the server needs time to boot before accepting connections, then +MinTimeout should be set to the amount of time this is expected to take. It +would make sense to reset MinTimeout after the first RPC has completed since +then a reconnection would not have to wait for the server to boot again: + + my $OldMinTimeout = $TA->SetConnectionTimeout(undef, undef, 90); + + ... first RPC ... + + $TA->SetConnectionTimeout(undef, undef, $OldMinTimeout); + +=back +=cut + sub SetConnectTimeout($$;$$) { - my ($self, $Timeout, $Attempts, $Interval) = @_; + my ($self, $OneTimeout, $MinAttempts, $MinTimeout) = @_; my @Ret; - if (defined $Timeout) + if (defined $OneTimeout) { - push @Ret, $self->{ctimeout}; - $self->{ctimeout} = $Timeout; + push @Ret, $self->{conetimeout}; + $self->{conetimeout} = $OneTimeout; } - if (defined $Attempts) + if (defined $MinAttempts) { - push @Ret, $self->{cattempts}; - $self->{cattempts} = $Attempts; + push @Ret, $self->{cminattempts}; + $self->{cminattempts} = $MinAttempts; } - if (defined $Interval) + if (defined $MinTimeout) { - push @Ret, $self->{cinterval}; - $self->{cinterval} = $Interval; + push @Ret, $self->{cmintimeout}; + $self->{cmintimeout} = $MinTimeout; } return @Ret; } +=pod +=over 12 + +=item C<SetTimeout()> + +Configures how long an individual RPC can take to complete (in seconds). + +Note that some operations like Wait() and Wait2() involve multiple RPCs. These +have their own timeouts. + +=back +=cut + sub SetTimeout($$) { my ($self, $Timeout) = @_; @@ -857,14 +932,16 @@ sub _Connect($) my $OldRPC = $self->{rpc}; $self->{rpc} = ($self->{rpc} ? "$self->{rpc}/" : "") ."connect"; - my $Start = time(); - foreach my $Attempt (1..$self->{cattempts}) + my $Attempt = 1; + my $MinDeadline = $self->{cmintimeout} ? time() + $self->{cmintimeout} : 0; + while (1) { my $Step = "initializing"; eval { local $SIG{ALRM} = sub { die "timeout" }; - $self->{deadline} = $self->{ctimeout} ? time() + $self->{ctimeout} : undef; + my $OneDeadline = $self->{conetimeout} ? time() + $self->{conetimeout} : 0; + $self->{deadline} = ($OneDeadline < $MinDeadline) ? $MinDeadline : $OneDeadline; $self->_SetAlarm(); if ($self->{tunnel}) @@ -945,8 +1022,13 @@ sub _Connect($) last; } - my $Remaining = $Attempt * $self->{cinterval} - (time() - $Start); - sleep($Remaining) if ($Remaining > 0); + $Attempt++; + last if ($Attempt > $self->{cminattempts} and $MinDeadline <= time() + 1); + + # Wait at least 1 second between attempts so that if the error happens on + #the client-side (e.g. in case of an invalid ssh argument) we don't busy + # loop for $MinTimeout seconds. + sleep(1); } $self->{rpc} = $OldRPC; return undef; @@ -1023,6 +1105,19 @@ sub _SendStringOrFile($$$$$$) $self->_RecvList(''); } +=pod +=over 12 + +=item C<SendFile()> + +Sends the $LocalPathName file and saves it as the $ServerPathName file on the +server. If $Flags is set to $SENDFILE_EXE the file will be made executable. + +Note that the transfer must complete within the SetTimeout() limit. + +=back +=cut + sub SendFile($$$;$) { my ($self, $LocalPathName, $ServerPathName, $Flags) = @_; @@ -1039,6 +1134,19 @@ sub SendFile($$$;$) return undef; } +=pod +=over 12 + +=item C<SendFileFromString()> + +Sends the $Data string and saves it as the $ServerPathName file on the +server. If $Flags is set to $SENDFILE_EXE the file will be made executable. + +Note that the transfer must complete within the SetTimeout() limit. + +=back +=cut + sub SendFileFromString($$$;$) { my ($self, $Data, $ServerPathName, $Flags) = @_; @@ -1059,6 +1167,19 @@ sub _GetStringOrFile($$$) $self->_RecvString('String', 'd')); } +=pod +=over 12 + +=item C<GetFile()> + +Retrieves the $ServerPathName file from the server and saves it as +$LocalPathName. + +Note that the transfer must complete within the SetTimeout() limit. + +=back +=cut + sub GetFile($$$) { my ($self, $ServerPathName, $LocalPathName) = @_; @@ -1075,6 +1196,18 @@ sub GetFile($$$) return undef; } +=pod +=over 12 + +=item C<GetFileToString()> + +Retrieves the $ServerPathName file from the server returns it as a string. + +Note that the transfer must complete within the SetTimeout() limit. + +=back +=cut + sub GetFileToString($$) { my ($self, $ServerPathName) = @_; diff --git a/testbot/lib/WineTestBot/VMs.pm b/testbot/lib/WineTestBot/VMs.pm index 2bf350435..d85df1d08 100644 --- a/testbot/lib/WineTestBot/VMs.pm +++ b/testbot/lib/WineTestBot/VMs.pm @@ -584,10 +584,10 @@ sub RunPowerOff($) Reverts the VM so that it is ready to run jobs. -Note that in addition to the hypervisor revert operation this implies checking -that it responds to our commands ($WaitForToolsInVM) and possibly letting the -VM settle down ($SleepAfterRevert). If this operation fails the administrator -is notified and the VM is marked as offline. +Note that in addition to the hypervisor revert operation this may imply waiting +for the VM to boot, checking that the TestAgent server is accessible, and +possibly letting the VM settle down ($SleepAfterRevert). If this operation +fails the administrator is notified and the VM is put offline. This operation can take a long time so it is performed in a separate process. diff --git a/testbot/scripts/SetWinLocale b/testbot/scripts/SetWinLocale index 2690dbb1f..ca750b792 100755 --- a/testbot/scripts/SetWinLocale +++ b/testbot/scripts/SetWinLocale @@ -843,7 +843,7 @@ if ($OptRefresh) # Allow up to 10 minutes for the shutdown plus reboot. Debug(Elapsed($Start), " Waiting for Windows to boot\n"); - $TA->SetConnectTimeout(20, 30); + $TA->SetConnectTimeout(undef, undef, $WaitForBoot); if (!$DryRun and !$TA->Ping()) { FatalError("could not reconnect to $OptHostName after the reboot: ", $TA->GetLastError(), "\n"); diff --git a/testbot/scripts/TestAgent b/testbot/scripts/TestAgent index 25fc0d426..b4ec582be 100755 --- a/testbot/scripts/TestAgent +++ b/testbot/scripts/TestAgent @@ -56,7 +56,7 @@ my ($Cmd, $Hostname, $LocalFilename, $ServerFilename, $PropName, $PropValue, @Rm my (@Run, $RunIn, $RunOut, $RunErr, $WaitPid); my $SendFlags = 0; my $RunFlags = 0; -my ($Port, $ConnectTimeout, $ConnectAttempts, $ConnectInterval, $Timeout); +my ($Port, $ConnectOneTimeout, $ConnectMinAttempts, $ConnectMinTimeout, $Timeout); my ($Keepalive, $TunnelOpt); my $Usage; @@ -100,17 +100,17 @@ while (@ARGV) { $Port = check_opt_val($arg, $Port); } - elsif ($arg eq "--connect-timeout") + elsif ($arg eq "--connect-one-timeout") { - $ConnectTimeout = check_opt_val($arg, $ConnectTimeout); + $ConnectOneTimeout = check_opt_val($arg, $ConnectOneTimeout); } - elsif ($arg eq "--connect-attempts") + elsif ($arg eq "--connect-min-attempts") { - $ConnectAttempts = check_opt_val($arg, $ConnectAttempts); + $ConnectMinAttempts = check_opt_val($arg, $ConnectMinAttempts); } - elsif ($arg eq "--connect-interval") + elsif ($arg eq "--connect-min-timeout") { - $ConnectInterval = check_opt_val($arg, $ConnectInterval); + $ConnectMinTimeout = check_opt_val($arg, $ConnectMinTimeout); } elsif ($arg eq "--timeout") { @@ -361,11 +361,12 @@ if (defined $Usage) print " restarts it.\n"; print " <hostname> Is the hostname of the server.\n"; print " --port <port> Use the specified port number instead of the default one.\n"; - print " --connect-timeout <timeout> Use the specified timeout (in seconds) when\n"; - print " connecting instead of the default one.\n"; - print " --connect-attempts <count> How many connection attempts to perform.\n"; - print " --connect-interval <time> The minimum interval (in seconds) between failed\n"; - print " connection attempts.\n"; + print " --connect-one-timeout <time> Specifies the timeout for one connection\n"; + print " attempt (in seconds).\n"; + print " --connect-min-attempts <count> Specifies the minimum number of connection\n"; + print " attempts.\n"; + print " --connect-min-timeout <time> The minimum period (in seconds) during which\n"; + print " connection attempts should be made.\n"; print " --timeout <timeout> Use the specified timeout (in seconds) instead of the\n"; print " default one for the operation.\n"; print " --keepalive <keepalive> How often (in seconds) the run and wait operations\n"; @@ -389,10 +390,7 @@ if ($TunnelOpt and $TunnelOpt =~ /^ssh:/) } my $TA = TestAgent->new($Hostname, $AgentPort, $TunnelInfo); -if (defined $ConnectTimeout or defined $ConnectAttempts or defined $ConnectInterval) -{ - $TA->SetConnectTimeout($ConnectTimeout, $ConnectAttempts, $ConnectInterval); -} +$TA->SetConnectTimeout($ConnectOneTimeout, $ConnectMinAttempts, $ConnectMinTimeout); $TA->SetTimeout($Timeout) if (defined $Timeout); my $RC = 0; -- 2.20.1
participants (1)
-
Francois Gouget