The new model better handles the case where we need to wait for the VM to boot before we can connect to the testagentd server.
Signed-off-by: Francois Gouget fgouget@codeweavers.com --- testbot/bin/LibvirtTool.pl | 9 +- testbot/lib/WineTestBot/Config.pm | 15 ++- testbot/lib/WineTestBot/TestAgent.pm | 169 ++++++++++++++++++++++++--- testbot/lib/WineTestBot/VMs.pm | 8 +- testbot/scripts/SetWinLocale | 2 +- testbot/scripts/TestAgent | 30 +++-- 6 files changed, 181 insertions(+), 52 deletions(-)
diff --git a/testbot/bin/LibvirtTool.pl b/testbot/bin/LibvirtTool.pl index 5c3906503..f15bab70b 100755 --- a/testbot/bin/LibvirtTool.pl +++ b/testbot/bin/LibvirtTool.pl @@ -408,12 +408,11 @@ sub Revert() # The VM is now sleeping which may allow some tasks to run return 1 if (ChangeStatus("reverting", "sleeping"));
- # Check the TestAgent connection. Note that this may take some time - # if the VM needs to boot first. - Debug(Elapsed($Start), " Trying the TestAgent connection\n"); - LogMsg "Waiting for ". $VM->Name ." (up to ${WaitForToolsInVM}s per attempt)\n"; + # Verify that the TestAgent server accepts connections + Debug(Elapsed($Start), " Verifying the TestAgent server\n"); + LogMsg "Verifying the $VMKey TestAgent server\n"; my $TA = $VM->GetAgent(); - $TA->SetConnectTimeout($WaitForToolsInVM, undef, $WaitForToolsInVM); + $TA->SetConnectTimeout(undef, undef, $WaitForBoot); my $Success = $TA->Ping(); $TA->Disconnect(); if (!$Success) diff --git a/testbot/lib/WineTestBot/Config.pm b/testbot/lib/WineTestBot/Config.pm index 3d9214bae..3496c310e 100644 --- a/testbot/lib/WineTestBot/Config.pm +++ b/testbot/lib/WineTestBot/Config.pm @@ -28,7 +28,7 @@ WineTestBot::Config - Site-independent configuration settings use vars qw (@ISA @EXPORT @EXPORT_OK $UseSSL $LogDir $DataDir $BinDir %RepoURLs $DbDataSource $DbUsername $DbPassword $MaxRevertingVMs $MaxRevertsWhileRunningVMs $MaxActiveVMs $MaxRunningVMs - $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM + $MaxVMsWhenIdle $SleepAfterRevert $WaitForBoot $VMToolTimeout $MaxVMErrors $MaxTaskTries $AdminEMail $RobotEMail $WinePatchToOverride $WinePatchCc $ExeBuildNativeTimeout $ExeBuildTestTimeout $ExeModuleTimeout @@ -44,7 +44,7 @@ require Exporter; @ISA = qw(Exporter); @EXPORT = qw($UseSSL $LogDir $DataDir $BinDir %RepoURLs $MaxRevertingVMs $MaxRevertsWhileRunningVMs $MaxActiveVMs - $MaxRunningVMs $MaxVMsWhenIdle $SleepAfterRevert $WaitForToolsInVM + $MaxRunningVMs $MaxVMsWhenIdle $SleepAfterRevert $WaitForBoot $VMToolTimeout $MaxVMErrors $MaxTaskTries $AdminEMail $RobotEMail $WinePatchToOverride $WinePatchCc $SuiteTimeout $ExeBuildNativeTimeout $ExeBuildTestTimeout $ExeModuleTimeout @@ -90,14 +90,13 @@ $MaxActiveVMs = 2; $MaxRunningVMs = 1; $MaxVMsWhenIdle = undef;
-# How long to wait for each of the 3 connection attempts to the VM's TestAgent -# server after a revert (in seconds). If there are powered off snapshots this -# must be long enough for the VM to boot up first. -$WaitForToolsInVM = 30; -# How long to let the VM settle down after the revert before starting a task on +# How long to attempt to connect to the TestAgent while the VM is booting. +$WaitForBoot = 90; +# How long to let the VM settle down after a revert before starting a task on # it (in seconds). $SleepAfterRevert = 0; -# Take into account $WaitForToolsInVM and $SleepAfterRevert +# How long to wait before considering a VM operation is stuck. In particular +# for reverts this should take into account the time it may need to boot. $VMToolTimeout = 6 * 60;
# After three consecutive failures to revert a VM, put it in maintenance mode. diff --git a/testbot/lib/WineTestBot/TestAgent.pm b/testbot/lib/WineTestBot/TestAgent.pm index d3d7137a5..8d7f456fd 100644 --- a/testbot/lib/WineTestBot/TestAgent.pm +++ b/testbot/lib/WineTestBot/TestAgent.pm @@ -101,9 +101,9 @@ sub new($$$;$) agenthost => $Hostname, agentport => $Port, connection => "$Hostname:$Port", - ctimeout => 20, - cattempts => 3, - cinterval => 0, + conetimeout => 20, + cminattempts => 2, + cmintimeout => 10, timeout => 0, fd => undef, deadline => undef, @@ -140,28 +140,103 @@ sub Disconnect($) $self->{agentversion} = undef; }
+=pod +=over 12 + +=item C<SetConnectTimeout()> + +Configures how many times and for how long to attempt to connect to the server. + +=item OneTimeout + +OneTimeout specifies the timeout for one connection attempt in seconds. +Zero means there will be no timeout. Undef means the value is not changed. + +=item MinAttempts + +MinAttempts specifies the minimum number of connection attempts. It must be a +non-zero positive integer. Undef means the value is not changed. + +=item MinTimeout + +MinTimeout specifies the minimum period during which connection attempts should +be made in seconds. Zero means there will be no minimum timeout. Undef means +the value is not changed. + +This means connection attempts will be made until either one succeeds or +MinTimeout seconds have elapsed, even if each attempt fails quickly such that +more than $MinAttempts attempts are performed. Conversely, if each attempt +takes a long time such that more MinTimeout is reached before MinAttempt +connection attempts have been made, then attempts will continue until the +minimum number of attempts is reached. + +Thus the worst case connection timeout is: + + max($MinAttempts * $OneTimeout, $MinTimeout + $OneTimeout) + +=item Return value + +For each modified value (i.e. not undef), the old value is returned. + +=item Usage + +To perform a single connection attempt with a 20 second timeout: + + $TA->SetConnectionTimeout(20, 1, 0); + +For a bit more robustness in case one expects short network disruptions: + + $TA->SetConnectionTimeout(20, 2, 10); + +But if the server needs time to boot before accepting connections, then +MinTimeout should be set to the amount of time this is expected to take. It +would make sense to reset MinTimeout after the first RPC has completed since +then a reconnection would not have to wait for the server to boot again: + + my $OldMinTimeout = $TA->SetConnectionTimeout(undef, undef, 90); + + ... first RPC ... + + $TA->SetConnectionTimeout(undef, undef, $OldMinTimeout); + +=back +=cut + sub SetConnectTimeout($$;$$) { - my ($self, $Timeout, $Attempts, $Interval) = @_; + my ($self, $OneTimeout, $MinAttempts, $MinTimeout) = @_; my @Ret; - if (defined $Timeout) + if (defined $OneTimeout) { - push @Ret, $self->{ctimeout}; - $self->{ctimeout} = $Timeout; + push @Ret, $self->{conetimeout}; + $self->{conetimeout} = $OneTimeout; } - if (defined $Attempts) + if (defined $MinAttempts) { - push @Ret, $self->{cattempts}; - $self->{cattempts} = $Attempts; + push @Ret, $self->{cminattempts}; + $self->{cminattempts} = $MinAttempts; } - if (defined $Interval) + if (defined $MinTimeout) { - push @Ret, $self->{cinterval}; - $self->{cinterval} = $Interval; + push @Ret, $self->{cmintimeout}; + $self->{cmintimeout} = $MinTimeout; } return @Ret; }
+=pod +=over 12 + +=item C<SetTimeout()> + +Configures how long an individual RPC can take to complete (in seconds). + +Note that some operations like Wait() and Wait2() involve multiple RPCs. These +have their own timeouts. + +=back +=cut + sub SetTimeout($$) { my ($self, $Timeout) = @_; @@ -857,14 +932,16 @@ sub _Connect($) my $OldRPC = $self->{rpc}; $self->{rpc} = ($self->{rpc} ? "$self->{rpc}/" : "") ."connect";
- my $Start = time(); - foreach my $Attempt (1..$self->{cattempts}) + my $Attempt = 1; + my $MinDeadline = $self->{cmintimeout} ? time() + $self->{cmintimeout} : 0; + while (1) { my $Step = "initializing"; eval { local $SIG{ALRM} = sub { die "timeout" }; - $self->{deadline} = $self->{ctimeout} ? time() + $self->{ctimeout} : undef; + my $OneDeadline = $self->{conetimeout} ? time() + $self->{conetimeout} : 0; + $self->{deadline} = ($OneDeadline < $MinDeadline) ? $MinDeadline : $OneDeadline; $self->_SetAlarm();
if ($self->{tunnel}) @@ -945,8 +1022,13 @@ sub _Connect($) last; }
- my $Remaining = $Attempt * $self->{cinterval} - (time() - $Start); - sleep($Remaining) if ($Remaining > 0); + $Attempt++; + last if ($Attempt > $self->{cminattempts} and $MinDeadline <= time() + 1); + + # Wait at least 1 second between attempts so that if the error happens on + #the client-side (e.g. in case of an invalid ssh argument) we don't busy + # loop for $MinTimeout seconds. + sleep(1); } $self->{rpc} = $OldRPC; return undef; @@ -1023,6 +1105,19 @@ sub _SendStringOrFile($$$$$$) $self->_RecvList(''); }
+=pod +=over 12 + +=item C<SendFile()> + +Sends the $LocalPathName file and saves it as the $ServerPathName file on the +server. If $Flags is set to $SENDFILE_EXE the file will be made executable. + +Note that the transfer must complete within the SetTimeout() limit. + +=back +=cut + sub SendFile($$$;$) { my ($self, $LocalPathName, $ServerPathName, $Flags) = @_; @@ -1039,6 +1134,19 @@ sub SendFile($$$;$) return undef; }
+=pod +=over 12 + +=item C<SendFileFromString()> + +Sends the $Data string and saves it as the $ServerPathName file on the +server. If $Flags is set to $SENDFILE_EXE the file will be made executable. + +Note that the transfer must complete within the SetTimeout() limit. + +=back +=cut + sub SendFileFromString($$$;$) { my ($self, $Data, $ServerPathName, $Flags) = @_; @@ -1059,6 +1167,19 @@ sub _GetStringOrFile($$$) $self->_RecvString('String', 'd')); }
+=pod +=over 12 + +=item C<GetFile()> + +Retrieves the $ServerPathName file from the server and saves it as +$LocalPathName. + +Note that the transfer must complete within the SetTimeout() limit. + +=back +=cut + sub GetFile($$$) { my ($self, $ServerPathName, $LocalPathName) = @_; @@ -1075,6 +1196,18 @@ sub GetFile($$$) return undef; }
+=pod +=over 12 + +=item C<GetFileToString()> + +Retrieves the $ServerPathName file from the server returns it as a string. + +Note that the transfer must complete within the SetTimeout() limit. + +=back +=cut + sub GetFileToString($$) { my ($self, $ServerPathName) = @_; diff --git a/testbot/lib/WineTestBot/VMs.pm b/testbot/lib/WineTestBot/VMs.pm index 2bf350435..d85df1d08 100644 --- a/testbot/lib/WineTestBot/VMs.pm +++ b/testbot/lib/WineTestBot/VMs.pm @@ -584,10 +584,10 @@ sub RunPowerOff($)
Reverts the VM so that it is ready to run jobs.
-Note that in addition to the hypervisor revert operation this implies checking -that it responds to our commands ($WaitForToolsInVM) and possibly letting the -VM settle down ($SleepAfterRevert). If this operation fails the administrator -is notified and the VM is marked as offline. +Note that in addition to the hypervisor revert operation this may imply waiting +for the VM to boot, checking that the TestAgent server is accessible, and +possibly letting the VM settle down ($SleepAfterRevert). If this operation +fails the administrator is notified and the VM is put offline.
This operation can take a long time so it is performed in a separate process.
diff --git a/testbot/scripts/SetWinLocale b/testbot/scripts/SetWinLocale index 2690dbb1f..ca750b792 100755 --- a/testbot/scripts/SetWinLocale +++ b/testbot/scripts/SetWinLocale @@ -843,7 +843,7 @@ if ($OptRefresh)
# Allow up to 10 minutes for the shutdown plus reboot. Debug(Elapsed($Start), " Waiting for Windows to boot\n"); - $TA->SetConnectTimeout(20, 30); + $TA->SetConnectTimeout(undef, undef, $WaitForBoot); if (!$DryRun and !$TA->Ping()) { FatalError("could not reconnect to $OptHostName after the reboot: ", $TA->GetLastError(), "\n"); diff --git a/testbot/scripts/TestAgent b/testbot/scripts/TestAgent index 25fc0d426..b4ec582be 100755 --- a/testbot/scripts/TestAgent +++ b/testbot/scripts/TestAgent @@ -56,7 +56,7 @@ my ($Cmd, $Hostname, $LocalFilename, $ServerFilename, $PropName, $PropValue, @Rm my (@Run, $RunIn, $RunOut, $RunErr, $WaitPid); my $SendFlags = 0; my $RunFlags = 0; -my ($Port, $ConnectTimeout, $ConnectAttempts, $ConnectInterval, $Timeout); +my ($Port, $ConnectOneTimeout, $ConnectMinAttempts, $ConnectMinTimeout, $Timeout); my ($Keepalive, $TunnelOpt); my $Usage;
@@ -100,17 +100,17 @@ while (@ARGV) { $Port = check_opt_val($arg, $Port); } - elsif ($arg eq "--connect-timeout") + elsif ($arg eq "--connect-one-timeout") { - $ConnectTimeout = check_opt_val($arg, $ConnectTimeout); + $ConnectOneTimeout = check_opt_val($arg, $ConnectOneTimeout); } - elsif ($arg eq "--connect-attempts") + elsif ($arg eq "--connect-min-attempts") { - $ConnectAttempts = check_opt_val($arg, $ConnectAttempts); + $ConnectMinAttempts = check_opt_val($arg, $ConnectMinAttempts); } - elsif ($arg eq "--connect-interval") + elsif ($arg eq "--connect-min-timeout") { - $ConnectInterval = check_opt_val($arg, $ConnectInterval); + $ConnectMinTimeout = check_opt_val($arg, $ConnectMinTimeout); } elsif ($arg eq "--timeout") { @@ -361,11 +361,12 @@ if (defined $Usage) print " restarts it.\n"; print " <hostname> Is the hostname of the server.\n"; print " --port <port> Use the specified port number instead of the default one.\n"; - print " --connect-timeout <timeout> Use the specified timeout (in seconds) when\n"; - print " connecting instead of the default one.\n"; - print " --connect-attempts <count> How many connection attempts to perform.\n"; - print " --connect-interval <time> The minimum interval (in seconds) between failed\n"; - print " connection attempts.\n"; + print " --connect-one-timeout <time> Specifies the timeout for one connection\n"; + print " attempt (in seconds).\n"; + print " --connect-min-attempts <count> Specifies the minimum number of connection\n"; + print " attempts.\n"; + print " --connect-min-timeout <time> The minimum period (in seconds) during which\n"; + print " connection attempts should be made.\n"; print " --timeout <timeout> Use the specified timeout (in seconds) instead of the\n"; print " default one for the operation.\n"; print " --keepalive <keepalive> How often (in seconds) the run and wait operations\n"; @@ -389,10 +390,7 @@ if ($TunnelOpt and $TunnelOpt =~ /^ssh:/) }
my $TA = TestAgent->new($Hostname, $AgentPort, $TunnelInfo); -if (defined $ConnectTimeout or defined $ConnectAttempts or defined $ConnectInterval) -{ - $TA->SetConnectTimeout($ConnectTimeout, $ConnectAttempts, $ConnectInterval); -} +$TA->SetConnectTimeout($ConnectOneTimeout, $ConnectMinAttempts, $ConnectMinTimeout); $TA->SetTimeout($Timeout) if (defined $Timeout);
my $RC = 0;