Introduce backoff to RetryByNumberOfTimes
authorHelga Velroyen <helgav@google.com>
Mon, 11 Jan 2016 14:32:47 +0000 (15:32 +0100)
committerHelga Velroyen <helgav@google.com>
Mon, 11 Jan 2016 16:02:34 +0000 (17:02 +0100)
This patch adds a backing-off mechanism to the function
RetryByNumberOfTimes. This is useful for example when SSH
connections fail in a flaky network. The original version of
RetryByNumberOfTimes immediately retried failed SSH calls,
but that might not be enough to recover from a network
problem.

The patch adds an additional parameter 'backoff' which
specifies the base number of seconds of the backoff. That
means after the first failed try, a delay is added as long
as the backoff parameter specifies. With each additional
failed try, the delay is doubled until the maximum
number of retries is hit.

Note that the backoff parameter is not a keyword argument,
which might have been more convenient. That's because
otherwise RetryByNumberOfTimes would no longer be able
to propagate *args and **kwargs to the function to be
called with retries.

Also note that there is a function "Retry" in the same
package, which already provides somewhat complicated
timeout capabilities. However, we did not merge these
two functions, because Retry also does not propagate
*args and **kwargs properly which is something we
depend on in backend.py.

This patch also updates the unit tests and mocks the
sleep function in the backend.py's unit tests to not
slow down the tests.

This fixes issue 1078.

Signed-off-by: Helga Velroyen <helgav@google.com>
Reviewed-by: Klaus Aehlig <aehlig@google.com>

lib/backend.py
lib/utils/retry.py
test/py/ganeti.backend_unittest.py
test/py/ganeti.utils.retry_unittest.py

index 67e82a0..346b900 100644 (file)
@@ -1559,8 +1559,9 @@ def AddNodeSshKeyBulk(node_list,
         (constants.SSHS_OVERRIDE, all_keys)
 
       try:
+        backoff = 5  # seconds
         utils.RetryByNumberOfTimes(
-            constants.SSHS_MAX_RETRIES,
+            constants.SSHS_MAX_RETRIES, backoff,
             errors.SshUpdateError,
             run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
             ssh_port_map.get(node_info.name), node_data,
@@ -1605,9 +1606,9 @@ def AddNodeSshKeyBulk(node_list,
     if node in potential_master_candidates:
       logging.debug("Updating SSH key files of node '%s'.", node)
       try:
+        backoff = 5  # seconds
         utils.RetryByNumberOfTimes(
-            constants.SSHS_MAX_RETRIES,
-            errors.SshUpdateError,
+            constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
             run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
             ssh_port_map.get(node), pot_mc_data,
             debug=ssh_update_debug, verbose=ssh_update_verbose,
@@ -1875,9 +1876,9 @@ def RemoveNodeSshKeyBulk(node_list,
           logging.debug("Updating key setup of potential master candidate node"
                         " %s.", node)
           try:
+            backoff = 5  # seconds
             utils.RetryByNumberOfTimes(
-                constants.SSHS_MAX_RETRIES,
-                errors.SshUpdateError,
+                constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
                 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
                 ssh_port, pot_mc_data,
                 debug=ssh_update_debug, verbose=ssh_update_verbose,
@@ -1892,9 +1893,9 @@ def RemoveNodeSshKeyBulk(node_list,
           if from_authorized_keys:
             logging.debug("Updating key setup of normal node %s.", node)
             try:
+              backoff = 5  # seconds
               utils.RetryByNumberOfTimes(
-                  constants.SSHS_MAX_RETRIES,
-                  errors.SshUpdateError,
+                  constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
                   run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
                   ssh_port, base_data,
                   debug=ssh_update_debug, verbose=ssh_update_verbose,
@@ -1946,8 +1947,9 @@ def RemoveNodeSshKeyBulk(node_list,
       logging.debug("Updating SSH key setup of target node '%s'.",
                     node_info.name)
       try:
+        backoff = 5  # seconds
         utils.RetryByNumberOfTimes(
-            constants.SSHS_MAX_RETRIES,
+            constants.SSHS_MAX_RETRIES, backoff,
             errors.SshUpdateError,
             run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
             ssh_port, data,
index 8079303..895cc0e 100644 (file)
@@ -253,7 +253,8 @@ def CountRetry(expected, fn, count, args=None):
                      wait_fn=inc_tries, _time_fn=get_tries)
 
 
-def RetryByNumberOfTimes(max_retries, exception_class, fn, *args, **kwargs):
+def RetryByNumberOfTimes(max_retries, backoff, exception_class, fn, *args,
+                         **kwargs):
   """Retries calling a function up to the specified number of times.
 
   @type max_retries: integer
@@ -264,9 +265,23 @@ def RetryByNumberOfTimes(max_retries, exception_class, fn, *args, **kwargs):
   @type fn: callable
   @param fn: Function to be called (up to the specified maximum number of
              retries.
+  @type backoff: int
+  @param backoff: this enables and configures the back off behavior after
+     failed tries. If value is '0', there will be no delay between failed
+     tries. If the value is a positive integer, it is interpreted as the
+     base length of the back off delay (in seconds). That means there will be a
+     delay between failed tries of the length specified in this paramter. With
+     each next retry, the delay is increased by the factor of two. For example,
+     if the value is '2', the first delay is 2 seconds, the second 4 seconds,
+     the third 8 seconds (until the max_retries) are hit or the function call
+     succeeds.
 
   """
+  if backoff < 0:
+    raise exception_class("Backoff must be a non-negative integer.")
+
   last_exception = None
+  delay = backoff
   for i in range(max_retries):
     try:
       fn(*args, **kwargs)
@@ -274,6 +289,8 @@ def RetryByNumberOfTimes(max_retries, exception_class, fn, *args, **kwargs):
     except errors.OpExecError as e:
       logging.error("Error after retry no. %s: %s.", i, e)
       last_exception = e
+      time.sleep(delay)
+      delay *= 2
   else:
     if last_exception:
       raise exception_class("Error after %s retries. Last exception: %s."
index 17e7dc7..2b1d71e 100755 (executable)
@@ -32,6 +32,7 @@
 
 import collections
 import copy
+import time
 import mock
 import os
 import shutil
@@ -986,6 +987,11 @@ class TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
     self._ssh_replace_name_by_uuid_mock.side_effect = \
       self._ssh_file_manager.ReplaceNameByUuid
 
+    self._time_sleep_patcher = testutils \
+        .patch_object(time, "sleep")
+    self._time_sleep_mock = \
+        self._time_sleep_patcher.start()
+
     self.noded_cert_file = testutils.TestDataFilename("cert1.pem")
 
     self._SetupTestData()
@@ -998,6 +1004,7 @@ class TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
     self._ssh_remove_public_key_patcher.stop()
     self._ssh_query_pub_key_file_patcher.stop()
     self._ssh_replace_name_by_uuid_patcher.stop()
+    self._time_sleep_patcher.stop()
     self._TearDownTestData()
 
   def _SetupTestData(self, number_of_nodes=15, number_of_pot_mcs=5,
index e796f84..93638cd 100755 (executable)
@@ -31,6 +31,7 @@
 """Script for testing ganeti.utils.retry"""
 
 import mock
+import time
 import unittest
 
 from ganeti import constants
@@ -213,12 +214,12 @@ class TestRetryByNumberOfTimes(testutils.GanetiTestCase):
 
   def testSuccessOnFirst(self):
     test_fn = mock.Mock()
-    utils.RetryByNumberOfTimes(5, Exception, test_fn)
+    utils.RetryByNumberOfTimes(5, 0, Exception, test_fn)
     test_fn.assert_called_once()
 
   def testSuccessOnFirstWithArgs(self):
     test_fn = mock.Mock()
-    utils.RetryByNumberOfTimes(5, Exception, test_fn,
+    utils.RetryByNumberOfTimes(5, 0, Exception, test_fn,
         "arg1", "arg2", kwarg1_key="kwarg1_value", kwarg2_key="kwarg2_value")
     test_fn.assert_called_with(
         "arg1", "arg2", kwarg1_key="kwarg1_value", kwarg2_key="kwarg2_value")
@@ -235,7 +236,7 @@ class TestRetryByNumberOfTimes(testutils.GanetiTestCase):
       else:
         return "I succeed."
 
-    utils.RetryByNumberOfTimes(self.max_tries, Exception, test_fn)
+    utils.RetryByNumberOfTimes(self.max_tries, 0, Exception, test_fn)
 
   def testFailAllTries(self):
     self.max_tries = 5
@@ -244,7 +245,35 @@ class TestRetryByNumberOfTimes(testutils.GanetiTestCase):
       raise errors.OpExecError("I fail!")
 
     self.assertRaises(Exception, utils.RetryByNumberOfTimes, self.max_tries,
-                      Exception, test_fn)
+                      0, Exception, test_fn)
+
+  @testutils.patch_object(time, "sleep")
+  def testBackoffZero(self, mock_sleep):
+    self.max_tries = 5
+
+    def test_fn():
+      raise errors.OpExecError("I fail!")
+
+    self.assertRaises(Exception, utils.RetryByNumberOfTimes, self.max_tries,
+                      backoff=0, exception_class=Exception, fn=test_fn)
+    for call in mock_sleep.mock_calls:
+      self.assertEqual(mock.call(0), call)
+
+  @testutils.patch_object(time, "sleep")
+  def testBackoffPositive(self, mock_sleep):
+    self.max_tries = 5
+
+    def test_fn():
+      raise errors.OpExecError("I fail!")
+
+    backoff = 3
+    self.assertRaises(Exception, utils.RetryByNumberOfTimes, self.max_tries,
+                      backoff=backoff, exception_class=Exception, fn=test_fn)
+
+    expected_calls = [3, 6, 12, 24, 48]
+    for call_idx in range(len(mock_sleep.mock_calls)):
+      self.assertEqual(mock.call(expected_calls[call_idx]),
+                       mock_sleep.mock_calls[call_idx])
 
 
 if __name__ == "__main__":