Make LURenewCrypto handle unreachable nodes properly
authorHelga Velroyen <helgav@google.com>
Wed, 4 Mar 2015 21:01:23 +0000 (22:01 +0100)
committerHelga Velroyen <helgav@google.com>
Wed, 18 Mar 2015 10:20:34 +0000 (11:20 +0100)
Currently an unreachable node can make LURenewCrypto fail
completely. This patch adds a unit test for it, and
improves the error handling of unreachable nodes in
a way, that the rest of the nodes are still handled
properly.

Signed-off-by: Helga Velroyen <helgav@google.com>
Reviewed-by: Petr Pudlak <pudlak@google.com>

lib/cmdlib/cluster.py
test/py/cmdlib/cluster_unittest.py

index 4372fc5..10deda6 100644 (file)
@@ -152,17 +152,30 @@ class LUClusterRenewCrypto(NoHooksLU):
       except IOError:
         pass
 
+    node_errors = {}
     nodes = self.cfg.GetAllNodesInfo()
     for (node_uuid, node_info) in nodes.items():
       if node_info.offline:
         feedback_fn("* Skipping offline node %s" % node_info.name)
         continue
       if node_uuid != master_uuid:
-        new_digest = CreateNewClientCert(self, node_uuid)
-        if node_info.master_candidate:
-          utils.AddNodeToCandidateCerts(node_uuid,
-                                        new_digest,
-                                        cluster.candidate_certs)
+        try:
+          new_digest = CreateNewClientCert(self, node_uuid)
+          if node_info.master_candidate:
+            utils.AddNodeToCandidateCerts(node_uuid,
+                                          new_digest,
+                                          cluster.candidate_certs)
+        except errors.OpExecError as e:
+          node_errors[node_uuid] = e
+
+    if node_errors:
+      msg = ("Some nodes' SSL client certificates could not be renewed."
+             " Please make sure those nodes are reachable and rerun"
+             " the operation. The affected nodes and their errors are:\n")
+      for uuid, e in node_errors.items():
+        msg += "Node %s: %s\n" % (uuid, e)
+      feedback_fn(msg)
+
     utils.RemoveNodeFromCandidateCerts("%s-SERVER" % master_uuid,
                                        cluster.candidate_certs)
     utils.RemoveNodeFromCandidateCerts("%s-OLDMASTER" % master_uuid,
index 4da6ea0..b5c3eab 100644 (file)
@@ -2351,6 +2351,56 @@ class TestLUClusterRenewCrypto(CmdlibTestCase):
     cluster = self.cfg.GetClusterInfo()
     self.assertFalse(cluster.candidate_certs)
 
+  def _partiallyFailingRpc(self, node_uuid, _):
+    if node_uuid == self._failed_node:
+      return self.RpcResultsBuilder() \
+        .CreateFailedNodeResult(node_uuid)
+    else:
+      return self.RpcResultsBuilder() \
+        .CreateSuccessfulNodeResult(node_uuid,
+          [(constants.CRYPTO_TYPE_SSL_DIGEST, self._GetFakeDigest(node_uuid))])
+
+  @patchPathutils("cluster")
+  def testNonMasterFails(self, pathutils):
+
+    # patch pathutils to point to temporary files
+    pathutils.NODED_CERT_FILE = self._node_cert
+    pathutils.NODED_CLIENT_CERT_FILE = self._client_node_cert
+    pathutils.NODED_CLIENT_CERT_FILE_TMP = \
+        self._client_node_cert_tmp
+
+    # create a few non-master, online nodes
+    num_nodes = 3
+    for _ in range(num_nodes):
+      self.cfg.AddNewNode()
+    nodes = self.cfg.GetAllNodesInfo()
+
+    # pick one node as the failing one
+    master_uuid = self.cfg.GetMasterNode()
+    self._failed_node = [node_uuid for node_uuid in nodes
+                         if node_uuid != master_uuid][1]
+    self.rpc.call_node_crypto_tokens = self._partiallyFailingRpc
+
+    op = opcodes.OpClusterRenewCrypto()
+    self.ExecOpCode(op)
+
+    # Check if the correct certificates exist and don't exist on the master
+    self.assertTrue(os.path.exists(pathutils.NODED_CERT_FILE))
+    self.assertTrue(os.path.exists(pathutils.NODED_CLIENT_CERT_FILE))
+    self.assertFalse(os.path.exists(pathutils.NODED_CLIENT_CERT_FILE_TMP))
+
+    # Check if we have the correct digests in the configuration
+    cluster = self.cfg.GetClusterInfo()
+    # There should be one digest missing.
+    self.assertEqual(num_nodes, len(cluster.candidate_certs))
+    nodes = self.cfg.GetAllNodesInfo()
+    for (node_uuid, _) in nodes.items():
+      if node_uuid == self._failed_node:
+        self.assertTrue(node_uuid not in cluster.candidate_certs)
+      else:
+        expected_digest = self._GetFakeDigest(node_uuid)
+        self.assertEqual(expected_digest, cluster.candidate_certs[node_uuid])
+
 
 if __name__ == "__main__":
   testutils.GanetiTestProgram()