Merge branch 'stable-2.16' into stable-2.17
[ganeti-github.git] / lib / cmdlib / cluster / verify.py
1 #
2 #
3
4 # Copyright (C) 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 """Logical units for cluster verification."""
31
32 import itertools
33 import logging
34 import operator
35 import re
36 import time
37 import ganeti.masterd.instance
38 import ganeti.rpc.node as rpc
39
40 from ganeti import compat
41 from ganeti import constants
42 from ganeti import errors
43 from ganeti import locking
44 from ganeti import pathutils
45 from ganeti import utils
46 from ganeti import vcluster
47 from ganeti import hypervisor
48 from ganeti import opcodes
49
50 from ganeti.cmdlib.base import LogicalUnit, NoHooksLU, ResultWithJobs
51 from ganeti.cmdlib.common import ShareAll, ComputeAncillaryFiles, \
52 CheckNodePVs, ComputeIPolicyInstanceViolation, AnnotateDiskParams, \
53 SupportsOob
54
55
56 def _GetAllHypervisorParameters(cluster, instances):
57 """Compute the set of all hypervisor parameters.
58
59 @type cluster: L{objects.Cluster}
60 @param cluster: the cluster object
61 @param instances: list of L{objects.Instance}
62 @param instances: additional instances from which to obtain parameters
63 @rtype: list of (origin, hypervisor, parameters)
64 @return: a list with all parameters found, indicating the hypervisor they
65 apply to, and the origin (can be "cluster", "os X", or "instance Y")
66
67 """
68 hvp_data = []
69
70 for hv_name in cluster.enabled_hypervisors:
71 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
72
73 for os_name, os_hvp in cluster.os_hvp.items():
74 for hv_name, hv_params in os_hvp.items():
75 if hv_params:
76 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
77 hvp_data.append(("os %s" % os_name, hv_name, full_params))
78
79 # TODO: collapse identical parameter values in a single one
80 for instance in instances:
81 if instance.hvparams:
82 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
83 cluster.FillHV(instance)))
84
85 return hvp_data
86
87
88 class _VerifyErrors(object):
89 """Mix-in for cluster/group verify LUs.
90
91 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
92 self.op and self._feedback_fn to be available.)
93
94 """
95
96 ETYPE_ERROR = constants.CV_ERROR
97 ETYPE_WARNING = constants.CV_WARNING
98
99 def _ErrorMsgList(self, error_descriptor, object_name, message_list,
100 log_type=ETYPE_ERROR):
101 """Format multiple error messages.
102
103 Based on the opcode's error_codes parameter, either format a
104 parseable error code, or a simpler error string.
105
106 This must be called only from Exec and functions called from Exec.
107
108
109 @type error_descriptor: tuple (string, string, string)
110 @param error_descriptor: triplet describing the error (object_type,
111 code, description)
112 @type object_name: string
113 @param object_name: name of object (instance, node ..) the error relates to
114 @type message_list: list of strings
115 @param message_list: body of error messages
116 @type log_type: string
117 @param log_type: log message type (WARNING, ERROR ..)
118 """
119 # Called with empty list - nothing to do
120 if not message_list:
121 return
122
123 object_type, error_code, _ = error_descriptor
124 # If the error code is in the list of ignored errors, demote the error to a
125 # warning
126 if error_code in self.op.ignore_errors: # pylint: disable=E1101
127 log_type = self.ETYPE_WARNING
128
129 prefixed_list = []
130 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
131 for msg in message_list:
132 prefixed_list.append(" - %s:%s:%s:%s:%s" % (
133 log_type, error_code, object_type, object_name, msg))
134 else:
135 if not object_name:
136 object_name = ""
137 for msg in message_list:
138 prefixed_list.append(" - %s: %s %s: %s" % (
139 log_type, object_type, object_name, msg))
140
141 # Report messages via the feedback_fn
142 # pylint: disable=E1101
143 self._feedback_fn(constants.ELOG_MESSAGE_LIST, prefixed_list)
144
145 # do not mark the operation as failed for WARN cases only
146 if log_type == self.ETYPE_ERROR:
147 self.bad = True
148
149 def _ErrorMsg(self, error_descriptor, object_name, message,
150 log_type=ETYPE_ERROR):
151 """Log a single error message.
152
153 """
154 self._ErrorMsgList(error_descriptor, object_name, [message], log_type)
155
156 # TODO: Replace this method with a cleaner interface, get rid of the if
157 # condition as it only rarely saves lines, but makes things less readable.
158 def _ErrorIf(self, cond, *args, **kwargs):
159 """Log an error message if the passed condition is True.
160
161 """
162 if (bool(cond)
163 or self.op.debug_simulate_errors): # pylint: disable=E1101
164 self._Error(*args, **kwargs)
165
166 # TODO: Replace this method with a cleaner interface
167 def _Error(self, ecode, item, message, *args, **kwargs):
168 """Log an error message if the passed condition is True.
169
170 """
171 #TODO: Remove 'code' argument in favour of using log_type
172 log_type = kwargs.get('code', self.ETYPE_ERROR)
173 if args:
174 message = message % args
175 self._ErrorMsgList(ecode, item, [message], log_type=log_type)
176
177
178 class LUClusterVerify(NoHooksLU):
179 """Submits all jobs necessary to verify the cluster.
180
181 """
182 REQ_BGL = False
183
184 def ExpandNames(self):
185 self.needed_locks = {}
186
187 def Exec(self, feedback_fn):
188 jobs = []
189
190 if self.op.group_name:
191 groups = [self.op.group_name]
192 depends_fn = lambda: None
193 else:
194 groups = self.cfg.GetNodeGroupList()
195
196 # Verify global configuration
197 jobs.append([
198 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors),
199 ])
200
201 # Always depend on global verification
202 depends_fn = lambda: [(-len(jobs), [])]
203
204 jobs.extend(
205 [opcodes.OpClusterVerifyGroup(group_name=group,
206 ignore_errors=self.op.ignore_errors,
207 depends=depends_fn(),
208 verify_clutter=self.op.verify_clutter)]
209 for group in groups)
210
211 # Fix up all parameters
212 for op in itertools.chain(*jobs):
213 op.debug_simulate_errors = self.op.debug_simulate_errors
214 op.verbose = self.op.verbose
215 op.error_codes = self.op.error_codes
216 try:
217 op.skip_checks = self.op.skip_checks
218 except AttributeError:
219 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
220
221 return ResultWithJobs(jobs)
222
223
224 class LUClusterVerifyDisks(NoHooksLU):
225 """Verifies the cluster disks status.
226
227 """
228 REQ_BGL = False
229
230 def ExpandNames(self):
231 self.share_locks = ShareAll()
232 if self.op.group_name:
233 self.needed_locks = {
234 locking.LEVEL_NODEGROUP: [self.cfg.LookupNodeGroup(self.op.group_name)]
235 }
236 else:
237 self.needed_locks = {
238 locking.LEVEL_NODEGROUP: locking.ALL_SET,
239 }
240
241 def Exec(self, feedback_fn):
242 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
243 instances = self.cfg.GetInstanceList()
244
245 only_ext = compat.all(
246 self.cfg.GetInstanceDiskTemplate(i) == constants.DT_EXT
247 for i in instances)
248
249 # We skip current NodeGroup verification if there are only external storage
250 # devices. Currently we provide an interface for external storage provider
251 # for disk verification implementations, however current ExtStorageDevice
252 # does not provide an API for this yet.
253 #
254 # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice
255 # is implemented.
256 if only_ext:
257 logging.info("All instances have ext storage, skipping verify disks.")
258 return ResultWithJobs([])
259 else:
260 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
261 return ResultWithJobs(
262 [[opcodes.OpGroupVerifyDisks(group_name=group,
263 is_strict=self.op.is_strict)]
264 for group in group_names])
265
266
267 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
268 """Verifies the cluster config.
269
270 """
271 REQ_BGL = False
272
273 def _VerifyHVP(self, hvp_data):
274 """Verifies locally the syntax of the hypervisor parameters.
275
276 """
277 for item, hv_name, hv_params in hvp_data:
278 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
279 (item, hv_name))
280 try:
281 hv_class = hypervisor.GetHypervisorClass(hv_name)
282 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
283 hv_class.CheckParameterSyntax(hv_params)
284 except errors.GenericError, err:
285 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
286
287 def ExpandNames(self):
288 self.needed_locks = dict.fromkeys(locking.LEVELS, locking.ALL_SET)
289 self.share_locks = ShareAll()
290
291 def CheckPrereq(self):
292 """Check prerequisites.
293
294 """
295 # Retrieve all information
296 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
297 self.all_node_info = self.cfg.GetAllNodesInfo()
298 self.all_inst_info = self.cfg.GetAllInstancesInfo()
299
300 def Exec(self, feedback_fn):
301 """Verify integrity of cluster, performing various test on nodes.
302
303 """
304 self.bad = False
305 self._feedback_fn = feedback_fn
306
307 feedback_fn("* Verifying cluster config")
308
309 msg_list = self.cfg.VerifyConfig()
310 self._ErrorMsgList(constants.CV_ECLUSTERCFG, None, msg_list)
311
312 feedback_fn("* Verifying cluster certificate files")
313
314 for cert_filename in pathutils.ALL_CERT_FILES:
315 (errcode, msg) = utils.VerifyCertificate(cert_filename)
316 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
317
318 self._ErrorIf(not utils.CanRead(constants.LUXID_USER,
319 pathutils.NODED_CERT_FILE),
320 constants.CV_ECLUSTERCERT,
321 None,
322 pathutils.NODED_CERT_FILE + " must be accessible by the " +
323 constants.LUXID_USER + " user")
324
325 feedback_fn("* Verifying hypervisor parameters")
326
327 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
328 self.all_inst_info.values()))
329
330 feedback_fn("* Verifying all nodes belong to an existing group")
331
332 # We do this verification here because, should this bogus circumstance
333 # occur, it would never be caught by VerifyGroup, which only acts on
334 # nodes/instances reachable from existing node groups.
335
336 dangling_nodes = set(node for node in self.all_node_info.values()
337 if node.group not in self.all_group_info)
338
339 dangling_instances = {}
340 no_node_instances = []
341
342 for inst in self.all_inst_info.values():
343 if inst.primary_node in [node.uuid for node in dangling_nodes]:
344 dangling_instances.setdefault(inst.primary_node, []).append(inst)
345 elif inst.primary_node not in self.all_node_info:
346 no_node_instances.append(inst)
347
348 pretty_dangling = [
349 "%s (%s)" %
350 (node.name,
351 utils.CommaJoin(inst.name for
352 inst in dangling_instances.get(node.uuid, [])))
353 for node in dangling_nodes]
354
355 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
356 None,
357 "the following nodes (and their instances) belong to a non"
358 " existing group: %s", utils.CommaJoin(pretty_dangling))
359
360 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
361 None,
362 "the following instances have a non-existing primary-node:"
363 " %s", utils.CommaJoin(inst.name for
364 inst in no_node_instances))
365
366 return not self.bad
367
368
369 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
370 """Verifies the status of a node group.
371
372 """
373 HPATH = "cluster-verify"
374 HTYPE = constants.HTYPE_CLUSTER
375 REQ_BGL = False
376
377 _HOOKS_INDENT_RE = re.compile("^", re.M)
378
379 class NodeImage(object):
380 """A class representing the logical and physical status of a node.
381
382 @type uuid: string
383 @ivar uuid: the node UUID to which this object refers
384 @ivar volumes: a structure as returned from
385 L{ganeti.backend.GetVolumeList} (runtime)
386 @ivar instances: a list of running instances (runtime)
387 @ivar pinst: list of configured primary instances (config)
388 @ivar sinst: list of configured secondary instances (config)
389 @ivar sbp: dictionary of {primary-node: list of instances} for all
390 instances for which this node is secondary (config)
391 @ivar mfree: free memory, as reported by hypervisor (runtime)
392 @ivar mtotal: total memory, as reported by hypervisor (runtime)
393 @ivar mdom0: domain0 memory, as reported by hypervisor (runtime)
394 @ivar dfree: free disk, as reported by the node (runtime)
395 @ivar offline: the offline status (config)
396 @type rpc_fail: boolean
397 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
398 not whether the individual keys were correct) (runtime)
399 @type lvm_fail: boolean
400 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
401 @type hyp_fail: boolean
402 @ivar hyp_fail: whether the RPC call didn't return the instance list
403 @type ghost: boolean
404 @ivar ghost: whether this is a known node or not (config)
405 @type os_fail: boolean
406 @ivar os_fail: whether the RPC call didn't return valid OS data
407 @type oslist: list
408 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
409 @type vm_capable: boolean
410 @ivar vm_capable: whether the node can host instances
411 @type pv_min: float
412 @ivar pv_min: size in MiB of the smallest PVs
413 @type pv_max: float
414 @ivar pv_max: size in MiB of the biggest PVs
415
416 """
417 def __init__(self, offline=False, uuid=None, vm_capable=True):
418 self.uuid = uuid
419 self.volumes = {}
420 self.instances = []
421 self.pinst = []
422 self.sinst = []
423 self.sbp = {}
424 self.mfree = 0
425 self.mtotal = 0
426 self.mdom0 = 0
427 self.dfree = 0
428 self.offline = offline
429 self.vm_capable = vm_capable
430 self.rpc_fail = False
431 self.lvm_fail = False
432 self.hyp_fail = False
433 self.ghost = False
434 self.os_fail = False
435 self.oslist = {}
436 self.pv_min = None
437 self.pv_max = None
438
439 def ExpandNames(self):
440 # This raises errors.OpPrereqError on its own:
441 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
442
443 # Get instances in node group; this is unsafe and needs verification later
444 inst_uuids = \
445 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
446
447 self.needed_locks = {
448 locking.LEVEL_INSTANCE: self.cfg.GetInstanceNames(inst_uuids),
449 locking.LEVEL_NODEGROUP: [self.group_uuid],
450 locking.LEVEL_NODE: [],
451 }
452
453 self.share_locks = ShareAll()
454
455 def DeclareLocks(self, level):
456 if level == locking.LEVEL_NODE:
457 # Get members of node group; this is unsafe and needs verification later
458 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
459
460 # In Exec(), we warn about mirrored instances that have primary and
461 # secondary living in separate node groups. To fully verify that
462 # volumes for these instances are healthy, we will need to do an
463 # extra call to their secondaries. We ensure here those nodes will
464 # be locked.
465 for inst_name in self.owned_locks(locking.LEVEL_INSTANCE):
466 # Important: access only the instances whose lock is owned
467 instance = self.cfg.GetInstanceInfoByName(inst_name)
468 disks = self.cfg.GetInstanceDisks(instance.uuid)
469 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
470 nodes.update(self.cfg.GetInstanceSecondaryNodes(instance.uuid))
471
472 self.needed_locks[locking.LEVEL_NODE] = nodes
473
474 def CheckPrereq(self):
475 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
476 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
477
478 group_node_uuids = set(self.group_info.members)
479 group_inst_uuids = \
480 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
481
482 unlocked_node_uuids = \
483 group_node_uuids.difference(self.owned_locks(locking.LEVEL_NODE))
484
485 unlocked_inst_uuids = \
486 group_inst_uuids.difference(
487 [self.cfg.GetInstanceInfoByName(name).uuid
488 for name in self.owned_locks(locking.LEVEL_INSTANCE)])
489
490 if unlocked_node_uuids:
491 raise errors.OpPrereqError(
492 "Missing lock for nodes: %s" %
493 utils.CommaJoin(self.cfg.GetNodeNames(unlocked_node_uuids)),
494 errors.ECODE_STATE)
495
496 if unlocked_inst_uuids:
497 raise errors.OpPrereqError(
498 "Missing lock for instances: %s" %
499 utils.CommaJoin(self.cfg.GetInstanceNames(unlocked_inst_uuids)),
500 errors.ECODE_STATE)
501
502 self.all_node_info = self.cfg.GetAllNodesInfo()
503 self.all_inst_info = self.cfg.GetAllInstancesInfo()
504 self.all_disks_info = self.cfg.GetAllDisksInfo()
505
506 self.my_node_uuids = group_node_uuids
507 self.my_node_info = dict((node_uuid, self.all_node_info[node_uuid])
508 for node_uuid in group_node_uuids)
509
510 self.my_inst_uuids = group_inst_uuids
511 self.my_inst_info = dict((inst_uuid, self.all_inst_info[inst_uuid])
512 for inst_uuid in group_inst_uuids)
513
514 # We detect here the nodes that will need the extra RPC calls for verifying
515 # split LV volumes; they should be locked.
516 extra_lv_nodes = {}
517
518 for inst in self.my_inst_info.values():
519 disks = self.cfg.GetInstanceDisks(inst.uuid)
520 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
521 inst_nodes = self.cfg.GetInstanceNodes(inst.uuid)
522 for nuuid in inst_nodes:
523 if self.all_node_info[nuuid].group != self.group_uuid:
524 if nuuid in extra_lv_nodes:
525 extra_lv_nodes[nuuid].append(inst.name)
526 else:
527 extra_lv_nodes[nuuid] = [inst.name]
528
529 extra_lv_nodes_set = set(extra_lv_nodes.iterkeys())
530 unlocked_lv_nodes = \
531 extra_lv_nodes_set.difference(self.owned_locks(locking.LEVEL_NODE))
532
533 if unlocked_lv_nodes:
534 node_strings = ['%s: [%s]' % (
535 self.cfg.GetNodeName(node), utils.CommaJoin(extra_lv_nodes[node]))
536 for node in unlocked_lv_nodes]
537 raise errors.OpPrereqError("Missing node locks for LV check: %s" %
538 utils.CommaJoin(node_strings),
539 errors.ECODE_STATE)
540 self.extra_lv_nodes = list(extra_lv_nodes_set)
541
542 def _VerifyNode(self, ninfo, nresult):
543 """Perform some basic validation on data returned from a node.
544
545 - check the result data structure is well formed and has all the
546 mandatory fields
547 - check ganeti version
548
549 @type ninfo: L{objects.Node}
550 @param ninfo: the node to check
551 @param nresult: the results from the node
552 @rtype: boolean
553 @return: whether overall this call was successful (and we can expect
554 reasonable values in the respose)
555
556 """
557 # main result, nresult should be a non-empty dict
558 test = not nresult or not isinstance(nresult, dict)
559 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name,
560 "unable to verify node: no data returned")
561 if test:
562 return False
563
564 # compares ganeti version
565 local_version = constants.PROTOCOL_VERSION
566 remote_version = nresult.get("version", None)
567 test = not (remote_version and
568 isinstance(remote_version, (list, tuple)) and
569 len(remote_version) == 2)
570 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name,
571 "connection to node returned invalid data")
572 if test:
573 return False
574
575 test = local_version != remote_version[0]
576 self._ErrorIf(test, constants.CV_ENODEVERSION, ninfo.name,
577 "incompatible protocol versions: master %s,"
578 " node %s", local_version, remote_version[0])
579 if test:
580 return False
581
582 # node seems compatible, we can actually try to look into its results
583
584 # full package version
585 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
586 constants.CV_ENODEVERSION, ninfo.name,
587 "software version mismatch: master %s, node %s",
588 constants.RELEASE_VERSION, remote_version[1],
589 code=self.ETYPE_WARNING)
590
591 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
592 if ninfo.vm_capable and isinstance(hyp_result, dict):
593 for hv_name, hv_result in hyp_result.iteritems():
594 test = hv_result is not None
595 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
596 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
597
598 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
599 if ninfo.vm_capable and isinstance(hvp_result, list):
600 for item, hv_name, hv_result in hvp_result:
601 self._ErrorIf(True, constants.CV_ENODEHV, ninfo.name,
602 "hypervisor %s parameter verify failure (source %s): %s",
603 hv_name, item, hv_result)
604
605 test = nresult.get(constants.NV_NODESETUP,
606 ["Missing NODESETUP results"])
607 self._ErrorIf(test, constants.CV_ENODESETUP, ninfo.name,
608 "node setup error: %s", "; ".join(test))
609
610 return True
611
612 def _VerifyNodeTime(self, ninfo, nresult,
613 nvinfo_starttime, nvinfo_endtime):
614 """Check the node time.
615
616 @type ninfo: L{objects.Node}
617 @param ninfo: the node to check
618 @param nresult: the remote results for the node
619 @param nvinfo_starttime: the start time of the RPC call
620 @param nvinfo_endtime: the end time of the RPC call
621
622 """
623 ntime = nresult.get(constants.NV_TIME, None)
624 try:
625 ntime_merged = utils.MergeTime(ntime)
626 except (ValueError, TypeError):
627 self._ErrorIf(True, constants.CV_ENODETIME, ninfo.name,
628 "Node returned invalid time")
629 return
630
631 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
632 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
633 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
634 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
635 else:
636 ntime_diff = None
637
638 self._ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, ninfo.name,
639 "Node time diverges by at least %s from master node time",
640 ntime_diff)
641
642 def _UpdateVerifyNodeLVM(self, ninfo, nresult, vg_name, nimg):
643 """Check the node LVM results and update info for cross-node checks.
644
645 @type ninfo: L{objects.Node}
646 @param ninfo: the node to check
647 @param nresult: the remote results for the node
648 @param vg_name: the configured VG name
649 @type nimg: L{NodeImage}
650 @param nimg: node image
651
652 """
653 if vg_name is None:
654 return
655
656 # checks vg existence and size > 20G
657 vglist = nresult.get(constants.NV_VGLIST, None)
658 test = not vglist
659 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name,
660 "unable to check volume groups")
661 if not test:
662 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
663 constants.MIN_VG_SIZE)
664 self._ErrorIf(vgstatus, constants.CV_ENODELVM, ninfo.name, vgstatus)
665
666 # Check PVs
667 (errmsgs, pvminmax) = CheckNodePVs(nresult, self._exclusive_storage)
668 for em in errmsgs:
669 self._Error(constants.CV_ENODELVM, ninfo.name, em)
670 if pvminmax is not None:
671 (nimg.pv_min, nimg.pv_max) = pvminmax
672
673 def _VerifyGroupDRBDVersion(self, node_verify_infos):
674 """Check cross-node DRBD version consistency.
675
676 @type node_verify_infos: dict
677 @param node_verify_infos: infos about nodes as returned from the
678 node_verify call.
679
680 """
681 node_versions = {}
682 for node_uuid, ndata in node_verify_infos.items():
683 nresult = ndata.payload
684 if nresult:
685 version = nresult.get(constants.NV_DRBDVERSION, None)
686 if version:
687 node_versions[node_uuid] = version
688
689 if len(set(node_versions.values())) > 1:
690 for node_uuid, version in sorted(node_versions.items()):
691 msg = "DRBD version mismatch: %s" % version
692 self._Error(constants.CV_ENODEDRBDHELPER, node_uuid, msg,
693 code=self.ETYPE_WARNING)
694
695 def _VerifyGroupLVM(self, node_image, vg_name):
696 """Check cross-node consistency in LVM.
697
698 @type node_image: dict
699 @param node_image: info about nodes, mapping from node to names to
700 L{NodeImage} objects
701 @param vg_name: the configured VG name
702
703 """
704 if vg_name is None:
705 return
706
707 # Only exclusive storage needs this kind of checks
708 if not self._exclusive_storage:
709 return
710
711 # exclusive_storage wants all PVs to have the same size (approximately),
712 # if the smallest and the biggest ones are okay, everything is fine.
713 # pv_min is None iff pv_max is None
714 vals = [ni for ni in node_image.values() if ni.pv_min is not None]
715 if not vals:
716 return
717 (pvmin, minnode_uuid) = min((ni.pv_min, ni.uuid) for ni in vals)
718 (pvmax, maxnode_uuid) = max((ni.pv_max, ni.uuid) for ni in vals)
719 bad = utils.LvmExclusiveTestBadPvSizes(pvmin, pvmax)
720 self._ErrorIf(bad, constants.CV_EGROUPDIFFERENTPVSIZE, self.group_info.name,
721 "PV sizes differ too much in the group; smallest (%s MB) is"
722 " on %s, biggest (%s MB) is on %s",
723 pvmin, self.cfg.GetNodeName(minnode_uuid),
724 pvmax, self.cfg.GetNodeName(maxnode_uuid))
725
726 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
727 """Check the node bridges.
728
729 @type ninfo: L{objects.Node}
730 @param ninfo: the node to check
731 @param nresult: the remote results for the node
732 @param bridges: the expected list of bridges
733
734 """
735 if not bridges:
736 return
737
738 missing = nresult.get(constants.NV_BRIDGES, None)
739 test = not isinstance(missing, list)
740 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
741 "did not return valid bridge information")
742 if not test:
743 self._ErrorIf(bool(missing), constants.CV_ENODENET, ninfo.name,
744 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
745
746 def _VerifyNodeUserScripts(self, ninfo, nresult):
747 """Check the results of user scripts presence and executability on the node
748
749 @type ninfo: L{objects.Node}
750 @param ninfo: the node to check
751 @param nresult: the remote results for the node
752
753 """
754 test = not constants.NV_USERSCRIPTS in nresult
755 self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, ninfo.name,
756 "did not return user scripts information")
757
758 broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None)
759 if not test:
760 self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, ninfo.name,
761 "user scripts not present or not executable: %s" %
762 utils.CommaJoin(sorted(broken_scripts)))
763
764 def _VerifyNodeNetwork(self, ninfo, nresult):
765 """Check the node network connectivity results.
766
767 @type ninfo: L{objects.Node}
768 @param ninfo: the node to check
769 @param nresult: the remote results for the node
770
771 """
772 test = constants.NV_NODELIST not in nresult
773 self._ErrorIf(test, constants.CV_ENODESSH, ninfo.name,
774 "node hasn't returned node ssh connectivity data")
775 if not test:
776 if nresult[constants.NV_NODELIST]:
777 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
778 self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name,
779 "ssh communication with node '%s': %s", a_node, a_msg)
780
781 if constants.NV_NODENETTEST not in nresult:
782 self._ErrorMsg(constants.CV_ENODENET, ninfo.name,
783 "node hasn't returned node tcp connectivity data")
784 elif nresult[constants.NV_NODENETTEST]:
785 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
786 msglist = []
787 for node in nlist:
788 msglist.append("tcp communication with node '%s': %s" %
789 (node, nresult[constants.NV_NODENETTEST][node]))
790 self._ErrorMsgList(constants.CV_ENODENET, ninfo.name, msglist)
791
792 if constants.NV_MASTERIP not in nresult:
793 self._ErrorMsg(constants.CV_ENODENET, ninfo.name,
794 "node hasn't returned node master IP reachability data")
795 elif nresult[constants.NV_MASTERIP] is False: # be explicit, could be None
796 if ninfo.uuid == self.master_node:
797 msg = "the master node cannot reach the master IP (not configured?)"
798 else:
799 msg = "cannot reach the master IP"
800 self._ErrorMsg(constants.CV_ENODENET, ninfo.name, msg)
801
802 def _VerifyInstance(self, instance, node_image, diskstatus):
803 """Verify an instance.
804
805 This function checks to see if the required block devices are
806 available on the instance's node, and that the nodes are in the correct
807 state.
808
809 """
810 pnode_uuid = instance.primary_node
811 pnode_img = node_image[pnode_uuid]
812 groupinfo = self.cfg.GetAllNodeGroupsInfo()
813
814 node_vol_should = {}
815 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
816
817 cluster = self.cfg.GetClusterInfo()
818 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster,
819 self.group_info)
820 err = ComputeIPolicyInstanceViolation(ipolicy, instance, self.cfg)
821 self._ErrorIf(err, constants.CV_EINSTANCEPOLICY, instance.name,
822 utils.CommaJoin(err), code=self.ETYPE_WARNING)
823
824 for node_uuid in node_vol_should:
825 n_img = node_image[node_uuid]
826 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
827 # ignore missing volumes on offline or broken nodes
828 continue
829 for volume in node_vol_should[node_uuid]:
830 test = volume not in n_img.volumes
831 self._ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance.name,
832 "volume %s missing on node %s", volume,
833 self.cfg.GetNodeName(node_uuid))
834
835 if instance.admin_state == constants.ADMINST_UP:
836 test = instance.uuid not in pnode_img.instances and not pnode_img.offline
837 self._ErrorIf(test, constants.CV_EINSTANCEDOWN, instance.name,
838 "instance not running on its primary node %s",
839 self.cfg.GetNodeName(pnode_uuid))
840 self._ErrorIf(pnode_img.offline, constants.CV_EINSTANCEBADNODE,
841 instance.name, "instance is marked as running and lives on"
842 " offline node %s", self.cfg.GetNodeName(pnode_uuid))
843
844 diskdata = [(nname, success, status, idx)
845 for (nname, disks) in diskstatus.items()
846 for idx, (success, status) in enumerate(disks)]
847
848 for nname, success, bdev_status, idx in diskdata:
849 # the 'ghost node' construction in Exec() ensures that we have a
850 # node here
851 snode = node_image[nname]
852 bad_snode = snode.ghost or snode.offline
853 self._ErrorIf(instance.disks_active and
854 not success and not bad_snode,
855 constants.CV_EINSTANCEFAULTYDISK, instance.name,
856 "couldn't retrieve status for disk/%s on %s: %s",
857 idx, self.cfg.GetNodeName(nname), bdev_status)
858
859 if instance.disks_active and success and bdev_status.is_degraded:
860 msg = "disk/%s on %s is degraded" % (idx, self.cfg.GetNodeName(nname))
861
862 code = self.ETYPE_ERROR
863 accepted_lds = [constants.LDS_OKAY, constants.LDS_SYNC]
864
865 if bdev_status.ldisk_status in accepted_lds:
866 code = self.ETYPE_WARNING
867
868 msg += "; local disk state is '%s'" % \
869 constants.LDS_NAMES[bdev_status.ldisk_status]
870
871 self._Error(constants.CV_EINSTANCEFAULTYDISK, instance.name, msg,
872 code=code)
873
874 self._ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
875 constants.CV_ENODERPC, self.cfg.GetNodeName(pnode_uuid),
876 "instance %s, connection to primary node failed",
877 instance.name)
878
879 secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid)
880 self._ErrorIf(len(secondary_nodes) > 1,
881 constants.CV_EINSTANCELAYOUT, instance.name,
882 "instance has multiple secondary nodes: %s",
883 utils.CommaJoin(secondary_nodes),
884 code=self.ETYPE_WARNING)
885
886 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid)
887 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, inst_nodes)
888 disks = self.cfg.GetInstanceDisks(instance.uuid)
889 if any(es_flags.values()):
890 if not utils.AllDiskOfType(disks, constants.DTS_EXCL_STORAGE):
891 # Disk template not compatible with exclusive_storage: no instance
892 # node should have the flag set
893 es_nodes = [n
894 for (n, es) in es_flags.items()
895 if es]
896 unsupported = [d.dev_type for d in disks
897 if d.dev_type not in constants.DTS_EXCL_STORAGE]
898 self._Error(constants.CV_EINSTANCEUNSUITABLENODE, instance.name,
899 "instance uses disk types %s, which are not supported on"
900 " nodes that have exclusive storage set: %s",
901 utils.CommaJoin(unsupported),
902 utils.CommaJoin(self.cfg.GetNodeNames(es_nodes)))
903 for (idx, disk) in enumerate(disks):
904 self._ErrorIf(disk.spindles is None,
905 constants.CV_EINSTANCEMISSINGCFGPARAMETER, instance.name,
906 "number of spindles not configured for disk %s while"
907 " exclusive storage is enabled, try running"
908 " gnt-cluster repair-disk-sizes", idx)
909
910 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
911 instance_nodes = utils.NiceSort(inst_nodes)
912 instance_groups = {}
913
914 for node_uuid in instance_nodes:
915 instance_groups.setdefault(self.all_node_info[node_uuid].group,
916 []).append(node_uuid)
917
918 pretty_list = [
919 "%s (group %s)" % (utils.CommaJoin(self.cfg.GetNodeNames(nodes)),
920 groupinfo[group].name)
921 # Sort so that we always list the primary node first.
922 for group, nodes in sorted(instance_groups.items(),
923 key=lambda (_, nodes): pnode_uuid in nodes,
924 reverse=True)]
925
926 self._ErrorIf(len(instance_groups) > 1,
927 constants.CV_EINSTANCESPLITGROUPS,
928 instance.name, "instance has primary and secondary nodes in"
929 " different groups: %s", utils.CommaJoin(pretty_list),
930 code=self.ETYPE_WARNING)
931
932 inst_nodes_offline = []
933 for snode in secondary_nodes:
934 s_img = node_image[snode]
935 self._ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
936 self.cfg.GetNodeName(snode),
937 "instance %s, connection to secondary node failed",
938 instance.name)
939
940 if s_img.offline:
941 inst_nodes_offline.append(snode)
942
943 # warn that the instance lives on offline nodes
944 self._ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE,
945 instance.name, "instance has offline secondary node(s) %s",
946 utils.CommaJoin(self.cfg.GetNodeNames(inst_nodes_offline)))
947 # ... or ghost/non-vm_capable nodes
948 for node_uuid in inst_nodes:
949 self._ErrorIf(node_image[node_uuid].ghost, constants.CV_EINSTANCEBADNODE,
950 instance.name, "instance lives on ghost node %s",
951 self.cfg.GetNodeName(node_uuid))
952 self._ErrorIf(not node_image[node_uuid].vm_capable,
953 constants.CV_EINSTANCEBADNODE, instance.name,
954 "instance lives on non-vm_capable node %s",
955 self.cfg.GetNodeName(node_uuid))
956
957 def _VerifyOrphanVolumes(self, vg_name, node_vol_should, node_image,
958 reserved):
959 """Verify if there are any unknown volumes in the cluster.
960
961 The .os, .swap and backup volumes are ignored. All other volumes are
962 reported as unknown.
963
964 @type vg_name: string
965 @param vg_name: the name of the Ganeti-administered volume group
966 @type node_vol_should: dict
967 @param node_vol_should: mapping of node UUIDs to expected LVs on each node
968 @type node_image: dict
969 @param node_image: mapping of node UUIDs to L{NodeImage} objects
970 @type reserved: L{ganeti.utils.FieldSet}
971 @param reserved: a FieldSet of reserved volume names
972
973 """
974 for node_uuid, n_img in node_image.items():
975 if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or
976 self.all_node_info[node_uuid].group != self.group_uuid):
977 # skip non-healthy nodes
978 continue
979 for volume in n_img.volumes:
980 # skip volumes not belonging to the ganeti-administered volume group
981 if volume.split('/')[0] != vg_name:
982 continue
983
984 test = ((node_uuid not in node_vol_should or
985 volume not in node_vol_should[node_uuid]) and
986 not reserved.Matches(volume))
987 self._ErrorIf(test, constants.CV_ENODEORPHANLV,
988 self.cfg.GetNodeName(node_uuid),
989 "volume %s is unknown", volume,
990 code=_VerifyErrors.ETYPE_WARNING)
991
992 def _VerifyNPlusOneMemory(self, node_image, all_insts):
993 """Verify N+1 Memory Resilience.
994
995 Check that if one single node dies we can still start all the
996 instances it was primary for.
997
998 """
999 cluster_info = self.cfg.GetClusterInfo()
1000 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster_info,
1001 self.group_info)
1002 memory_ratio = ipolicy[constants.IPOLICY_MEMORY_RATIO]
1003
1004 for node_uuid, n_img in node_image.items():
1005 # This code checks that every node which is now listed as
1006 # secondary has enough memory to host all instances it is
1007 # supposed to should a single other node in the cluster fail.
1008 # FIXME: not ready for failover to an arbitrary node
1009 # FIXME: does not support file-backed instances
1010 # WARNING: we currently take into account down instances as well
1011 # as up ones, considering that even if they're down someone
1012 # might want to start them even in the event of a node failure.
1013 node_cfg = self.all_node_info[node_uuid]
1014 if n_img.offline or \
1015 node_cfg.group != self.group_uuid:
1016 # we're skipping nodes marked offline and nodes in other groups from
1017 # the N+1 warning, since most likely we don't have good memory
1018 # information from them; we already list instances living on such
1019 # nodes, and that's enough warning
1020 continue
1021 #TODO(dynmem): also consider ballooning out other instances
1022 for prinode, inst_uuids in n_img.sbp.items():
1023 needed_mem = 0
1024 for inst_uuid in inst_uuids:
1025 bep = cluster_info.FillBE(all_insts[inst_uuid])
1026 if bep[constants.BE_AUTO_BALANCE]:
1027 needed_mem += bep[constants.BE_MINMEM]
1028 mnode = n_img.mdom0
1029 (hv, hv_state) = self.cfg.GetFilledHvStateParams(node_cfg).items()[0]
1030 if hv != constants.HT_XEN_PVM and hv != constants.HT_XEN_HVM:
1031 mnode = hv_state["mem_node"]
1032 # minimum allowed free memory (it's negative due to over-commitment)
1033 mem_treshold = (n_img.mtotal - mnode) * (memory_ratio - 1)
1034 test = n_img.mfree - needed_mem < mem_treshold
1035 self._ErrorIf(test, constants.CV_ENODEN1,
1036 self.cfg.GetNodeName(node_uuid),
1037 "not enough memory to accomodate instance failovers"
1038 " should node %s fail (%dMiB needed, %dMiB available)",
1039 self.cfg.GetNodeName(prinode), needed_mem, n_img.mfree)
1040
1041 def _CertError(self, *args):
1042 """Helper function for _VerifyClientCertificates."""
1043 self._Error(constants.CV_ECLUSTERCLIENTCERT, None, *args)
1044 self._cert_error_found = True
1045
1046 def _VerifyClientCertificates(self, nodes, all_nvinfo):
1047 """Verifies the consistency of the client certificates.
1048
1049 This includes several aspects:
1050 - the individual validation of all nodes' certificates
1051 - the consistency of the master candidate certificate map
1052 - the consistency of the master candidate certificate map with the
1053 certificates that the master candidates are actually using.
1054
1055 @param nodes: the list of nodes to consider in this verification
1056 @param all_nvinfo: the map of results of the verify_node call to
1057 all nodes
1058
1059 """
1060
1061 rebuild_certs_msg = (
1062 "To rebuild node certificates, please run"
1063 " 'gnt-cluster renew-crypto --new-node-certificates'.")
1064
1065 self._cert_error_found = False
1066
1067 candidate_certs = self.cfg.GetClusterInfo().candidate_certs
1068 if not candidate_certs:
1069 self._CertError(
1070 "The cluster's list of master candidate certificates is empty."
1071 " This may be because you just updated the cluster. " +
1072 rebuild_certs_msg)
1073 return
1074
1075 if len(candidate_certs) != len(set(candidate_certs.values())):
1076 self._CertError(
1077 "There are at least two master candidates configured to use the same"
1078 " certificate.")
1079
1080 # collect the client certificate
1081 for node in nodes:
1082 if node.offline:
1083 continue
1084
1085 nresult = all_nvinfo[node.uuid]
1086 if nresult.fail_msg or not nresult.payload:
1087 continue
1088
1089 (errcode, msg) = nresult.payload.get(constants.NV_CLIENT_CERT, None)
1090
1091 if errcode is not None:
1092 self._CertError(
1093 "Client certificate of node '%s' failed validation: %s (code '%s')",
1094 node.uuid, msg, errcode)
1095 if not errcode:
1096 digest = msg
1097 if node.master_candidate:
1098 if node.uuid in candidate_certs:
1099 if digest != candidate_certs[node.uuid]:
1100 self._CertError(
1101 "Client certificate digest of master candidate '%s' does not"
1102 " match its entry in the cluster's map of master candidate"
1103 " certificates. Expected: %s Got: %s", node.uuid,
1104 digest, candidate_certs[node.uuid])
1105 else:
1106 self._CertError(
1107 "The master candidate '%s' does not have an entry in the"
1108 " map of candidate certificates.", node.uuid)
1109 if digest in candidate_certs.values():
1110 self._CertError(
1111 "Master candidate '%s' is using a certificate of another node.",
1112 node.uuid)
1113 else:
1114 if node.uuid in candidate_certs:
1115 self._CertError(
1116 "Node '%s' is not a master candidate, but still listed in the"
1117 " map of master candidate certificates.", node.uuid)
1118 if (node.uuid not in candidate_certs and
1119 digest in candidate_certs.values()):
1120 self._CertError(
1121 "Node '%s' is not a master candidate and is incorrectly using a"
1122 " certificate of another node which is master candidate.",
1123 node.uuid)
1124
1125 if self._cert_error_found:
1126 self._CertError(rebuild_certs_msg)
1127
1128 def _VerifySshSetup(self, nodes, all_nvinfo):
1129 """Evaluates the verification results of the SSH setup and clutter test.
1130
1131 @param nodes: List of L{objects.Node} objects
1132 @param all_nvinfo: RPC results
1133
1134 """
1135 for node in nodes:
1136 if not node.offline:
1137 nresult = all_nvinfo[node.uuid]
1138 if nresult.fail_msg or not nresult.payload:
1139 self._ErrorIf(True, constants.CV_ENODESSH, node.name,
1140 "Could not verify the SSH setup of this node.")
1141 return
1142 for ssh_test in [constants.NV_SSH_SETUP, constants.NV_SSH_CLUTTER]:
1143 result = nresult.payload.get(ssh_test, None)
1144 error_msg = ""
1145 if isinstance(result, list):
1146 error_msg = " ".join(result)
1147 self._ErrorIf(result,
1148 constants.CV_ENODESSH, None, error_msg)
1149
1150 def _VerifyFiles(self, nodes, master_node_uuid, all_nvinfo,
1151 (files_all, files_opt, files_mc, files_vm)):
1152 """Verifies file checksums collected from all nodes.
1153
1154 @param nodes: List of L{objects.Node} objects
1155 @param master_node_uuid: UUID of master node
1156 @param all_nvinfo: RPC results
1157
1158 """
1159 # Define functions determining which nodes to consider for a file
1160 files2nodefn = [
1161 (files_all, None),
1162 (files_mc, lambda node: (node.master_candidate or
1163 node.uuid == master_node_uuid)),
1164 (files_vm, lambda node: node.vm_capable),
1165 ]
1166
1167 # Build mapping from filename to list of nodes which should have the file
1168 nodefiles = {}
1169 for (files, fn) in files2nodefn:
1170 if fn is None:
1171 filenodes = nodes
1172 else:
1173 filenodes = filter(fn, nodes)
1174 nodefiles.update((filename, frozenset(fn.uuid for fn in filenodes))
1175 for filename in files)
1176
1177 assert set(nodefiles) == (files_all | files_mc | files_vm)
1178
1179 fileinfo = dict((filename, {}) for filename in nodefiles)
1180 ignore_nodes = set()
1181
1182 for node in nodes:
1183 if node.offline:
1184 ignore_nodes.add(node.uuid)
1185 continue
1186
1187 nresult = all_nvinfo[node.uuid]
1188
1189 if nresult.fail_msg or not nresult.payload:
1190 node_files = None
1191 else:
1192 fingerprints = nresult.payload.get(constants.NV_FILELIST, {})
1193 node_files = dict((vcluster.LocalizeVirtualPath(key), value)
1194 for (key, value) in fingerprints.items())
1195 del fingerprints
1196
1197 test = not (node_files and isinstance(node_files, dict))
1198 self._ErrorIf(test, constants.CV_ENODEFILECHECK, node.name,
1199 "Node did not return file checksum data")
1200 if test:
1201 ignore_nodes.add(node.uuid)
1202 continue
1203
1204 # Build per-checksum mapping from filename to nodes having it
1205 for (filename, checksum) in node_files.items():
1206 assert filename in nodefiles
1207 fileinfo[filename].setdefault(checksum, set()).add(node.uuid)
1208
1209 for (filename, checksums) in fileinfo.items():
1210 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
1211
1212 # Nodes having the file
1213 with_file = frozenset(node_uuid
1214 for node_uuids in fileinfo[filename].values()
1215 for node_uuid in node_uuids) - ignore_nodes
1216
1217 expected_nodes = nodefiles[filename] - ignore_nodes
1218
1219 # Nodes missing file
1220 missing_file = expected_nodes - with_file
1221
1222 if filename in files_opt:
1223 # All or no nodes
1224 self._ErrorIf(missing_file and missing_file != expected_nodes,
1225 constants.CV_ECLUSTERFILECHECK, None,
1226 "File %s is optional, but it must exist on all or no"
1227 " nodes (not found on %s)",
1228 filename,
1229 utils.CommaJoin(utils.NiceSort(
1230 self.cfg.GetNodeName(n) for n in missing_file)))
1231 else:
1232 self._ErrorIf(missing_file, constants.CV_ECLUSTERFILECHECK, None,
1233 "File %s is missing from node(s) %s", filename,
1234 utils.CommaJoin(utils.NiceSort(
1235 self.cfg.GetNodeName(n) for n in missing_file)))
1236
1237 # Warn if a node has a file it shouldn't
1238 unexpected = with_file - expected_nodes
1239 self._ErrorIf(unexpected,
1240 constants.CV_ECLUSTERFILECHECK, None,
1241 "File %s should not exist on node(s) %s",
1242 filename,
1243 utils.CommaJoin(utils.NiceSort(
1244 self.cfg.GetNodeName(n) for n in unexpected)))
1245
1246 # See if there are multiple versions of the file
1247 test = len(checksums) > 1
1248 if test:
1249 variants = ["variant %s on %s" %
1250 (idx + 1,
1251 utils.CommaJoin(utils.NiceSort(
1252 self.cfg.GetNodeName(n) for n in node_uuids)))
1253 for (idx, (checksum, node_uuids)) in
1254 enumerate(sorted(checksums.items()))]
1255 else:
1256 variants = []
1257
1258 self._ErrorIf(test, constants.CV_ECLUSTERFILECHECK, None,
1259 "File %s found with %s different checksums (%s)",
1260 filename, len(checksums), "; ".join(variants))
1261
1262 def _VerifyNodeDrbdHelper(self, ninfo, nresult, drbd_helper):
1263 """Verify the drbd helper.
1264
1265 """
1266 if drbd_helper:
1267 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
1268 test = (helper_result is None)
1269 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
1270 "no drbd usermode helper returned")
1271 if helper_result:
1272 status, payload = helper_result
1273 test = not status
1274 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
1275 "drbd usermode helper check unsuccessful: %s", payload)
1276 test = status and (payload != drbd_helper)
1277 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
1278 "wrong drbd usermode helper: %s", payload)
1279
1280 @staticmethod
1281 def _ComputeDrbdMinors(ninfo, instanceinfo, disks_info, drbd_map, error_if):
1282 """Gives the DRBD information in a map for a node.
1283
1284 @type ninfo: L{objects.Node}
1285 @param ninfo: the node to check
1286 @param instanceinfo: the dict of instances
1287 @param disks_info: the dict of disks
1288 @param drbd_map: the DRBD map as returned by
1289 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1290 @type error_if: callable like L{_ErrorIf}
1291 @param error_if: The error reporting function
1292 @return: dict from minor number to (disk_uuid, instance_uuid, active)
1293
1294 """
1295 node_drbd = {}
1296 for minor, disk_uuid in drbd_map[ninfo.uuid].items():
1297 test = disk_uuid not in disks_info
1298 error_if(test, constants.CV_ECLUSTERCFG, None,
1299 "ghost disk '%s' in temporary DRBD map", disk_uuid)
1300 # ghost disk should not be active, but otherwise we
1301 # don't give double warnings (both ghost disk and
1302 # unallocated minor in use)
1303 if test:
1304 node_drbd[minor] = (disk_uuid, None, False)
1305 else:
1306 disk_active = False
1307 disk_instance = None
1308 for (inst_uuid, inst) in instanceinfo.items():
1309 if disk_uuid in inst.disks:
1310 disk_active = inst.disks_active
1311 disk_instance = inst_uuid
1312 break
1313 node_drbd[minor] = (disk_uuid, disk_instance, disk_active)
1314 return node_drbd
1315
1316 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, disks_info,
1317 drbd_helper, drbd_map):
1318 """Verifies and the node DRBD status.
1319
1320 @type ninfo: L{objects.Node}
1321 @param ninfo: the node to check
1322 @param nresult: the remote results for the node
1323 @param instanceinfo: the dict of instances
1324 @param disks_info: the dict of disks
1325 @param drbd_helper: the configured DRBD usermode helper
1326 @param drbd_map: the DRBD map as returned by
1327 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1328
1329 """
1330 self._VerifyNodeDrbdHelper(ninfo, nresult, drbd_helper)
1331
1332 # compute the DRBD minors
1333 node_drbd = self._ComputeDrbdMinors(ninfo, instanceinfo, disks_info,
1334 drbd_map, self._ErrorIf)
1335
1336 # and now check them
1337 used_minors = nresult.get(constants.NV_DRBDLIST, [])
1338 test = not isinstance(used_minors, (tuple, list))
1339 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
1340 "cannot parse drbd status file: %s", str(used_minors))
1341 if test:
1342 # we cannot check drbd status
1343 return
1344
1345 for minor, (disk_uuid, inst_uuid, must_exist) in node_drbd.items():
1346 test = minor not in used_minors and must_exist
1347 if inst_uuid is not None:
1348 attached = "(attached in instance '%s')" % \
1349 self.cfg.GetInstanceName(inst_uuid)
1350 else:
1351 attached = "(detached)"
1352 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
1353 "drbd minor %d of disk %s %s is not active",
1354 minor, disk_uuid, attached)
1355 for minor in used_minors:
1356 test = minor not in node_drbd
1357 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
1358 "unallocated drbd minor %d is in use", minor)
1359
1360 def _UpdateNodeOS(self, ninfo, nresult, nimg):
1361 """Builds the node OS structures.
1362
1363 @type ninfo: L{objects.Node}
1364 @param ninfo: the node to check
1365 @param nresult: the remote results for the node
1366 @param nimg: the node image object
1367
1368 """
1369 remote_os = nresult.get(constants.NV_OSLIST, None)
1370 test = (not isinstance(remote_os, list) or
1371 not compat.all(isinstance(v, list) and len(v) == 8
1372 for v in remote_os))
1373
1374 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name,
1375 "node hasn't returned valid OS data")
1376
1377 nimg.os_fail = test
1378
1379 if test:
1380 return
1381
1382 os_dict = {}
1383
1384 for (name, os_path, status, diagnose,
1385 variants, parameters, api_ver,
1386 trusted) in nresult[constants.NV_OSLIST]:
1387
1388 if name not in os_dict:
1389 os_dict[name] = []
1390
1391 # parameters is a list of lists instead of list of tuples due to
1392 # JSON lacking a real tuple type, fix it:
1393 parameters = [tuple(v) for v in parameters]
1394 os_dict[name].append((os_path, status, diagnose,
1395 set(variants), set(parameters), set(api_ver),
1396 trusted))
1397
1398 nimg.oslist = os_dict
1399
1400 def _VerifyNodeOS(self, ninfo, nimg, base):
1401 """Verifies the node OS list.
1402
1403 @type ninfo: L{objects.Node}
1404 @param ninfo: the node to check
1405 @param nimg: the node image object
1406 @param base: the 'template' node we match against (e.g. from the master)
1407
1408 """
1409 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
1410
1411 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
1412 for os_name, os_data in nimg.oslist.items():
1413 assert os_data, "Empty OS status for OS %s?!" % os_name
1414 f_path, f_status, f_diag, f_var, f_param, f_api, f_trusted = os_data[0]
1415 self._ErrorIf(not f_status, constants.CV_ENODEOS, ninfo.name,
1416 "Invalid OS %s (located at %s): %s",
1417 os_name, f_path, f_diag)
1418 self._ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, ninfo.name,
1419 "OS '%s' has multiple entries"
1420 " (first one shadows the rest): %s",
1421 os_name, utils.CommaJoin([v[0] for v in os_data]))
1422 # comparisons with the 'base' image
1423 test = os_name not in base.oslist
1424 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name,
1425 "Extra OS %s not present on reference node (%s)",
1426 os_name, self.cfg.GetNodeName(base.uuid))
1427 if test:
1428 continue
1429 assert base.oslist[os_name], "Base node has empty OS status?"
1430 _, b_status, _, b_var, b_param, b_api, b_trusted = base.oslist[os_name][0]
1431 if not b_status:
1432 # base OS is invalid, skipping
1433 continue
1434 for kind, a, b in [("API version", f_api, b_api),
1435 ("variants list", f_var, b_var),
1436 ("parameters", beautify_params(f_param),
1437 beautify_params(b_param))]:
1438 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name,
1439 "OS %s for %s differs from reference node %s:"
1440 " [%s] vs. [%s]", kind, os_name,
1441 self.cfg.GetNodeName(base.uuid),
1442 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
1443 for kind, a, b in [("trusted", f_trusted, b_trusted)]:
1444 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name,
1445 "OS %s for %s differs from reference node %s:"
1446 " %s vs. %s", kind, os_name,
1447 self.cfg.GetNodeName(base.uuid), a, b)
1448
1449 # check any missing OSes
1450 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
1451 self._ErrorIf(missing, constants.CV_ENODEOS, ninfo.name,
1452 "OSes present on reference node %s"
1453 " but missing on this node: %s",
1454 self.cfg.GetNodeName(base.uuid), utils.CommaJoin(missing))
1455
1456 def _VerifyAcceptedFileStoragePaths(self, ninfo, nresult, is_master):
1457 """Verifies paths in L{pathutils.FILE_STORAGE_PATHS_FILE}.
1458
1459 @type ninfo: L{objects.Node}
1460 @param ninfo: the node to check
1461 @param nresult: the remote results for the node
1462 @type is_master: bool
1463 @param is_master: Whether node is the master node
1464
1465 """
1466 cluster = self.cfg.GetClusterInfo()
1467 if (is_master and
1468 (cluster.IsFileStorageEnabled() or
1469 cluster.IsSharedFileStorageEnabled())):
1470 try:
1471 fspaths = nresult[constants.NV_ACCEPTED_STORAGE_PATHS]
1472 except KeyError:
1473 # This should never happen
1474 self._ErrorIf(True, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
1475 "Node did not return forbidden file storage paths")
1476 else:
1477 self._ErrorIf(fspaths, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
1478 "Found forbidden file storage paths: %s",
1479 utils.CommaJoin(fspaths))
1480 else:
1481 self._ErrorIf(constants.NV_ACCEPTED_STORAGE_PATHS in nresult,
1482 constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
1483 "Node should not have returned forbidden file storage"
1484 " paths")
1485
1486 def _VerifyStoragePaths(self, ninfo, nresult, file_disk_template,
1487 verify_key, error_key):
1488 """Verifies (file) storage paths.
1489
1490 @type ninfo: L{objects.Node}
1491 @param ninfo: the node to check
1492 @param nresult: the remote results for the node
1493 @type file_disk_template: string
1494 @param file_disk_template: file-based disk template, whose directory
1495 is supposed to be verified
1496 @type verify_key: string
1497 @param verify_key: key for the verification map of this file
1498 verification step
1499 @param error_key: error key to be added to the verification results
1500 in case something goes wrong in this verification step
1501
1502 """
1503 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes(
1504 constants.ST_FILE, constants.ST_SHARED_FILE, constants.ST_GLUSTER
1505 ))
1506
1507 cluster = self.cfg.GetClusterInfo()
1508 if cluster.IsDiskTemplateEnabled(file_disk_template):
1509 self._ErrorIf(
1510 verify_key in nresult,
1511 error_key, ninfo.name,
1512 "The configured %s storage path is unusable: %s" %
1513 (file_disk_template, nresult.get(verify_key)))
1514
1515 def _VerifyFileStoragePaths(self, ninfo, nresult):
1516 """Verifies (file) storage paths.
1517
1518 @see: C{_VerifyStoragePaths}
1519
1520 """
1521 self._VerifyStoragePaths(
1522 ninfo, nresult, constants.DT_FILE,
1523 constants.NV_FILE_STORAGE_PATH,
1524 constants.CV_ENODEFILESTORAGEPATHUNUSABLE)
1525
1526 def _VerifySharedFileStoragePaths(self, ninfo, nresult):
1527 """Verifies (file) storage paths.
1528
1529 @see: C{_VerifyStoragePaths}
1530
1531 """
1532 self._VerifyStoragePaths(
1533 ninfo, nresult, constants.DT_SHARED_FILE,
1534 constants.NV_SHARED_FILE_STORAGE_PATH,
1535 constants.CV_ENODESHAREDFILESTORAGEPATHUNUSABLE)
1536
1537 def _VerifyGlusterStoragePaths(self, ninfo, nresult):
1538 """Verifies (file) storage paths.
1539
1540 @see: C{_VerifyStoragePaths}
1541
1542 """
1543 self._VerifyStoragePaths(
1544 ninfo, nresult, constants.DT_GLUSTER,
1545 constants.NV_GLUSTER_STORAGE_PATH,
1546 constants.CV_ENODEGLUSTERSTORAGEPATHUNUSABLE)
1547
1548 def _VerifyOob(self, ninfo, nresult):
1549 """Verifies out of band functionality of a node.
1550
1551 @type ninfo: L{objects.Node}
1552 @param ninfo: the node to check
1553 @param nresult: the remote results for the node
1554
1555 """
1556 # We just have to verify the paths on master and/or master candidates
1557 # as the oob helper is invoked on the master
1558 if ((ninfo.master_candidate or ninfo.master_capable) and
1559 constants.NV_OOB_PATHS in nresult):
1560 for path_result in nresult[constants.NV_OOB_PATHS]:
1561 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH,
1562 ninfo.name, path_result)
1563
1564 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1565 """Verifies and updates the node volume data.
1566
1567 This function will update a L{NodeImage}'s internal structures
1568 with data from the remote call.
1569
1570 @type ninfo: L{objects.Node}
1571 @param ninfo: the node to check
1572 @param nresult: the remote results for the node
1573 @param nimg: the node image object
1574 @param vg_name: the configured VG name
1575
1576 """
1577 nimg.lvm_fail = True
1578 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
1579 if vg_name is None:
1580 pass
1581 elif isinstance(lvdata, basestring):
1582 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name,
1583 "LVM problem on node: %s", utils.SafeEncode(lvdata))
1584 elif not isinstance(lvdata, dict):
1585 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name,
1586 "rpc call to node failed (lvlist)")
1587 else:
1588 nimg.volumes = lvdata
1589 nimg.lvm_fail = False
1590
1591 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1592 """Verifies and updates the node instance list.
1593
1594 If the listing was successful, then updates this node's instance
1595 list. Otherwise, it marks the RPC call as failed for the instance
1596 list key.
1597
1598 @type ninfo: L{objects.Node}
1599 @param ninfo: the node to check
1600 @param nresult: the remote results for the node
1601 @param nimg: the node image object
1602
1603 """
1604 idata = nresult.get(constants.NV_INSTANCELIST, None)
1605 test = not isinstance(idata, list)
1606 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
1607 "rpc call to node failed (instancelist): %s",
1608 utils.SafeEncode(str(idata)))
1609 if test:
1610 nimg.hyp_fail = True
1611 else:
1612 nimg.instances = [uuid for (uuid, _) in
1613 self.cfg.GetMultiInstanceInfoByName(idata)]
1614
1615 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1616 """Verifies and computes a node information map
1617
1618 @type ninfo: L{objects.Node}
1619 @param ninfo: the node to check
1620 @param nresult: the remote results for the node
1621 @param nimg: the node image object
1622 @param vg_name: the configured VG name
1623
1624 """
1625 # try to read free memory (from the hypervisor)
1626 hv_info = nresult.get(constants.NV_HVINFO, None)
1627 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info \
1628 or "memory_total" not in hv_info \
1629 or "memory_dom0" not in hv_info
1630 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
1631 "rpc call to node failed (hvinfo)")
1632 if not test:
1633 try:
1634 nimg.mfree = int(hv_info["memory_free"])
1635 nimg.mtotal = int(hv_info["memory_total"])
1636 nimg.mdom0 = int(hv_info["memory_dom0"])
1637 except (ValueError, TypeError):
1638 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
1639 "node returned invalid nodeinfo, check hypervisor")
1640
1641 # FIXME: devise a free space model for file based instances as well
1642 if vg_name is not None:
1643 test = (constants.NV_VGLIST not in nresult or
1644 vg_name not in nresult[constants.NV_VGLIST])
1645 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name,
1646 "node didn't return data for the volume group '%s'"
1647 " - it is either missing or broken", vg_name)
1648 if not test:
1649 try:
1650 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
1651 except (ValueError, TypeError):
1652 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
1653 "node returned invalid LVM info, check LVM status")
1654
1655 def _CollectDiskInfo(self, node_uuids, node_image, instanceinfo):
1656 """Gets per-disk status information for all instances.
1657
1658 @type node_uuids: list of strings
1659 @param node_uuids: Node UUIDs
1660 @type node_image: dict of (UUID, L{objects.Node})
1661 @param node_image: Node objects
1662 @type instanceinfo: dict of (UUID, L{objects.Instance})
1663 @param instanceinfo: Instance objects
1664 @rtype: {instance: {node: [(succes, payload)]}}
1665 @return: a dictionary of per-instance dictionaries with nodes as
1666 keys and disk information as values; the disk information is a
1667 list of tuples (success, payload)
1668
1669 """
1670 node_disks = {}
1671 node_disks_dev_inst_only = {}
1672 diskless_instances = set()
1673 nodisk_instances = set()
1674
1675 for nuuid in node_uuids:
1676 node_inst_uuids = list(itertools.chain(node_image[nuuid].pinst,
1677 node_image[nuuid].sinst))
1678 diskless_instances.update(uuid for uuid in node_inst_uuids
1679 if not instanceinfo[uuid].disks)
1680 disks = [(inst_uuid, disk)
1681 for inst_uuid in node_inst_uuids
1682 for disk in self.cfg.GetInstanceDisks(inst_uuid)]
1683
1684 if not disks:
1685 nodisk_instances.update(uuid for uuid in node_inst_uuids
1686 if instanceinfo[uuid].disks)
1687 # No need to collect data
1688 continue
1689
1690 node_disks[nuuid] = disks
1691
1692 # _AnnotateDiskParams makes already copies of the disks
1693 dev_inst_only = []
1694 for (inst_uuid, dev) in disks:
1695 (anno_disk,) = AnnotateDiskParams(instanceinfo[inst_uuid], [dev],
1696 self.cfg)
1697 dev_inst_only.append((anno_disk, instanceinfo[inst_uuid]))
1698
1699 node_disks_dev_inst_only[nuuid] = dev_inst_only
1700
1701 assert len(node_disks) == len(node_disks_dev_inst_only)
1702
1703 # Collect data from all nodes with disks
1704 result = self.rpc.call_blockdev_getmirrorstatus_multi(
1705 node_disks.keys(), node_disks_dev_inst_only)
1706
1707 assert len(result) == len(node_disks)
1708
1709 instdisk = {}
1710
1711 for (nuuid, nres) in result.items():
1712 node = self.cfg.GetNodeInfo(nuuid)
1713 disks = node_disks[node.uuid]
1714
1715 if nres.offline:
1716 # No data from this node
1717 data = len(disks) * [(False, "node offline")]
1718 else:
1719 msg = nres.fail_msg
1720 self._ErrorIf(msg, constants.CV_ENODERPC, node.name,
1721 "while getting disk information: %s", msg)
1722 if msg:
1723 # No data from this node
1724 data = len(disks) * [(False, msg)]
1725 else:
1726 data = []
1727 for idx, i in enumerate(nres.payload):
1728 if isinstance(i, (tuple, list)) and len(i) == 2:
1729 data.append(i)
1730 else:
1731 logging.warning("Invalid result from node %s, entry %d: %s",
1732 node.name, idx, i)
1733 data.append((False, "Invalid result from the remote node"))
1734
1735 for ((inst_uuid, _), status) in zip(disks, data):
1736 instdisk.setdefault(inst_uuid, {}).setdefault(node.uuid, []) \
1737 .append(status)
1738
1739 # Add empty entries for diskless instances.
1740 for inst_uuid in diskless_instances:
1741 assert inst_uuid not in instdisk
1742 instdisk[inst_uuid] = {}
1743 # ...and disk-full instances that happen to have no disks
1744 for inst_uuid in nodisk_instances:
1745 assert inst_uuid not in instdisk
1746 instdisk[inst_uuid] = {}
1747
1748 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
1749 len(nuuids) <= len(
1750 self.cfg.GetInstanceNodes(instanceinfo[inst].uuid)) and
1751 compat.all(isinstance(s, (tuple, list)) and
1752 len(s) == 2 for s in statuses)
1753 for inst, nuuids in instdisk.items()
1754 for nuuid, statuses in nuuids.items())
1755 if __debug__:
1756 instdisk_keys = set(instdisk)
1757 instanceinfo_keys = set(instanceinfo)
1758 assert instdisk_keys == instanceinfo_keys, \
1759 ("instdisk keys (%s) do not match instanceinfo keys (%s)" %
1760 (instdisk_keys, instanceinfo_keys))
1761
1762 return instdisk
1763
1764 @staticmethod
1765 def _SshNodeSelector(group_uuid, all_nodes):
1766 """Create endless iterators for all potential SSH check hosts.
1767
1768 """
1769 nodes = [node for node in all_nodes
1770 if (node.group != group_uuid and
1771 not node.offline)]
1772 keyfunc = operator.attrgetter("group")
1773
1774 return map(itertools.cycle,
1775 [sorted(n.name for n in names)
1776 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
1777 keyfunc)])
1778
1779 @classmethod
1780 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
1781 """Choose which nodes should talk to which other nodes.
1782
1783 We will make nodes contact all nodes in their group, and one node from
1784 every other group.
1785
1786 @rtype: tuple of (string, dict of strings to list of strings, string)
1787 @return: a tuple containing the list of all online nodes, a dictionary
1788 mapping node names to additional nodes of other node groups to which
1789 connectivity should be tested, and a list of all online master
1790 candidates
1791
1792 @warning: This algorithm has a known issue if one node group is much
1793 smaller than others (e.g. just one node). In such a case all other
1794 nodes will talk to the single node.
1795
1796 """
1797 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
1798 online_mcs = sorted(node.name for node in group_nodes
1799 if (node.master_candidate and not node.offline))
1800 sel = cls._SshNodeSelector(group_uuid, all_nodes)
1801
1802 return (online_nodes,
1803 dict((name, sorted([i.next() for i in sel]))
1804 for name in online_nodes),
1805 online_mcs)
1806
1807 def _PrepareSshSetupCheck(self):
1808 """Prepare the input data for the SSH setup verification.
1809
1810 """
1811 all_nodes_info = self.cfg.GetAllNodesInfo()
1812 potential_master_candidates = self.cfg.GetPotentialMasterCandidates()
1813 node_status = [
1814 (uuid, node_info.name, node_info.master_candidate,
1815 node_info.name in potential_master_candidates, not node_info.offline)
1816 for (uuid, node_info) in all_nodes_info.items()]
1817 return node_status
1818
1819 def BuildHooksEnv(self):
1820 """Build hooks env.
1821
1822 Cluster-Verify hooks just ran in the post phase and their failure makes
1823 the output be logged in the verify output and the verification to fail.
1824
1825 """
1826 env = {
1827 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags()),
1828 }
1829
1830 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
1831 for node in self.my_node_info.values())
1832
1833 return env
1834
1835 def BuildHooksNodes(self):
1836 """Build hooks nodes.
1837
1838 """
1839 return ([], list(self.my_node_info.keys()))
1840
1841 @staticmethod
1842 def _VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced,
1843 i_offline, n_offline, n_drained):
1844 feedback_fn("* Other Notes")
1845 if i_non_redundant:
1846 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
1847 % len(i_non_redundant))
1848
1849 if i_non_a_balanced:
1850 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
1851 % len(i_non_a_balanced))
1852
1853 if i_offline:
1854 feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline)
1855
1856 if n_offline:
1857 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
1858
1859 if n_drained:
1860 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
1861
1862 def _VerifyExclusionTags(self, nodename, pinst, ctags):
1863 """Verify that all instances have different exclusion tags.
1864
1865 @type nodename: string
1866 @param nodename: the name of the node for which the check is done
1867 @type pinst: list of string
1868 @param pinst: list of UUIDs of those instances having the given node
1869 as primary node
1870 @type ctags: list of string
1871 @param ctags: tags of the cluster
1872
1873 """
1874 exclusion_prefixes = utils.GetExclusionPrefixes(ctags)
1875 tags_seen = set([])
1876 conflicting_tags = set([])
1877 for iuuid in pinst:
1878 allitags = self.my_inst_info[iuuid].tags
1879 if allitags is None:
1880 allitags = []
1881 itags = set([tag for tag in allitags
1882 if utils.IsGoodTag(exclusion_prefixes, tag)])
1883 conflicts = itags.intersection(tags_seen)
1884 if len(conflicts) > 0:
1885 conflicting_tags = conflicting_tags.union(conflicts)
1886 tags_seen = tags_seen.union(itags)
1887
1888 self._ErrorIf(len(conflicting_tags) > 0, constants.CV_EEXTAGS, nodename,
1889 "Tags where there is more than one instance: %s",
1890 list(conflicting_tags), code=constants.CV_WARNING)
1891
1892 def Exec(self, feedback_fn): # pylint: disable=R0915
1893 """Verify integrity of the node group, performing various test on nodes.
1894
1895 """
1896 # This method has too many local variables. pylint: disable=R0914
1897 feedback_fn("* Verifying group '%s'" % self.group_info.name)
1898
1899 if not self.my_node_uuids:
1900 # empty node group
1901 feedback_fn("* Empty node group, skipping verification")
1902 return True
1903
1904 self.bad = False
1905 verbose = self.op.verbose
1906 self._feedback_fn = feedback_fn
1907
1908 vg_name = self.cfg.GetVGName()
1909 drbd_helper = self.cfg.GetDRBDHelper()
1910 cluster = self.cfg.GetClusterInfo()
1911 hypervisors = cluster.enabled_hypervisors
1912 node_data_list = self.my_node_info.values()
1913
1914 i_non_redundant = [] # Non redundant instances
1915 i_non_a_balanced = [] # Non auto-balanced instances
1916 i_offline = 0 # Count of offline instances
1917 n_offline = 0 # Count of offline nodes
1918 n_drained = 0 # Count of nodes being drained
1919 node_vol_should = {}
1920
1921 # FIXME: verify OS list
1922
1923 # File verification
1924 filemap = ComputeAncillaryFiles(cluster, False)
1925
1926 # do local checksums
1927 master_node_uuid = self.master_node = self.cfg.GetMasterNode()
1928 master_ip = self.cfg.GetMasterIP()
1929
1930 online_master_candidates = sorted(
1931 node.name for node in node_data_list
1932 if (node.master_candidate and not node.offline))
1933
1934 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids))
1935
1936 user_scripts = []
1937 if self.cfg.GetUseExternalMipScript():
1938 user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT)
1939
1940 online_nodes = [(node.name, node.primary_ip, node.secondary_ip)
1941 for node in node_data_list if not node.offline]
1942 node_nettest_params = (online_nodes, online_master_candidates)
1943
1944 node_verify_param = {
1945 constants.NV_FILELIST:
1946 [vcluster.MakeVirtualPath(f)
1947 for f in utils.UniqueSequence(filename
1948 for files in filemap
1949 for filename in files)],
1950 constants.NV_NODELIST:
1951 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
1952 self.all_node_info.values()),
1953 constants.NV_HYPERVISOR: hypervisors,
1954 constants.NV_HVPARAMS:
1955 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
1956 constants.NV_NODENETTEST: node_nettest_params,
1957 constants.NV_INSTANCELIST: hypervisors,
1958 constants.NV_VERSION: None,
1959 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
1960 constants.NV_NODESETUP: None,
1961 constants.NV_TIME: None,
1962 constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip,
1963 online_master_candidates),
1964 constants.NV_OSLIST: None,
1965 constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(),
1966 constants.NV_USERSCRIPTS: user_scripts,
1967 constants.NV_CLIENT_CERT: None,
1968 }
1969
1970 if self.cfg.GetClusterInfo().modify_ssh_setup:
1971 node_verify_param[constants.NV_SSH_SETUP] = \
1972 (self._PrepareSshSetupCheck(), self.cfg.GetClusterInfo().ssh_key_type)
1973 if self.op.verify_clutter:
1974 node_verify_param[constants.NV_SSH_CLUTTER] = True
1975
1976 if vg_name is not None:
1977 node_verify_param[constants.NV_VGLIST] = None
1978 node_verify_param[constants.NV_LVLIST] = vg_name
1979 node_verify_param[constants.NV_PVLIST] = [vg_name]
1980
1981 if cluster.IsDiskTemplateEnabled(constants.DT_DRBD8):
1982 if drbd_helper:
1983 node_verify_param[constants.NV_DRBDVERSION] = None
1984 node_verify_param[constants.NV_DRBDLIST] = None
1985 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
1986
1987 if cluster.IsFileStorageEnabled() or \
1988 cluster.IsSharedFileStorageEnabled():
1989 # Load file storage paths only from master node
1990 node_verify_param[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1991 self.cfg.GetMasterNodeName()
1992 if cluster.IsFileStorageEnabled():
1993 node_verify_param[constants.NV_FILE_STORAGE_PATH] = \
1994 cluster.file_storage_dir
1995 if cluster.IsSharedFileStorageEnabled():
1996 node_verify_param[constants.NV_SHARED_FILE_STORAGE_PATH] = \
1997 cluster.shared_file_storage_dir
1998
1999 # bridge checks
2000 # FIXME: this needs to be changed per node-group, not cluster-wide
2001 bridges = set()
2002 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
2003 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2004 bridges.add(default_nicpp[constants.NIC_LINK])
2005 for inst_uuid in self.my_inst_info.values():
2006 for nic in inst_uuid.nics:
2007 full_nic = cluster.SimpleFillNIC(nic.nicparams)
2008 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
2009 bridges.add(full_nic[constants.NIC_LINK])
2010
2011 if bridges:
2012 node_verify_param[constants.NV_BRIDGES] = list(bridges)
2013
2014 # Build our expected cluster state
2015 node_image = dict((node.uuid, self.NodeImage(offline=node.offline,
2016 uuid=node.uuid,
2017 vm_capable=node.vm_capable))
2018 for node in node_data_list)
2019
2020 # Gather OOB paths
2021 oob_paths = []
2022 for node in self.all_node_info.values():
2023 path = SupportsOob(self.cfg, node)
2024 if path and path not in oob_paths:
2025 oob_paths.append(path)
2026
2027 if oob_paths:
2028 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2029
2030 for inst_uuid in self.my_inst_uuids:
2031 instance = self.my_inst_info[inst_uuid]
2032 if instance.admin_state == constants.ADMINST_OFFLINE:
2033 i_offline += 1
2034
2035 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid)
2036 for nuuid in inst_nodes:
2037 if nuuid not in node_image:
2038 gnode = self.NodeImage(uuid=nuuid)
2039 gnode.ghost = (nuuid not in self.all_node_info)
2040 node_image[nuuid] = gnode
2041
2042 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
2043
2044 pnode = instance.primary_node
2045 node_image[pnode].pinst.append(instance.uuid)
2046
2047 for snode in self.cfg.GetInstanceSecondaryNodes(instance.uuid):
2048 nimg = node_image[snode]
2049 nimg.sinst.append(instance.uuid)
2050 if pnode not in nimg.sbp:
2051 nimg.sbp[pnode] = []
2052 nimg.sbp[pnode].append(instance.uuid)
2053
2054 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg,
2055 self.my_node_info.keys())
2056 # The value of exclusive_storage should be the same across the group, so if
2057 # it's True for at least a node, we act as if it were set for all the nodes
2058 self._exclusive_storage = compat.any(es_flags.values())
2059 if self._exclusive_storage:
2060 node_verify_param[constants.NV_EXCLUSIVEPVS] = True
2061
2062 # At this point, we have the in-memory data structures complete,
2063 # except for the runtime information, which we'll gather next
2064
2065 # NOTE: Here we lock the configuration for the duration of RPC calls,
2066 # which means that the cluster configuration changes are blocked during
2067 # this period.
2068 # This is something that should be done only exceptionally and only for
2069 # justified cases!
2070 # In this case, we need the lock as we can only verify the integrity of
2071 # configuration files on MCs only if we know nobody else is modifying it.
2072 # FIXME: The check for integrity of config.data should be moved to
2073 # WConfD, which is the only one who can otherwise ensure nobody
2074 # will modify the configuration during the check.
2075 with self.cfg.GetConfigManager(shared=True, forcelock=True):
2076 feedback_fn("* Gathering information about nodes (%s nodes)" %
2077 len(self.my_node_uuids))
2078 # Force the configuration to be fully distributed before doing any tests
2079 self.cfg.FlushConfigGroup(self.group_uuid)
2080 # Due to the way our RPC system works, exact response times cannot be
2081 # guaranteed (e.g. a broken node could run into a timeout). By keeping
2082 # the time before and after executing the request, we can at least have
2083 # a time window.
2084 nvinfo_starttime = time.time()
2085 # Get lock on the configuration so that nobody modifies it concurrently.
2086 # Otherwise it can be modified by other jobs, failing the consistency
2087 # test.
2088 # NOTE: This is an exceptional situation, we should otherwise avoid
2089 # locking the configuration for something but very fast, pure operations.
2090 cluster_name = self.cfg.GetClusterName()
2091 hvparams = self.cfg.GetClusterInfo().hvparams
2092
2093 all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids,
2094 node_verify_param,
2095 cluster_name,
2096 hvparams)
2097 nvinfo_endtime = time.time()
2098
2099 if self.extra_lv_nodes and vg_name is not None:
2100 feedback_fn("* Gathering information about extra nodes (%s nodes)" %
2101 len(self.extra_lv_nodes))
2102 extra_lv_nvinfo = \
2103 self.rpc.call_node_verify(self.extra_lv_nodes,
2104 {constants.NV_LVLIST: vg_name},
2105 self.cfg.GetClusterName(),
2106 self.cfg.GetClusterInfo().hvparams)
2107 else:
2108 extra_lv_nvinfo = {}
2109
2110 # If not all nodes are being checked, we need to make sure the master
2111 # node and a non-checked vm_capable node are in the list.
2112 absent_node_uuids = set(self.all_node_info).difference(self.my_node_info)
2113 if absent_node_uuids:
2114 vf_nvinfo = all_nvinfo.copy()
2115 vf_node_info = list(self.my_node_info.values())
2116 additional_node_uuids = []
2117 if master_node_uuid not in self.my_node_info:
2118 additional_node_uuids.append(master_node_uuid)
2119 vf_node_info.append(self.all_node_info[master_node_uuid])
2120 # Add the first vm_capable node we find which is not included,
2121 # excluding the master node (which we already have)
2122 for node_uuid in absent_node_uuids:
2123 nodeinfo = self.all_node_info[node_uuid]
2124 if (nodeinfo.vm_capable and not nodeinfo.offline and
2125 node_uuid != master_node_uuid):
2126 additional_node_uuids.append(node_uuid)
2127 vf_node_info.append(self.all_node_info[node_uuid])
2128 break
2129 key = constants.NV_FILELIST
2130
2131 feedback_fn("* Gathering information about the master node")
2132 vf_nvinfo.update(self.rpc.call_node_verify(
2133 additional_node_uuids, {key: node_verify_param[key]},
2134 self.cfg.GetClusterName(), self.cfg.GetClusterInfo().hvparams))
2135 else:
2136 vf_nvinfo = all_nvinfo
2137 vf_node_info = self.my_node_info.values()
2138
2139 all_drbd_map = self.cfg.ComputeDRBDMap()
2140
2141 feedback_fn("* Gathering disk information (%s nodes)" %
2142 len(self.my_node_uuids))
2143 instdisk = self._CollectDiskInfo(self.my_node_info.keys(), node_image,
2144 self.my_inst_info)
2145
2146 feedback_fn("* Verifying configuration file consistency")
2147
2148 self._VerifyClientCertificates(self.my_node_info.values(), all_nvinfo)
2149 if self.cfg.GetClusterInfo().modify_ssh_setup:
2150 self._VerifySshSetup(self.my_node_info.values(), all_nvinfo)
2151 self._VerifyFiles(vf_node_info, master_node_uuid, vf_nvinfo, filemap)
2152
2153 feedback_fn("* Verifying node status")
2154
2155 refos_img = None
2156
2157 for node_i in node_data_list:
2158 nimg = node_image[node_i.uuid]
2159
2160 if node_i.offline:
2161 if verbose:
2162 feedback_fn("* Skipping offline node %s" % (node_i.name,))
2163 n_offline += 1
2164 continue
2165
2166 if node_i.uuid == master_node_uuid:
2167 ntype = "master"
2168 elif node_i.master_candidate:
2169 ntype = "master candidate"
2170 elif node_i.drained:
2171 ntype = "drained"
2172 n_drained += 1
2173 else:
2174 ntype = "regular"
2175 if verbose:
2176 feedback_fn("* Verifying node %s (%s)" % (node_i.name, ntype))
2177
2178 msg = all_nvinfo[node_i.uuid].fail_msg
2179 self._ErrorIf(msg, constants.CV_ENODERPC, node_i.name,
2180 "while contacting node: %s", msg)
2181 if msg:
2182 nimg.rpc_fail = True
2183 continue
2184
2185 nresult = all_nvinfo[node_i.uuid].payload
2186
2187 nimg.call_ok = self._VerifyNode(node_i, nresult)
2188 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2189 self._VerifyNodeNetwork(node_i, nresult)
2190 self._VerifyNodeUserScripts(node_i, nresult)
2191 self._VerifyOob(node_i, nresult)
2192 self._VerifyAcceptedFileStoragePaths(node_i, nresult,
2193 node_i.uuid == master_node_uuid)
2194 self._VerifyFileStoragePaths(node_i, nresult)
2195 self._VerifySharedFileStoragePaths(node_i, nresult)
2196 self._VerifyGlusterStoragePaths(node_i, nresult)
2197
2198 if nimg.vm_capable:
2199 self._UpdateVerifyNodeLVM(node_i, nresult, vg_name, nimg)
2200 if constants.DT_DRBD8 in cluster.enabled_disk_templates:
2201 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info,
2202 self.all_disks_info, drbd_helper, all_drbd_map)
2203
2204 if (constants.DT_PLAIN in cluster.enabled_disk_templates) or \
2205 (constants.DT_DRBD8 in cluster.enabled_disk_templates):
2206 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2207 self._UpdateNodeInstances(node_i, nresult, nimg)
2208 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2209 self._UpdateNodeOS(node_i, nresult, nimg)
2210
2211 if not nimg.os_fail:
2212 if refos_img is None:
2213 refos_img = nimg
2214 self._VerifyNodeOS(node_i, nimg, refos_img)
2215 self._VerifyNodeBridges(node_i, nresult, bridges)
2216
2217 # Check whether all running instances are primary for the node. (This
2218 # can no longer be done from _VerifyInstance below, since some of the
2219 # wrong instances could be from other node groups.)
2220 non_primary_inst_uuids = set(nimg.instances).difference(nimg.pinst)
2221
2222 for inst_uuid in non_primary_inst_uuids:
2223 test = inst_uuid in self.all_inst_info
2224 self._ErrorIf(test, constants.CV_EINSTANCEWRONGNODE,
2225 self.cfg.GetInstanceName(inst_uuid),
2226 "instance should not run on node %s", node_i.name)
2227 self._ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
2228 "node is running unknown instance %s", inst_uuid)
2229
2230 self._VerifyExclusionTags(node_i.name, nimg.pinst, cluster.tags)
2231
2232 self._VerifyGroupDRBDVersion(all_nvinfo)
2233 self._VerifyGroupLVM(node_image, vg_name)
2234
2235 for node_uuid, result in extra_lv_nvinfo.items():
2236 self._UpdateNodeVolumes(self.all_node_info[node_uuid], result.payload,
2237 node_image[node_uuid], vg_name)
2238
2239 feedback_fn("* Verifying instance status")
2240 for inst_uuid in self.my_inst_uuids:
2241 instance = self.my_inst_info[inst_uuid]
2242 if verbose:
2243 feedback_fn("* Verifying instance %s" % instance.name)
2244 self._VerifyInstance(instance, node_image, instdisk[inst_uuid])
2245
2246 # If the instance is not fully redundant we cannot survive losing its
2247 # primary node, so we are not N+1 compliant.
2248 inst_disks = self.cfg.GetInstanceDisks(instance.uuid)
2249 if not utils.AllDiskOfType(inst_disks, constants.DTS_MIRRORED):
2250 i_non_redundant.append(instance)
2251
2252 if not cluster.FillBE(instance)[constants.BE_AUTO_BALANCE]:
2253 i_non_a_balanced.append(instance)
2254
2255 feedback_fn("* Verifying orphan volumes")
2256 reserved = utils.FieldSet(*cluster.reserved_lvs)
2257
2258 # We will get spurious "unknown volume" warnings if any node of this group
2259 # is secondary for an instance whose primary is in another group. To avoid
2260 # them, we find these instances and add their volumes to node_vol_should.
2261 for instance in self.all_inst_info.values():
2262 for secondary in self.cfg.GetInstanceSecondaryNodes(instance.uuid):
2263 if (secondary in self.my_node_info
2264 and instance.uuid not in self.my_inst_info):
2265 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
2266 break
2267
2268 self._VerifyOrphanVolumes(vg_name, node_vol_should, node_image, reserved)
2269
2270 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2271 feedback_fn("* Verifying N+1 Memory redundancy")
2272 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
2273
2274 self._VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced,
2275 i_offline, n_offline, n_drained)
2276
2277 return not self.bad
2278
2279 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2280 """Analyze the post-hooks' result
2281
2282 This method analyses the hook result, handles it, and sends some
2283 nicely-formatted feedback back to the user.
2284
2285 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2286 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2287 @param hooks_results: the results of the multi-node hooks rpc call
2288 @param feedback_fn: function used send feedback back to the caller
2289 @param lu_result: previous Exec result
2290 @return: the new Exec result, based on the previous result
2291 and hook results
2292
2293 """
2294 # We only really run POST phase hooks, only for non-empty groups,
2295 # and are only interested in their results
2296 if not self.my_node_uuids:
2297 # empty node group
2298 pass
2299 elif phase == constants.HOOKS_PHASE_POST:
2300 # Used to change hooks' output to proper indentation
2301 feedback_fn("* Hooks Results")
2302 assert hooks_results, "invalid result from hooks"
2303
2304 for node_name in hooks_results:
2305 res = hooks_results[node_name]
2306 msg = res.fail_msg
2307 test = msg and not res.offline
2308 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
2309 "Communication failure in hooks execution: %s", msg)
2310 if test:
2311 lu_result = False
2312 continue
2313 if res.offline:
2314 # No need to investigate payload if node is offline
2315 continue
2316 for script, hkr, output in res.payload:
2317 test = hkr == constants.HKR_FAIL
2318 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
2319 "Script %s failed, output:", script)
2320 if test:
2321 output = self._HOOKS_INDENT_RE.sub(" ", output)
2322 feedback_fn("%s" % output)
2323 lu_result = False
2324
2325 return lu_result