02ad841c056c12f97ad047dd070cd39c74ddacc4
[ganeti-github.git] / lib / cmdlib / cluster / verify.py
1 #
2 #
3
4 # Copyright (C) 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 """Logical units for cluster verification."""
31
32 import itertools
33 import logging
34 import operator
35 import re
36 import time
37 import ganeti.masterd.instance
38 import ganeti.rpc.node as rpc
39
40 from ganeti import compat
41 from ganeti import constants
42 from ganeti import errors
43 from ganeti import locking
44 from ganeti import pathutils
45 from ganeti import utils
46 from ganeti import vcluster
47 from ganeti import hypervisor
48 from ganeti import opcodes
49
50 from ganeti.cmdlib.base import LogicalUnit, NoHooksLU, ResultWithJobs
51 from ganeti.cmdlib.common import ShareAll, ComputeAncillaryFiles, \
52 CheckNodePVs, ComputeIPolicyInstanceViolation, AnnotateDiskParams, \
53 SupportsOob
54
55
56 def _GetAllHypervisorParameters(cluster, instances):
57 """Compute the set of all hypervisor parameters.
58
59 @type cluster: L{objects.Cluster}
60 @param cluster: the cluster object
61 @param instances: list of L{objects.Instance}
62 @param instances: additional instances from which to obtain parameters
63 @rtype: list of (origin, hypervisor, parameters)
64 @return: a list with all parameters found, indicating the hypervisor they
65 apply to, and the origin (can be "cluster", "os X", or "instance Y")
66
67 """
68 hvp_data = []
69
70 for hv_name in cluster.enabled_hypervisors:
71 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
72
73 for os_name, os_hvp in cluster.os_hvp.items():
74 for hv_name, hv_params in os_hvp.items():
75 if hv_params:
76 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
77 hvp_data.append(("os %s" % os_name, hv_name, full_params))
78
79 # TODO: collapse identical parameter values in a single one
80 for instance in instances:
81 if instance.hvparams:
82 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
83 cluster.FillHV(instance)))
84
85 return hvp_data
86
87
88 class _VerifyErrors(object):
89 """Mix-in for cluster/group verify LUs.
90
91 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
92 self.op and self._feedback_fn to be available.)
93
94 """
95
96 ETYPE_ERROR = constants.CV_ERROR
97 ETYPE_WARNING = constants.CV_WARNING
98
99 def _ErrorMsgList(self, error_descriptor, object_name, message_list,
100 log_type=ETYPE_ERROR):
101 """Format multiple error messages.
102
103 Based on the opcode's error_codes parameter, either format a
104 parseable error code, or a simpler error string.
105
106 This must be called only from Exec and functions called from Exec.
107
108
109 @type error_descriptor: tuple (string, string, string)
110 @param error_descriptor: triplet describing the error (object_type,
111 code, description)
112 @type object_name: string
113 @param object_name: name of object (instance, node ..) the error relates to
114 @type message_list: list of strings
115 @param message_list: body of error messages
116 @type log_type: string
117 @param log_type: log message type (WARNING, ERROR ..)
118 """
119 # Called with empty list - nothing to do
120 if not message_list:
121 return
122
123 object_type, error_code, _ = error_descriptor
124 # If the error code is in the list of ignored errors, demote the error to a
125 # warning
126 if error_code in self.op.ignore_errors: # pylint: disable=E1101
127 log_type = self.ETYPE_WARNING
128
129 prefixed_list = []
130 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
131 for msg in message_list:
132 prefixed_list.append(" - %s:%s:%s:%s:%s" % (
133 log_type, error_code, object_type, object_name, msg))
134 else:
135 if not object_name:
136 object_name = ""
137 for msg in message_list:
138 prefixed_list.append(" - %s: %s %s: %s" % (
139 log_type, object_type, object_name, msg))
140
141 # Report messages via the feedback_fn
142 # pylint: disable=E1101
143 self._feedback_fn(constants.ELOG_MESSAGE_LIST, prefixed_list)
144
145 # do not mark the operation as failed for WARN cases only
146 if log_type == self.ETYPE_ERROR:
147 self.bad = True
148
149 def _ErrorMsg(self, error_descriptor, object_name, message,
150 log_type=ETYPE_ERROR):
151 """Log a single error message.
152
153 """
154 self._ErrorMsgList(error_descriptor, object_name, [message], log_type)
155
156 # TODO: Replace this method with a cleaner interface, get rid of the if
157 # condition as it only rarely saves lines, but makes things less readable.
158 def _ErrorIf(self, cond, *args, **kwargs):
159 """Log an error message if the passed condition is True.
160
161 """
162 if (bool(cond)
163 or self.op.debug_simulate_errors): # pylint: disable=E1101
164 self._Error(*args, **kwargs)
165
166 # TODO: Replace this method with a cleaner interface
167 def _Error(self, ecode, item, message, *args, **kwargs):
168 """Log an error message if the passed condition is True.
169
170 """
171 #TODO: Remove 'code' argument in favour of using log_type
172 log_type = kwargs.get('code', self.ETYPE_ERROR)
173 if args:
174 message = message % args
175 self._ErrorMsgList(ecode, item, [message], log_type=log_type)
176
177
178 class LUClusterVerify(NoHooksLU):
179 """Submits all jobs necessary to verify the cluster.
180
181 """
182 REQ_BGL = False
183
184 def ExpandNames(self):
185 self.needed_locks = {}
186
187 def Exec(self, feedback_fn):
188 jobs = []
189
190 if self.op.group_name:
191 groups = [self.op.group_name]
192 depends_fn = lambda: None
193 else:
194 groups = self.cfg.GetNodeGroupList()
195
196 # Verify global configuration
197 jobs.append([
198 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors),
199 ])
200
201 # Always depend on global verification
202 depends_fn = lambda: [(-len(jobs), [])]
203
204 jobs.extend(
205 [opcodes.OpClusterVerifyGroup(group_name=group,
206 ignore_errors=self.op.ignore_errors,
207 depends=depends_fn(),
208 verify_clutter=self.op.verify_clutter)]
209 for group in groups)
210
211 # Fix up all parameters
212 for op in itertools.chain(*jobs):
213 op.debug_simulate_errors = self.op.debug_simulate_errors
214 op.verbose = self.op.verbose
215 op.error_codes = self.op.error_codes
216 try:
217 op.skip_checks = self.op.skip_checks
218 except AttributeError:
219 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
220
221 return ResultWithJobs(jobs)
222
223
224 class LUClusterVerifyDisks(NoHooksLU):
225 """Verifies the cluster disks status.
226
227 """
228 REQ_BGL = False
229
230 def ExpandNames(self):
231 self.share_locks = ShareAll()
232 if self.op.group_name:
233 self.needed_locks = {
234 locking.LEVEL_NODEGROUP: [self.cfg.LookupNodeGroup(self.op.group_name)]
235 }
236 else:
237 self.needed_locks = {
238 locking.LEVEL_NODEGROUP: locking.ALL_SET,
239 }
240
241 def Exec(self, feedback_fn):
242 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
243 instances = self.cfg.GetInstanceList()
244
245 only_ext = compat.all(
246 self.cfg.GetInstanceDiskTemplate(i) == constants.DT_EXT
247 for i in instances)
248
249 # We skip current NodeGroup verification if there are only external storage
250 # devices. Currently we provide an interface for external storage provider
251 # for disk verification implementations, however current ExtStorageDevice
252 # does not provide an API for this yet.
253 #
254 # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice
255 # is implemented.
256 if only_ext:
257 logging.info("All instances have ext storage, skipping verify disks.")
258 return ResultWithJobs([])
259 else:
260 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
261 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
262 for group in group_names])
263
264
265 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
266 """Verifies the cluster config.
267
268 """
269 REQ_BGL = False
270
271 def _VerifyHVP(self, hvp_data):
272 """Verifies locally the syntax of the hypervisor parameters.
273
274 """
275 for item, hv_name, hv_params in hvp_data:
276 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
277 (item, hv_name))
278 try:
279 hv_class = hypervisor.GetHypervisorClass(hv_name)
280 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
281 hv_class.CheckParameterSyntax(hv_params)
282 except errors.GenericError, err:
283 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
284
285 def ExpandNames(self):
286 self.needed_locks = dict.fromkeys(locking.LEVELS, locking.ALL_SET)
287 self.share_locks = ShareAll()
288
289 def CheckPrereq(self):
290 """Check prerequisites.
291
292 """
293 # Retrieve all information
294 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
295 self.all_node_info = self.cfg.GetAllNodesInfo()
296 self.all_inst_info = self.cfg.GetAllInstancesInfo()
297
298 def Exec(self, feedback_fn):
299 """Verify integrity of cluster, performing various test on nodes.
300
301 """
302 self.bad = False
303 self._feedback_fn = feedback_fn
304
305 feedback_fn("* Verifying cluster config")
306
307 msg_list = self.cfg.VerifyConfig()
308 self._ErrorMsgList(constants.CV_ECLUSTERCFG, None, msg_list)
309
310 feedback_fn("* Verifying cluster certificate files")
311
312 for cert_filename in pathutils.ALL_CERT_FILES:
313 (errcode, msg) = utils.VerifyCertificate(cert_filename)
314 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
315
316 self._ErrorIf(not utils.CanRead(constants.LUXID_USER,
317 pathutils.NODED_CERT_FILE),
318 constants.CV_ECLUSTERCERT,
319 None,
320 pathutils.NODED_CERT_FILE + " must be accessible by the " +
321 constants.LUXID_USER + " user")
322
323 feedback_fn("* Verifying hypervisor parameters")
324
325 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
326 self.all_inst_info.values()))
327
328 feedback_fn("* Verifying all nodes belong to an existing group")
329
330 # We do this verification here because, should this bogus circumstance
331 # occur, it would never be caught by VerifyGroup, which only acts on
332 # nodes/instances reachable from existing node groups.
333
334 dangling_nodes = set(node for node in self.all_node_info.values()
335 if node.group not in self.all_group_info)
336
337 dangling_instances = {}
338 no_node_instances = []
339
340 for inst in self.all_inst_info.values():
341 if inst.primary_node in [node.uuid for node in dangling_nodes]:
342 dangling_instances.setdefault(inst.primary_node, []).append(inst)
343 elif inst.primary_node not in self.all_node_info:
344 no_node_instances.append(inst)
345
346 pretty_dangling = [
347 "%s (%s)" %
348 (node.name,
349 utils.CommaJoin(inst.name for
350 inst in dangling_instances.get(node.uuid, [])))
351 for node in dangling_nodes]
352
353 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
354 None,
355 "the following nodes (and their instances) belong to a non"
356 " existing group: %s", utils.CommaJoin(pretty_dangling))
357
358 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
359 None,
360 "the following instances have a non-existing primary-node:"
361 " %s", utils.CommaJoin(inst.name for
362 inst in no_node_instances))
363
364 return not self.bad
365
366
367 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
368 """Verifies the status of a node group.
369
370 """
371 HPATH = "cluster-verify"
372 HTYPE = constants.HTYPE_CLUSTER
373 REQ_BGL = False
374
375 _HOOKS_INDENT_RE = re.compile("^", re.M)
376
377 class NodeImage(object):
378 """A class representing the logical and physical status of a node.
379
380 @type uuid: string
381 @ivar uuid: the node UUID to which this object refers
382 @ivar volumes: a structure as returned from
383 L{ganeti.backend.GetVolumeList} (runtime)
384 @ivar instances: a list of running instances (runtime)
385 @ivar pinst: list of configured primary instances (config)
386 @ivar sinst: list of configured secondary instances (config)
387 @ivar sbp: dictionary of {primary-node: list of instances} for all
388 instances for which this node is secondary (config)
389 @ivar mfree: free memory, as reported by hypervisor (runtime)
390 @ivar dfree: free disk, as reported by the node (runtime)
391 @ivar offline: the offline status (config)
392 @type rpc_fail: boolean
393 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
394 not whether the individual keys were correct) (runtime)
395 @type lvm_fail: boolean
396 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
397 @type hyp_fail: boolean
398 @ivar hyp_fail: whether the RPC call didn't return the instance list
399 @type ghost: boolean
400 @ivar ghost: whether this is a known node or not (config)
401 @type os_fail: boolean
402 @ivar os_fail: whether the RPC call didn't return valid OS data
403 @type oslist: list
404 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
405 @type vm_capable: boolean
406 @ivar vm_capable: whether the node can host instances
407 @type pv_min: float
408 @ivar pv_min: size in MiB of the smallest PVs
409 @type pv_max: float
410 @ivar pv_max: size in MiB of the biggest PVs
411
412 """
413 def __init__(self, offline=False, uuid=None, vm_capable=True):
414 self.uuid = uuid
415 self.volumes = {}
416 self.instances = []
417 self.pinst = []
418 self.sinst = []
419 self.sbp = {}
420 self.mfree = 0
421 self.dfree = 0
422 self.offline = offline
423 self.vm_capable = vm_capable
424 self.rpc_fail = False
425 self.lvm_fail = False
426 self.hyp_fail = False
427 self.ghost = False
428 self.os_fail = False
429 self.oslist = {}
430 self.pv_min = None
431 self.pv_max = None
432
433 def ExpandNames(self):
434 # This raises errors.OpPrereqError on its own:
435 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
436
437 # Get instances in node group; this is unsafe and needs verification later
438 inst_uuids = \
439 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
440
441 self.needed_locks = {
442 locking.LEVEL_INSTANCE: self.cfg.GetInstanceNames(inst_uuids),
443 locking.LEVEL_NODEGROUP: [self.group_uuid],
444 locking.LEVEL_NODE: [],
445 }
446
447 self.share_locks = ShareAll()
448
449 def DeclareLocks(self, level):
450 if level == locking.LEVEL_NODE:
451 # Get members of node group; this is unsafe and needs verification later
452 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
453
454 # In Exec(), we warn about mirrored instances that have primary and
455 # secondary living in separate node groups. To fully verify that
456 # volumes for these instances are healthy, we will need to do an
457 # extra call to their secondaries. We ensure here those nodes will
458 # be locked.
459 for inst_name in self.owned_locks(locking.LEVEL_INSTANCE):
460 # Important: access only the instances whose lock is owned
461 instance = self.cfg.GetInstanceInfoByName(inst_name)
462 disks = self.cfg.GetInstanceDisks(instance.uuid)
463 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
464 nodes.update(self.cfg.GetInstanceSecondaryNodes(instance.uuid))
465
466 self.needed_locks[locking.LEVEL_NODE] = nodes
467
468 def CheckPrereq(self):
469 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
470 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
471
472 group_node_uuids = set(self.group_info.members)
473 group_inst_uuids = \
474 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
475
476 unlocked_node_uuids = \
477 group_node_uuids.difference(self.owned_locks(locking.LEVEL_NODE))
478
479 unlocked_inst_uuids = \
480 group_inst_uuids.difference(
481 [self.cfg.GetInstanceInfoByName(name).uuid
482 for name in self.owned_locks(locking.LEVEL_INSTANCE)])
483
484 if unlocked_node_uuids:
485 raise errors.OpPrereqError(
486 "Missing lock for nodes: %s" %
487 utils.CommaJoin(self.cfg.GetNodeNames(unlocked_node_uuids)),
488 errors.ECODE_STATE)
489
490 if unlocked_inst_uuids:
491 raise errors.OpPrereqError(
492 "Missing lock for instances: %s" %
493 utils.CommaJoin(self.cfg.GetInstanceNames(unlocked_inst_uuids)),
494 errors.ECODE_STATE)
495
496 self.all_node_info = self.cfg.GetAllNodesInfo()
497 self.all_inst_info = self.cfg.GetAllInstancesInfo()
498 self.all_disks_info = self.cfg.GetAllDisksInfo()
499
500 self.my_node_uuids = group_node_uuids
501 self.my_node_info = dict((node_uuid, self.all_node_info[node_uuid])
502 for node_uuid in group_node_uuids)
503
504 self.my_inst_uuids = group_inst_uuids
505 self.my_inst_info = dict((inst_uuid, self.all_inst_info[inst_uuid])
506 for inst_uuid in group_inst_uuids)
507
508 # We detect here the nodes that will need the extra RPC calls for verifying
509 # split LV volumes; they should be locked.
510 extra_lv_nodes = {}
511
512 for inst in self.my_inst_info.values():
513 disks = self.cfg.GetInstanceDisks(inst.uuid)
514 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
515 inst_nodes = self.cfg.GetInstanceNodes(inst.uuid)
516 for nuuid in inst_nodes:
517 if self.all_node_info[nuuid].group != self.group_uuid:
518 if nuuid in extra_lv_nodes:
519 extra_lv_nodes[nuuid].append(inst.name)
520 else:
521 extra_lv_nodes[nuuid] = [inst.name]
522
523 extra_lv_nodes_set = set(extra_lv_nodes.iterkeys())
524 unlocked_lv_nodes = \
525 extra_lv_nodes_set.difference(self.owned_locks(locking.LEVEL_NODE))
526
527 if unlocked_lv_nodes:
528 node_strings = ['%s: [%s]' % (
529 self.cfg.GetNodeName(node), utils.CommaJoin(extra_lv_nodes[node]))
530 for node in unlocked_lv_nodes]
531 raise errors.OpPrereqError("Missing node locks for LV check: %s" %
532 utils.CommaJoin(node_strings),
533 errors.ECODE_STATE)
534 self.extra_lv_nodes = list(extra_lv_nodes_set)
535
536 def _VerifyNode(self, ninfo, nresult):
537 """Perform some basic validation on data returned from a node.
538
539 - check the result data structure is well formed and has all the
540 mandatory fields
541 - check ganeti version
542
543 @type ninfo: L{objects.Node}
544 @param ninfo: the node to check
545 @param nresult: the results from the node
546 @rtype: boolean
547 @return: whether overall this call was successful (and we can expect
548 reasonable values in the respose)
549
550 """
551 # main result, nresult should be a non-empty dict
552 test = not nresult or not isinstance(nresult, dict)
553 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name,
554 "unable to verify node: no data returned")
555 if test:
556 return False
557
558 # compares ganeti version
559 local_version = constants.PROTOCOL_VERSION
560 remote_version = nresult.get("version", None)
561 test = not (remote_version and
562 isinstance(remote_version, (list, tuple)) and
563 len(remote_version) == 2)
564 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name,
565 "connection to node returned invalid data")
566 if test:
567 return False
568
569 test = local_version != remote_version[0]
570 self._ErrorIf(test, constants.CV_ENODEVERSION, ninfo.name,
571 "incompatible protocol versions: master %s,"
572 " node %s", local_version, remote_version[0])
573 if test:
574 return False
575
576 # node seems compatible, we can actually try to look into its results
577
578 # full package version
579 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
580 constants.CV_ENODEVERSION, ninfo.name,
581 "software version mismatch: master %s, node %s",
582 constants.RELEASE_VERSION, remote_version[1],
583 code=self.ETYPE_WARNING)
584
585 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
586 if ninfo.vm_capable and isinstance(hyp_result, dict):
587 for hv_name, hv_result in hyp_result.iteritems():
588 test = hv_result is not None
589 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
590 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
591
592 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
593 if ninfo.vm_capable and isinstance(hvp_result, list):
594 for item, hv_name, hv_result in hvp_result:
595 self._ErrorIf(True, constants.CV_ENODEHV, ninfo.name,
596 "hypervisor %s parameter verify failure (source %s): %s",
597 hv_name, item, hv_result)
598
599 test = nresult.get(constants.NV_NODESETUP,
600 ["Missing NODESETUP results"])
601 self._ErrorIf(test, constants.CV_ENODESETUP, ninfo.name,
602 "node setup error: %s", "; ".join(test))
603
604 return True
605
606 def _VerifyNodeTime(self, ninfo, nresult,
607 nvinfo_starttime, nvinfo_endtime):
608 """Check the node time.
609
610 @type ninfo: L{objects.Node}
611 @param ninfo: the node to check
612 @param nresult: the remote results for the node
613 @param nvinfo_starttime: the start time of the RPC call
614 @param nvinfo_endtime: the end time of the RPC call
615
616 """
617 ntime = nresult.get(constants.NV_TIME, None)
618 try:
619 ntime_merged = utils.MergeTime(ntime)
620 except (ValueError, TypeError):
621 self._ErrorIf(True, constants.CV_ENODETIME, ninfo.name,
622 "Node returned invalid time")
623 return
624
625 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
626 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
627 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
628 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
629 else:
630 ntime_diff = None
631
632 self._ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, ninfo.name,
633 "Node time diverges by at least %s from master node time",
634 ntime_diff)
635
636 def _UpdateVerifyNodeLVM(self, ninfo, nresult, vg_name, nimg):
637 """Check the node LVM results and update info for cross-node checks.
638
639 @type ninfo: L{objects.Node}
640 @param ninfo: the node to check
641 @param nresult: the remote results for the node
642 @param vg_name: the configured VG name
643 @type nimg: L{NodeImage}
644 @param nimg: node image
645
646 """
647 if vg_name is None:
648 return
649
650 # checks vg existence and size > 20G
651 vglist = nresult.get(constants.NV_VGLIST, None)
652 test = not vglist
653 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name,
654 "unable to check volume groups")
655 if not test:
656 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
657 constants.MIN_VG_SIZE)
658 self._ErrorIf(vgstatus, constants.CV_ENODELVM, ninfo.name, vgstatus)
659
660 # Check PVs
661 (errmsgs, pvminmax) = CheckNodePVs(nresult, self._exclusive_storage)
662 for em in errmsgs:
663 self._Error(constants.CV_ENODELVM, ninfo.name, em)
664 if pvminmax is not None:
665 (nimg.pv_min, nimg.pv_max) = pvminmax
666
667 def _VerifyGroupDRBDVersion(self, node_verify_infos):
668 """Check cross-node DRBD version consistency.
669
670 @type node_verify_infos: dict
671 @param node_verify_infos: infos about nodes as returned from the
672 node_verify call.
673
674 """
675 node_versions = {}
676 for node_uuid, ndata in node_verify_infos.items():
677 nresult = ndata.payload
678 if nresult:
679 version = nresult.get(constants.NV_DRBDVERSION, None)
680 if version:
681 node_versions[node_uuid] = version
682
683 if len(set(node_versions.values())) > 1:
684 for node_uuid, version in sorted(node_versions.items()):
685 msg = "DRBD version mismatch: %s" % version
686 self._Error(constants.CV_ENODEDRBDHELPER, node_uuid, msg,
687 code=self.ETYPE_WARNING)
688
689 def _VerifyGroupLVM(self, node_image, vg_name):
690 """Check cross-node consistency in LVM.
691
692 @type node_image: dict
693 @param node_image: info about nodes, mapping from node to names to
694 L{NodeImage} objects
695 @param vg_name: the configured VG name
696
697 """
698 if vg_name is None:
699 return
700
701 # Only exclusive storage needs this kind of checks
702 if not self._exclusive_storage:
703 return
704
705 # exclusive_storage wants all PVs to have the same size (approximately),
706 # if the smallest and the biggest ones are okay, everything is fine.
707 # pv_min is None iff pv_max is None
708 vals = [ni for ni in node_image.values() if ni.pv_min is not None]
709 if not vals:
710 return
711 (pvmin, minnode_uuid) = min((ni.pv_min, ni.uuid) for ni in vals)
712 (pvmax, maxnode_uuid) = max((ni.pv_max, ni.uuid) for ni in vals)
713 bad = utils.LvmExclusiveTestBadPvSizes(pvmin, pvmax)
714 self._ErrorIf(bad, constants.CV_EGROUPDIFFERENTPVSIZE, self.group_info.name,
715 "PV sizes differ too much in the group; smallest (%s MB) is"
716 " on %s, biggest (%s MB) is on %s",
717 pvmin, self.cfg.GetNodeName(minnode_uuid),
718 pvmax, self.cfg.GetNodeName(maxnode_uuid))
719
720 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
721 """Check the node bridges.
722
723 @type ninfo: L{objects.Node}
724 @param ninfo: the node to check
725 @param nresult: the remote results for the node
726 @param bridges: the expected list of bridges
727
728 """
729 if not bridges:
730 return
731
732 missing = nresult.get(constants.NV_BRIDGES, None)
733 test = not isinstance(missing, list)
734 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
735 "did not return valid bridge information")
736 if not test:
737 self._ErrorIf(bool(missing), constants.CV_ENODENET, ninfo.name,
738 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
739
740 def _VerifyNodeUserScripts(self, ninfo, nresult):
741 """Check the results of user scripts presence and executability on the node
742
743 @type ninfo: L{objects.Node}
744 @param ninfo: the node to check
745 @param nresult: the remote results for the node
746
747 """
748 test = not constants.NV_USERSCRIPTS in nresult
749 self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, ninfo.name,
750 "did not return user scripts information")
751
752 broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None)
753 if not test:
754 self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, ninfo.name,
755 "user scripts not present or not executable: %s" %
756 utils.CommaJoin(sorted(broken_scripts)))
757
758 def _VerifyNodeNetwork(self, ninfo, nresult):
759 """Check the node network connectivity results.
760
761 @type ninfo: L{objects.Node}
762 @param ninfo: the node to check
763 @param nresult: the remote results for the node
764
765 """
766 test = constants.NV_NODELIST not in nresult
767 self._ErrorIf(test, constants.CV_ENODESSH, ninfo.name,
768 "node hasn't returned node ssh connectivity data")
769 if not test:
770 if nresult[constants.NV_NODELIST]:
771 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
772 self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name,
773 "ssh communication with node '%s': %s", a_node, a_msg)
774
775 if constants.NV_NODENETTEST not in nresult:
776 self._ErrorMsg(constants.CV_ENODENET, ninfo.name,
777 "node hasn't returned node tcp connectivity data")
778 elif nresult[constants.NV_NODENETTEST]:
779 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
780 msglist = []
781 for node in nlist:
782 msglist.append("tcp communication with node '%s': %s" %
783 (node, nresult[constants.NV_NODENETTEST][node]))
784 self._ErrorMsgList(constants.CV_ENODENET, ninfo.name, msglist)
785
786 if constants.NV_MASTERIP not in nresult:
787 self._ErrorMsg(constants.CV_ENODENET, ninfo.name,
788 "node hasn't returned node master IP reachability data")
789 elif nresult[constants.NV_MASTERIP] is False: # be explicit, could be None
790 if ninfo.uuid == self.master_node:
791 msg = "the master node cannot reach the master IP (not configured?)"
792 else:
793 msg = "cannot reach the master IP"
794 self._ErrorMsg(constants.CV_ENODENET, ninfo.name, msg)
795
796 def _VerifyInstance(self, instance, node_image, diskstatus):
797 """Verify an instance.
798
799 This function checks to see if the required block devices are
800 available on the instance's node, and that the nodes are in the correct
801 state.
802
803 """
804 pnode_uuid = instance.primary_node
805 pnode_img = node_image[pnode_uuid]
806 groupinfo = self.cfg.GetAllNodeGroupsInfo()
807
808 node_vol_should = {}
809 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
810
811 cluster = self.cfg.GetClusterInfo()
812 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster,
813 self.group_info)
814 err = ComputeIPolicyInstanceViolation(ipolicy, instance, self.cfg)
815 self._ErrorIf(err, constants.CV_EINSTANCEPOLICY, instance.name,
816 utils.CommaJoin(err), code=self.ETYPE_WARNING)
817
818 for node_uuid in node_vol_should:
819 n_img = node_image[node_uuid]
820 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
821 # ignore missing volumes on offline or broken nodes
822 continue
823 for volume in node_vol_should[node_uuid]:
824 test = volume not in n_img.volumes
825 self._ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance.name,
826 "volume %s missing on node %s", volume,
827 self.cfg.GetNodeName(node_uuid))
828
829 if instance.admin_state == constants.ADMINST_UP:
830 test = instance.uuid not in pnode_img.instances and not pnode_img.offline
831 self._ErrorIf(test, constants.CV_EINSTANCEDOWN, instance.name,
832 "instance not running on its primary node %s",
833 self.cfg.GetNodeName(pnode_uuid))
834 self._ErrorIf(pnode_img.offline, constants.CV_EINSTANCEBADNODE,
835 instance.name, "instance is marked as running and lives on"
836 " offline node %s", self.cfg.GetNodeName(pnode_uuid))
837
838 diskdata = [(nname, success, status, idx)
839 for (nname, disks) in diskstatus.items()
840 for idx, (success, status) in enumerate(disks)]
841
842 for nname, success, bdev_status, idx in diskdata:
843 # the 'ghost node' construction in Exec() ensures that we have a
844 # node here
845 snode = node_image[nname]
846 bad_snode = snode.ghost or snode.offline
847 self._ErrorIf(instance.disks_active and
848 not success and not bad_snode,
849 constants.CV_EINSTANCEFAULTYDISK, instance.name,
850 "couldn't retrieve status for disk/%s on %s: %s",
851 idx, self.cfg.GetNodeName(nname), bdev_status)
852
853 if instance.disks_active and success and bdev_status.is_degraded:
854 msg = "disk/%s on %s is degraded" % (idx, self.cfg.GetNodeName(nname))
855
856 code = self.ETYPE_ERROR
857 accepted_lds = [constants.LDS_OKAY, constants.LDS_SYNC]
858
859 if bdev_status.ldisk_status in accepted_lds:
860 code = self.ETYPE_WARNING
861
862 msg += "; local disk state is '%s'" % \
863 constants.LDS_NAMES[bdev_status.ldisk_status]
864
865 self._Error(constants.CV_EINSTANCEFAULTYDISK, instance.name, msg,
866 code=code)
867
868 self._ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
869 constants.CV_ENODERPC, self.cfg.GetNodeName(pnode_uuid),
870 "instance %s, connection to primary node failed",
871 instance.name)
872
873 secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid)
874 self._ErrorIf(len(secondary_nodes) > 1,
875 constants.CV_EINSTANCELAYOUT, instance.name,
876 "instance has multiple secondary nodes: %s",
877 utils.CommaJoin(secondary_nodes),
878 code=self.ETYPE_WARNING)
879
880 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid)
881 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, inst_nodes)
882 disks = self.cfg.GetInstanceDisks(instance.uuid)
883 if any(es_flags.values()):
884 if not utils.AllDiskOfType(disks, constants.DTS_EXCL_STORAGE):
885 # Disk template not compatible with exclusive_storage: no instance
886 # node should have the flag set
887 es_nodes = [n
888 for (n, es) in es_flags.items()
889 if es]
890 unsupported = [d.dev_type for d in disks
891 if d.dev_type not in constants.DTS_EXCL_STORAGE]
892 self._Error(constants.CV_EINSTANCEUNSUITABLENODE, instance.name,
893 "instance uses disk types %s, which are not supported on"
894 " nodes that have exclusive storage set: %s",
895 utils.CommaJoin(unsupported),
896 utils.CommaJoin(self.cfg.GetNodeNames(es_nodes)))
897 for (idx, disk) in enumerate(disks):
898 self._ErrorIf(disk.spindles is None,
899 constants.CV_EINSTANCEMISSINGCFGPARAMETER, instance.name,
900 "number of spindles not configured for disk %s while"
901 " exclusive storage is enabled, try running"
902 " gnt-cluster repair-disk-sizes", idx)
903
904 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
905 instance_nodes = utils.NiceSort(inst_nodes)
906 instance_groups = {}
907
908 for node_uuid in instance_nodes:
909 instance_groups.setdefault(self.all_node_info[node_uuid].group,
910 []).append(node_uuid)
911
912 pretty_list = [
913 "%s (group %s)" % (utils.CommaJoin(self.cfg.GetNodeNames(nodes)),
914 groupinfo[group].name)
915 # Sort so that we always list the primary node first.
916 for group, nodes in sorted(instance_groups.items(),
917 key=lambda (_, nodes): pnode_uuid in nodes,
918 reverse=True)]
919
920 self._ErrorIf(len(instance_groups) > 1,
921 constants.CV_EINSTANCESPLITGROUPS,
922 instance.name, "instance has primary and secondary nodes in"
923 " different groups: %s", utils.CommaJoin(pretty_list),
924 code=self.ETYPE_WARNING)
925
926 inst_nodes_offline = []
927 for snode in secondary_nodes:
928 s_img = node_image[snode]
929 self._ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
930 self.cfg.GetNodeName(snode),
931 "instance %s, connection to secondary node failed",
932 instance.name)
933
934 if s_img.offline:
935 inst_nodes_offline.append(snode)
936
937 # warn that the instance lives on offline nodes
938 self._ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE,
939 instance.name, "instance has offline secondary node(s) %s",
940 utils.CommaJoin(self.cfg.GetNodeNames(inst_nodes_offline)))
941 # ... or ghost/non-vm_capable nodes
942 for node_uuid in inst_nodes:
943 self._ErrorIf(node_image[node_uuid].ghost, constants.CV_EINSTANCEBADNODE,
944 instance.name, "instance lives on ghost node %s",
945 self.cfg.GetNodeName(node_uuid))
946 self._ErrorIf(not node_image[node_uuid].vm_capable,
947 constants.CV_EINSTANCEBADNODE, instance.name,
948 "instance lives on non-vm_capable node %s",
949 self.cfg.GetNodeName(node_uuid))
950
951 def _VerifyOrphanVolumes(self, vg_name, node_vol_should, node_image,
952 reserved):
953 """Verify if there are any unknown volumes in the cluster.
954
955 The .os, .swap and backup volumes are ignored. All other volumes are
956 reported as unknown.
957
958 @type vg_name: string
959 @param vg_name: the name of the Ganeti-administered volume group
960 @type node_vol_should: dict
961 @param node_vol_should: mapping of node UUIDs to expected LVs on each node
962 @type node_image: dict
963 @param node_image: mapping of node UUIDs to L{NodeImage} objects
964 @type reserved: L{ganeti.utils.FieldSet}
965 @param reserved: a FieldSet of reserved volume names
966
967 """
968 for node_uuid, n_img in node_image.items():
969 if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or
970 self.all_node_info[node_uuid].group != self.group_uuid):
971 # skip non-healthy nodes
972 continue
973 for volume in n_img.volumes:
974 # skip volumes not belonging to the ganeti-administered volume group
975 if volume.split('/')[0] != vg_name:
976 continue
977
978 test = ((node_uuid not in node_vol_should or
979 volume not in node_vol_should[node_uuid]) and
980 not reserved.Matches(volume))
981 self._ErrorIf(test, constants.CV_ENODEORPHANLV,
982 self.cfg.GetNodeName(node_uuid),
983 "volume %s is unknown", volume,
984 code=_VerifyErrors.ETYPE_WARNING)
985
986 def _VerifyNPlusOneMemory(self, node_image, all_insts):
987 """Verify N+1 Memory Resilience.
988
989 Check that if one single node dies we can still start all the
990 instances it was primary for.
991
992 """
993 cluster_info = self.cfg.GetClusterInfo()
994 for node_uuid, n_img in node_image.items():
995 # This code checks that every node which is now listed as
996 # secondary has enough memory to host all instances it is
997 # supposed to should a single other node in the cluster fail.
998 # FIXME: not ready for failover to an arbitrary node
999 # FIXME: does not support file-backed instances
1000 # WARNING: we currently take into account down instances as well
1001 # as up ones, considering that even if they're down someone
1002 # might want to start them even in the event of a node failure.
1003 if n_img.offline or \
1004 self.all_node_info[node_uuid].group != self.group_uuid:
1005 # we're skipping nodes marked offline and nodes in other groups from
1006 # the N+1 warning, since most likely we don't have good memory
1007 # information from them; we already list instances living on such
1008 # nodes, and that's enough warning
1009 continue
1010 #TODO(dynmem): also consider ballooning out other instances
1011 for prinode, inst_uuids in n_img.sbp.items():
1012 needed_mem = 0
1013 for inst_uuid in inst_uuids:
1014 bep = cluster_info.FillBE(all_insts[inst_uuid])
1015 if bep[constants.BE_AUTO_BALANCE]:
1016 needed_mem += bep[constants.BE_MINMEM]
1017 test = n_img.mfree < needed_mem
1018 self._ErrorIf(test, constants.CV_ENODEN1,
1019 self.cfg.GetNodeName(node_uuid),
1020 "not enough memory to accomodate instance failovers"
1021 " should node %s fail (%dMiB needed, %dMiB available)",
1022 self.cfg.GetNodeName(prinode), needed_mem, n_img.mfree)
1023
1024 def _CertError(self, *args):
1025 """Helper function for _VerifyClientCertificates."""
1026 self._Error(constants.CV_ECLUSTERCLIENTCERT, None, *args)
1027 self._cert_error_found = True
1028
1029 def _VerifyClientCertificates(self, nodes, all_nvinfo):
1030 """Verifies the consistency of the client certificates.
1031
1032 This includes several aspects:
1033 - the individual validation of all nodes' certificates
1034 - the consistency of the master candidate certificate map
1035 - the consistency of the master candidate certificate map with the
1036 certificates that the master candidates are actually using.
1037
1038 @param nodes: the list of nodes to consider in this verification
1039 @param all_nvinfo: the map of results of the verify_node call to
1040 all nodes
1041
1042 """
1043
1044 rebuild_certs_msg = (
1045 "To rebuild node certificates, please run"
1046 " 'gnt-cluster renew-crypto --new-node-certificates'.")
1047
1048 self._cert_error_found = False
1049
1050 candidate_certs = self.cfg.GetClusterInfo().candidate_certs
1051 if not candidate_certs:
1052 self._CertError(
1053 "The cluster's list of master candidate certificates is empty."
1054 " This may be because you just updated the cluster. " +
1055 rebuild_certs_msg)
1056 return
1057
1058 if len(candidate_certs) != len(set(candidate_certs.values())):
1059 self._CertError(
1060 "There are at least two master candidates configured to use the same"
1061 " certificate.")
1062
1063 # collect the client certificate
1064 for node in nodes:
1065 if node.offline:
1066 continue
1067
1068 nresult = all_nvinfo[node.uuid]
1069 if nresult.fail_msg or not nresult.payload:
1070 continue
1071
1072 (errcode, msg) = nresult.payload.get(constants.NV_CLIENT_CERT, None)
1073
1074 if errcode is not None:
1075 self._CertError(
1076 "Client certificate of node '%s' failed validation: %s (code '%s')",
1077 node.uuid, msg, errcode)
1078 if not errcode:
1079 digest = msg
1080 if node.master_candidate:
1081 if node.uuid in candidate_certs:
1082 if digest != candidate_certs[node.uuid]:
1083 self._CertError(
1084 "Client certificate digest of master candidate '%s' does not"
1085 " match its entry in the cluster's map of master candidate"
1086 " certificates. Expected: %s Got: %s", node.uuid,
1087 digest, candidate_certs[node.uuid])
1088 else:
1089 self._CertError(
1090 "The master candidate '%s' does not have an entry in the"
1091 " map of candidate certificates.", node.uuid)
1092 if digest in candidate_certs.values():
1093 self._CertError(
1094 "Master candidate '%s' is using a certificate of another node.",
1095 node.uuid)
1096 else:
1097 if node.uuid in candidate_certs:
1098 self._CertError(
1099 "Node '%s' is not a master candidate, but still listed in the"
1100 " map of master candidate certificates.", node.uuid)
1101 if (node.uuid not in candidate_certs and
1102 digest in candidate_certs.values()):
1103 self._CertError(
1104 "Node '%s' is not a master candidate and is incorrectly using a"
1105 " certificate of another node which is master candidate.",
1106 node.uuid)
1107
1108 if self._cert_error_found:
1109 self._CertError(rebuild_certs_msg)
1110
1111 def _VerifySshSetup(self, nodes, all_nvinfo):
1112 """Evaluates the verification results of the SSH setup and clutter test.
1113
1114 @param nodes: List of L{objects.Node} objects
1115 @param all_nvinfo: RPC results
1116
1117 """
1118 for node in nodes:
1119 if not node.offline:
1120 nresult = all_nvinfo[node.uuid]
1121 if nresult.fail_msg or not nresult.payload:
1122 self._ErrorIf(True, constants.CV_ENODESSH, node.name,
1123 "Could not verify the SSH setup of this node.")
1124 return
1125 for ssh_test in [constants.NV_SSH_SETUP, constants.NV_SSH_CLUTTER]:
1126 result = nresult.payload.get(ssh_test, None)
1127 error_msg = ""
1128 if isinstance(result, list):
1129 error_msg = " ".join(result)
1130 self._ErrorIf(result,
1131 constants.CV_ENODESSH, None, error_msg)
1132
1133 def _VerifyFiles(self, nodes, master_node_uuid, all_nvinfo,
1134 (files_all, files_opt, files_mc, files_vm)):
1135 """Verifies file checksums collected from all nodes.
1136
1137 @param nodes: List of L{objects.Node} objects
1138 @param master_node_uuid: UUID of master node
1139 @param all_nvinfo: RPC results
1140
1141 """
1142 # Define functions determining which nodes to consider for a file
1143 files2nodefn = [
1144 (files_all, None),
1145 (files_mc, lambda node: (node.master_candidate or
1146 node.uuid == master_node_uuid)),
1147 (files_vm, lambda node: node.vm_capable),
1148 ]
1149
1150 # Build mapping from filename to list of nodes which should have the file
1151 nodefiles = {}
1152 for (files, fn) in files2nodefn:
1153 if fn is None:
1154 filenodes = nodes
1155 else:
1156 filenodes = filter(fn, nodes)
1157 nodefiles.update((filename, frozenset(fn.uuid for fn in filenodes))
1158 for filename in files)
1159
1160 assert set(nodefiles) == (files_all | files_mc | files_vm)
1161
1162 fileinfo = dict((filename, {}) for filename in nodefiles)
1163 ignore_nodes = set()
1164
1165 for node in nodes:
1166 if node.offline:
1167 ignore_nodes.add(node.uuid)
1168 continue
1169
1170 nresult = all_nvinfo[node.uuid]
1171
1172 if nresult.fail_msg or not nresult.payload:
1173 node_files = None
1174 else:
1175 fingerprints = nresult.payload.get(constants.NV_FILELIST, {})
1176 node_files = dict((vcluster.LocalizeVirtualPath(key), value)
1177 for (key, value) in fingerprints.items())
1178 del fingerprints
1179
1180 test = not (node_files and isinstance(node_files, dict))
1181 self._ErrorIf(test, constants.CV_ENODEFILECHECK, node.name,
1182 "Node did not return file checksum data")
1183 if test:
1184 ignore_nodes.add(node.uuid)
1185 continue
1186
1187 # Build per-checksum mapping from filename to nodes having it
1188 for (filename, checksum) in node_files.items():
1189 assert filename in nodefiles
1190 fileinfo[filename].setdefault(checksum, set()).add(node.uuid)
1191
1192 for (filename, checksums) in fileinfo.items():
1193 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
1194
1195 # Nodes having the file
1196 with_file = frozenset(node_uuid
1197 for node_uuids in fileinfo[filename].values()
1198 for node_uuid in node_uuids) - ignore_nodes
1199
1200 expected_nodes = nodefiles[filename] - ignore_nodes
1201
1202 # Nodes missing file
1203 missing_file = expected_nodes - with_file
1204
1205 if filename in files_opt:
1206 # All or no nodes
1207 self._ErrorIf(missing_file and missing_file != expected_nodes,
1208 constants.CV_ECLUSTERFILECHECK, None,
1209 "File %s is optional, but it must exist on all or no"
1210 " nodes (not found on %s)",
1211 filename,
1212 utils.CommaJoin(utils.NiceSort(
1213 self.cfg.GetNodeName(n) for n in missing_file)))
1214 else:
1215 self._ErrorIf(missing_file, constants.CV_ECLUSTERFILECHECK, None,
1216 "File %s is missing from node(s) %s", filename,
1217 utils.CommaJoin(utils.NiceSort(
1218 self.cfg.GetNodeName(n) for n in missing_file)))
1219
1220 # Warn if a node has a file it shouldn't
1221 unexpected = with_file - expected_nodes
1222 self._ErrorIf(unexpected,
1223 constants.CV_ECLUSTERFILECHECK, None,
1224 "File %s should not exist on node(s) %s",
1225 filename,
1226 utils.CommaJoin(utils.NiceSort(
1227 self.cfg.GetNodeName(n) for n in unexpected)))
1228
1229 # See if there are multiple versions of the file
1230 test = len(checksums) > 1
1231 if test:
1232 variants = ["variant %s on %s" %
1233 (idx + 1,
1234 utils.CommaJoin(utils.NiceSort(
1235 self.cfg.GetNodeName(n) for n in node_uuids)))
1236 for (idx, (checksum, node_uuids)) in
1237 enumerate(sorted(checksums.items()))]
1238 else:
1239 variants = []
1240
1241 self._ErrorIf(test, constants.CV_ECLUSTERFILECHECK, None,
1242 "File %s found with %s different checksums (%s)",
1243 filename, len(checksums), "; ".join(variants))
1244
1245 def _VerifyNodeDrbdHelper(self, ninfo, nresult, drbd_helper):
1246 """Verify the drbd helper.
1247
1248 """
1249 if drbd_helper:
1250 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
1251 test = (helper_result is None)
1252 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
1253 "no drbd usermode helper returned")
1254 if helper_result:
1255 status, payload = helper_result
1256 test = not status
1257 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
1258 "drbd usermode helper check unsuccessful: %s", payload)
1259 test = status and (payload != drbd_helper)
1260 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
1261 "wrong drbd usermode helper: %s", payload)
1262
1263 @staticmethod
1264 def _ComputeDrbdMinors(ninfo, instanceinfo, disks_info, drbd_map, error_if):
1265 """Gives the DRBD information in a map for a node.
1266
1267 @type ninfo: L{objects.Node}
1268 @param ninfo: the node to check
1269 @param instanceinfo: the dict of instances
1270 @param disks_info: the dict of disks
1271 @param drbd_map: the DRBD map as returned by
1272 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1273 @type error_if: callable like L{_ErrorIf}
1274 @param error_if: The error reporting function
1275 @return: dict from minor number to (disk_uuid, instance_uuid, active)
1276
1277 """
1278 node_drbd = {}
1279 for minor, disk_uuid in drbd_map[ninfo.uuid].items():
1280 test = disk_uuid not in disks_info
1281 error_if(test, constants.CV_ECLUSTERCFG, None,
1282 "ghost disk '%s' in temporary DRBD map", disk_uuid)
1283 # ghost disk should not be active, but otherwise we
1284 # don't give double warnings (both ghost disk and
1285 # unallocated minor in use)
1286 if test:
1287 node_drbd[minor] = (disk_uuid, None, False)
1288 else:
1289 disk_active = False
1290 disk_instance = None
1291 for (inst_uuid, inst) in instanceinfo.items():
1292 if disk_uuid in inst.disks:
1293 disk_active = inst.disks_active
1294 disk_instance = inst_uuid
1295 break
1296 node_drbd[minor] = (disk_uuid, disk_instance, disk_active)
1297 return node_drbd
1298
1299 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, disks_info,
1300 drbd_helper, drbd_map):
1301 """Verifies and the node DRBD status.
1302
1303 @type ninfo: L{objects.Node}
1304 @param ninfo: the node to check
1305 @param nresult: the remote results for the node
1306 @param instanceinfo: the dict of instances
1307 @param disks_info: the dict of disks
1308 @param drbd_helper: the configured DRBD usermode helper
1309 @param drbd_map: the DRBD map as returned by
1310 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1311
1312 """
1313 self._VerifyNodeDrbdHelper(ninfo, nresult, drbd_helper)
1314
1315 # compute the DRBD minors
1316 node_drbd = self._ComputeDrbdMinors(ninfo, instanceinfo, disks_info,
1317 drbd_map, self._ErrorIf)
1318
1319 # and now check them
1320 used_minors = nresult.get(constants.NV_DRBDLIST, [])
1321 test = not isinstance(used_minors, (tuple, list))
1322 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
1323 "cannot parse drbd status file: %s", str(used_minors))
1324 if test:
1325 # we cannot check drbd status
1326 return
1327
1328 for minor, (disk_uuid, inst_uuid, must_exist) in node_drbd.items():
1329 test = minor not in used_minors and must_exist
1330 if inst_uuid is not None:
1331 attached = "(attached in instance '%s')" % \
1332 self.cfg.GetInstanceName(inst_uuid)
1333 else:
1334 attached = "(detached)"
1335 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
1336 "drbd minor %d of disk %s %s is not active",
1337 minor, disk_uuid, attached)
1338 for minor in used_minors:
1339 test = minor not in node_drbd
1340 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
1341 "unallocated drbd minor %d is in use", minor)
1342
1343 def _UpdateNodeOS(self, ninfo, nresult, nimg):
1344 """Builds the node OS structures.
1345
1346 @type ninfo: L{objects.Node}
1347 @param ninfo: the node to check
1348 @param nresult: the remote results for the node
1349 @param nimg: the node image object
1350
1351 """
1352 remote_os = nresult.get(constants.NV_OSLIST, None)
1353 test = (not isinstance(remote_os, list) or
1354 not compat.all(isinstance(v, list) and len(v) == 8
1355 for v in remote_os))
1356
1357 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name,
1358 "node hasn't returned valid OS data")
1359
1360 nimg.os_fail = test
1361
1362 if test:
1363 return
1364
1365 os_dict = {}
1366
1367 for (name, os_path, status, diagnose,
1368 variants, parameters, api_ver,
1369 trusted) in nresult[constants.NV_OSLIST]:
1370
1371 if name not in os_dict:
1372 os_dict[name] = []
1373
1374 # parameters is a list of lists instead of list of tuples due to
1375 # JSON lacking a real tuple type, fix it:
1376 parameters = [tuple(v) for v in parameters]
1377 os_dict[name].append((os_path, status, diagnose,
1378 set(variants), set(parameters), set(api_ver),
1379 trusted))
1380
1381 nimg.oslist = os_dict
1382
1383 def _VerifyNodeOS(self, ninfo, nimg, base):
1384 """Verifies the node OS list.
1385
1386 @type ninfo: L{objects.Node}
1387 @param ninfo: the node to check
1388 @param nimg: the node image object
1389 @param base: the 'template' node we match against (e.g. from the master)
1390
1391 """
1392 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
1393
1394 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
1395 for os_name, os_data in nimg.oslist.items():
1396 assert os_data, "Empty OS status for OS %s?!" % os_name
1397 f_path, f_status, f_diag, f_var, f_param, f_api, f_trusted = os_data[0]
1398 self._ErrorIf(not f_status, constants.CV_ENODEOS, ninfo.name,
1399 "Invalid OS %s (located at %s): %s",
1400 os_name, f_path, f_diag)
1401 self._ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, ninfo.name,
1402 "OS '%s' has multiple entries"
1403 " (first one shadows the rest): %s",
1404 os_name, utils.CommaJoin([v[0] for v in os_data]))
1405 # comparisons with the 'base' image
1406 test = os_name not in base.oslist
1407 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name,
1408 "Extra OS %s not present on reference node (%s)",
1409 os_name, self.cfg.GetNodeName(base.uuid))
1410 if test:
1411 continue
1412 assert base.oslist[os_name], "Base node has empty OS status?"
1413 _, b_status, _, b_var, b_param, b_api, b_trusted = base.oslist[os_name][0]
1414 if not b_status:
1415 # base OS is invalid, skipping
1416 continue
1417 for kind, a, b in [("API version", f_api, b_api),
1418 ("variants list", f_var, b_var),
1419 ("parameters", beautify_params(f_param),
1420 beautify_params(b_param))]:
1421 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name,
1422 "OS %s for %s differs from reference node %s:"
1423 " [%s] vs. [%s]", kind, os_name,
1424 self.cfg.GetNodeName(base.uuid),
1425 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
1426 for kind, a, b in [("trusted", f_trusted, b_trusted)]:
1427 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name,
1428 "OS %s for %s differs from reference node %s:"
1429 " %s vs. %s", kind, os_name,
1430 self.cfg.GetNodeName(base.uuid), a, b)
1431
1432 # check any missing OSes
1433 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
1434 self._ErrorIf(missing, constants.CV_ENODEOS, ninfo.name,
1435 "OSes present on reference node %s"
1436 " but missing on this node: %s",
1437 self.cfg.GetNodeName(base.uuid), utils.CommaJoin(missing))
1438
1439 def _VerifyAcceptedFileStoragePaths(self, ninfo, nresult, is_master):
1440 """Verifies paths in L{pathutils.FILE_STORAGE_PATHS_FILE}.
1441
1442 @type ninfo: L{objects.Node}
1443 @param ninfo: the node to check
1444 @param nresult: the remote results for the node
1445 @type is_master: bool
1446 @param is_master: Whether node is the master node
1447
1448 """
1449 cluster = self.cfg.GetClusterInfo()
1450 if (is_master and
1451 (cluster.IsFileStorageEnabled() or
1452 cluster.IsSharedFileStorageEnabled())):
1453 try:
1454 fspaths = nresult[constants.NV_ACCEPTED_STORAGE_PATHS]
1455 except KeyError:
1456 # This should never happen
1457 self._ErrorIf(True, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
1458 "Node did not return forbidden file storage paths")
1459 else:
1460 self._ErrorIf(fspaths, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
1461 "Found forbidden file storage paths: %s",
1462 utils.CommaJoin(fspaths))
1463 else:
1464 self._ErrorIf(constants.NV_ACCEPTED_STORAGE_PATHS in nresult,
1465 constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
1466 "Node should not have returned forbidden file storage"
1467 " paths")
1468
1469 def _VerifyStoragePaths(self, ninfo, nresult, file_disk_template,
1470 verify_key, error_key):
1471 """Verifies (file) storage paths.
1472
1473 @type ninfo: L{objects.Node}
1474 @param ninfo: the node to check
1475 @param nresult: the remote results for the node
1476 @type file_disk_template: string
1477 @param file_disk_template: file-based disk template, whose directory
1478 is supposed to be verified
1479 @type verify_key: string
1480 @param verify_key: key for the verification map of this file
1481 verification step
1482 @param error_key: error key to be added to the verification results
1483 in case something goes wrong in this verification step
1484
1485 """
1486 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes(
1487 constants.ST_FILE, constants.ST_SHARED_FILE, constants.ST_GLUSTER
1488 ))
1489
1490 cluster = self.cfg.GetClusterInfo()
1491 if cluster.IsDiskTemplateEnabled(file_disk_template):
1492 self._ErrorIf(
1493 verify_key in nresult,
1494 error_key, ninfo.name,
1495 "The configured %s storage path is unusable: %s" %
1496 (file_disk_template, nresult.get(verify_key)))
1497
1498 def _VerifyFileStoragePaths(self, ninfo, nresult):
1499 """Verifies (file) storage paths.
1500
1501 @see: C{_VerifyStoragePaths}
1502
1503 """
1504 self._VerifyStoragePaths(
1505 ninfo, nresult, constants.DT_FILE,
1506 constants.NV_FILE_STORAGE_PATH,
1507 constants.CV_ENODEFILESTORAGEPATHUNUSABLE)
1508
1509 def _VerifySharedFileStoragePaths(self, ninfo, nresult):
1510 """Verifies (file) storage paths.
1511
1512 @see: C{_VerifyStoragePaths}
1513
1514 """
1515 self._VerifyStoragePaths(
1516 ninfo, nresult, constants.DT_SHARED_FILE,
1517 constants.NV_SHARED_FILE_STORAGE_PATH,
1518 constants.CV_ENODESHAREDFILESTORAGEPATHUNUSABLE)
1519
1520 def _VerifyGlusterStoragePaths(self, ninfo, nresult):
1521 """Verifies (file) storage paths.
1522
1523 @see: C{_VerifyStoragePaths}
1524
1525 """
1526 self._VerifyStoragePaths(
1527 ninfo, nresult, constants.DT_GLUSTER,
1528 constants.NV_GLUSTER_STORAGE_PATH,
1529 constants.CV_ENODEGLUSTERSTORAGEPATHUNUSABLE)
1530
1531 def _VerifyOob(self, ninfo, nresult):
1532 """Verifies out of band functionality of a node.
1533
1534 @type ninfo: L{objects.Node}
1535 @param ninfo: the node to check
1536 @param nresult: the remote results for the node
1537
1538 """
1539 # We just have to verify the paths on master and/or master candidates
1540 # as the oob helper is invoked on the master
1541 if ((ninfo.master_candidate or ninfo.master_capable) and
1542 constants.NV_OOB_PATHS in nresult):
1543 for path_result in nresult[constants.NV_OOB_PATHS]:
1544 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH,
1545 ninfo.name, path_result)
1546
1547 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1548 """Verifies and updates the node volume data.
1549
1550 This function will update a L{NodeImage}'s internal structures
1551 with data from the remote call.
1552
1553 @type ninfo: L{objects.Node}
1554 @param ninfo: the node to check
1555 @param nresult: the remote results for the node
1556 @param nimg: the node image object
1557 @param vg_name: the configured VG name
1558
1559 """
1560 nimg.lvm_fail = True
1561 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
1562 if vg_name is None:
1563 pass
1564 elif isinstance(lvdata, basestring):
1565 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name,
1566 "LVM problem on node: %s", utils.SafeEncode(lvdata))
1567 elif not isinstance(lvdata, dict):
1568 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name,
1569 "rpc call to node failed (lvlist)")
1570 else:
1571 nimg.volumes = lvdata
1572 nimg.lvm_fail = False
1573
1574 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1575 """Verifies and updates the node instance list.
1576
1577 If the listing was successful, then updates this node's instance
1578 list. Otherwise, it marks the RPC call as failed for the instance
1579 list key.
1580
1581 @type ninfo: L{objects.Node}
1582 @param ninfo: the node to check
1583 @param nresult: the remote results for the node
1584 @param nimg: the node image object
1585
1586 """
1587 idata = nresult.get(constants.NV_INSTANCELIST, None)
1588 test = not isinstance(idata, list)
1589 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
1590 "rpc call to node failed (instancelist): %s",
1591 utils.SafeEncode(str(idata)))
1592 if test:
1593 nimg.hyp_fail = True
1594 else:
1595 nimg.instances = [uuid for (uuid, _) in
1596 self.cfg.GetMultiInstanceInfoByName(idata)]
1597
1598 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1599 """Verifies and computes a node information map
1600
1601 @type ninfo: L{objects.Node}
1602 @param ninfo: the node to check
1603 @param nresult: the remote results for the node
1604 @param nimg: the node image object
1605 @param vg_name: the configured VG name
1606
1607 """
1608 # try to read free memory (from the hypervisor)
1609 hv_info = nresult.get(constants.NV_HVINFO, None)
1610 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
1611 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
1612 "rpc call to node failed (hvinfo)")
1613 if not test:
1614 try:
1615 nimg.mfree = int(hv_info["memory_free"])
1616 except (ValueError, TypeError):
1617 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
1618 "node returned invalid nodeinfo, check hypervisor")
1619
1620 # FIXME: devise a free space model for file based instances as well
1621 if vg_name is not None:
1622 test = (constants.NV_VGLIST not in nresult or
1623 vg_name not in nresult[constants.NV_VGLIST])
1624 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name,
1625 "node didn't return data for the volume group '%s'"
1626 " - it is either missing or broken", vg_name)
1627 if not test:
1628 try:
1629 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
1630 except (ValueError, TypeError):
1631 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
1632 "node returned invalid LVM info, check LVM status")
1633
1634 def _CollectDiskInfo(self, node_uuids, node_image, instanceinfo):
1635 """Gets per-disk status information for all instances.
1636
1637 @type node_uuids: list of strings
1638 @param node_uuids: Node UUIDs
1639 @type node_image: dict of (UUID, L{objects.Node})
1640 @param node_image: Node objects
1641 @type instanceinfo: dict of (UUID, L{objects.Instance})
1642 @param instanceinfo: Instance objects
1643 @rtype: {instance: {node: [(succes, payload)]}}
1644 @return: a dictionary of per-instance dictionaries with nodes as
1645 keys and disk information as values; the disk information is a
1646 list of tuples (success, payload)
1647
1648 """
1649 node_disks = {}
1650 node_disks_dev_inst_only = {}
1651 diskless_instances = set()
1652 nodisk_instances = set()
1653
1654 for nuuid in node_uuids:
1655 node_inst_uuids = list(itertools.chain(node_image[nuuid].pinst,
1656 node_image[nuuid].sinst))
1657 diskless_instances.update(uuid for uuid in node_inst_uuids
1658 if not instanceinfo[uuid].disks)
1659 disks = [(inst_uuid, disk)
1660 for inst_uuid in node_inst_uuids
1661 for disk in self.cfg.GetInstanceDisks(inst_uuid)]
1662
1663 if not disks:
1664 nodisk_instances.update(uuid for uuid in node_inst_uuids
1665 if instanceinfo[uuid].disks)
1666 # No need to collect data
1667 continue
1668
1669 node_disks[nuuid] = disks
1670
1671 # _AnnotateDiskParams makes already copies of the disks
1672 dev_inst_only = []
1673 for (inst_uuid, dev) in disks:
1674 (anno_disk,) = AnnotateDiskParams(instanceinfo[inst_uuid], [dev],
1675 self.cfg)
1676 dev_inst_only.append((anno_disk, instanceinfo[inst_uuid]))
1677
1678 node_disks_dev_inst_only[nuuid] = dev_inst_only
1679
1680 assert len(node_disks) == len(node_disks_dev_inst_only)
1681
1682 # Collect data from all nodes with disks
1683 result = self.rpc.call_blockdev_getmirrorstatus_multi(
1684 node_disks.keys(), node_disks_dev_inst_only)
1685
1686 assert len(result) == len(node_disks)
1687
1688 instdisk = {}
1689
1690 for (nuuid, nres) in result.items():
1691 node = self.cfg.GetNodeInfo(nuuid)
1692 disks = node_disks[node.uuid]
1693
1694 if nres.offline:
1695 # No data from this node
1696 data = len(disks) * [(False, "node offline")]
1697 else:
1698 msg = nres.fail_msg
1699 self._ErrorIf(msg, constants.CV_ENODERPC, node.name,
1700 "while getting disk information: %s", msg)
1701 if msg:
1702 # No data from this node
1703 data = len(disks) * [(False, msg)]
1704 else:
1705 data = []
1706 for idx, i in enumerate(nres.payload):
1707 if isinstance(i, (tuple, list)) and len(i) == 2:
1708 data.append(i)
1709 else:
1710 logging.warning("Invalid result from node %s, entry %d: %s",
1711 node.name, idx, i)
1712 data.append((False, "Invalid result from the remote node"))
1713
1714 for ((inst_uuid, _), status) in zip(disks, data):
1715 instdisk.setdefault(inst_uuid, {}).setdefault(node.uuid, []) \
1716 .append(status)
1717
1718 # Add empty entries for diskless instances.
1719 for inst_uuid in diskless_instances:
1720 assert inst_uuid not in instdisk
1721 instdisk[inst_uuid] = {}
1722 # ...and disk-full instances that happen to have no disks
1723 for inst_uuid in nodisk_instances:
1724 assert inst_uuid not in instdisk
1725 instdisk[inst_uuid] = {}
1726
1727 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
1728 len(nuuids) <= len(
1729 self.cfg.GetInstanceNodes(instanceinfo[inst].uuid)) and
1730 compat.all(isinstance(s, (tuple, list)) and
1731 len(s) == 2 for s in statuses)
1732 for inst, nuuids in instdisk.items()
1733 for nuuid, statuses in nuuids.items())
1734 if __debug__:
1735 instdisk_keys = set(instdisk)
1736 instanceinfo_keys = set(instanceinfo)
1737 assert instdisk_keys == instanceinfo_keys, \
1738 ("instdisk keys (%s) do not match instanceinfo keys (%s)" %
1739 (instdisk_keys, instanceinfo_keys))
1740
1741 return instdisk
1742
1743 @staticmethod
1744 def _SshNodeSelector(group_uuid, all_nodes):
1745 """Create endless iterators for all potential SSH check hosts.
1746
1747 """
1748 nodes = [node for node in all_nodes
1749 if (node.group != group_uuid and
1750 not node.offline)]
1751 keyfunc = operator.attrgetter("group")
1752
1753 return map(itertools.cycle,
1754 [sorted(n.name for n in names)
1755 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
1756 keyfunc)])
1757
1758 @classmethod
1759 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
1760 """Choose which nodes should talk to which other nodes.
1761
1762 We will make nodes contact all nodes in their group, and one node from
1763 every other group.
1764
1765 @rtype: tuple of (string, dict of strings to list of strings, string)
1766 @return: a tuple containing the list of all online nodes, a dictionary
1767 mapping node names to additional nodes of other node groups to which
1768 connectivity should be tested, and a list of all online master
1769 candidates
1770
1771 @warning: This algorithm has a known issue if one node group is much
1772 smaller than others (e.g. just one node). In such a case all other
1773 nodes will talk to the single node.
1774
1775 """
1776 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
1777 online_mcs = sorted(node.name for node in group_nodes
1778 if (node.master_candidate and not node.offline))
1779 sel = cls._SshNodeSelector(group_uuid, all_nodes)
1780
1781 return (online_nodes,
1782 dict((name, sorted([i.next() for i in sel]))
1783 for name in online_nodes),
1784 online_mcs)
1785
1786 def _PrepareSshSetupCheck(self):
1787 """Prepare the input data for the SSH setup verification.
1788
1789 """
1790 all_nodes_info = self.cfg.GetAllNodesInfo()
1791 potential_master_candidates = self.cfg.GetPotentialMasterCandidates()
1792 node_status = [
1793 (uuid, node_info.name, node_info.master_candidate,
1794 node_info.name in potential_master_candidates, not node_info.offline)
1795 for (uuid, node_info) in all_nodes_info.items()]
1796 return node_status
1797
1798 def BuildHooksEnv(self):
1799 """Build hooks env.
1800
1801 Cluster-Verify hooks just ran in the post phase and their failure makes
1802 the output be logged in the verify output and the verification to fail.
1803
1804 """
1805 env = {
1806 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags()),
1807 }
1808
1809 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
1810 for node in self.my_node_info.values())
1811
1812 return env
1813
1814 def BuildHooksNodes(self):
1815 """Build hooks nodes.
1816
1817 """
1818 return ([], list(self.my_node_info.keys()))
1819
1820 @staticmethod
1821 def _VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced,
1822 i_offline, n_offline, n_drained):
1823 feedback_fn("* Other Notes")
1824 if i_non_redundant:
1825 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
1826 % len(i_non_redundant))
1827
1828 if i_non_a_balanced:
1829 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
1830 % len(i_non_a_balanced))
1831
1832 if i_offline:
1833 feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline)
1834
1835 if n_offline:
1836 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
1837
1838 if n_drained:
1839 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
1840
1841 def _VerifyExclusionTags(self, nodename, pinst, ctags):
1842 """Verify that all instances have different exclusion tags.
1843
1844 @type nodename: string
1845 @param nodename: the name of the node for which the check is done
1846 @type pinst: list of string
1847 @param pinst: list of UUIDs of those instances having the given node
1848 as primary node
1849 @type ctags: list of string
1850 @param ctags: tags of the cluster
1851
1852 """
1853 exclusion_prefixes = utils.GetExclusionPrefixes(ctags)
1854 tags_seen = set([])
1855 conflicting_tags = set([])
1856 for iuuid in pinst:
1857 allitags = self.my_inst_info[iuuid].tags
1858 if allitags is None:
1859 allitags = []
1860 itags = set([tag for tag in allitags
1861 if utils.IsGoodTag(exclusion_prefixes, tag)])
1862 conflicts = itags.intersection(tags_seen)
1863 if len(conflicts) > 0:
1864 conflicting_tags = conflicting_tags.union(conflicts)
1865 tags_seen = tags_seen.union(itags)
1866
1867 self._ErrorIf(len(conflicting_tags) > 0, constants.CV_EEXTAGS, nodename,
1868 "Tags where there is more than one instance: %s",
1869 list(conflicting_tags), code=constants.CV_WARNING)
1870
1871 def Exec(self, feedback_fn): # pylint: disable=R0915
1872 """Verify integrity of the node group, performing various test on nodes.
1873
1874 """
1875 # This method has too many local variables. pylint: disable=R0914
1876 feedback_fn("* Verifying group '%s'" % self.group_info.name)
1877
1878 if not self.my_node_uuids:
1879 # empty node group
1880 feedback_fn("* Empty node group, skipping verification")
1881 return True
1882
1883 self.bad = False
1884 verbose = self.op.verbose
1885 self._feedback_fn = feedback_fn
1886
1887 vg_name = self.cfg.GetVGName()
1888 drbd_helper = self.cfg.GetDRBDHelper()
1889 cluster = self.cfg.GetClusterInfo()
1890 hypervisors = cluster.enabled_hypervisors
1891 node_data_list = self.my_node_info.values()
1892
1893 i_non_redundant = [] # Non redundant instances
1894 i_non_a_balanced = [] # Non auto-balanced instances
1895 i_offline = 0 # Count of offline instances
1896 n_offline = 0 # Count of offline nodes
1897 n_drained = 0 # Count of nodes being drained
1898 node_vol_should = {}
1899
1900 # FIXME: verify OS list
1901
1902 # File verification
1903 filemap = ComputeAncillaryFiles(cluster, False)
1904
1905 # do local checksums
1906 master_node_uuid = self.master_node = self.cfg.GetMasterNode()
1907 master_ip = self.cfg.GetMasterIP()
1908
1909 online_master_candidates = sorted(
1910 node.name for node in node_data_list
1911 if (node.master_candidate and not node.offline))
1912
1913 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids))
1914
1915 user_scripts = []
1916 if self.cfg.GetUseExternalMipScript():
1917 user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT)
1918
1919 online_nodes = [(node.name, node.primary_ip, node.secondary_ip)
1920 for node in node_data_list if not node.offline]
1921 node_nettest_params = (online_nodes, online_master_candidates)
1922
1923 node_verify_param = {
1924 constants.NV_FILELIST:
1925 [vcluster.MakeVirtualPath(f)
1926 for f in utils.UniqueSequence(filename
1927 for files in filemap
1928 for filename in files)],
1929 constants.NV_NODELIST:
1930 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
1931 self.all_node_info.values()),
1932 constants.NV_HYPERVISOR: hypervisors,
1933 constants.NV_HVPARAMS:
1934 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
1935 constants.NV_NODENETTEST: node_nettest_params,
1936 constants.NV_INSTANCELIST: hypervisors,
1937 constants.NV_VERSION: None,
1938 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
1939 constants.NV_NODESETUP: None,
1940 constants.NV_TIME: None,
1941 constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip,
1942 online_master_candidates),
1943 constants.NV_OSLIST: None,
1944 constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(),
1945 constants.NV_USERSCRIPTS: user_scripts,
1946 constants.NV_CLIENT_CERT: None,
1947 }
1948
1949 if self.cfg.GetClusterInfo().modify_ssh_setup:
1950 node_verify_param[constants.NV_SSH_SETUP] = \
1951 (self._PrepareSshSetupCheck(), self.cfg.GetClusterInfo().ssh_key_type)
1952 if self.op.verify_clutter:
1953 node_verify_param[constants.NV_SSH_CLUTTER] = True
1954
1955 if vg_name is not None:
1956 node_verify_param[constants.NV_VGLIST] = None
1957 node_verify_param[constants.NV_LVLIST] = vg_name
1958 node_verify_param[constants.NV_PVLIST] = [vg_name]
1959
1960 if cluster.IsDiskTemplateEnabled(constants.DT_DRBD8):
1961 if drbd_helper:
1962 node_verify_param[constants.NV_DRBDVERSION] = None
1963 node_verify_param[constants.NV_DRBDLIST] = None
1964 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
1965
1966 if cluster.IsFileStorageEnabled() or \
1967 cluster.IsSharedFileStorageEnabled():
1968 # Load file storage paths only from master node
1969 node_verify_param[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1970 self.cfg.GetMasterNodeName()
1971 if cluster.IsFileStorageEnabled():
1972 node_verify_param[constants.NV_FILE_STORAGE_PATH] = \
1973 cluster.file_storage_dir
1974 if cluster.IsSharedFileStorageEnabled():
1975 node_verify_param[constants.NV_SHARED_FILE_STORAGE_PATH] = \
1976 cluster.shared_file_storage_dir
1977
1978 # bridge checks
1979 # FIXME: this needs to be changed per node-group, not cluster-wide
1980 bridges = set()
1981 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
1982 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
1983 bridges.add(default_nicpp[constants.NIC_LINK])
1984 for inst_uuid in self.my_inst_info.values():
1985 for nic in inst_uuid.nics:
1986 full_nic = cluster.SimpleFillNIC(nic.nicparams)
1987 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
1988 bridges.add(full_nic[constants.NIC_LINK])
1989
1990 if bridges:
1991 node_verify_param[constants.NV_BRIDGES] = list(bridges)
1992
1993 # Build our expected cluster state
1994 node_image = dict((node.uuid, self.NodeImage(offline=node.offline,
1995 uuid=node.uuid,
1996 vm_capable=node.vm_capable))
1997 for node in node_data_list)
1998
1999 # Gather OOB paths
2000 oob_paths = []
2001 for node in self.all_node_info.values():
2002 path = SupportsOob(self.cfg, node)
2003 if path and path not in oob_paths:
2004 oob_paths.append(path)
2005
2006 if oob_paths:
2007 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
2008
2009 for inst_uuid in self.my_inst_uuids:
2010 instance = self.my_inst_info[inst_uuid]
2011 if instance.admin_state == constants.ADMINST_OFFLINE:
2012 i_offline += 1
2013
2014 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid)
2015 for nuuid in inst_nodes:
2016 if nuuid not in node_image:
2017 gnode = self.NodeImage(uuid=nuuid)
2018 gnode.ghost = (nuuid not in self.all_node_info)
2019 node_image[nuuid] = gnode
2020
2021 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
2022
2023 pnode = instance.primary_node
2024 node_image[pnode].pinst.append(instance.uuid)
2025
2026 for snode in self.cfg.GetInstanceSecondaryNodes(instance.uuid):
2027 nimg = node_image[snode]
2028 nimg.sinst.append(instance.uuid)
2029 if pnode not in nimg.sbp:
2030 nimg.sbp[pnode] = []
2031 nimg.sbp[pnode].append(instance.uuid)
2032
2033 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg,
2034 self.my_node_info.keys())
2035 # The value of exclusive_storage should be the same across the group, so if
2036 # it's True for at least a node, we act as if it were set for all the nodes
2037 self._exclusive_storage = compat.any(es_flags.values())
2038 if self._exclusive_storage:
2039 node_verify_param[constants.NV_EXCLUSIVEPVS] = True
2040
2041 # At this point, we have the in-memory data structures complete,
2042 # except for the runtime information, which we'll gather next
2043
2044 # NOTE: Here we lock the configuration for the duration of RPC calls,
2045 # which means that the cluster configuration changes are blocked during
2046 # this period.
2047 # This is something that should be done only exceptionally and only for
2048 # justified cases!
2049 # In this case, we need the lock as we can only verify the integrity of
2050 # configuration files on MCs only if we know nobody else is modifying it.
2051 # FIXME: The check for integrity of config.data should be moved to
2052 # WConfD, which is the only one who can otherwise ensure nobody
2053 # will modify the configuration during the check.
2054 with self.cfg.GetConfigManager(shared=True, forcelock=True):
2055 feedback_fn("* Gathering information about nodes (%s nodes)" %
2056 len(self.my_node_uuids))
2057 # Force the configuration to be fully distributed before doing any tests
2058 self.cfg.FlushConfigGroup(self.group_uuid)
2059 # Due to the way our RPC system works, exact response times cannot be
2060 # guaranteed (e.g. a broken node could run into a timeout). By keeping
2061 # the time before and after executing the request, we can at least have
2062 # a time window.
2063 nvinfo_starttime = time.time()
2064 # Get lock on the configuration so that nobody modifies it concurrently.
2065 # Otherwise it can be modified by other jobs, failing the consistency
2066 # test.
2067 # NOTE: This is an exceptional situation, we should otherwise avoid
2068 # locking the configuration for something but very fast, pure operations.
2069 cluster_name = self.cfg.GetClusterName()
2070 hvparams = self.cfg.GetClusterInfo().hvparams
2071
2072 all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids,
2073 node_verify_param,
2074 cluster_name,
2075 hvparams)
2076 nvinfo_endtime = time.time()
2077
2078 if self.extra_lv_nodes and vg_name is not None:
2079 feedback_fn("* Gathering information about extra nodes (%s nodes)" %
2080 len(self.extra_lv_nodes))
2081 extra_lv_nvinfo = \
2082 self.rpc.call_node_verify(self.extra_lv_nodes,
2083 {constants.NV_LVLIST: vg_name},
2084 self.cfg.GetClusterName(),
2085 self.cfg.GetClusterInfo().hvparams)
2086 else:
2087 extra_lv_nvinfo = {}
2088
2089 # If not all nodes are being checked, we need to make sure the master
2090 # node and a non-checked vm_capable node are in the list.
2091 absent_node_uuids = set(self.all_node_info).difference(self.my_node_info)
2092 if absent_node_uuids:
2093 vf_nvinfo = all_nvinfo.copy()
2094 vf_node_info = list(self.my_node_info.values())
2095 additional_node_uuids = []
2096 if master_node_uuid not in self.my_node_info:
2097 additional_node_uuids.append(master_node_uuid)
2098 vf_node_info.append(self.all_node_info[master_node_uuid])
2099 # Add the first vm_capable node we find which is not included,
2100 # excluding the master node (which we already have)
2101 for node_uuid in absent_node_uuids:
2102 nodeinfo = self.all_node_info[node_uuid]
2103 if (nodeinfo.vm_capable and not nodeinfo.offline and
2104 node_uuid != master_node_uuid):
2105 additional_node_uuids.append(node_uuid)
2106 vf_node_info.append(self.all_node_info[node_uuid])
2107 break
2108 key = constants.NV_FILELIST
2109
2110 feedback_fn("* Gathering information about the master node")
2111 vf_nvinfo.update(self.rpc.call_node_verify(
2112 additional_node_uuids, {key: node_verify_param[key]},
2113 self.cfg.GetClusterName(), self.cfg.GetClusterInfo().hvparams))
2114 else:
2115 vf_nvinfo = all_nvinfo
2116 vf_node_info = self.my_node_info.values()
2117
2118 all_drbd_map = self.cfg.ComputeDRBDMap()
2119
2120 feedback_fn("* Gathering disk information (%s nodes)" %
2121 len(self.my_node_uuids))
2122 instdisk = self._CollectDiskInfo(self.my_node_info.keys(), node_image,
2123 self.my_inst_info)
2124
2125 feedback_fn("* Verifying configuration file consistency")
2126
2127 self._VerifyClientCertificates(self.my_node_info.values(), all_nvinfo)
2128 if self.cfg.GetClusterInfo().modify_ssh_setup:
2129 self._VerifySshSetup(self.my_node_info.values(), all_nvinfo)
2130 self._VerifyFiles(vf_node_info, master_node_uuid, vf_nvinfo, filemap)
2131
2132 feedback_fn("* Verifying node status")
2133
2134 refos_img = None
2135
2136 for node_i in node_data_list:
2137 nimg = node_image[node_i.uuid]
2138
2139 if node_i.offline:
2140 if verbose:
2141 feedback_fn("* Skipping offline node %s" % (node_i.name,))
2142 n_offline += 1
2143 continue
2144
2145 if node_i.uuid == master_node_uuid:
2146 ntype = "master"
2147 elif node_i.master_candidate:
2148 ntype = "master candidate"
2149 elif node_i.drained:
2150 ntype = "drained"
2151 n_drained += 1
2152 else:
2153 ntype = "regular"
2154 if verbose:
2155 feedback_fn("* Verifying node %s (%s)" % (node_i.name, ntype))
2156
2157 msg = all_nvinfo[node_i.uuid].fail_msg
2158 self._ErrorIf(msg, constants.CV_ENODERPC, node_i.name,
2159 "while contacting node: %s", msg)
2160 if msg:
2161 nimg.rpc_fail = True
2162 continue
2163
2164 nresult = all_nvinfo[node_i.uuid].payload
2165
2166 nimg.call_ok = self._VerifyNode(node_i, nresult)
2167 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2168 self._VerifyNodeNetwork(node_i, nresult)
2169 self._VerifyNodeUserScripts(node_i, nresult)
2170 self._VerifyOob(node_i, nresult)
2171 self._VerifyAcceptedFileStoragePaths(node_i, nresult,
2172 node_i.uuid == master_node_uuid)
2173 self._VerifyFileStoragePaths(node_i, nresult)
2174 self._VerifySharedFileStoragePaths(node_i, nresult)
2175 self._VerifyGlusterStoragePaths(node_i, nresult)
2176
2177 if nimg.vm_capable:
2178 self._UpdateVerifyNodeLVM(node_i, nresult, vg_name, nimg)
2179 if constants.DT_DRBD8 in cluster.enabled_disk_templates:
2180 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info,
2181 self.all_disks_info, drbd_helper, all_drbd_map)
2182
2183 if (constants.DT_PLAIN in cluster.enabled_disk_templates) or \
2184 (constants.DT_DRBD8 in cluster.enabled_disk_templates):
2185 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2186 self._UpdateNodeInstances(node_i, nresult, nimg)
2187 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2188 self._UpdateNodeOS(node_i, nresult, nimg)
2189
2190 if not nimg.os_fail:
2191 if refos_img is None:
2192 refos_img = nimg
2193 self._VerifyNodeOS(node_i, nimg, refos_img)
2194 self._VerifyNodeBridges(node_i, nresult, bridges)
2195
2196 # Check whether all running instances are primary for the node. (This
2197 # can no longer be done from _VerifyInstance below, since some of the
2198 # wrong instances could be from other node groups.)
2199 non_primary_inst_uuids = set(nimg.instances).difference(nimg.pinst)
2200
2201 for inst_uuid in non_primary_inst_uuids:
2202 test = inst_uuid in self.all_inst_info
2203 self._ErrorIf(test, constants.CV_EINSTANCEWRONGNODE,
2204 self.cfg.GetInstanceName(inst_uuid),
2205 "instance should not run on node %s", node_i.name)
2206 self._ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
2207 "node is running unknown instance %s", inst_uuid)
2208
2209 self._VerifyExclusionTags(node_i.name, nimg.pinst, cluster.tags)
2210
2211 self._VerifyGroupDRBDVersion(all_nvinfo)
2212 self._VerifyGroupLVM(node_image, vg_name)
2213
2214 for node_uuid, result in extra_lv_nvinfo.items():
2215 self._UpdateNodeVolumes(self.all_node_info[node_uuid], result.payload,
2216 node_image[node_uuid], vg_name)
2217
2218 feedback_fn("* Verifying instance status")
2219 for inst_uuid in self.my_inst_uuids:
2220 instance = self.my_inst_info[inst_uuid]
2221 if verbose:
2222 feedback_fn("* Verifying instance %s" % instance.name)
2223 self._VerifyInstance(instance, node_image, instdisk[inst_uuid])
2224
2225 # If the instance is not fully redundant we cannot survive losing its
2226 # primary node, so we are not N+1 compliant.
2227 inst_disks = self.cfg.GetInstanceDisks(instance.uuid)
2228 if not utils.AllDiskOfType(inst_disks, constants.DTS_MIRRORED):
2229 i_non_redundant.append(instance)
2230
2231 if not cluster.FillBE(instance)[constants.BE_AUTO_BALANCE]:
2232 i_non_a_balanced.append(instance)
2233
2234 feedback_fn("* Verifying orphan volumes")
2235 reserved = utils.FieldSet(*cluster.reserved_lvs)
2236
2237 # We will get spurious "unknown volume" warnings if any node of this group
2238 # is secondary for an instance whose primary is in another group. To avoid
2239 # them, we find these instances and add their volumes to node_vol_should.
2240 for instance in self.all_inst_info.values():
2241 for secondary in self.cfg.GetInstanceSecondaryNodes(instance.uuid):
2242 if (secondary in self.my_node_info
2243 and instance.uuid not in self.my_inst_info):
2244 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
2245 break
2246
2247 self._VerifyOrphanVolumes(vg_name, node_vol_should, node_image, reserved)
2248
2249 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2250 feedback_fn("* Verifying N+1 Memory redundancy")
2251 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
2252
2253 self._VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced,
2254 i_offline, n_offline, n_drained)
2255
2256 return not self.bad
2257
2258 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2259 """Analyze the post-hooks' result
2260
2261 This method analyses the hook result, handles it, and sends some
2262 nicely-formatted feedback back to the user.
2263
2264 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2265 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2266 @param hooks_results: the results of the multi-node hooks rpc call
2267 @param feedback_fn: function used send feedback back to the caller
2268 @param lu_result: previous Exec result
2269 @return: the new Exec result, based on the previous result
2270 and hook results
2271
2272 """
2273 # We only really run POST phase hooks, only for non-empty groups,
2274 # and are only interested in their results
2275 if not self.my_node_uuids:
2276 # empty node group
2277 pass
2278 elif phase == constants.HOOKS_PHASE_POST:
2279 # Used to change hooks' output to proper indentation
2280 feedback_fn("* Hooks Results")
2281 assert hooks_results, "invalid result from hooks"
2282
2283 for node_name in hooks_results:
2284 res = hooks_results[node_name]
2285 msg = res.fail_msg
2286 test = msg and not res.offline
2287 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
2288 "Communication failure in hooks execution: %s", msg)
2289 if test:
2290 lu_result = False
2291 continue
2292 if res.offline:
2293 # No need to investigate payload if node is offline
2294 continue
2295 for script, hkr, output in res.payload:
2296 test = hkr == constants.HKR_FAIL
2297 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
2298 "Script %s failed, output:", script)
2299 if test:
2300 output = self._HOOKS_INDENT_RE.sub(" ", output)
2301 feedback_fn("%s" % output)
2302 lu_result = False
2303
2304 return lu_result