Merge branch 'stable-2.15' into stable-2.16
[ganeti-github.git] / lib / cmdlib / cluster / verify.py
1 #
2 #
3
4 # Copyright (C) 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 """Logical units for cluster verification."""
31
32 import itertools
33 import logging
34 import operator
35 import re
36 import time
37 import ganeti.masterd.instance
38 import ganeti.rpc.node as rpc
39
40 from ganeti import compat
41 from ganeti import constants
42 from ganeti import errors
43 from ganeti import locking
44 from ganeti import pathutils
45 from ganeti import utils
46 from ganeti import vcluster
47 from ganeti import hypervisor
48 from ganeti import opcodes
49
50 from ganeti.cmdlib.base import LogicalUnit, NoHooksLU, ResultWithJobs
51 from ganeti.cmdlib.common import ShareAll, ComputeAncillaryFiles, \
52 CheckNodePVs, ComputeIPolicyInstanceViolation, AnnotateDiskParams, \
53 SupportsOob
54
55
56 def _GetAllHypervisorParameters(cluster, instances):
57 """Compute the set of all hypervisor parameters.
58
59 @type cluster: L{objects.Cluster}
60 @param cluster: the cluster object
61 @param instances: list of L{objects.Instance}
62 @param instances: additional instances from which to obtain parameters
63 @rtype: list of (origin, hypervisor, parameters)
64 @return: a list with all parameters found, indicating the hypervisor they
65 apply to, and the origin (can be "cluster", "os X", or "instance Y")
66
67 """
68 hvp_data = []
69
70 for hv_name in cluster.enabled_hypervisors:
71 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
72
73 for os_name, os_hvp in cluster.os_hvp.items():
74 for hv_name, hv_params in os_hvp.items():
75 if hv_params:
76 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
77 hvp_data.append(("os %s" % os_name, hv_name, full_params))
78
79 # TODO: collapse identical parameter values in a single one
80 for instance in instances:
81 if instance.hvparams:
82 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
83 cluster.FillHV(instance)))
84
85 return hvp_data
86
87
88 class _VerifyErrors(object):
89 """Mix-in for cluster/group verify LUs.
90
91 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
92 self.op and self._feedback_fn to be available.)
93
94 """
95
96 ETYPE_FIELD = "code"
97 ETYPE_ERROR = constants.CV_ERROR
98 ETYPE_WARNING = constants.CV_WARNING
99
100 def _Error(self, ecode, item, msg, *args, **kwargs):
101 """Format an error message.
102
103 Based on the opcode's error_codes parameter, either format a
104 parseable error code, or a simpler error string.
105
106 This must be called only from Exec and functions called from Exec.
107
108 """
109 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
110 itype, etxt, _ = ecode
111 # If the error code is in the list of ignored errors, demote the error to a
112 # warning
113 if etxt in self.op.ignore_errors: # pylint: disable=E1101
114 ltype = self.ETYPE_WARNING
115 # first complete the msg
116 if args:
117 msg = msg % args
118 # then format the whole message
119 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
120 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
121 else:
122 if item:
123 item = " " + item
124 else:
125 item = ""
126 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
127 # and finally report it via the feedback_fn
128 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101
129 # do not mark the operation as failed for WARN cases only
130 if ltype == self.ETYPE_ERROR:
131 self.bad = True
132
133 def _ErrorIf(self, cond, *args, **kwargs):
134 """Log an error message if the passed condition is True.
135
136 """
137 if (bool(cond)
138 or self.op.debug_simulate_errors): # pylint: disable=E1101
139 self._Error(*args, **kwargs)
140
141
142 class LUClusterVerify(NoHooksLU):
143 """Submits all jobs necessary to verify the cluster.
144
145 """
146 REQ_BGL = False
147
148 def ExpandNames(self):
149 self.needed_locks = {}
150
151 def Exec(self, feedback_fn):
152 jobs = []
153
154 if self.op.group_name:
155 groups = [self.op.group_name]
156 depends_fn = lambda: None
157 else:
158 groups = self.cfg.GetNodeGroupList()
159
160 # Verify global configuration
161 jobs.append([
162 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors),
163 ])
164
165 # Always depend on global verification
166 depends_fn = lambda: [(-len(jobs), [])]
167
168 jobs.extend(
169 [opcodes.OpClusterVerifyGroup(group_name=group,
170 ignore_errors=self.op.ignore_errors,
171 depends=depends_fn(),
172 verify_clutter=self.op.verify_clutter)]
173 for group in groups)
174
175 # Fix up all parameters
176 for op in itertools.chain(*jobs): # pylint: disable=W0142
177 op.debug_simulate_errors = self.op.debug_simulate_errors
178 op.verbose = self.op.verbose
179 op.error_codes = self.op.error_codes
180 try:
181 op.skip_checks = self.op.skip_checks
182 except AttributeError:
183 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
184
185 return ResultWithJobs(jobs)
186
187
188 class LUClusterVerifyDisks(NoHooksLU):
189 """Verifies the cluster disks status.
190
191 """
192 REQ_BGL = False
193
194 def ExpandNames(self):
195 self.share_locks = ShareAll()
196 if self.op.group_name:
197 self.needed_locks = {
198 locking.LEVEL_NODEGROUP: [self.cfg.LookupNodeGroup(self.op.group_name)]
199 }
200 else:
201 self.needed_locks = {
202 locking.LEVEL_NODEGROUP: locking.ALL_SET,
203 }
204
205 def Exec(self, feedback_fn):
206 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
207
208 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
209 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
210 for group in group_names])
211
212
213 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
214 """Verifies the cluster config.
215
216 """
217 REQ_BGL = False
218
219 def _VerifyHVP(self, hvp_data):
220 """Verifies locally the syntax of the hypervisor parameters.
221
222 """
223 for item, hv_name, hv_params in hvp_data:
224 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
225 (item, hv_name))
226 try:
227 hv_class = hypervisor.GetHypervisorClass(hv_name)
228 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
229 hv_class.CheckParameterSyntax(hv_params)
230 except errors.GenericError, err:
231 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
232
233 def ExpandNames(self):
234 self.needed_locks = dict.fromkeys(locking.LEVELS, locking.ALL_SET)
235 self.share_locks = ShareAll()
236
237 def CheckPrereq(self):
238 """Check prerequisites.
239
240 """
241 # Retrieve all information
242 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
243 self.all_node_info = self.cfg.GetAllNodesInfo()
244 self.all_inst_info = self.cfg.GetAllInstancesInfo()
245
246 def Exec(self, feedback_fn):
247 """Verify integrity of cluster, performing various test on nodes.
248
249 """
250 self.bad = False
251 self._feedback_fn = feedback_fn
252
253 feedback_fn("* Verifying cluster config")
254
255 for msg in self.cfg.VerifyConfig():
256 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg)
257
258 feedback_fn("* Verifying cluster certificate files")
259
260 for cert_filename in pathutils.ALL_CERT_FILES:
261 (errcode, msg) = utils.VerifyCertificate(cert_filename)
262 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
263
264 self._ErrorIf(not utils.CanRead(constants.LUXID_USER,
265 pathutils.NODED_CERT_FILE),
266 constants.CV_ECLUSTERCERT,
267 None,
268 pathutils.NODED_CERT_FILE + " must be accessible by the " +
269 constants.LUXID_USER + " user")
270
271 feedback_fn("* Verifying hypervisor parameters")
272
273 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
274 self.all_inst_info.values()))
275
276 feedback_fn("* Verifying all nodes belong to an existing group")
277
278 # We do this verification here because, should this bogus circumstance
279 # occur, it would never be caught by VerifyGroup, which only acts on
280 # nodes/instances reachable from existing node groups.
281
282 dangling_nodes = set(node for node in self.all_node_info.values()
283 if node.group not in self.all_group_info)
284
285 dangling_instances = {}
286 no_node_instances = []
287
288 for inst in self.all_inst_info.values():
289 if inst.primary_node in [node.uuid for node in dangling_nodes]:
290 dangling_instances.setdefault(inst.primary_node, []).append(inst)
291 elif inst.primary_node not in self.all_node_info:
292 no_node_instances.append(inst)
293
294 pretty_dangling = [
295 "%s (%s)" %
296 (node.name,
297 utils.CommaJoin(inst.name for
298 inst in dangling_instances.get(node.uuid, [])))
299 for node in dangling_nodes]
300
301 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
302 None,
303 "the following nodes (and their instances) belong to a non"
304 " existing group: %s", utils.CommaJoin(pretty_dangling))
305
306 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
307 None,
308 "the following instances have a non-existing primary-node:"
309 " %s", utils.CommaJoin(inst.name for
310 inst in no_node_instances))
311
312 return not self.bad
313
314
315 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
316 """Verifies the status of a node group.
317
318 """
319 HPATH = "cluster-verify"
320 HTYPE = constants.HTYPE_CLUSTER
321 REQ_BGL = False
322
323 _HOOKS_INDENT_RE = re.compile("^", re.M)
324
325 class NodeImage(object):
326 """A class representing the logical and physical status of a node.
327
328 @type uuid: string
329 @ivar uuid: the node UUID to which this object refers
330 @ivar volumes: a structure as returned from
331 L{ganeti.backend.GetVolumeList} (runtime)
332 @ivar instances: a list of running instances (runtime)
333 @ivar pinst: list of configured primary instances (config)
334 @ivar sinst: list of configured secondary instances (config)
335 @ivar sbp: dictionary of {primary-node: list of instances} for all
336 instances for which this node is secondary (config)
337 @ivar mfree: free memory, as reported by hypervisor (runtime)
338 @ivar dfree: free disk, as reported by the node (runtime)
339 @ivar offline: the offline status (config)
340 @type rpc_fail: boolean
341 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
342 not whether the individual keys were correct) (runtime)
343 @type lvm_fail: boolean
344 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
345 @type hyp_fail: boolean
346 @ivar hyp_fail: whether the RPC call didn't return the instance list
347 @type ghost: boolean
348 @ivar ghost: whether this is a known node or not (config)
349 @type os_fail: boolean
350 @ivar os_fail: whether the RPC call didn't return valid OS data
351 @type oslist: list
352 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
353 @type vm_capable: boolean
354 @ivar vm_capable: whether the node can host instances
355 @type pv_min: float
356 @ivar pv_min: size in MiB of the smallest PVs
357 @type pv_max: float
358 @ivar pv_max: size in MiB of the biggest PVs
359
360 """
361 def __init__(self, offline=False, uuid=None, vm_capable=True):
362 self.uuid = uuid
363 self.volumes = {}
364 self.instances = []
365 self.pinst = []
366 self.sinst = []
367 self.sbp = {}
368 self.mfree = 0
369 self.dfree = 0
370 self.offline = offline
371 self.vm_capable = vm_capable
372 self.rpc_fail = False
373 self.lvm_fail = False
374 self.hyp_fail = False
375 self.ghost = False
376 self.os_fail = False
377 self.oslist = {}
378 self.pv_min = None
379 self.pv_max = None
380
381 def ExpandNames(self):
382 # This raises errors.OpPrereqError on its own:
383 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
384
385 # Get instances in node group; this is unsafe and needs verification later
386 inst_uuids = \
387 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
388
389 self.needed_locks = {
390 locking.LEVEL_INSTANCE: self.cfg.GetInstanceNames(inst_uuids),
391 locking.LEVEL_NODEGROUP: [self.group_uuid],
392 locking.LEVEL_NODE: [],
393 }
394
395 self.share_locks = ShareAll()
396
397 def DeclareLocks(self, level):
398 if level == locking.LEVEL_NODE:
399 # Get members of node group; this is unsafe and needs verification later
400 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
401
402 # In Exec(), we warn about mirrored instances that have primary and
403 # secondary living in separate node groups. To fully verify that
404 # volumes for these instances are healthy, we will need to do an
405 # extra call to their secondaries. We ensure here those nodes will
406 # be locked.
407 for inst_name in self.owned_locks(locking.LEVEL_INSTANCE):
408 # Important: access only the instances whose lock is owned
409 instance = self.cfg.GetInstanceInfoByName(inst_name)
410 disks = self.cfg.GetInstanceDisks(instance.uuid)
411 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
412 nodes.update(self.cfg.GetInstanceSecondaryNodes(instance.uuid))
413
414 self.needed_locks[locking.LEVEL_NODE] = nodes
415
416 def CheckPrereq(self):
417 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
418 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
419
420 group_node_uuids = set(self.group_info.members)
421 group_inst_uuids = \
422 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
423
424 unlocked_node_uuids = \
425 group_node_uuids.difference(self.owned_locks(locking.LEVEL_NODE))
426
427 unlocked_inst_uuids = \
428 group_inst_uuids.difference(
429 [self.cfg.GetInstanceInfoByName(name).uuid
430 for name in self.owned_locks(locking.LEVEL_INSTANCE)])
431
432 if unlocked_node_uuids:
433 raise errors.OpPrereqError(
434 "Missing lock for nodes: %s" %
435 utils.CommaJoin(self.cfg.GetNodeNames(unlocked_node_uuids)),
436 errors.ECODE_STATE)
437
438 if unlocked_inst_uuids:
439 raise errors.OpPrereqError(
440 "Missing lock for instances: %s" %
441 utils.CommaJoin(self.cfg.GetInstanceNames(unlocked_inst_uuids)),
442 errors.ECODE_STATE)
443
444 self.all_node_info = self.cfg.GetAllNodesInfo()
445 self.all_inst_info = self.cfg.GetAllInstancesInfo()
446 self.all_disks_info = self.cfg.GetAllDisksInfo()
447
448 self.my_node_uuids = group_node_uuids
449 self.my_node_info = dict((node_uuid, self.all_node_info[node_uuid])
450 for node_uuid in group_node_uuids)
451
452 self.my_inst_uuids = group_inst_uuids
453 self.my_inst_info = dict((inst_uuid, self.all_inst_info[inst_uuid])
454 for inst_uuid in group_inst_uuids)
455
456 # We detect here the nodes that will need the extra RPC calls for verifying
457 # split LV volumes; they should be locked.
458 extra_lv_nodes = set()
459
460 for inst in self.my_inst_info.values():
461 disks = self.cfg.GetInstanceDisks(inst.uuid)
462 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
463 inst_nodes = self.cfg.GetInstanceNodes(inst.uuid)
464 for nuuid in inst_nodes:
465 if self.all_node_info[nuuid].group != self.group_uuid:
466 extra_lv_nodes.add(nuuid)
467
468 unlocked_lv_nodes = \
469 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
470
471 if unlocked_lv_nodes:
472 raise errors.OpPrereqError("Missing node locks for LV check: %s" %
473 utils.CommaJoin(unlocked_lv_nodes),
474 errors.ECODE_STATE)
475 self.extra_lv_nodes = list(extra_lv_nodes)
476
477 def _VerifyNode(self, ninfo, nresult):
478 """Perform some basic validation on data returned from a node.
479
480 - check the result data structure is well formed and has all the
481 mandatory fields
482 - check ganeti version
483
484 @type ninfo: L{objects.Node}
485 @param ninfo: the node to check
486 @param nresult: the results from the node
487 @rtype: boolean
488 @return: whether overall this call was successful (and we can expect
489 reasonable values in the respose)
490
491 """
492 # main result, nresult should be a non-empty dict
493 test = not nresult or not isinstance(nresult, dict)
494 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name,
495 "unable to verify node: no data returned")
496 if test:
497 return False
498
499 # compares ganeti version
500 local_version = constants.PROTOCOL_VERSION
501 remote_version = nresult.get("version", None)
502 test = not (remote_version and
503 isinstance(remote_version, (list, tuple)) and
504 len(remote_version) == 2)
505 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name,
506 "connection to node returned invalid data")
507 if test:
508 return False
509
510 test = local_version != remote_version[0]
511 self._ErrorIf(test, constants.CV_ENODEVERSION, ninfo.name,
512 "incompatible protocol versions: master %s,"
513 " node %s", local_version, remote_version[0])
514 if test:
515 return False
516
517 # node seems compatible, we can actually try to look into its results
518
519 # full package version
520 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
521 constants.CV_ENODEVERSION, ninfo.name,
522 "software version mismatch: master %s, node %s",
523 constants.RELEASE_VERSION, remote_version[1],
524 code=self.ETYPE_WARNING)
525
526 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
527 if ninfo.vm_capable and isinstance(hyp_result, dict):
528 for hv_name, hv_result in hyp_result.iteritems():
529 test = hv_result is not None
530 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
531 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
532
533 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
534 if ninfo.vm_capable and isinstance(hvp_result, list):
535 for item, hv_name, hv_result in hvp_result:
536 self._ErrorIf(True, constants.CV_ENODEHV, ninfo.name,
537 "hypervisor %s parameter verify failure (source %s): %s",
538 hv_name, item, hv_result)
539
540 test = nresult.get(constants.NV_NODESETUP,
541 ["Missing NODESETUP results"])
542 self._ErrorIf(test, constants.CV_ENODESETUP, ninfo.name,
543 "node setup error: %s", "; ".join(test))
544
545 return True
546
547 def _VerifyNodeTime(self, ninfo, nresult,
548 nvinfo_starttime, nvinfo_endtime):
549 """Check the node time.
550
551 @type ninfo: L{objects.Node}
552 @param ninfo: the node to check
553 @param nresult: the remote results for the node
554 @param nvinfo_starttime: the start time of the RPC call
555 @param nvinfo_endtime: the end time of the RPC call
556
557 """
558 ntime = nresult.get(constants.NV_TIME, None)
559 try:
560 ntime_merged = utils.MergeTime(ntime)
561 except (ValueError, TypeError):
562 self._ErrorIf(True, constants.CV_ENODETIME, ninfo.name,
563 "Node returned invalid time")
564 return
565
566 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
567 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
568 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
569 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
570 else:
571 ntime_diff = None
572
573 self._ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, ninfo.name,
574 "Node time diverges by at least %s from master node time",
575 ntime_diff)
576
577 def _UpdateVerifyNodeLVM(self, ninfo, nresult, vg_name, nimg):
578 """Check the node LVM results and update info for cross-node checks.
579
580 @type ninfo: L{objects.Node}
581 @param ninfo: the node to check
582 @param nresult: the remote results for the node
583 @param vg_name: the configured VG name
584 @type nimg: L{NodeImage}
585 @param nimg: node image
586
587 """
588 if vg_name is None:
589 return
590
591 # checks vg existence and size > 20G
592 vglist = nresult.get(constants.NV_VGLIST, None)
593 test = not vglist
594 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name,
595 "unable to check volume groups")
596 if not test:
597 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
598 constants.MIN_VG_SIZE)
599 self._ErrorIf(vgstatus, constants.CV_ENODELVM, ninfo.name, vgstatus)
600
601 # Check PVs
602 (errmsgs, pvminmax) = CheckNodePVs(nresult, self._exclusive_storage)
603 for em in errmsgs:
604 self._Error(constants.CV_ENODELVM, ninfo.name, em)
605 if pvminmax is not None:
606 (nimg.pv_min, nimg.pv_max) = pvminmax
607
608 def _VerifyGroupDRBDVersion(self, node_verify_infos):
609 """Check cross-node DRBD version consistency.
610
611 @type node_verify_infos: dict
612 @param node_verify_infos: infos about nodes as returned from the
613 node_verify call.
614
615 """
616 node_versions = {}
617 for node_uuid, ndata in node_verify_infos.items():
618 nresult = ndata.payload
619 if nresult:
620 version = nresult.get(constants.NV_DRBDVERSION, None)
621 if version:
622 node_versions[node_uuid] = version
623
624 if len(set(node_versions.values())) > 1:
625 for node_uuid, version in sorted(node_versions.items()):
626 msg = "DRBD version mismatch: %s" % version
627 self._Error(constants.CV_ENODEDRBDHELPER, node_uuid, msg,
628 code=self.ETYPE_WARNING)
629
630 def _VerifyGroupLVM(self, node_image, vg_name):
631 """Check cross-node consistency in LVM.
632
633 @type node_image: dict
634 @param node_image: info about nodes, mapping from node to names to
635 L{NodeImage} objects
636 @param vg_name: the configured VG name
637
638 """
639 if vg_name is None:
640 return
641
642 # Only exclusive storage needs this kind of checks
643 if not self._exclusive_storage:
644 return
645
646 # exclusive_storage wants all PVs to have the same size (approximately),
647 # if the smallest and the biggest ones are okay, everything is fine.
648 # pv_min is None iff pv_max is None
649 vals = filter((lambda ni: ni.pv_min is not None), node_image.values())
650 if not vals:
651 return
652 (pvmin, minnode_uuid) = min((ni.pv_min, ni.uuid) for ni in vals)
653 (pvmax, maxnode_uuid) = max((ni.pv_max, ni.uuid) for ni in vals)
654 bad = utils.LvmExclusiveTestBadPvSizes(pvmin, pvmax)
655 self._ErrorIf(bad, constants.CV_EGROUPDIFFERENTPVSIZE, self.group_info.name,
656 "PV sizes differ too much in the group; smallest (%s MB) is"
657 " on %s, biggest (%s MB) is on %s",
658 pvmin, self.cfg.GetNodeName(minnode_uuid),
659 pvmax, self.cfg.GetNodeName(maxnode_uuid))
660
661 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
662 """Check the node bridges.
663
664 @type ninfo: L{objects.Node}
665 @param ninfo: the node to check
666 @param nresult: the remote results for the node
667 @param bridges: the expected list of bridges
668
669 """
670 if not bridges:
671 return
672
673 missing = nresult.get(constants.NV_BRIDGES, None)
674 test = not isinstance(missing, list)
675 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
676 "did not return valid bridge information")
677 if not test:
678 self._ErrorIf(bool(missing), constants.CV_ENODENET, ninfo.name,
679 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
680
681 def _VerifyNodeUserScripts(self, ninfo, nresult):
682 """Check the results of user scripts presence and executability on the node
683
684 @type ninfo: L{objects.Node}
685 @param ninfo: the node to check
686 @param nresult: the remote results for the node
687
688 """
689 test = not constants.NV_USERSCRIPTS in nresult
690 self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, ninfo.name,
691 "did not return user scripts information")
692
693 broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None)
694 if not test:
695 self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, ninfo.name,
696 "user scripts not present or not executable: %s" %
697 utils.CommaJoin(sorted(broken_scripts)))
698
699 def _VerifyNodeNetwork(self, ninfo, nresult):
700 """Check the node network connectivity results.
701
702 @type ninfo: L{objects.Node}
703 @param ninfo: the node to check
704 @param nresult: the remote results for the node
705
706 """
707 test = constants.NV_NODELIST not in nresult
708 self._ErrorIf(test, constants.CV_ENODESSH, ninfo.name,
709 "node hasn't returned node ssh connectivity data")
710 if not test:
711 if nresult[constants.NV_NODELIST]:
712 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
713 self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name,
714 "ssh communication with node '%s': %s", a_node, a_msg)
715
716 test = constants.NV_NODENETTEST not in nresult
717 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
718 "node hasn't returned node tcp connectivity data")
719 if not test:
720 if nresult[constants.NV_NODENETTEST]:
721 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
722 for anode in nlist:
723 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name,
724 "tcp communication with node '%s': %s",
725 anode, nresult[constants.NV_NODENETTEST][anode])
726
727 test = constants.NV_MASTERIP not in nresult
728 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
729 "node hasn't returned node master IP reachability data")
730 if not test:
731 if not nresult[constants.NV_MASTERIP]:
732 if ninfo.uuid == self.master_node:
733 msg = "the master node cannot reach the master IP (not configured?)"
734 else:
735 msg = "cannot reach the master IP"
736 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, msg)
737
738 def _VerifyInstance(self, instance, node_image, diskstatus):
739 """Verify an instance.
740
741 This function checks to see if the required block devices are
742 available on the instance's node, and that the nodes are in the correct
743 state.
744
745 """
746 pnode_uuid = instance.primary_node
747 pnode_img = node_image[pnode_uuid]
748 groupinfo = self.cfg.GetAllNodeGroupsInfo()
749
750 node_vol_should = {}
751 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
752
753 cluster = self.cfg.GetClusterInfo()
754 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster,
755 self.group_info)
756 err = ComputeIPolicyInstanceViolation(ipolicy, instance, self.cfg)
757 self._ErrorIf(err, constants.CV_EINSTANCEPOLICY, instance.name,
758 utils.CommaJoin(err), code=self.ETYPE_WARNING)
759
760 for node_uuid in node_vol_should:
761 n_img = node_image[node_uuid]
762 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
763 # ignore missing volumes on offline or broken nodes
764 continue
765 for volume in node_vol_should[node_uuid]:
766 test = volume not in n_img.volumes
767 self._ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance.name,
768 "volume %s missing on node %s", volume,
769 self.cfg.GetNodeName(node_uuid))
770
771 if instance.admin_state == constants.ADMINST_UP:
772 test = instance.uuid not in pnode_img.instances and not pnode_img.offline
773 self._ErrorIf(test, constants.CV_EINSTANCEDOWN, instance.name,
774 "instance not running on its primary node %s",
775 self.cfg.GetNodeName(pnode_uuid))
776 self._ErrorIf(pnode_img.offline, constants.CV_EINSTANCEBADNODE,
777 instance.name, "instance is marked as running and lives on"
778 " offline node %s", self.cfg.GetNodeName(pnode_uuid))
779
780 diskdata = [(nname, success, status, idx)
781 for (nname, disks) in diskstatus.items()
782 for idx, (success, status) in enumerate(disks)]
783
784 for nname, success, bdev_status, idx in diskdata:
785 # the 'ghost node' construction in Exec() ensures that we have a
786 # node here
787 snode = node_image[nname]
788 bad_snode = snode.ghost or snode.offline
789 self._ErrorIf(instance.disks_active and
790 not success and not bad_snode,
791 constants.CV_EINSTANCEFAULTYDISK, instance.name,
792 "couldn't retrieve status for disk/%s on %s: %s",
793 idx, self.cfg.GetNodeName(nname), bdev_status)
794
795 if instance.disks_active and success and bdev_status.is_degraded:
796 msg = "disk/%s on %s is degraded" % (idx, self.cfg.GetNodeName(nname))
797
798 code = self.ETYPE_ERROR
799 accepted_lds = [constants.LDS_OKAY, constants.LDS_SYNC]
800
801 if bdev_status.ldisk_status in accepted_lds:
802 code = self.ETYPE_WARNING
803
804 msg += "; local disk state is '%s'" % \
805 constants.LDS_NAMES[bdev_status.ldisk_status]
806
807 self._Error(constants.CV_EINSTANCEFAULTYDISK, instance.name, msg,
808 code=code)
809
810 self._ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
811 constants.CV_ENODERPC, self.cfg.GetNodeName(pnode_uuid),
812 "instance %s, connection to primary node failed",
813 instance.name)
814
815 secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid)
816 self._ErrorIf(len(secondary_nodes) > 1,
817 constants.CV_EINSTANCELAYOUT, instance.name,
818 "instance has multiple secondary nodes: %s",
819 utils.CommaJoin(secondary_nodes),
820 code=self.ETYPE_WARNING)
821
822 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid)
823 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, inst_nodes)
824 disks = self.cfg.GetInstanceDisks(instance.uuid)
825 if any(es_flags.values()):
826 if not utils.AllDiskOfType(disks, constants.DTS_EXCL_STORAGE):
827 # Disk template not compatible with exclusive_storage: no instance
828 # node should have the flag set
829 es_nodes = [n
830 for (n, es) in es_flags.items()
831 if es]
832 unsupported = [d.dev_type for d in disks
833 if d.dev_type not in constants.DTS_EXCL_STORAGE]
834 self._Error(constants.CV_EINSTANCEUNSUITABLENODE, instance.name,
835 "instance uses disk types %s, which are not supported on"
836 " nodes that have exclusive storage set: %s",
837 utils.CommaJoin(unsupported),
838 utils.CommaJoin(self.cfg.GetNodeNames(es_nodes)))
839 for (idx, disk) in enumerate(disks):
840 self._ErrorIf(disk.spindles is None,
841 constants.CV_EINSTANCEMISSINGCFGPARAMETER, instance.name,
842 "number of spindles not configured for disk %s while"
843 " exclusive storage is enabled, try running"
844 " gnt-cluster repair-disk-sizes", idx)
845
846 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
847 instance_nodes = utils.NiceSort(inst_nodes)
848 instance_groups = {}
849
850 for node_uuid in instance_nodes:
851 instance_groups.setdefault(self.all_node_info[node_uuid].group,
852 []).append(node_uuid)
853
854 pretty_list = [
855 "%s (group %s)" % (utils.CommaJoin(self.cfg.GetNodeNames(nodes)),
856 groupinfo[group].name)
857 # Sort so that we always list the primary node first.
858 for group, nodes in sorted(instance_groups.items(),
859 key=lambda (_, nodes): pnode_uuid in nodes,
860 reverse=True)]
861
862 self._ErrorIf(len(instance_groups) > 1,
863 constants.CV_EINSTANCESPLITGROUPS,
864 instance.name, "instance has primary and secondary nodes in"
865 " different groups: %s", utils.CommaJoin(pretty_list),
866 code=self.ETYPE_WARNING)
867
868 inst_nodes_offline = []
869 for snode in secondary_nodes:
870 s_img = node_image[snode]
871 self._ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
872 self.cfg.GetNodeName(snode),
873 "instance %s, connection to secondary node failed",
874 instance.name)
875
876 if s_img.offline:
877 inst_nodes_offline.append(snode)
878
879 # warn that the instance lives on offline nodes
880 self._ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE,
881 instance.name, "instance has offline secondary node(s) %s",
882 utils.CommaJoin(self.cfg.GetNodeNames(inst_nodes_offline)))
883 # ... or ghost/non-vm_capable nodes
884 for node_uuid in inst_nodes:
885 self._ErrorIf(node_image[node_uuid].ghost, constants.CV_EINSTANCEBADNODE,
886 instance.name, "instance lives on ghost node %s",
887 self.cfg.GetNodeName(node_uuid))
888 self._ErrorIf(not node_image[node_uuid].vm_capable,
889 constants.CV_EINSTANCEBADNODE, instance.name,
890 "instance lives on non-vm_capable node %s",
891 self.cfg.GetNodeName(node_uuid))
892
893 def _VerifyOrphanVolumes(self, vg_name, node_vol_should, node_image,
894 reserved):
895 """Verify if there are any unknown volumes in the cluster.
896
897 The .os, .swap and backup volumes are ignored. All other volumes are
898 reported as unknown.
899
900 @type vg_name: string
901 @param vg_name: the name of the Ganeti-administered volume group
902 @type reserved: L{ganeti.utils.FieldSet}
903 @param reserved: a FieldSet of reserved volume names
904
905 """
906 for node_uuid, n_img in node_image.items():
907 if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or
908 self.all_node_info[node_uuid].group != self.group_uuid):
909 # skip non-healthy nodes
910 continue
911 for volume in n_img.volumes:
912 # skip volumes not belonging to the ganeti-administered volume group
913 if volume.split('/')[0] != vg_name:
914 continue
915
916 test = ((node_uuid not in node_vol_should or
917 volume not in node_vol_should[node_uuid]) and
918 not reserved.Matches(volume))
919 self._ErrorIf(test, constants.CV_ENODEORPHANLV,
920 self.cfg.GetNodeName(node_uuid),
921 "volume %s is unknown", volume,
922 code=_VerifyErrors.ETYPE_WARNING)
923
924 def _VerifyNPlusOneMemory(self, node_image, all_insts):
925 """Verify N+1 Memory Resilience.
926
927 Check that if one single node dies we can still start all the
928 instances it was primary for.
929
930 """
931 cluster_info = self.cfg.GetClusterInfo()
932 for node_uuid, n_img in node_image.items():
933 # This code checks that every node which is now listed as
934 # secondary has enough memory to host all instances it is
935 # supposed to should a single other node in the cluster fail.
936 # FIXME: not ready for failover to an arbitrary node
937 # FIXME: does not support file-backed instances
938 # WARNING: we currently take into account down instances as well
939 # as up ones, considering that even if they're down someone
940 # might want to start them even in the event of a node failure.
941 if n_img.offline or \
942 self.all_node_info[node_uuid].group != self.group_uuid:
943 # we're skipping nodes marked offline and nodes in other groups from
944 # the N+1 warning, since most likely we don't have good memory
945 # information from them; we already list instances living on such
946 # nodes, and that's enough warning
947 continue
948 #TODO(dynmem): also consider ballooning out other instances
949 for prinode, inst_uuids in n_img.sbp.items():
950 needed_mem = 0
951 for inst_uuid in inst_uuids:
952 bep = cluster_info.FillBE(all_insts[inst_uuid])
953 if bep[constants.BE_AUTO_BALANCE]:
954 needed_mem += bep[constants.BE_MINMEM]
955 test = n_img.mfree < needed_mem
956 self._ErrorIf(test, constants.CV_ENODEN1,
957 self.cfg.GetNodeName(node_uuid),
958 "not enough memory to accomodate instance failovers"
959 " should node %s fail (%dMiB needed, %dMiB available)",
960 self.cfg.GetNodeName(prinode), needed_mem, n_img.mfree)
961
962 def _VerifyClientCertificates(self, nodes, all_nvinfo):
963 """Verifies the consistency of the client certificates.
964
965 This includes several aspects:
966 - the individual validation of all nodes' certificates
967 - the consistency of the master candidate certificate map
968 - the consistency of the master candidate certificate map with the
969 certificates that the master candidates are actually using.
970
971 @param nodes: the list of nodes to consider in this verification
972 @param all_nvinfo: the map of results of the verify_node call to
973 all nodes
974
975 """
976 candidate_certs = self.cfg.GetClusterInfo().candidate_certs
977 if candidate_certs is None or len(candidate_certs) == 0:
978 self._ErrorIf(
979 True, constants.CV_ECLUSTERCLIENTCERT, None,
980 "The cluster's list of master candidate certificates is empty."
981 " If you just updated the cluster, please run"
982 " 'gnt-cluster renew-crypto --new-node-certificates'.")
983 return
984
985 self._ErrorIf(
986 len(candidate_certs) != len(set(candidate_certs.values())),
987 constants.CV_ECLUSTERCLIENTCERT, None,
988 "There are at least two master candidates configured to use the same"
989 " certificate.")
990
991 # collect the client certificate
992 for node in nodes:
993 if node.offline:
994 continue
995
996 nresult = all_nvinfo[node.uuid]
997 if nresult.fail_msg or not nresult.payload:
998 continue
999
1000 (errcode, msg) = nresult.payload.get(constants.NV_CLIENT_CERT, None)
1001
1002 self._ErrorIf(
1003 errcode is not None, constants.CV_ECLUSTERCLIENTCERT, None,
1004 "Client certificate of node '%s' failed validation: %s (code '%s')",
1005 node.uuid, msg, errcode)
1006
1007 if not errcode:
1008 digest = msg
1009 if node.master_candidate:
1010 if node.uuid in candidate_certs:
1011 self._ErrorIf(
1012 digest != candidate_certs[node.uuid],
1013 constants.CV_ECLUSTERCLIENTCERT, None,
1014 "Client certificate digest of master candidate '%s' does not"
1015 " match its entry in the cluster's map of master candidate"
1016 " certificates. Expected: %s Got: %s", node.uuid,
1017 digest, candidate_certs[node.uuid])
1018 else:
1019 self._ErrorIf(
1020 True, constants.CV_ECLUSTERCLIENTCERT, None,
1021 "The master candidate '%s' does not have an entry in the"
1022 " map of candidate certificates.", node.uuid)
1023 self._ErrorIf(
1024 digest in candidate_certs.values(),
1025 constants.CV_ECLUSTERCLIENTCERT, None,
1026 "Master candidate '%s' is using a certificate of another node.",
1027 node.uuid)
1028 else:
1029 self._ErrorIf(
1030 node.uuid in candidate_certs,
1031 constants.CV_ECLUSTERCLIENTCERT, None,
1032 "Node '%s' is not a master candidate, but still listed in the"
1033 " map of master candidate certificates.", node.uuid)
1034 self._ErrorIf(
1035 (node.uuid not in candidate_certs) and
1036 (digest in candidate_certs.values()),
1037 constants.CV_ECLUSTERCLIENTCERT, None,
1038 "Node '%s' is not a master candidate and is incorrectly using a"
1039 " certificate of another node which is master candidate.",
1040 node.uuid)
1041
1042 def _VerifySshSetup(self, nodes, all_nvinfo):
1043 """Evaluates the verification results of the SSH setup and clutter test.
1044
1045 @param nodes: List of L{objects.Node} objects
1046 @param all_nvinfo: RPC results
1047
1048 """
1049 for node in nodes:
1050 if not node.offline:
1051 nresult = all_nvinfo[node.uuid]
1052 if nresult.fail_msg or not nresult.payload:
1053 self._ErrorIf(True, constants.CV_ENODESSH, node.name,
1054 "Could not verify the SSH setup of this node.")
1055 return
1056 for ssh_test in [constants.NV_SSH_SETUP, constants.NV_SSH_CLUTTER]:
1057 result = nresult.payload.get(ssh_test, None)
1058 error_msg = ""
1059 if isinstance(result, list):
1060 error_msg = " ".join(result)
1061 self._ErrorIf(result,
1062 constants.CV_ENODESSH, None, error_msg)
1063
1064 def _VerifyFiles(self, nodes, master_node_uuid, all_nvinfo,
1065 (files_all, files_opt, files_mc, files_vm)):
1066 """Verifies file checksums collected from all nodes.
1067
1068 @param nodes: List of L{objects.Node} objects
1069 @param master_node_uuid: UUID of master node
1070 @param all_nvinfo: RPC results
1071
1072 """
1073 # Define functions determining which nodes to consider for a file
1074 files2nodefn = [
1075 (files_all, None),
1076 (files_mc, lambda node: (node.master_candidate or
1077 node.uuid == master_node_uuid)),
1078 (files_vm, lambda node: node.vm_capable),
1079 ]
1080
1081 # Build mapping from filename to list of nodes which should have the file
1082 nodefiles = {}
1083 for (files, fn) in files2nodefn:
1084 if fn is None:
1085 filenodes = nodes
1086 else:
1087 filenodes = filter(fn, nodes)
1088 nodefiles.update((filename,
1089 frozenset(map(operator.attrgetter("uuid"), filenodes)))
1090 for filename in files)
1091
1092 assert set(nodefiles) == (files_all | files_mc | files_vm)
1093
1094 fileinfo = dict((filename, {}) for filename in nodefiles)
1095 ignore_nodes = set()
1096
1097 for node in nodes:
1098 if node.offline:
1099 ignore_nodes.add(node.uuid)
1100 continue
1101
1102 nresult = all_nvinfo[node.uuid]
1103
1104 if nresult.fail_msg or not nresult.payload:
1105 node_files = None
1106 else:
1107 fingerprints = nresult.payload.get(constants.NV_FILELIST, {})
1108 node_files = dict((vcluster.LocalizeVirtualPath(key), value)
1109 for (key, value) in fingerprints.items())
1110 del fingerprints
1111
1112 test = not (node_files and isinstance(node_files, dict))
1113 self._ErrorIf(test, constants.CV_ENODEFILECHECK, node.name,
1114 "Node did not return file checksum data")
1115 if test:
1116 ignore_nodes.add(node.uuid)
1117 continue
1118
1119 # Build per-checksum mapping from filename to nodes having it
1120 for (filename, checksum) in node_files.items():
1121 assert filename in nodefiles
1122 fileinfo[filename].setdefault(checksum, set()).add(node.uuid)
1123
1124 for (filename, checksums) in fileinfo.items():
1125 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
1126
1127 # Nodes having the file
1128 with_file = frozenset(node_uuid
1129 for node_uuids in fileinfo[filename].values()
1130 for node_uuid in node_uuids) - ignore_nodes
1131
1132 expected_nodes = nodefiles[filename] - ignore_nodes
1133
1134 # Nodes missing file
1135 missing_file = expected_nodes - with_file
1136
1137 if filename in files_opt:
1138 # All or no nodes
1139 self._ErrorIf(missing_file and missing_file != expected_nodes,
1140 constants.CV_ECLUSTERFILECHECK, None,
1141 "File %s is optional, but it must exist on all or no"
1142 " nodes (not found on %s)",
1143 filename,
1144 utils.CommaJoin(
1145 utils.NiceSort(
1146 map(self.cfg.GetNodeName, missing_file))))
1147 else:
1148 self._ErrorIf(missing_file, constants.CV_ECLUSTERFILECHECK, None,
1149 "File %s is missing from node(s) %s", filename,
1150 utils.CommaJoin(
1151 utils.NiceSort(
1152 map(self.cfg.GetNodeName, missing_file))))
1153
1154 # Warn if a node has a file it shouldn't
1155 unexpected = with_file - expected_nodes
1156 self._ErrorIf(unexpected,
1157 constants.CV_ECLUSTERFILECHECK, None,
1158 "File %s should not exist on node(s) %s",
1159 filename, utils.CommaJoin(
1160 utils.NiceSort(map(self.cfg.GetNodeName, unexpected))))
1161
1162 # See if there are multiple versions of the file
1163 test = len(checksums) > 1
1164 if test:
1165 variants = ["variant %s on %s" %
1166 (idx + 1,
1167 utils.CommaJoin(utils.NiceSort(
1168 map(self.cfg.GetNodeName, node_uuids))))
1169 for (idx, (checksum, node_uuids)) in
1170 enumerate(sorted(checksums.items()))]
1171 else:
1172 variants = []
1173
1174 self._ErrorIf(test, constants.CV_ECLUSTERFILECHECK, None,
1175 "File %s found with %s different checksums (%s)",
1176 filename, len(checksums), "; ".join(variants))
1177
1178 def _VerifyNodeDrbdHelper(self, ninfo, nresult, drbd_helper):
1179 """Verify the drbd helper.
1180
1181 """
1182 if drbd_helper:
1183 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
1184 test = (helper_result is None)
1185 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
1186 "no drbd usermode helper returned")
1187 if helper_result:
1188 status, payload = helper_result
1189 test = not status
1190 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
1191 "drbd usermode helper check unsuccessful: %s", payload)
1192 test = status and (payload != drbd_helper)
1193 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
1194 "wrong drbd usermode helper: %s", payload)
1195
1196 @staticmethod
1197 def _ComputeDrbdMinors(ninfo, instanceinfo, disks_info, drbd_map, error_if):
1198 """Gives the DRBD information in a map for a node.
1199
1200 @type ninfo: L{objects.Node}
1201 @param ninfo: the node to check
1202 @param instanceinfo: the dict of instances
1203 @param disks_info: the dict of disks
1204 @param drbd_map: the DRBD map as returned by
1205 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1206 @type error_if: callable like L{_ErrorIf}
1207 @param error_if: The error reporting function
1208 @return: dict from minor number to (disk_uuid, instance_uuid, active)
1209
1210 """
1211 node_drbd = {}
1212 for minor, disk_uuid in drbd_map[ninfo.uuid].items():
1213 test = disk_uuid not in disks_info
1214 error_if(test, constants.CV_ECLUSTERCFG, None,
1215 "ghost disk '%s' in temporary DRBD map", disk_uuid)
1216 # ghost disk should not be active, but otherwise we
1217 # don't give double warnings (both ghost disk and
1218 # unallocated minor in use)
1219 if test:
1220 node_drbd[minor] = (disk_uuid, None, False)
1221 else:
1222 disk_active = False
1223 disk_instance = None
1224 for (inst_uuid, inst) in instanceinfo.items():
1225 if disk_uuid in inst.disks:
1226 disk_active = inst.disks_active
1227 disk_instance = inst_uuid
1228 break
1229 node_drbd[minor] = (disk_uuid, disk_instance, disk_active)
1230 return node_drbd
1231
1232 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, disks_info,
1233 drbd_helper, drbd_map):
1234 """Verifies and the node DRBD status.
1235
1236 @type ninfo: L{objects.Node}
1237 @param ninfo: the node to check
1238 @param nresult: the remote results for the node
1239 @param instanceinfo: the dict of instances
1240 @param disks_info: the dict of disks
1241 @param drbd_helper: the configured DRBD usermode helper
1242 @param drbd_map: the DRBD map as returned by
1243 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1244
1245 """
1246 self._VerifyNodeDrbdHelper(ninfo, nresult, drbd_helper)
1247
1248 # compute the DRBD minors
1249 node_drbd = self._ComputeDrbdMinors(ninfo, instanceinfo, disks_info,
1250 drbd_map, self._ErrorIf)
1251
1252 # and now check them
1253 used_minors = nresult.get(constants.NV_DRBDLIST, [])
1254 test = not isinstance(used_minors, (tuple, list))
1255 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
1256 "cannot parse drbd status file: %s", str(used_minors))
1257 if test:
1258 # we cannot check drbd status
1259 return
1260
1261 for minor, (disk_uuid, inst_uuid, must_exist) in node_drbd.items():
1262 test = minor not in used_minors and must_exist
1263 if inst_uuid is not None:
1264 attached = "(attached in instance '%s')" % \
1265 self.cfg.GetInstanceName(inst_uuid)
1266 else:
1267 attached = "(detached)"
1268 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
1269 "drbd minor %d of disk %s %s is not active",
1270 minor, disk_uuid, attached)
1271 for minor in used_minors:
1272 test = minor not in node_drbd
1273 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
1274 "unallocated drbd minor %d is in use", minor)
1275
1276 def _UpdateNodeOS(self, ninfo, nresult, nimg):
1277 """Builds the node OS structures.
1278
1279 @type ninfo: L{objects.Node}
1280 @param ninfo: the node to check
1281 @param nresult: the remote results for the node
1282 @param nimg: the node image object
1283
1284 """
1285 remote_os = nresult.get(constants.NV_OSLIST, None)
1286 test = (not isinstance(remote_os, list) or
1287 not compat.all(isinstance(v, list) and len(v) == 8
1288 for v in remote_os))
1289
1290 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name,
1291 "node hasn't returned valid OS data")
1292
1293 nimg.os_fail = test
1294
1295 if test:
1296 return
1297
1298 os_dict = {}
1299
1300 for (name, os_path, status, diagnose,
1301 variants, parameters, api_ver,
1302 trusted) in nresult[constants.NV_OSLIST]:
1303
1304 if name not in os_dict:
1305 os_dict[name] = []
1306
1307 # parameters is a list of lists instead of list of tuples due to
1308 # JSON lacking a real tuple type, fix it:
1309 parameters = [tuple(v) for v in parameters]
1310 os_dict[name].append((os_path, status, diagnose,
1311 set(variants), set(parameters), set(api_ver),
1312 trusted))
1313
1314 nimg.oslist = os_dict
1315
1316 def _VerifyNodeOS(self, ninfo, nimg, base):
1317 """Verifies the node OS list.
1318
1319 @type ninfo: L{objects.Node}
1320 @param ninfo: the node to check
1321 @param nimg: the node image object
1322 @param base: the 'template' node we match against (e.g. from the master)
1323
1324 """
1325 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
1326
1327 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
1328 for os_name, os_data in nimg.oslist.items():
1329 assert os_data, "Empty OS status for OS %s?!" % os_name
1330 f_path, f_status, f_diag, f_var, f_param, f_api, f_trusted = os_data[0]
1331 self._ErrorIf(not f_status, constants.CV_ENODEOS, ninfo.name,
1332 "Invalid OS %s (located at %s): %s",
1333 os_name, f_path, f_diag)
1334 self._ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, ninfo.name,
1335 "OS '%s' has multiple entries"
1336 " (first one shadows the rest): %s",
1337 os_name, utils.CommaJoin([v[0] for v in os_data]))
1338 # comparisons with the 'base' image
1339 test = os_name not in base.oslist
1340 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name,
1341 "Extra OS %s not present on reference node (%s)",
1342 os_name, self.cfg.GetNodeName(base.uuid))
1343 if test:
1344 continue
1345 assert base.oslist[os_name], "Base node has empty OS status?"
1346 _, b_status, _, b_var, b_param, b_api, b_trusted = base.oslist[os_name][0]
1347 if not b_status:
1348 # base OS is invalid, skipping
1349 continue
1350 for kind, a, b in [("API version", f_api, b_api),
1351 ("variants list", f_var, b_var),
1352 ("parameters", beautify_params(f_param),
1353 beautify_params(b_param))]:
1354 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name,
1355 "OS %s for %s differs from reference node %s:"
1356 " [%s] vs. [%s]", kind, os_name,
1357 self.cfg.GetNodeName(base.uuid),
1358 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
1359 for kind, a, b in [("trusted", f_trusted, b_trusted)]:
1360 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name,
1361 "OS %s for %s differs from reference node %s:"
1362 " %s vs. %s", kind, os_name,
1363 self.cfg.GetNodeName(base.uuid), a, b)
1364
1365 # check any missing OSes
1366 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
1367 self._ErrorIf(missing, constants.CV_ENODEOS, ninfo.name,
1368 "OSes present on reference node %s"
1369 " but missing on this node: %s",
1370 self.cfg.GetNodeName(base.uuid), utils.CommaJoin(missing))
1371
1372 def _VerifyAcceptedFileStoragePaths(self, ninfo, nresult, is_master):
1373 """Verifies paths in L{pathutils.FILE_STORAGE_PATHS_FILE}.
1374
1375 @type ninfo: L{objects.Node}
1376 @param ninfo: the node to check
1377 @param nresult: the remote results for the node
1378 @type is_master: bool
1379 @param is_master: Whether node is the master node
1380
1381 """
1382 cluster = self.cfg.GetClusterInfo()
1383 if (is_master and
1384 (cluster.IsFileStorageEnabled() or
1385 cluster.IsSharedFileStorageEnabled())):
1386 try:
1387 fspaths = nresult[constants.NV_ACCEPTED_STORAGE_PATHS]
1388 except KeyError:
1389 # This should never happen
1390 self._ErrorIf(True, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
1391 "Node did not return forbidden file storage paths")
1392 else:
1393 self._ErrorIf(fspaths, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
1394 "Found forbidden file storage paths: %s",
1395 utils.CommaJoin(fspaths))
1396 else:
1397 self._ErrorIf(constants.NV_ACCEPTED_STORAGE_PATHS in nresult,
1398 constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
1399 "Node should not have returned forbidden file storage"
1400 " paths")
1401
1402 def _VerifyStoragePaths(self, ninfo, nresult, file_disk_template,
1403 verify_key, error_key):
1404 """Verifies (file) storage paths.
1405
1406 @type ninfo: L{objects.Node}
1407 @param ninfo: the node to check
1408 @param nresult: the remote results for the node
1409 @type file_disk_template: string
1410 @param file_disk_template: file-based disk template, whose directory
1411 is supposed to be verified
1412 @type verify_key: string
1413 @param verify_key: key for the verification map of this file
1414 verification step
1415 @param error_key: error key to be added to the verification results
1416 in case something goes wrong in this verification step
1417
1418 """
1419 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes(
1420 constants.ST_FILE, constants.ST_SHARED_FILE, constants.ST_GLUSTER
1421 ))
1422
1423 cluster = self.cfg.GetClusterInfo()
1424 if cluster.IsDiskTemplateEnabled(file_disk_template):
1425 self._ErrorIf(
1426 verify_key in nresult,
1427 error_key, ninfo.name,
1428 "The configured %s storage path is unusable: %s" %
1429 (file_disk_template, nresult.get(verify_key)))
1430
1431 def _VerifyFileStoragePaths(self, ninfo, nresult):
1432 """Verifies (file) storage paths.
1433
1434 @see: C{_VerifyStoragePaths}
1435
1436 """
1437 self._VerifyStoragePaths(
1438 ninfo, nresult, constants.DT_FILE,
1439 constants.NV_FILE_STORAGE_PATH,
1440 constants.CV_ENODEFILESTORAGEPATHUNUSABLE)
1441
1442 def _VerifySharedFileStoragePaths(self, ninfo, nresult):
1443 """Verifies (file) storage paths.
1444
1445 @see: C{_VerifyStoragePaths}
1446
1447 """
1448 self._VerifyStoragePaths(
1449 ninfo, nresult, constants.DT_SHARED_FILE,
1450 constants.NV_SHARED_FILE_STORAGE_PATH,
1451 constants.CV_ENODESHAREDFILESTORAGEPATHUNUSABLE)
1452
1453 def _VerifyGlusterStoragePaths(self, ninfo, nresult):
1454 """Verifies (file) storage paths.
1455
1456 @see: C{_VerifyStoragePaths}
1457
1458 """
1459 self._VerifyStoragePaths(
1460 ninfo, nresult, constants.DT_GLUSTER,
1461 constants.NV_GLUSTER_STORAGE_PATH,
1462 constants.CV_ENODEGLUSTERSTORAGEPATHUNUSABLE)
1463
1464 def _VerifyOob(self, ninfo, nresult):
1465 """Verifies out of band functionality of a node.
1466
1467 @type ninfo: L{objects.Node}
1468 @param ninfo: the node to check
1469 @param nresult: the remote results for the node
1470
1471 """
1472 # We just have to verify the paths on master and/or master candidates
1473 # as the oob helper is invoked on the master
1474 if ((ninfo.master_candidate or ninfo.master_capable) and
1475 constants.NV_OOB_PATHS in nresult):
1476 for path_result in nresult[constants.NV_OOB_PATHS]:
1477 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH,
1478 ninfo.name, path_result)
1479
1480 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1481 """Verifies and updates the node volume data.
1482
1483 This function will update a L{NodeImage}'s internal structures
1484 with data from the remote call.
1485
1486 @type ninfo: L{objects.Node}
1487 @param ninfo: the node to check
1488 @param nresult: the remote results for the node
1489 @param nimg: the node image object
1490 @param vg_name: the configured VG name
1491
1492 """
1493 nimg.lvm_fail = True
1494 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
1495 if vg_name is None:
1496 pass
1497 elif isinstance(lvdata, basestring):
1498 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name,
1499 "LVM problem on node: %s", utils.SafeEncode(lvdata))
1500 elif not isinstance(lvdata, dict):
1501 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name,
1502 "rpc call to node failed (lvlist)")
1503 else:
1504 nimg.volumes = lvdata
1505 nimg.lvm_fail = False
1506
1507 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1508 """Verifies and updates the node instance list.
1509
1510 If the listing was successful, then updates this node's instance
1511 list. Otherwise, it marks the RPC call as failed for the instance
1512 list key.
1513
1514 @type ninfo: L{objects.Node}
1515 @param ninfo: the node to check
1516 @param nresult: the remote results for the node
1517 @param nimg: the node image object
1518
1519 """
1520 idata = nresult.get(constants.NV_INSTANCELIST, None)
1521 test = not isinstance(idata, list)
1522 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
1523 "rpc call to node failed (instancelist): %s",
1524 utils.SafeEncode(str(idata)))
1525 if test:
1526 nimg.hyp_fail = True
1527 else:
1528 nimg.instances = [uuid for (uuid, _) in
1529 self.cfg.GetMultiInstanceInfoByName(idata)]
1530
1531 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1532 """Verifies and computes a node information map
1533
1534 @type ninfo: L{objects.Node}
1535 @param ninfo: the node to check
1536 @param nresult: the remote results for the node
1537 @param nimg: the node image object
1538 @param vg_name: the configured VG name
1539
1540 """
1541 # try to read free memory (from the hypervisor)
1542 hv_info = nresult.get(constants.NV_HVINFO, None)
1543 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
1544 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
1545 "rpc call to node failed (hvinfo)")
1546 if not test:
1547 try:
1548 nimg.mfree = int(hv_info["memory_free"])
1549 except (ValueError, TypeError):
1550 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
1551 "node returned invalid nodeinfo, check hypervisor")
1552
1553 # FIXME: devise a free space model for file based instances as well
1554 if vg_name is not None:
1555 test = (constants.NV_VGLIST not in nresult or
1556 vg_name not in nresult[constants.NV_VGLIST])
1557 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name,
1558 "node didn't return data for the volume group '%s'"
1559 " - it is either missing or broken", vg_name)
1560 if not test:
1561 try:
1562 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
1563 except (ValueError, TypeError):
1564 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
1565 "node returned invalid LVM info, check LVM status")
1566
1567 def _CollectDiskInfo(self, node_uuids, node_image, instanceinfo):
1568 """Gets per-disk status information for all instances.
1569
1570 @type node_uuids: list of strings
1571 @param node_uuids: Node UUIDs
1572 @type node_image: dict of (UUID, L{objects.Node})
1573 @param node_image: Node objects
1574 @type instanceinfo: dict of (UUID, L{objects.Instance})
1575 @param instanceinfo: Instance objects
1576 @rtype: {instance: {node: [(succes, payload)]}}
1577 @return: a dictionary of per-instance dictionaries with nodes as
1578 keys and disk information as values; the disk information is a
1579 list of tuples (success, payload)
1580
1581 """
1582 node_disks = {}
1583 node_disks_dev_inst_only = {}
1584 diskless_instances = set()
1585 nodisk_instances = set()
1586
1587 for nuuid in node_uuids:
1588 node_inst_uuids = list(itertools.chain(node_image[nuuid].pinst,
1589 node_image[nuuid].sinst))
1590 diskless_instances.update(uuid for uuid in node_inst_uuids
1591 if not instanceinfo[uuid].disks)
1592 disks = [(inst_uuid, disk)
1593 for inst_uuid in node_inst_uuids
1594 for disk in self.cfg.GetInstanceDisks(inst_uuid)]
1595
1596 if not disks:
1597 nodisk_instances.update(uuid for uuid in node_inst_uuids
1598 if instanceinfo[uuid].disks)
1599 # No need to collect data
1600 continue
1601
1602 node_disks[nuuid] = disks
1603
1604 # _AnnotateDiskParams makes already copies of the disks
1605 dev_inst_only = []
1606 for (inst_uuid, dev) in disks:
1607 (anno_disk,) = AnnotateDiskParams(instanceinfo[inst_uuid], [dev],
1608 self.cfg)
1609 dev_inst_only.append((anno_disk, instanceinfo[inst_uuid]))
1610
1611 node_disks_dev_inst_only[nuuid] = dev_inst_only
1612
1613 assert len(node_disks) == len(node_disks_dev_inst_only)
1614
1615 # Collect data from all nodes with disks
1616 result = self.rpc.call_blockdev_getmirrorstatus_multi(
1617 node_disks.keys(), node_disks_dev_inst_only)
1618
1619 assert len(result) == len(node_disks)
1620
1621 instdisk = {}
1622
1623 for (nuuid, nres) in result.items():
1624 node = self.cfg.GetNodeInfo(nuuid)
1625 disks = node_disks[node.uuid]
1626
1627 if nres.offline:
1628 # No data from this node
1629 data = len(disks) * [(False, "node offline")]
1630 else:
1631 msg = nres.fail_msg
1632 self._ErrorIf(msg, constants.CV_ENODERPC, node.name,
1633 "while getting disk information: %s", msg)
1634 if msg:
1635 # No data from this node
1636 data = len(disks) * [(False, msg)]
1637 else:
1638 data = []
1639 for idx, i in enumerate(nres.payload):
1640 if isinstance(i, (tuple, list)) and len(i) == 2:
1641 data.append(i)
1642 else:
1643 logging.warning("Invalid result from node %s, entry %d: %s",
1644 node.name, idx, i)
1645 data.append((False, "Invalid result from the remote node"))
1646
1647 for ((inst_uuid, _), status) in zip(disks, data):
1648 instdisk.setdefault(inst_uuid, {}).setdefault(node.uuid, []) \
1649 .append(status)
1650
1651 # Add empty entries for diskless instances.
1652 for inst_uuid in diskless_instances:
1653 assert inst_uuid not in instdisk
1654 instdisk[inst_uuid] = {}
1655 # ...and disk-full instances that happen to have no disks
1656 for inst_uuid in nodisk_instances:
1657 assert inst_uuid not in instdisk
1658 instdisk[inst_uuid] = {}
1659
1660 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
1661 len(nuuids) <= len(
1662 self.cfg.GetInstanceNodes(instanceinfo[inst].uuid)) and
1663 compat.all(isinstance(s, (tuple, list)) and
1664 len(s) == 2 for s in statuses)
1665 for inst, nuuids in instdisk.items()
1666 for nuuid, statuses in nuuids.items())
1667 if __debug__:
1668 instdisk_keys = set(instdisk)
1669 instanceinfo_keys = set(instanceinfo)
1670 assert instdisk_keys == instanceinfo_keys, \
1671 ("instdisk keys (%s) do not match instanceinfo keys (%s)" %
1672 (instdisk_keys, instanceinfo_keys))
1673
1674 return instdisk
1675
1676 @staticmethod
1677 def _SshNodeSelector(group_uuid, all_nodes):
1678 """Create endless iterators for all potential SSH check hosts.
1679
1680 """
1681 nodes = [node for node in all_nodes
1682 if (node.group != group_uuid and
1683 not node.offline)]
1684 keyfunc = operator.attrgetter("group")
1685
1686 return map(itertools.cycle,
1687 [sorted(map(operator.attrgetter("name"), names))
1688 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
1689 keyfunc)])
1690
1691 @classmethod
1692 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
1693 """Choose which nodes should talk to which other nodes.
1694
1695 We will make nodes contact all nodes in their group, and one node from
1696 every other group.
1697
1698 @rtype: tuple of (string, dict of strings to list of strings, string)
1699 @return: a tuple containing the list of all online nodes, a dictionary
1700 mapping node names to additional nodes of other node groups to which
1701 connectivity should be tested, and a list of all online master
1702 candidates
1703
1704 @warning: This algorithm has a known issue if one node group is much
1705 smaller than others (e.g. just one node). In such a case all other
1706 nodes will talk to the single node.
1707
1708 """
1709 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
1710 online_mcs = sorted(node.name for node in group_nodes
1711 if (node.master_candidate and not node.offline))
1712 sel = cls._SshNodeSelector(group_uuid, all_nodes)
1713
1714 return (online_nodes,
1715 dict((name, sorted([i.next() for i in sel]))
1716 for name in online_nodes),
1717 online_mcs)
1718
1719 def _PrepareSshSetupCheck(self):
1720 """Prepare the input data for the SSH setup verification.
1721
1722 """
1723 all_nodes_info = self.cfg.GetAllNodesInfo()
1724 potential_master_candidates = self.cfg.GetPotentialMasterCandidates()
1725 node_status = [
1726 (uuid, node_info.name, node_info.master_candidate,
1727 node_info.name in potential_master_candidates, not node_info.offline)
1728 for (uuid, node_info) in all_nodes_info.items()]
1729 return node_status
1730
1731 def BuildHooksEnv(self):
1732 """Build hooks env.
1733
1734 Cluster-Verify hooks just ran in the post phase and their failure makes
1735 the output be logged in the verify output and the verification to fail.
1736
1737 """
1738 env = {
1739 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags()),
1740 }
1741
1742 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
1743 for node in self.my_node_info.values())
1744
1745 return env
1746
1747 def BuildHooksNodes(self):
1748 """Build hooks nodes.
1749
1750 """
1751 return ([], list(self.my_node_info.keys()))
1752
1753 @staticmethod
1754 def _VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced,
1755 i_offline, n_offline, n_drained):
1756 feedback_fn("* Other Notes")
1757 if i_non_redundant:
1758 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
1759 % len(i_non_redundant))
1760
1761 if i_non_a_balanced:
1762 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
1763 % len(i_non_a_balanced))
1764
1765 if i_offline:
1766 feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline)
1767
1768 if n_offline:
1769 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
1770
1771 if n_drained:
1772 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
1773
1774 def _VerifyExclusionTags(self, nodename, pinst, ctags):
1775 """Verify that all instances have different exclusion tags.
1776
1777 @type nodename: string
1778 @param nodename: the name of the node for which the check is done
1779 @type pinst: list of string
1780 @param pinst: list of UUIDs of those instances having the given node
1781 as primary node
1782 @type ctags: list of string
1783 @param ctags: tags of the cluster
1784
1785 """
1786 exclusion_prefixes = utils.GetExclusionPrefixes(ctags)
1787 tags_seen = set([])
1788 conflicting_tags = set([])
1789 for iuuid in pinst:
1790 allitags = self.my_inst_info[iuuid].tags
1791 if allitags is None:
1792 allitags = []
1793 itags = set([tag for tag in allitags
1794 if utils.IsGoodTag(exclusion_prefixes, tag)])
1795 conflicts = itags.intersection(tags_seen)
1796 if len(conflicts) > 0:
1797 conflicting_tags = conflicting_tags.union(conflicts)
1798 tags_seen = tags_seen.union(itags)
1799
1800 self._ErrorIf(len(conflicting_tags) > 0, constants.CV_EEXTAGS, nodename,
1801 "Tags where there is more than one instance: %s",
1802 list(conflicting_tags), code=constants.CV_WARNING)
1803
1804 def Exec(self, feedback_fn): # pylint: disable=R0915
1805 """Verify integrity of the node group, performing various test on nodes.
1806
1807 """
1808 # This method has too many local variables. pylint: disable=R0914
1809 feedback_fn("* Verifying group '%s'" % self.group_info.name)
1810
1811 if not self.my_node_uuids:
1812 # empty node group
1813 feedback_fn("* Empty node group, skipping verification")
1814 return True
1815
1816 self.bad = False
1817 verbose = self.op.verbose
1818 self._feedback_fn = feedback_fn
1819
1820 vg_name = self.cfg.GetVGName()
1821 drbd_helper = self.cfg.GetDRBDHelper()
1822 cluster = self.cfg.GetClusterInfo()
1823 hypervisors = cluster.enabled_hypervisors
1824 node_data_list = self.my_node_info.values()
1825
1826 i_non_redundant = [] # Non redundant instances
1827 i_non_a_balanced = [] # Non auto-balanced instances
1828 i_offline = 0 # Count of offline instances
1829 n_offline = 0 # Count of offline nodes
1830 n_drained = 0 # Count of nodes being drained
1831 node_vol_should = {}
1832
1833 # FIXME: verify OS list
1834
1835 # File verification
1836 filemap = ComputeAncillaryFiles(cluster, False)
1837
1838 # do local checksums
1839 master_node_uuid = self.master_node = self.cfg.GetMasterNode()
1840 master_ip = self.cfg.GetMasterIP()
1841
1842 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids))
1843
1844 user_scripts = []
1845 if self.cfg.GetUseExternalMipScript():
1846 user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT)
1847
1848 node_verify_param = {
1849 constants.NV_FILELIST:
1850 map(vcluster.MakeVirtualPath,
1851 utils.UniqueSequence(filename
1852 for files in filemap
1853 for filename in files)),
1854 constants.NV_NODELIST:
1855 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
1856 self.all_node_info.values()),
1857 constants.NV_HYPERVISOR: hypervisors,
1858 constants.NV_HVPARAMS:
1859 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
1860 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip)
1861 for node in node_data_list
1862 if not node.offline],
1863 constants.NV_INSTANCELIST: hypervisors,
1864 constants.NV_VERSION: None,
1865 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
1866 constants.NV_NODESETUP: None,
1867 constants.NV_TIME: None,
1868 constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip),
1869 constants.NV_OSLIST: None,
1870 constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(),
1871 constants.NV_USERSCRIPTS: user_scripts,
1872 constants.NV_CLIENT_CERT: None,
1873 }
1874
1875 if self.cfg.GetClusterInfo().modify_ssh_setup:
1876 node_verify_param[constants.NV_SSH_SETUP] = \
1877 (self._PrepareSshSetupCheck(), self.cfg.GetClusterInfo().ssh_key_type)
1878 if self.op.verify_clutter:
1879 node_verify_param[constants.NV_SSH_CLUTTER] = True
1880
1881 if vg_name is not None:
1882 node_verify_param[constants.NV_VGLIST] = None
1883 node_verify_param[constants.NV_LVLIST] = vg_name
1884 node_verify_param[constants.NV_PVLIST] = [vg_name]
1885
1886 if cluster.IsDiskTemplateEnabled(constants.DT_DRBD8):
1887 if drbd_helper:
1888 node_verify_param[constants.NV_DRBDVERSION] = None
1889 node_verify_param[constants.NV_DRBDLIST] = None
1890 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
1891
1892 if cluster.IsFileStorageEnabled() or \
1893 cluster.IsSharedFileStorageEnabled():
1894 # Load file storage paths only from master node
1895 node_verify_param[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1896 self.cfg.GetMasterNodeName()
1897 if cluster.IsFileStorageEnabled():
1898 node_verify_param[constants.NV_FILE_STORAGE_PATH] = \
1899 cluster.file_storage_dir
1900 if cluster.IsSharedFileStorageEnabled():
1901 node_verify_param[constants.NV_SHARED_FILE_STORAGE_PATH] = \
1902 cluster.shared_file_storage_dir
1903
1904 # bridge checks
1905 # FIXME: this needs to be changed per node-group, not cluster-wide
1906 bridges = set()
1907 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
1908 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
1909 bridges.add(default_nicpp[constants.NIC_LINK])
1910 for inst_uuid in self.my_inst_info.values():
1911 for nic in inst_uuid.nics:
1912 full_nic = cluster.SimpleFillNIC(nic.nicparams)
1913 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
1914 bridges.add(full_nic[constants.NIC_LINK])
1915
1916 if bridges:
1917 node_verify_param[constants.NV_BRIDGES] = list(bridges)
1918
1919 # Build our expected cluster state
1920 node_image = dict((node.uuid, self.NodeImage(offline=node.offline,
1921 uuid=node.uuid,
1922 vm_capable=node.vm_capable))
1923 for node in node_data_list)
1924
1925 # Gather OOB paths
1926 oob_paths = []
1927 for node in self.all_node_info.values():
1928 path = SupportsOob(self.cfg, node)
1929 if path and path not in oob_paths:
1930 oob_paths.append(path)
1931
1932 if oob_paths:
1933 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
1934
1935 for inst_uuid in self.my_inst_uuids:
1936 instance = self.my_inst_info[inst_uuid]
1937 if instance.admin_state == constants.ADMINST_OFFLINE:
1938 i_offline += 1
1939
1940 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid)
1941 for nuuid in inst_nodes:
1942 if nuuid not in node_image:
1943 gnode = self.NodeImage(uuid=nuuid)
1944 gnode.ghost = (nuuid not in self.all_node_info)
1945 node_image[nuuid] = gnode
1946
1947 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
1948
1949 pnode = instance.primary_node
1950 node_image[pnode].pinst.append(instance.uuid)
1951
1952 for snode in self.cfg.GetInstanceSecondaryNodes(instance.uuid):
1953 nimg = node_image[snode]
1954 nimg.sinst.append(instance.uuid)
1955 if pnode not in nimg.sbp:
1956 nimg.sbp[pnode] = []
1957 nimg.sbp[pnode].append(instance.uuid)
1958
1959 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg,
1960 self.my_node_info.keys())
1961 # The value of exclusive_storage should be the same across the group, so if
1962 # it's True for at least a node, we act as if it were set for all the nodes
1963 self._exclusive_storage = compat.any(es_flags.values())
1964 if self._exclusive_storage:
1965 node_verify_param[constants.NV_EXCLUSIVEPVS] = True
1966
1967 # At this point, we have the in-memory data structures complete,
1968 # except for the runtime information, which we'll gather next
1969
1970 # NOTE: Here we lock the configuration for the duration of RPC calls,
1971 # which means that the cluster configuration changes are blocked during
1972 # this period.
1973 # This is something that should be done only exceptionally and only for
1974 # justified cases!
1975 # In this case, we need the lock as we can only verify the integrity of
1976 # configuration files on MCs only if we know nobody else is modifying it.
1977 # FIXME: The check for integrity of config.data should be moved to
1978 # WConfD, which is the only one who can otherwise ensure nobody
1979 # will modify the configuration during the check.
1980 with self.cfg.GetConfigManager(shared=True, forcelock=True):
1981 feedback_fn("* Gathering information about nodes (%s nodes)" %
1982 len(self.my_node_uuids))
1983 # Force the configuration to be fully distributed before doing any tests
1984 self.cfg.FlushConfig()
1985 # Due to the way our RPC system works, exact response times cannot be
1986 # guaranteed (e.g. a broken node could run into a timeout). By keeping
1987 # the time before and after executing the request, we can at least have
1988 # a time window.
1989 nvinfo_starttime = time.time()
1990 # Get lock on the configuration so that nobody modifies it concurrently.
1991 # Otherwise it can be modified by other jobs, failing the consistency
1992 # test.
1993 # NOTE: This is an exceptional situation, we should otherwise avoid
1994 # locking the configuration for something but very fast, pure operations.
1995 cluster_name = self.cfg.GetClusterName()
1996 hvparams = self.cfg.GetClusterInfo().hvparams
1997 all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids,
1998 node_verify_param,
1999 cluster_name,
2000 hvparams)
2001 nvinfo_endtime = time.time()
2002
2003 if self.extra_lv_nodes and vg_name is not None:
2004 feedback_fn("* Gathering information about extra nodes (%s nodes)" %
2005 len(self.extra_lv_nodes))
2006 extra_lv_nvinfo = \
2007 self.rpc.call_node_verify(self.extra_lv_nodes,
2008 {constants.NV_LVLIST: vg_name},
2009 self.cfg.GetClusterName(),
2010 self.cfg.GetClusterInfo().hvparams)
2011 else:
2012 extra_lv_nvinfo = {}
2013
2014 # If not all nodes are being checked, we need to make sure the master
2015 # node and a non-checked vm_capable node are in the list.
2016 absent_node_uuids = set(self.all_node_info).difference(self.my_node_info)
2017 if absent_node_uuids:
2018 vf_nvinfo = all_nvinfo.copy()
2019 vf_node_info = list(self.my_node_info.values())
2020 additional_node_uuids = []
2021 if master_node_uuid not in self.my_node_info:
2022 additional_node_uuids.append(master_node_uuid)
2023 vf_node_info.append(self.all_node_info[master_node_uuid])
2024 # Add the first vm_capable node we find which is not included,
2025 # excluding the master node (which we already have)
2026 for node_uuid in absent_node_uuids:
2027 nodeinfo = self.all_node_info[node_uuid]
2028 if (nodeinfo.vm_capable and not nodeinfo.offline and
2029 node_uuid != master_node_uuid):
2030 additional_node_uuids.append(node_uuid)
2031 vf_node_info.append(self.all_node_info[node_uuid])
2032 break
2033 key = constants.NV_FILELIST
2034
2035 feedback_fn("* Gathering information about the master node")
2036 vf_nvinfo.update(self.rpc.call_node_verify(
2037 additional_node_uuids, {key: node_verify_param[key]},
2038 self.cfg.GetClusterName(), self.cfg.GetClusterInfo().hvparams))
2039 else:
2040 vf_nvinfo = all_nvinfo
2041 vf_node_info = self.my_node_info.values()
2042
2043 all_drbd_map = self.cfg.ComputeDRBDMap()
2044
2045 feedback_fn("* Gathering disk information (%s nodes)" %
2046 len(self.my_node_uuids))
2047 instdisk = self._CollectDiskInfo(self.my_node_info.keys(), node_image,
2048 self.my_inst_info)
2049
2050 feedback_fn("* Verifying configuration file consistency")
2051
2052 self._VerifyClientCertificates(self.my_node_info.values(), all_nvinfo)
2053 if self.cfg.GetClusterInfo().modify_ssh_setup:
2054 self._VerifySshSetup(self.my_node_info.values(), all_nvinfo)
2055 self._VerifyFiles(vf_node_info, master_node_uuid, vf_nvinfo, filemap)
2056
2057 feedback_fn("* Verifying node status")
2058
2059 refos_img = None
2060
2061 for node_i in node_data_list:
2062 nimg = node_image[node_i.uuid]
2063
2064 if node_i.offline:
2065 if verbose:
2066 feedback_fn("* Skipping offline node %s" % (node_i.name,))
2067 n_offline += 1
2068 continue
2069
2070 if node_i.uuid == master_node_uuid:
2071 ntype = "master"
2072 elif node_i.master_candidate:
2073 ntype = "master candidate"
2074 elif node_i.drained:
2075 ntype = "drained"
2076 n_drained += 1
2077 else:
2078 ntype = "regular"
2079 if verbose:
2080 feedback_fn("* Verifying node %s (%s)" % (node_i.name, ntype))
2081
2082 msg = all_nvinfo[node_i.uuid].fail_msg
2083 self._ErrorIf(msg, constants.CV_ENODERPC, node_i.name,
2084 "while contacting node: %s", msg)
2085 if msg:
2086 nimg.rpc_fail = True
2087 continue
2088
2089 nresult = all_nvinfo[node_i.uuid].payload
2090
2091 nimg.call_ok = self._VerifyNode(node_i, nresult)
2092 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2093 self._VerifyNodeNetwork(node_i, nresult)
2094 self._VerifyNodeUserScripts(node_i, nresult)
2095 self._VerifyOob(node_i, nresult)
2096 self._VerifyAcceptedFileStoragePaths(node_i, nresult,
2097 node_i.uuid == master_node_uuid)
2098 self._VerifyFileStoragePaths(node_i, nresult)
2099 self._VerifySharedFileStoragePaths(node_i, nresult)
2100 self._VerifyGlusterStoragePaths(node_i, nresult)
2101
2102 if nimg.vm_capable:
2103 self._UpdateVerifyNodeLVM(node_i, nresult, vg_name, nimg)
2104 if constants.DT_DRBD8 in cluster.enabled_disk_templates:
2105 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info,
2106 self.all_disks_info, drbd_helper, all_drbd_map)
2107
2108 if (constants.DT_PLAIN in cluster.enabled_disk_templates) or \
2109 (constants.DT_DRBD8 in cluster.enabled_disk_templates):
2110 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2111 self._UpdateNodeInstances(node_i, nresult, nimg)
2112 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2113 self._UpdateNodeOS(node_i, nresult, nimg)
2114
2115 if not nimg.os_fail:
2116 if refos_img is None:
2117 refos_img = nimg
2118 self._VerifyNodeOS(node_i, nimg, refos_img)
2119 self._VerifyNodeBridges(node_i, nresult, bridges)
2120
2121 # Check whether all running instances are primary for the node. (This
2122 # can no longer be done from _VerifyInstance below, since some of the
2123 # wrong instances could be from other node groups.)
2124 non_primary_inst_uuids = set(nimg.instances).difference(nimg.pinst)
2125
2126 for inst_uuid in non_primary_inst_uuids:
2127 test = inst_uuid in self.all_inst_info
2128 self._ErrorIf(test, constants.CV_EINSTANCEWRONGNODE,
2129 self.cfg.GetInstanceName(inst_uuid),
2130 "instance should not run on node %s", node_i.name)
2131 self._ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
2132 "node is running unknown instance %s", inst_uuid)
2133
2134 self._VerifyExclusionTags(node_i.name, nimg.pinst, cluster.tags)
2135
2136 self._VerifyGroupDRBDVersion(all_nvinfo)
2137 self._VerifyGroupLVM(node_image, vg_name)
2138
2139 for node_uuid, result in extra_lv_nvinfo.items():
2140 self._UpdateNodeVolumes(self.all_node_info[node_uuid], result.payload,
2141 node_image[node_uuid], vg_name)
2142
2143 feedback_fn("* Verifying instance status")
2144 for inst_uuid in self.my_inst_uuids:
2145 instance = self.my_inst_info[inst_uuid]
2146 if verbose:
2147 feedback_fn("* Verifying instance %s" % instance.name)
2148 self._VerifyInstance(instance, node_image, instdisk[inst_uuid])
2149
2150 # If the instance is not fully redundant we cannot survive losing its
2151 # primary node, so we are not N+1 compliant.
2152 inst_disks = self.cfg.GetInstanceDisks(instance.uuid)
2153 if not utils.AllDiskOfType(inst_disks, constants.DTS_MIRRORED):
2154 i_non_redundant.append(instance)
2155
2156 if not cluster.FillBE(instance)[constants.BE_AUTO_BALANCE]:
2157 i_non_a_balanced.append(instance)
2158
2159 feedback_fn("* Verifying orphan volumes")
2160 reserved = utils.FieldSet(*cluster.reserved_lvs)
2161
2162 # We will get spurious "unknown volume" warnings if any node of this group
2163 # is secondary for an instance whose primary is in another group. To avoid
2164 # them, we find these instances and add their volumes to node_vol_should.
2165 for instance in self.all_inst_info.values():
2166 for secondary in self.cfg.GetInstanceSecondaryNodes(instance.uuid):
2167 if (secondary in self.my_node_info
2168 and instance.name not in self.my_inst_info):
2169 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
2170 break
2171
2172 self._VerifyOrphanVolumes(vg_name, node_vol_should, node_image, reserved)
2173
2174 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2175 feedback_fn("* Verifying N+1 Memory redundancy")
2176 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
2177
2178 self._VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced,
2179 i_offline, n_offline, n_drained)
2180
2181 return not self.bad
2182
2183 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2184 """Analyze the post-hooks' result
2185
2186 This method analyses the hook result, handles it, and sends some
2187 nicely-formatted feedback back to the user.
2188
2189 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2190 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2191 @param hooks_results: the results of the multi-node hooks rpc call
2192 @param feedback_fn: function used send feedback back to the caller
2193 @param lu_result: previous Exec result
2194 @return: the new Exec result, based on the previous result
2195 and hook results
2196
2197 """
2198 # We only really run POST phase hooks, only for non-empty groups,
2199 # and are only interested in their results
2200 if not self.my_node_uuids:
2201 # empty node group
2202 pass
2203 elif phase == constants.HOOKS_PHASE_POST:
2204 # Used to change hooks' output to proper indentation
2205 feedback_fn("* Hooks Results")
2206 assert hooks_results, "invalid result from hooks"
2207
2208 for node_name in hooks_results:
2209 res = hooks_results[node_name]
2210 msg = res.fail_msg
2211 test = msg and not res.offline
2212 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
2213 "Communication failure in hooks execution: %s", msg)
2214 if test:
2215 lu_result = False
2216 continue
2217 if res.offline:
2218 # No need to investigate payload if node is offline
2219 continue
2220 for script, hkr, output in res.payload:
2221 test = hkr == constants.HKR_FAIL
2222 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
2223 "Script %s failed, output:", script)
2224 if test:
2225 output = self._HOOKS_INDENT_RE.sub(" ", output)
2226 feedback_fn("%s" % output)
2227 lu_result = False
2228
2229 return lu_result