Do not add a new Inotify watchers on timer
[ganeti-github.git] / lib / cmdlib / cluster / verify.py
1 #
2 #
3
4 # Copyright (C) 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 """Logical units for cluster verification."""
31
32 import itertools
33 import logging
34 import operator
35 import re
36 import time
37 import ganeti.masterd.instance
38 import ganeti.rpc.node as rpc
39
40 from ganeti import compat
41 from ganeti import constants
42 from ganeti import errors
43 from ganeti import locking
44 from ganeti import pathutils
45 from ganeti import utils
46 from ganeti import vcluster
47 from ganeti import hypervisor
48 from ganeti import opcodes
49
50 from ganeti.cmdlib.base import LogicalUnit, NoHooksLU, ResultWithJobs
51 from ganeti.cmdlib.common import ShareAll, ComputeAncillaryFiles, \
52 CheckNodePVs, ComputeIPolicyInstanceViolation, AnnotateDiskParams, \
53 SupportsOob
54
55
56 def _GetAllHypervisorParameters(cluster, instances):
57 """Compute the set of all hypervisor parameters.
58
59 @type cluster: L{objects.Cluster}
60 @param cluster: the cluster object
61 @param instances: list of L{objects.Instance}
62 @param instances: additional instances from which to obtain parameters
63 @rtype: list of (origin, hypervisor, parameters)
64 @return: a list with all parameters found, indicating the hypervisor they
65 apply to, and the origin (can be "cluster", "os X", or "instance Y")
66
67 """
68 hvp_data = []
69
70 for hv_name in cluster.enabled_hypervisors:
71 hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
72
73 for os_name, os_hvp in cluster.os_hvp.items():
74 for hv_name, hv_params in os_hvp.items():
75 if hv_params:
76 full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
77 hvp_data.append(("os %s" % os_name, hv_name, full_params))
78
79 # TODO: collapse identical parameter values in a single one
80 for instance in instances:
81 if instance.hvparams:
82 hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
83 cluster.FillHV(instance)))
84
85 return hvp_data
86
87
88 class _VerifyErrors(object):
89 """Mix-in for cluster/group verify LUs.
90
91 It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
92 self.op and self._feedback_fn to be available.)
93
94 """
95
96 ETYPE_FIELD = "code"
97 ETYPE_ERROR = constants.CV_ERROR
98 ETYPE_WARNING = constants.CV_WARNING
99
100 def _Error(self, ecode, item, msg, *args, **kwargs):
101 """Format an error message.
102
103 Based on the opcode's error_codes parameter, either format a
104 parseable error code, or a simpler error string.
105
106 This must be called only from Exec and functions called from Exec.
107
108 """
109 ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
110 itype, etxt, _ = ecode
111 # If the error code is in the list of ignored errors, demote the error to a
112 # warning
113 if etxt in self.op.ignore_errors: # pylint: disable=E1101
114 ltype = self.ETYPE_WARNING
115 # first complete the msg
116 if args:
117 msg = msg % args
118 # then format the whole message
119 if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
120 msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
121 else:
122 if item:
123 item = " " + item
124 else:
125 item = ""
126 msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
127 # and finally report it via the feedback_fn
128 self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101
129 # do not mark the operation as failed for WARN cases only
130 if ltype == self.ETYPE_ERROR:
131 self.bad = True
132
133 def _ErrorIf(self, cond, *args, **kwargs):
134 """Log an error message if the passed condition is True.
135
136 """
137 if (bool(cond)
138 or self.op.debug_simulate_errors): # pylint: disable=E1101
139 self._Error(*args, **kwargs)
140
141
142 class LUClusterVerify(NoHooksLU):
143 """Submits all jobs necessary to verify the cluster.
144
145 """
146 REQ_BGL = False
147
148 def ExpandNames(self):
149 self.needed_locks = {}
150
151 def Exec(self, feedback_fn):
152 jobs = []
153
154 if self.op.group_name:
155 groups = [self.op.group_name]
156 depends_fn = lambda: None
157 else:
158 groups = self.cfg.GetNodeGroupList()
159
160 # Verify global configuration
161 jobs.append([
162 opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors),
163 ])
164
165 # Always depend on global verification
166 depends_fn = lambda: [(-len(jobs), [])]
167
168 jobs.extend(
169 [opcodes.OpClusterVerifyGroup(group_name=group,
170 ignore_errors=self.op.ignore_errors,
171 depends=depends_fn(),
172 verify_clutter=self.op.verify_clutter)]
173 for group in groups)
174
175 # Fix up all parameters
176 for op in itertools.chain(*jobs): # pylint: disable=W0142
177 op.debug_simulate_errors = self.op.debug_simulate_errors
178 op.verbose = self.op.verbose
179 op.error_codes = self.op.error_codes
180 try:
181 op.skip_checks = self.op.skip_checks
182 except AttributeError:
183 assert not isinstance(op, opcodes.OpClusterVerifyGroup)
184
185 return ResultWithJobs(jobs)
186
187
188 class LUClusterVerifyDisks(NoHooksLU):
189 """Verifies the cluster disks status.
190
191 """
192 REQ_BGL = False
193
194 def ExpandNames(self):
195 self.share_locks = ShareAll()
196 self.needed_locks = {
197 locking.LEVEL_NODEGROUP: locking.ALL_SET,
198 }
199
200 def Exec(self, feedback_fn):
201 group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
202
203 # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
204 return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
205 for group in group_names])
206
207
208 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
209 """Verifies the cluster config.
210
211 """
212 REQ_BGL = False
213
214 def _VerifyHVP(self, hvp_data):
215 """Verifies locally the syntax of the hypervisor parameters.
216
217 """
218 for item, hv_name, hv_params in hvp_data:
219 msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
220 (item, hv_name))
221 try:
222 hv_class = hypervisor.GetHypervisorClass(hv_name)
223 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
224 hv_class.CheckParameterSyntax(hv_params)
225 except errors.GenericError, err:
226 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
227
228 def ExpandNames(self):
229 self.needed_locks = dict.fromkeys(locking.LEVELS, locking.ALL_SET)
230 self.share_locks = ShareAll()
231
232 def CheckPrereq(self):
233 """Check prerequisites.
234
235 """
236 # Retrieve all information
237 self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
238 self.all_node_info = self.cfg.GetAllNodesInfo()
239 self.all_inst_info = self.cfg.GetAllInstancesInfo()
240
241 def Exec(self, feedback_fn):
242 """Verify integrity of cluster, performing various test on nodes.
243
244 """
245 self.bad = False
246 self._feedback_fn = feedback_fn
247
248 feedback_fn("* Verifying cluster config")
249
250 for msg in self.cfg.VerifyConfig():
251 self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg)
252
253 feedback_fn("* Verifying cluster certificate files")
254
255 for cert_filename in pathutils.ALL_CERT_FILES:
256 (errcode, msg) = utils.VerifyCertificate(cert_filename)
257 self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg, code=errcode)
258
259 self._ErrorIf(not utils.CanRead(constants.LUXID_USER,
260 pathutils.NODED_CERT_FILE),
261 constants.CV_ECLUSTERCERT,
262 None,
263 pathutils.NODED_CERT_FILE + " must be accessible by the " +
264 constants.LUXID_USER + " user")
265
266 feedback_fn("* Verifying hypervisor parameters")
267
268 self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
269 self.all_inst_info.values()))
270
271 feedback_fn("* Verifying all nodes belong to an existing group")
272
273 # We do this verification here because, should this bogus circumstance
274 # occur, it would never be caught by VerifyGroup, which only acts on
275 # nodes/instances reachable from existing node groups.
276
277 dangling_nodes = set(node for node in self.all_node_info.values()
278 if node.group not in self.all_group_info)
279
280 dangling_instances = {}
281 no_node_instances = []
282
283 for inst in self.all_inst_info.values():
284 if inst.primary_node in [node.uuid for node in dangling_nodes]:
285 dangling_instances.setdefault(inst.primary_node, []).append(inst)
286 elif inst.primary_node not in self.all_node_info:
287 no_node_instances.append(inst)
288
289 pretty_dangling = [
290 "%s (%s)" %
291 (node.name,
292 utils.CommaJoin(inst.name for
293 inst in dangling_instances.get(node.uuid, [])))
294 for node in dangling_nodes]
295
296 self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
297 None,
298 "the following nodes (and their instances) belong to a non"
299 " existing group: %s", utils.CommaJoin(pretty_dangling))
300
301 self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
302 None,
303 "the following instances have a non-existing primary-node:"
304 " %s", utils.CommaJoin(inst.name for
305 inst in no_node_instances))
306
307 return not self.bad
308
309
310 class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
311 """Verifies the status of a node group.
312
313 """
314 HPATH = "cluster-verify"
315 HTYPE = constants.HTYPE_CLUSTER
316 REQ_BGL = False
317
318 _HOOKS_INDENT_RE = re.compile("^", re.M)
319
320 class NodeImage(object):
321 """A class representing the logical and physical status of a node.
322
323 @type uuid: string
324 @ivar uuid: the node UUID to which this object refers
325 @ivar volumes: a structure as returned from
326 L{ganeti.backend.GetVolumeList} (runtime)
327 @ivar instances: a list of running instances (runtime)
328 @ivar pinst: list of configured primary instances (config)
329 @ivar sinst: list of configured secondary instances (config)
330 @ivar sbp: dictionary of {primary-node: list of instances} for all
331 instances for which this node is secondary (config)
332 @ivar mfree: free memory, as reported by hypervisor (runtime)
333 @ivar dfree: free disk, as reported by the node (runtime)
334 @ivar offline: the offline status (config)
335 @type rpc_fail: boolean
336 @ivar rpc_fail: whether the RPC verify call was successfull (overall,
337 not whether the individual keys were correct) (runtime)
338 @type lvm_fail: boolean
339 @ivar lvm_fail: whether the RPC call didn't return valid LVM data
340 @type hyp_fail: boolean
341 @ivar hyp_fail: whether the RPC call didn't return the instance list
342 @type ghost: boolean
343 @ivar ghost: whether this is a known node or not (config)
344 @type os_fail: boolean
345 @ivar os_fail: whether the RPC call didn't return valid OS data
346 @type oslist: list
347 @ivar oslist: list of OSes as diagnosed by DiagnoseOS
348 @type vm_capable: boolean
349 @ivar vm_capable: whether the node can host instances
350 @type pv_min: float
351 @ivar pv_min: size in MiB of the smallest PVs
352 @type pv_max: float
353 @ivar pv_max: size in MiB of the biggest PVs
354
355 """
356 def __init__(self, offline=False, uuid=None, vm_capable=True):
357 self.uuid = uuid
358 self.volumes = {}
359 self.instances = []
360 self.pinst = []
361 self.sinst = []
362 self.sbp = {}
363 self.mfree = 0
364 self.dfree = 0
365 self.offline = offline
366 self.vm_capable = vm_capable
367 self.rpc_fail = False
368 self.lvm_fail = False
369 self.hyp_fail = False
370 self.ghost = False
371 self.os_fail = False
372 self.oslist = {}
373 self.pv_min = None
374 self.pv_max = None
375
376 def ExpandNames(self):
377 # This raises errors.OpPrereqError on its own:
378 self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
379
380 # Get instances in node group; this is unsafe and needs verification later
381 inst_uuids = \
382 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
383
384 self.needed_locks = {
385 locking.LEVEL_INSTANCE: self.cfg.GetInstanceNames(inst_uuids),
386 locking.LEVEL_NODEGROUP: [self.group_uuid],
387 locking.LEVEL_NODE: [],
388 }
389
390 self.share_locks = ShareAll()
391
392 def DeclareLocks(self, level):
393 if level == locking.LEVEL_NODE:
394 # Get members of node group; this is unsafe and needs verification later
395 nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
396
397 # In Exec(), we warn about mirrored instances that have primary and
398 # secondary living in separate node groups. To fully verify that
399 # volumes for these instances are healthy, we will need to do an
400 # extra call to their secondaries. We ensure here those nodes will
401 # be locked.
402 for inst_name in self.owned_locks(locking.LEVEL_INSTANCE):
403 # Important: access only the instances whose lock is owned
404 instance = self.cfg.GetInstanceInfoByName(inst_name)
405 disks = self.cfg.GetInstanceDisks(instance.uuid)
406 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
407 nodes.update(self.cfg.GetInstanceSecondaryNodes(instance.uuid))
408
409 self.needed_locks[locking.LEVEL_NODE] = nodes
410
411 def CheckPrereq(self):
412 assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
413 self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
414
415 group_node_uuids = set(self.group_info.members)
416 group_inst_uuids = \
417 self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
418
419 unlocked_node_uuids = \
420 group_node_uuids.difference(self.owned_locks(locking.LEVEL_NODE))
421
422 unlocked_inst_uuids = \
423 group_inst_uuids.difference(
424 [self.cfg.GetInstanceInfoByName(name).uuid
425 for name in self.owned_locks(locking.LEVEL_INSTANCE)])
426
427 if unlocked_node_uuids:
428 raise errors.OpPrereqError(
429 "Missing lock for nodes: %s" %
430 utils.CommaJoin(self.cfg.GetNodeNames(unlocked_node_uuids)),
431 errors.ECODE_STATE)
432
433 if unlocked_inst_uuids:
434 raise errors.OpPrereqError(
435 "Missing lock for instances: %s" %
436 utils.CommaJoin(self.cfg.GetInstanceNames(unlocked_inst_uuids)),
437 errors.ECODE_STATE)
438
439 self.all_node_info = self.cfg.GetAllNodesInfo()
440 self.all_inst_info = self.cfg.GetAllInstancesInfo()
441 self.all_disks_info = self.cfg.GetAllDisksInfo()
442
443 self.my_node_uuids = group_node_uuids
444 self.my_node_info = dict((node_uuid, self.all_node_info[node_uuid])
445 for node_uuid in group_node_uuids)
446
447 self.my_inst_uuids = group_inst_uuids
448 self.my_inst_info = dict((inst_uuid, self.all_inst_info[inst_uuid])
449 for inst_uuid in group_inst_uuids)
450
451 # We detect here the nodes that will need the extra RPC calls for verifying
452 # split LV volumes; they should be locked.
453 extra_lv_nodes = set()
454
455 for inst in self.my_inst_info.values():
456 disks = self.cfg.GetInstanceDisks(inst.uuid)
457 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
458 inst_nodes = self.cfg.GetInstanceNodes(inst.uuid)
459 for nuuid in inst_nodes:
460 if self.all_node_info[nuuid].group != self.group_uuid:
461 extra_lv_nodes.add(nuuid)
462
463 unlocked_lv_nodes = \
464 extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
465
466 if unlocked_lv_nodes:
467 raise errors.OpPrereqError("Missing node locks for LV check: %s" %
468 utils.CommaJoin(unlocked_lv_nodes),
469 errors.ECODE_STATE)
470 self.extra_lv_nodes = list(extra_lv_nodes)
471
472 def _VerifyNode(self, ninfo, nresult):
473 """Perform some basic validation on data returned from a node.
474
475 - check the result data structure is well formed and has all the
476 mandatory fields
477 - check ganeti version
478
479 @type ninfo: L{objects.Node}
480 @param ninfo: the node to check
481 @param nresult: the results from the node
482 @rtype: boolean
483 @return: whether overall this call was successful (and we can expect
484 reasonable values in the respose)
485
486 """
487 # main result, nresult should be a non-empty dict
488 test = not nresult or not isinstance(nresult, dict)
489 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name,
490 "unable to verify node: no data returned")
491 if test:
492 return False
493
494 # compares ganeti version
495 local_version = constants.PROTOCOL_VERSION
496 remote_version = nresult.get("version", None)
497 test = not (remote_version and
498 isinstance(remote_version, (list, tuple)) and
499 len(remote_version) == 2)
500 self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name,
501 "connection to node returned invalid data")
502 if test:
503 return False
504
505 test = local_version != remote_version[0]
506 self._ErrorIf(test, constants.CV_ENODEVERSION, ninfo.name,
507 "incompatible protocol versions: master %s,"
508 " node %s", local_version, remote_version[0])
509 if test:
510 return False
511
512 # node seems compatible, we can actually try to look into its results
513
514 # full package version
515 self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
516 constants.CV_ENODEVERSION, ninfo.name,
517 "software version mismatch: master %s, node %s",
518 constants.RELEASE_VERSION, remote_version[1],
519 code=self.ETYPE_WARNING)
520
521 hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
522 if ninfo.vm_capable and isinstance(hyp_result, dict):
523 for hv_name, hv_result in hyp_result.iteritems():
524 test = hv_result is not None
525 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
526 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
527
528 hvp_result = nresult.get(constants.NV_HVPARAMS, None)
529 if ninfo.vm_capable and isinstance(hvp_result, list):
530 for item, hv_name, hv_result in hvp_result:
531 self._ErrorIf(True, constants.CV_ENODEHV, ninfo.name,
532 "hypervisor %s parameter verify failure (source %s): %s",
533 hv_name, item, hv_result)
534
535 test = nresult.get(constants.NV_NODESETUP,
536 ["Missing NODESETUP results"])
537 self._ErrorIf(test, constants.CV_ENODESETUP, ninfo.name,
538 "node setup error: %s", "; ".join(test))
539
540 return True
541
542 def _VerifyNodeTime(self, ninfo, nresult,
543 nvinfo_starttime, nvinfo_endtime):
544 """Check the node time.
545
546 @type ninfo: L{objects.Node}
547 @param ninfo: the node to check
548 @param nresult: the remote results for the node
549 @param nvinfo_starttime: the start time of the RPC call
550 @param nvinfo_endtime: the end time of the RPC call
551
552 """
553 ntime = nresult.get(constants.NV_TIME, None)
554 try:
555 ntime_merged = utils.MergeTime(ntime)
556 except (ValueError, TypeError):
557 self._ErrorIf(True, constants.CV_ENODETIME, ninfo.name,
558 "Node returned invalid time")
559 return
560
561 if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
562 ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
563 elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
564 ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
565 else:
566 ntime_diff = None
567
568 self._ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, ninfo.name,
569 "Node time diverges by at least %s from master node time",
570 ntime_diff)
571
572 def _UpdateVerifyNodeLVM(self, ninfo, nresult, vg_name, nimg):
573 """Check the node LVM results and update info for cross-node checks.
574
575 @type ninfo: L{objects.Node}
576 @param ninfo: the node to check
577 @param nresult: the remote results for the node
578 @param vg_name: the configured VG name
579 @type nimg: L{NodeImage}
580 @param nimg: node image
581
582 """
583 if vg_name is None:
584 return
585
586 # checks vg existence and size > 20G
587 vglist = nresult.get(constants.NV_VGLIST, None)
588 test = not vglist
589 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name,
590 "unable to check volume groups")
591 if not test:
592 vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
593 constants.MIN_VG_SIZE)
594 self._ErrorIf(vgstatus, constants.CV_ENODELVM, ninfo.name, vgstatus)
595
596 # Check PVs
597 (errmsgs, pvminmax) = CheckNodePVs(nresult, self._exclusive_storage)
598 for em in errmsgs:
599 self._Error(constants.CV_ENODELVM, ninfo.name, em)
600 if pvminmax is not None:
601 (nimg.pv_min, nimg.pv_max) = pvminmax
602
603 def _VerifyGroupDRBDVersion(self, node_verify_infos):
604 """Check cross-node DRBD version consistency.
605
606 @type node_verify_infos: dict
607 @param node_verify_infos: infos about nodes as returned from the
608 node_verify call.
609
610 """
611 node_versions = {}
612 for node_uuid, ndata in node_verify_infos.items():
613 nresult = ndata.payload
614 if nresult:
615 version = nresult.get(constants.NV_DRBDVERSION, None)
616 if version:
617 node_versions[node_uuid] = version
618
619 if len(set(node_versions.values())) > 1:
620 for node_uuid, version in sorted(node_versions.items()):
621 msg = "DRBD version mismatch: %s" % version
622 self._Error(constants.CV_ENODEDRBDHELPER, node_uuid, msg,
623 code=self.ETYPE_WARNING)
624
625 def _VerifyGroupLVM(self, node_image, vg_name):
626 """Check cross-node consistency in LVM.
627
628 @type node_image: dict
629 @param node_image: info about nodes, mapping from node to names to
630 L{NodeImage} objects
631 @param vg_name: the configured VG name
632
633 """
634 if vg_name is None:
635 return
636
637 # Only exclusive storage needs this kind of checks
638 if not self._exclusive_storage:
639 return
640
641 # exclusive_storage wants all PVs to have the same size (approximately),
642 # if the smallest and the biggest ones are okay, everything is fine.
643 # pv_min is None iff pv_max is None
644 vals = filter((lambda ni: ni.pv_min is not None), node_image.values())
645 if not vals:
646 return
647 (pvmin, minnode_uuid) = min((ni.pv_min, ni.uuid) for ni in vals)
648 (pvmax, maxnode_uuid) = max((ni.pv_max, ni.uuid) for ni in vals)
649 bad = utils.LvmExclusiveTestBadPvSizes(pvmin, pvmax)
650 self._ErrorIf(bad, constants.CV_EGROUPDIFFERENTPVSIZE, self.group_info.name,
651 "PV sizes differ too much in the group; smallest (%s MB) is"
652 " on %s, biggest (%s MB) is on %s",
653 pvmin, self.cfg.GetNodeName(minnode_uuid),
654 pvmax, self.cfg.GetNodeName(maxnode_uuid))
655
656 def _VerifyNodeBridges(self, ninfo, nresult, bridges):
657 """Check the node bridges.
658
659 @type ninfo: L{objects.Node}
660 @param ninfo: the node to check
661 @param nresult: the remote results for the node
662 @param bridges: the expected list of bridges
663
664 """
665 if not bridges:
666 return
667
668 missing = nresult.get(constants.NV_BRIDGES, None)
669 test = not isinstance(missing, list)
670 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
671 "did not return valid bridge information")
672 if not test:
673 self._ErrorIf(bool(missing), constants.CV_ENODENET, ninfo.name,
674 "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
675
676 def _VerifyNodeUserScripts(self, ninfo, nresult):
677 """Check the results of user scripts presence and executability on the node
678
679 @type ninfo: L{objects.Node}
680 @param ninfo: the node to check
681 @param nresult: the remote results for the node
682
683 """
684 test = not constants.NV_USERSCRIPTS in nresult
685 self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, ninfo.name,
686 "did not return user scripts information")
687
688 broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None)
689 if not test:
690 self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, ninfo.name,
691 "user scripts not present or not executable: %s" %
692 utils.CommaJoin(sorted(broken_scripts)))
693
694 def _VerifyNodeNetwork(self, ninfo, nresult):
695 """Check the node network connectivity results.
696
697 @type ninfo: L{objects.Node}
698 @param ninfo: the node to check
699 @param nresult: the remote results for the node
700
701 """
702 test = constants.NV_NODELIST not in nresult
703 self._ErrorIf(test, constants.CV_ENODESSH, ninfo.name,
704 "node hasn't returned node ssh connectivity data")
705 if not test:
706 if nresult[constants.NV_NODELIST]:
707 for a_node, a_msg in nresult[constants.NV_NODELIST].items():
708 self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name,
709 "ssh communication with node '%s': %s", a_node, a_msg)
710
711 test = constants.NV_NODENETTEST not in nresult
712 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
713 "node hasn't returned node tcp connectivity data")
714 if not test:
715 if nresult[constants.NV_NODENETTEST]:
716 nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
717 for anode in nlist:
718 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name,
719 "tcp communication with node '%s': %s",
720 anode, nresult[constants.NV_NODENETTEST][anode])
721
722 test = constants.NV_MASTERIP not in nresult
723 self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
724 "node hasn't returned node master IP reachability data")
725 if not test:
726 if not nresult[constants.NV_MASTERIP]:
727 if ninfo.uuid == self.master_node:
728 msg = "the master node cannot reach the master IP (not configured?)"
729 else:
730 msg = "cannot reach the master IP"
731 self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, msg)
732
733 def _VerifyInstance(self, instance, node_image, diskstatus):
734 """Verify an instance.
735
736 This function checks to see if the required block devices are
737 available on the instance's node, and that the nodes are in the correct
738 state.
739
740 """
741 pnode_uuid = instance.primary_node
742 pnode_img = node_image[pnode_uuid]
743 groupinfo = self.cfg.GetAllNodeGroupsInfo()
744
745 node_vol_should = {}
746 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
747
748 cluster = self.cfg.GetClusterInfo()
749 ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster,
750 self.group_info)
751 err = ComputeIPolicyInstanceViolation(ipolicy, instance, self.cfg)
752 self._ErrorIf(err, constants.CV_EINSTANCEPOLICY, instance.name,
753 utils.CommaJoin(err), code=self.ETYPE_WARNING)
754
755 for node_uuid in node_vol_should:
756 n_img = node_image[node_uuid]
757 if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
758 # ignore missing volumes on offline or broken nodes
759 continue
760 for volume in node_vol_should[node_uuid]:
761 test = volume not in n_img.volumes
762 self._ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance.name,
763 "volume %s missing on node %s", volume,
764 self.cfg.GetNodeName(node_uuid))
765
766 if instance.admin_state == constants.ADMINST_UP:
767 test = instance.uuid not in pnode_img.instances and not pnode_img.offline
768 self._ErrorIf(test, constants.CV_EINSTANCEDOWN, instance.name,
769 "instance not running on its primary node %s",
770 self.cfg.GetNodeName(pnode_uuid))
771 self._ErrorIf(pnode_img.offline, constants.CV_EINSTANCEBADNODE,
772 instance.name, "instance is marked as running and lives on"
773 " offline node %s", self.cfg.GetNodeName(pnode_uuid))
774
775 diskdata = [(nname, success, status, idx)
776 for (nname, disks) in diskstatus.items()
777 for idx, (success, status) in enumerate(disks)]
778
779 for nname, success, bdev_status, idx in diskdata:
780 # the 'ghost node' construction in Exec() ensures that we have a
781 # node here
782 snode = node_image[nname]
783 bad_snode = snode.ghost or snode.offline
784 self._ErrorIf(instance.disks_active and
785 not success and not bad_snode,
786 constants.CV_EINSTANCEFAULTYDISK, instance.name,
787 "couldn't retrieve status for disk/%s on %s: %s",
788 idx, self.cfg.GetNodeName(nname), bdev_status)
789
790 if instance.disks_active and success and bdev_status.is_degraded:
791 msg = "disk/%s on %s is degraded" % (idx, self.cfg.GetNodeName(nname))
792
793 code = self.ETYPE_ERROR
794 accepted_lds = [constants.LDS_OKAY, constants.LDS_SYNC]
795
796 if bdev_status.ldisk_status in accepted_lds:
797 code = self.ETYPE_WARNING
798
799 msg += "; local disk state is '%s'" % \
800 constants.LDS_NAMES[bdev_status.ldisk_status]
801
802 self._Error(constants.CV_EINSTANCEFAULTYDISK, instance.name, msg,
803 code=code)
804
805 self._ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
806 constants.CV_ENODERPC, self.cfg.GetNodeName(pnode_uuid),
807 "instance %s, connection to primary node failed",
808 instance.name)
809
810 secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid)
811 self._ErrorIf(len(secondary_nodes) > 1,
812 constants.CV_EINSTANCELAYOUT, instance.name,
813 "instance has multiple secondary nodes: %s",
814 utils.CommaJoin(secondary_nodes),
815 code=self.ETYPE_WARNING)
816
817 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid)
818 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, inst_nodes)
819 disks = self.cfg.GetInstanceDisks(instance.uuid)
820 if any(es_flags.values()):
821 if not utils.AllDiskOfType(disks, constants.DTS_EXCL_STORAGE):
822 # Disk template not compatible with exclusive_storage: no instance
823 # node should have the flag set
824 es_nodes = [n
825 for (n, es) in es_flags.items()
826 if es]
827 unsupported = [d.dev_type for d in disks
828 if d.dev_type not in constants.DTS_EXCL_STORAGE]
829 self._Error(constants.CV_EINSTANCEUNSUITABLENODE, instance.name,
830 "instance uses disk types %s, which are not supported on"
831 " nodes that have exclusive storage set: %s",
832 utils.CommaJoin(unsupported),
833 utils.CommaJoin(self.cfg.GetNodeNames(es_nodes)))
834 for (idx, disk) in enumerate(disks):
835 self._ErrorIf(disk.spindles is None,
836 constants.CV_EINSTANCEMISSINGCFGPARAMETER, instance.name,
837 "number of spindles not configured for disk %s while"
838 " exclusive storage is enabled, try running"
839 " gnt-cluster repair-disk-sizes", idx)
840
841 if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
842 instance_nodes = utils.NiceSort(inst_nodes)
843 instance_groups = {}
844
845 for node_uuid in instance_nodes:
846 instance_groups.setdefault(self.all_node_info[node_uuid].group,
847 []).append(node_uuid)
848
849 pretty_list = [
850 "%s (group %s)" % (utils.CommaJoin(self.cfg.GetNodeNames(nodes)),
851 groupinfo[group].name)
852 # Sort so that we always list the primary node first.
853 for group, nodes in sorted(instance_groups.items(),
854 key=lambda (_, nodes): pnode_uuid in nodes,
855 reverse=True)]
856
857 self._ErrorIf(len(instance_groups) > 1,
858 constants.CV_EINSTANCESPLITGROUPS,
859 instance.name, "instance has primary and secondary nodes in"
860 " different groups: %s", utils.CommaJoin(pretty_list),
861 code=self.ETYPE_WARNING)
862
863 inst_nodes_offline = []
864 for snode in secondary_nodes:
865 s_img = node_image[snode]
866 self._ErrorIf(s_img.rpc_fail and not s_img.offline, constants.CV_ENODERPC,
867 self.cfg.GetNodeName(snode),
868 "instance %s, connection to secondary node failed",
869 instance.name)
870
871 if s_img.offline:
872 inst_nodes_offline.append(snode)
873
874 # warn that the instance lives on offline nodes
875 self._ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE,
876 instance.name, "instance has offline secondary node(s) %s",
877 utils.CommaJoin(self.cfg.GetNodeNames(inst_nodes_offline)))
878 # ... or ghost/non-vm_capable nodes
879 for node_uuid in inst_nodes:
880 self._ErrorIf(node_image[node_uuid].ghost, constants.CV_EINSTANCEBADNODE,
881 instance.name, "instance lives on ghost node %s",
882 self.cfg.GetNodeName(node_uuid))
883 self._ErrorIf(not node_image[node_uuid].vm_capable,
884 constants.CV_EINSTANCEBADNODE, instance.name,
885 "instance lives on non-vm_capable node %s",
886 self.cfg.GetNodeName(node_uuid))
887
888 def _VerifyOrphanVolumes(self, vg_name, node_vol_should, node_image,
889 reserved):
890 """Verify if there are any unknown volumes in the cluster.
891
892 The .os, .swap and backup volumes are ignored. All other volumes are
893 reported as unknown.
894
895 @type vg_name: string
896 @param vg_name: the name of the Ganeti-administered volume group
897 @type reserved: L{ganeti.utils.FieldSet}
898 @param reserved: a FieldSet of reserved volume names
899
900 """
901 for node_uuid, n_img in node_image.items():
902 if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or
903 self.all_node_info[node_uuid].group != self.group_uuid):
904 # skip non-healthy nodes
905 continue
906 for volume in n_img.volumes:
907 # skip volumes not belonging to the ganeti-administered volume group
908 if volume.split('/')[0] != vg_name:
909 continue
910
911 test = ((node_uuid not in node_vol_should or
912 volume not in node_vol_should[node_uuid]) and
913 not reserved.Matches(volume))
914 self._ErrorIf(test, constants.CV_ENODEORPHANLV,
915 self.cfg.GetNodeName(node_uuid),
916 "volume %s is unknown", volume,
917 code=_VerifyErrors.ETYPE_WARNING)
918
919 def _VerifyNPlusOneMemory(self, node_image, all_insts):
920 """Verify N+1 Memory Resilience.
921
922 Check that if one single node dies we can still start all the
923 instances it was primary for.
924
925 """
926 cluster_info = self.cfg.GetClusterInfo()
927 for node_uuid, n_img in node_image.items():
928 # This code checks that every node which is now listed as
929 # secondary has enough memory to host all instances it is
930 # supposed to should a single other node in the cluster fail.
931 # FIXME: not ready for failover to an arbitrary node
932 # FIXME: does not support file-backed instances
933 # WARNING: we currently take into account down instances as well
934 # as up ones, considering that even if they're down someone
935 # might want to start them even in the event of a node failure.
936 if n_img.offline or \
937 self.all_node_info[node_uuid].group != self.group_uuid:
938 # we're skipping nodes marked offline and nodes in other groups from
939 # the N+1 warning, since most likely we don't have good memory
940 # information from them; we already list instances living on such
941 # nodes, and that's enough warning
942 continue
943 #TODO(dynmem): also consider ballooning out other instances
944 for prinode, inst_uuids in n_img.sbp.items():
945 needed_mem = 0
946 for inst_uuid in inst_uuids:
947 bep = cluster_info.FillBE(all_insts[inst_uuid])
948 if bep[constants.BE_AUTO_BALANCE]:
949 needed_mem += bep[constants.BE_MINMEM]
950 test = n_img.mfree < needed_mem
951 self._ErrorIf(test, constants.CV_ENODEN1,
952 self.cfg.GetNodeName(node_uuid),
953 "not enough memory to accomodate instance failovers"
954 " should node %s fail (%dMiB needed, %dMiB available)",
955 self.cfg.GetNodeName(prinode), needed_mem, n_img.mfree)
956
957 def _VerifyClientCertificates(self, nodes, all_nvinfo):
958 """Verifies the consistency of the client certificates.
959
960 This includes several aspects:
961 - the individual validation of all nodes' certificates
962 - the consistency of the master candidate certificate map
963 - the consistency of the master candidate certificate map with the
964 certificates that the master candidates are actually using.
965
966 @param nodes: the list of nodes to consider in this verification
967 @param all_nvinfo: the map of results of the verify_node call to
968 all nodes
969
970 """
971 candidate_certs = self.cfg.GetClusterInfo().candidate_certs
972 if candidate_certs is None or len(candidate_certs) == 0:
973 self._ErrorIf(
974 True, constants.CV_ECLUSTERCLIENTCERT, None,
975 "The cluster's list of master candidate certificates is empty."
976 " If you just updated the cluster, please run"
977 " 'gnt-cluster renew-crypto --new-node-certificates'.")
978 return
979
980 self._ErrorIf(
981 len(candidate_certs) != len(set(candidate_certs.values())),
982 constants.CV_ECLUSTERCLIENTCERT, None,
983 "There are at least two master candidates configured to use the same"
984 " certificate.")
985
986 # collect the client certificate
987 for node in nodes:
988 if node.offline:
989 continue
990
991 nresult = all_nvinfo[node.uuid]
992 if nresult.fail_msg or not nresult.payload:
993 continue
994
995 (errcode, msg) = nresult.payload.get(constants.NV_CLIENT_CERT, None)
996
997 self._ErrorIf(
998 errcode is not None, constants.CV_ECLUSTERCLIENTCERT, None,
999 "Client certificate of node '%s' failed validation: %s (code '%s')",
1000 node.uuid, msg, errcode)
1001
1002 if not errcode:
1003 digest = msg
1004 if node.master_candidate:
1005 if node.uuid in candidate_certs:
1006 self._ErrorIf(
1007 digest != candidate_certs[node.uuid],
1008 constants.CV_ECLUSTERCLIENTCERT, None,
1009 "Client certificate digest of master candidate '%s' does not"
1010 " match its entry in the cluster's map of master candidate"
1011 " certificates. Expected: %s Got: %s", node.uuid,
1012 digest, candidate_certs[node.uuid])
1013 else:
1014 self._ErrorIf(
1015 True, constants.CV_ECLUSTERCLIENTCERT, None,
1016 "The master candidate '%s' does not have an entry in the"
1017 " map of candidate certificates.", node.uuid)
1018 self._ErrorIf(
1019 digest in candidate_certs.values(),
1020 constants.CV_ECLUSTERCLIENTCERT, None,
1021 "Master candidate '%s' is using a certificate of another node.",
1022 node.uuid)
1023 else:
1024 self._ErrorIf(
1025 node.uuid in candidate_certs,
1026 constants.CV_ECLUSTERCLIENTCERT, None,
1027 "Node '%s' is not a master candidate, but still listed in the"
1028 " map of master candidate certificates.", node.uuid)
1029 self._ErrorIf(
1030 (node.uuid not in candidate_certs) and
1031 (digest in candidate_certs.values()),
1032 constants.CV_ECLUSTERCLIENTCERT, None,
1033 "Node '%s' is not a master candidate and is incorrectly using a"
1034 " certificate of another node which is master candidate.",
1035 node.uuid)
1036
1037 def _VerifySshSetup(self, nodes, all_nvinfo):
1038 """Evaluates the verification results of the SSH setup and clutter test.
1039
1040 @param nodes: List of L{objects.Node} objects
1041 @param all_nvinfo: RPC results
1042
1043 """
1044 for node in nodes:
1045 if not node.offline:
1046 nresult = all_nvinfo[node.uuid]
1047 if nresult.fail_msg or not nresult.payload:
1048 self._ErrorIf(True, constants.CV_ENODESSH, node.name,
1049 "Could not verify the SSH setup of this node.")
1050 return
1051 for ssh_test in [constants.NV_SSH_SETUP, constants.NV_SSH_CLUTTER]:
1052 result = nresult.payload.get(ssh_test, None)
1053 error_msg = ""
1054 if isinstance(result, list):
1055 error_msg = " ".join(result)
1056 self._ErrorIf(result,
1057 constants.CV_ENODESSH, None, error_msg)
1058
1059 def _VerifyFiles(self, nodes, master_node_uuid, all_nvinfo,
1060 (files_all, files_opt, files_mc, files_vm)):
1061 """Verifies file checksums collected from all nodes.
1062
1063 @param nodes: List of L{objects.Node} objects
1064 @param master_node_uuid: UUID of master node
1065 @param all_nvinfo: RPC results
1066
1067 """
1068 # Define functions determining which nodes to consider for a file
1069 files2nodefn = [
1070 (files_all, None),
1071 (files_mc, lambda node: (node.master_candidate or
1072 node.uuid == master_node_uuid)),
1073 (files_vm, lambda node: node.vm_capable),
1074 ]
1075
1076 # Build mapping from filename to list of nodes which should have the file
1077 nodefiles = {}
1078 for (files, fn) in files2nodefn:
1079 if fn is None:
1080 filenodes = nodes
1081 else:
1082 filenodes = filter(fn, nodes)
1083 nodefiles.update((filename,
1084 frozenset(map(operator.attrgetter("uuid"), filenodes)))
1085 for filename in files)
1086
1087 assert set(nodefiles) == (files_all | files_mc | files_vm)
1088
1089 fileinfo = dict((filename, {}) for filename in nodefiles)
1090 ignore_nodes = set()
1091
1092 for node in nodes:
1093 if node.offline:
1094 ignore_nodes.add(node.uuid)
1095 continue
1096
1097 nresult = all_nvinfo[node.uuid]
1098
1099 if nresult.fail_msg or not nresult.payload:
1100 node_files = None
1101 else:
1102 fingerprints = nresult.payload.get(constants.NV_FILELIST, {})
1103 node_files = dict((vcluster.LocalizeVirtualPath(key), value)
1104 for (key, value) in fingerprints.items())
1105 del fingerprints
1106
1107 test = not (node_files and isinstance(node_files, dict))
1108 self._ErrorIf(test, constants.CV_ENODEFILECHECK, node.name,
1109 "Node did not return file checksum data")
1110 if test:
1111 ignore_nodes.add(node.uuid)
1112 continue
1113
1114 # Build per-checksum mapping from filename to nodes having it
1115 for (filename, checksum) in node_files.items():
1116 assert filename in nodefiles
1117 fileinfo[filename].setdefault(checksum, set()).add(node.uuid)
1118
1119 for (filename, checksums) in fileinfo.items():
1120 assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
1121
1122 # Nodes having the file
1123 with_file = frozenset(node_uuid
1124 for node_uuids in fileinfo[filename].values()
1125 for node_uuid in node_uuids) - ignore_nodes
1126
1127 expected_nodes = nodefiles[filename] - ignore_nodes
1128
1129 # Nodes missing file
1130 missing_file = expected_nodes - with_file
1131
1132 if filename in files_opt:
1133 # All or no nodes
1134 self._ErrorIf(missing_file and missing_file != expected_nodes,
1135 constants.CV_ECLUSTERFILECHECK, None,
1136 "File %s is optional, but it must exist on all or no"
1137 " nodes (not found on %s)",
1138 filename,
1139 utils.CommaJoin(
1140 utils.NiceSort(
1141 map(self.cfg.GetNodeName, missing_file))))
1142 else:
1143 self._ErrorIf(missing_file, constants.CV_ECLUSTERFILECHECK, None,
1144 "File %s is missing from node(s) %s", filename,
1145 utils.CommaJoin(
1146 utils.NiceSort(
1147 map(self.cfg.GetNodeName, missing_file))))
1148
1149 # Warn if a node has a file it shouldn't
1150 unexpected = with_file - expected_nodes
1151 self._ErrorIf(unexpected,
1152 constants.CV_ECLUSTERFILECHECK, None,
1153 "File %s should not exist on node(s) %s",
1154 filename, utils.CommaJoin(
1155 utils.NiceSort(map(self.cfg.GetNodeName, unexpected))))
1156
1157 # See if there are multiple versions of the file
1158 test = len(checksums) > 1
1159 if test:
1160 variants = ["variant %s on %s" %
1161 (idx + 1,
1162 utils.CommaJoin(utils.NiceSort(
1163 map(self.cfg.GetNodeName, node_uuids))))
1164 for (idx, (checksum, node_uuids)) in
1165 enumerate(sorted(checksums.items()))]
1166 else:
1167 variants = []
1168
1169 self._ErrorIf(test, constants.CV_ECLUSTERFILECHECK, None,
1170 "File %s found with %s different checksums (%s)",
1171 filename, len(checksums), "; ".join(variants))
1172
1173 def _VerifyNodeDrbdHelper(self, ninfo, nresult, drbd_helper):
1174 """Verify the drbd helper.
1175
1176 """
1177 if drbd_helper:
1178 helper_result = nresult.get(constants.NV_DRBDHELPER, None)
1179 test = (helper_result is None)
1180 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
1181 "no drbd usermode helper returned")
1182 if helper_result:
1183 status, payload = helper_result
1184 test = not status
1185 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
1186 "drbd usermode helper check unsuccessful: %s", payload)
1187 test = status and (payload != drbd_helper)
1188 self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
1189 "wrong drbd usermode helper: %s", payload)
1190
1191 @staticmethod
1192 def _ComputeDrbdMinors(ninfo, instanceinfo, disks_info, drbd_map, error_if):
1193 """Gives the DRBD information in a map for a node.
1194
1195 @type ninfo: L{objects.Node}
1196 @param ninfo: the node to check
1197 @param instanceinfo: the dict of instances
1198 @param disks_info: the dict of disks
1199 @param drbd_map: the DRBD map as returned by
1200 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1201 @type error_if: callable like L{_ErrorIf}
1202 @param error_if: The error reporting function
1203 @return: dict from minor number to (disk_uuid, instance_uuid, active)
1204
1205 """
1206 node_drbd = {}
1207 for minor, disk_uuid in drbd_map[ninfo.uuid].items():
1208 test = disk_uuid not in disks_info
1209 error_if(test, constants.CV_ECLUSTERCFG, None,
1210 "ghost disk '%s' in temporary DRBD map", disk_uuid)
1211 # ghost disk should not be active, but otherwise we
1212 # don't give double warnings (both ghost disk and
1213 # unallocated minor in use)
1214 if test:
1215 node_drbd[minor] = (disk_uuid, None, False)
1216 else:
1217 disk_active = False
1218 disk_instance = None
1219 for (inst_uuid, inst) in instanceinfo.items():
1220 if disk_uuid in inst.disks:
1221 disk_active = inst.disks_active
1222 disk_instance = inst_uuid
1223 break
1224 node_drbd[minor] = (disk_uuid, disk_instance, disk_active)
1225 return node_drbd
1226
1227 def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, disks_info,
1228 drbd_helper, drbd_map):
1229 """Verifies and the node DRBD status.
1230
1231 @type ninfo: L{objects.Node}
1232 @param ninfo: the node to check
1233 @param nresult: the remote results for the node
1234 @param instanceinfo: the dict of instances
1235 @param disks_info: the dict of disks
1236 @param drbd_helper: the configured DRBD usermode helper
1237 @param drbd_map: the DRBD map as returned by
1238 L{ganeti.config.ConfigWriter.ComputeDRBDMap}
1239
1240 """
1241 self._VerifyNodeDrbdHelper(ninfo, nresult, drbd_helper)
1242
1243 # compute the DRBD minors
1244 node_drbd = self._ComputeDrbdMinors(ninfo, instanceinfo, disks_info,
1245 drbd_map, self._ErrorIf)
1246
1247 # and now check them
1248 used_minors = nresult.get(constants.NV_DRBDLIST, [])
1249 test = not isinstance(used_minors, (tuple, list))
1250 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
1251 "cannot parse drbd status file: %s", str(used_minors))
1252 if test:
1253 # we cannot check drbd status
1254 return
1255
1256 for minor, (disk_uuid, inst_uuid, must_exist) in node_drbd.items():
1257 test = minor not in used_minors and must_exist
1258 if inst_uuid is not None:
1259 attached = "(attached in instance '%s')" % \
1260 self.cfg.GetInstanceName(inst_uuid)
1261 else:
1262 attached = "(detached)"
1263 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
1264 "drbd minor %d of disk %s %s is not active",
1265 minor, disk_uuid, attached)
1266 for minor in used_minors:
1267 test = minor not in node_drbd
1268 self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
1269 "unallocated drbd minor %d is in use", minor)
1270
1271 def _UpdateNodeOS(self, ninfo, nresult, nimg):
1272 """Builds the node OS structures.
1273
1274 @type ninfo: L{objects.Node}
1275 @param ninfo: the node to check
1276 @param nresult: the remote results for the node
1277 @param nimg: the node image object
1278
1279 """
1280 remote_os = nresult.get(constants.NV_OSLIST, None)
1281 test = (not isinstance(remote_os, list) or
1282 not compat.all(isinstance(v, list) and len(v) == 8
1283 for v in remote_os))
1284
1285 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name,
1286 "node hasn't returned valid OS data")
1287
1288 nimg.os_fail = test
1289
1290 if test:
1291 return
1292
1293 os_dict = {}
1294
1295 for (name, os_path, status, diagnose,
1296 variants, parameters, api_ver,
1297 trusted) in nresult[constants.NV_OSLIST]:
1298
1299 if name not in os_dict:
1300 os_dict[name] = []
1301
1302 # parameters is a list of lists instead of list of tuples due to
1303 # JSON lacking a real tuple type, fix it:
1304 parameters = [tuple(v) for v in parameters]
1305 os_dict[name].append((os_path, status, diagnose,
1306 set(variants), set(parameters), set(api_ver),
1307 trusted))
1308
1309 nimg.oslist = os_dict
1310
1311 def _VerifyNodeOS(self, ninfo, nimg, base):
1312 """Verifies the node OS list.
1313
1314 @type ninfo: L{objects.Node}
1315 @param ninfo: the node to check
1316 @param nimg: the node image object
1317 @param base: the 'template' node we match against (e.g. from the master)
1318
1319 """
1320 assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
1321
1322 beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
1323 for os_name, os_data in nimg.oslist.items():
1324 assert os_data, "Empty OS status for OS %s?!" % os_name
1325 f_path, f_status, f_diag, f_var, f_param, f_api, f_trusted = os_data[0]
1326 self._ErrorIf(not f_status, constants.CV_ENODEOS, ninfo.name,
1327 "Invalid OS %s (located at %s): %s",
1328 os_name, f_path, f_diag)
1329 self._ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, ninfo.name,
1330 "OS '%s' has multiple entries"
1331 " (first one shadows the rest): %s",
1332 os_name, utils.CommaJoin([v[0] for v in os_data]))
1333 # comparisons with the 'base' image
1334 test = os_name not in base.oslist
1335 self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name,
1336 "Extra OS %s not present on reference node (%s)",
1337 os_name, self.cfg.GetNodeName(base.uuid))
1338 if test:
1339 continue
1340 assert base.oslist[os_name], "Base node has empty OS status?"
1341 _, b_status, _, b_var, b_param, b_api, b_trusted = base.oslist[os_name][0]
1342 if not b_status:
1343 # base OS is invalid, skipping
1344 continue
1345 for kind, a, b in [("API version", f_api, b_api),
1346 ("variants list", f_var, b_var),
1347 ("parameters", beautify_params(f_param),
1348 beautify_params(b_param))]:
1349 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name,
1350 "OS %s for %s differs from reference node %s:"
1351 " [%s] vs. [%s]", kind, os_name,
1352 self.cfg.GetNodeName(base.uuid),
1353 utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
1354 for kind, a, b in [("trusted", f_trusted, b_trusted)]:
1355 self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name,
1356 "OS %s for %s differs from reference node %s:"
1357 " %s vs. %s", kind, os_name,
1358 self.cfg.GetNodeName(base.uuid), a, b)
1359
1360 # check any missing OSes
1361 missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
1362 self._ErrorIf(missing, constants.CV_ENODEOS, ninfo.name,
1363 "OSes present on reference node %s"
1364 " but missing on this node: %s",
1365 self.cfg.GetNodeName(base.uuid), utils.CommaJoin(missing))
1366
1367 def _VerifyAcceptedFileStoragePaths(self, ninfo, nresult, is_master):
1368 """Verifies paths in L{pathutils.FILE_STORAGE_PATHS_FILE}.
1369
1370 @type ninfo: L{objects.Node}
1371 @param ninfo: the node to check
1372 @param nresult: the remote results for the node
1373 @type is_master: bool
1374 @param is_master: Whether node is the master node
1375
1376 """
1377 cluster = self.cfg.GetClusterInfo()
1378 if (is_master and
1379 (cluster.IsFileStorageEnabled() or
1380 cluster.IsSharedFileStorageEnabled())):
1381 try:
1382 fspaths = nresult[constants.NV_ACCEPTED_STORAGE_PATHS]
1383 except KeyError:
1384 # This should never happen
1385 self._ErrorIf(True, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
1386 "Node did not return forbidden file storage paths")
1387 else:
1388 self._ErrorIf(fspaths, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
1389 "Found forbidden file storage paths: %s",
1390 utils.CommaJoin(fspaths))
1391 else:
1392 self._ErrorIf(constants.NV_ACCEPTED_STORAGE_PATHS in nresult,
1393 constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
1394 "Node should not have returned forbidden file storage"
1395 " paths")
1396
1397 def _VerifyStoragePaths(self, ninfo, nresult, file_disk_template,
1398 verify_key, error_key):
1399 """Verifies (file) storage paths.
1400
1401 @type ninfo: L{objects.Node}
1402 @param ninfo: the node to check
1403 @param nresult: the remote results for the node
1404 @type file_disk_template: string
1405 @param file_disk_template: file-based disk template, whose directory
1406 is supposed to be verified
1407 @type verify_key: string
1408 @param verify_key: key for the verification map of this file
1409 verification step
1410 @param error_key: error key to be added to the verification results
1411 in case something goes wrong in this verification step
1412
1413 """
1414 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes(
1415 constants.ST_FILE, constants.ST_SHARED_FILE, constants.ST_GLUSTER
1416 ))
1417
1418 cluster = self.cfg.GetClusterInfo()
1419 if cluster.IsDiskTemplateEnabled(file_disk_template):
1420 self._ErrorIf(
1421 verify_key in nresult,
1422 error_key, ninfo.name,
1423 "The configured %s storage path is unusable: %s" %
1424 (file_disk_template, nresult.get(verify_key)))
1425
1426 def _VerifyFileStoragePaths(self, ninfo, nresult):
1427 """Verifies (file) storage paths.
1428
1429 @see: C{_VerifyStoragePaths}
1430
1431 """
1432 self._VerifyStoragePaths(
1433 ninfo, nresult, constants.DT_FILE,
1434 constants.NV_FILE_STORAGE_PATH,
1435 constants.CV_ENODEFILESTORAGEPATHUNUSABLE)
1436
1437 def _VerifySharedFileStoragePaths(self, ninfo, nresult):
1438 """Verifies (file) storage paths.
1439
1440 @see: C{_VerifyStoragePaths}
1441
1442 """
1443 self._VerifyStoragePaths(
1444 ninfo, nresult, constants.DT_SHARED_FILE,
1445 constants.NV_SHARED_FILE_STORAGE_PATH,
1446 constants.CV_ENODESHAREDFILESTORAGEPATHUNUSABLE)
1447
1448 def _VerifyGlusterStoragePaths(self, ninfo, nresult):
1449 """Verifies (file) storage paths.
1450
1451 @see: C{_VerifyStoragePaths}
1452
1453 """
1454 self._VerifyStoragePaths(
1455 ninfo, nresult, constants.DT_GLUSTER,
1456 constants.NV_GLUSTER_STORAGE_PATH,
1457 constants.CV_ENODEGLUSTERSTORAGEPATHUNUSABLE)
1458
1459 def _VerifyOob(self, ninfo, nresult):
1460 """Verifies out of band functionality of a node.
1461
1462 @type ninfo: L{objects.Node}
1463 @param ninfo: the node to check
1464 @param nresult: the remote results for the node
1465
1466 """
1467 # We just have to verify the paths on master and/or master candidates
1468 # as the oob helper is invoked on the master
1469 if ((ninfo.master_candidate or ninfo.master_capable) and
1470 constants.NV_OOB_PATHS in nresult):
1471 for path_result in nresult[constants.NV_OOB_PATHS]:
1472 self._ErrorIf(path_result, constants.CV_ENODEOOBPATH,
1473 ninfo.name, path_result)
1474
1475 def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
1476 """Verifies and updates the node volume data.
1477
1478 This function will update a L{NodeImage}'s internal structures
1479 with data from the remote call.
1480
1481 @type ninfo: L{objects.Node}
1482 @param ninfo: the node to check
1483 @param nresult: the remote results for the node
1484 @param nimg: the node image object
1485 @param vg_name: the configured VG name
1486
1487 """
1488 nimg.lvm_fail = True
1489 lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
1490 if vg_name is None:
1491 pass
1492 elif isinstance(lvdata, basestring):
1493 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name,
1494 "LVM problem on node: %s", utils.SafeEncode(lvdata))
1495 elif not isinstance(lvdata, dict):
1496 self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name,
1497 "rpc call to node failed (lvlist)")
1498 else:
1499 nimg.volumes = lvdata
1500 nimg.lvm_fail = False
1501
1502 def _UpdateNodeInstances(self, ninfo, nresult, nimg):
1503 """Verifies and updates the node instance list.
1504
1505 If the listing was successful, then updates this node's instance
1506 list. Otherwise, it marks the RPC call as failed for the instance
1507 list key.
1508
1509 @type ninfo: L{objects.Node}
1510 @param ninfo: the node to check
1511 @param nresult: the remote results for the node
1512 @param nimg: the node image object
1513
1514 """
1515 idata = nresult.get(constants.NV_INSTANCELIST, None)
1516 test = not isinstance(idata, list)
1517 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
1518 "rpc call to node failed (instancelist): %s",
1519 utils.SafeEncode(str(idata)))
1520 if test:
1521 nimg.hyp_fail = True
1522 else:
1523 nimg.instances = [uuid for (uuid, _) in
1524 self.cfg.GetMultiInstanceInfoByName(idata)]
1525
1526 def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
1527 """Verifies and computes a node information map
1528
1529 @type ninfo: L{objects.Node}
1530 @param ninfo: the node to check
1531 @param nresult: the remote results for the node
1532 @param nimg: the node image object
1533 @param vg_name: the configured VG name
1534
1535 """
1536 # try to read free memory (from the hypervisor)
1537 hv_info = nresult.get(constants.NV_HVINFO, None)
1538 test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
1539 self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
1540 "rpc call to node failed (hvinfo)")
1541 if not test:
1542 try:
1543 nimg.mfree = int(hv_info["memory_free"])
1544 except (ValueError, TypeError):
1545 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
1546 "node returned invalid nodeinfo, check hypervisor")
1547
1548 # FIXME: devise a free space model for file based instances as well
1549 if vg_name is not None:
1550 test = (constants.NV_VGLIST not in nresult or
1551 vg_name not in nresult[constants.NV_VGLIST])
1552 self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name,
1553 "node didn't return data for the volume group '%s'"
1554 " - it is either missing or broken", vg_name)
1555 if not test:
1556 try:
1557 nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
1558 except (ValueError, TypeError):
1559 self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
1560 "node returned invalid LVM info, check LVM status")
1561
1562 def _CollectDiskInfo(self, node_uuids, node_image, instanceinfo):
1563 """Gets per-disk status information for all instances.
1564
1565 @type node_uuids: list of strings
1566 @param node_uuids: Node UUIDs
1567 @type node_image: dict of (UUID, L{objects.Node})
1568 @param node_image: Node objects
1569 @type instanceinfo: dict of (UUID, L{objects.Instance})
1570 @param instanceinfo: Instance objects
1571 @rtype: {instance: {node: [(succes, payload)]}}
1572 @return: a dictionary of per-instance dictionaries with nodes as
1573 keys and disk information as values; the disk information is a
1574 list of tuples (success, payload)
1575
1576 """
1577 node_disks = {}
1578 node_disks_dev_inst_only = {}
1579 diskless_instances = set()
1580 nodisk_instances = set()
1581
1582 for nuuid in node_uuids:
1583 node_inst_uuids = list(itertools.chain(node_image[nuuid].pinst,
1584 node_image[nuuid].sinst))
1585 diskless_instances.update(uuid for uuid in node_inst_uuids
1586 if not instanceinfo[uuid].disks)
1587 disks = [(inst_uuid, disk)
1588 for inst_uuid in node_inst_uuids
1589 for disk in self.cfg.GetInstanceDisks(inst_uuid)]
1590
1591 if not disks:
1592 nodisk_instances.update(uuid for uuid in node_inst_uuids
1593 if instanceinfo[uuid].disks)
1594 # No need to collect data
1595 continue
1596
1597 node_disks[nuuid] = disks
1598
1599 # _AnnotateDiskParams makes already copies of the disks
1600 dev_inst_only = []
1601 for (inst_uuid, dev) in disks:
1602 (anno_disk,) = AnnotateDiskParams(instanceinfo[inst_uuid], [dev],
1603 self.cfg)
1604 dev_inst_only.append((anno_disk, instanceinfo[inst_uuid]))
1605
1606 node_disks_dev_inst_only[nuuid] = dev_inst_only
1607
1608 assert len(node_disks) == len(node_disks_dev_inst_only)
1609
1610 # Collect data from all nodes with disks
1611 result = self.rpc.call_blockdev_getmirrorstatus_multi(
1612 node_disks.keys(), node_disks_dev_inst_only)
1613
1614 assert len(result) == len(node_disks)
1615
1616 instdisk = {}
1617
1618 for (nuuid, nres) in result.items():
1619 node = self.cfg.GetNodeInfo(nuuid)
1620 disks = node_disks[node.uuid]
1621
1622 if nres.offline:
1623 # No data from this node
1624 data = len(disks) * [(False, "node offline")]
1625 else:
1626 msg = nres.fail_msg
1627 self._ErrorIf(msg, constants.CV_ENODERPC, node.name,
1628 "while getting disk information: %s", msg)
1629 if msg:
1630 # No data from this node
1631 data = len(disks) * [(False, msg)]
1632 else:
1633 data = []
1634 for idx, i in enumerate(nres.payload):
1635 if isinstance(i, (tuple, list)) and len(i) == 2:
1636 data.append(i)
1637 else:
1638 logging.warning("Invalid result from node %s, entry %d: %s",
1639 node.name, idx, i)
1640 data.append((False, "Invalid result from the remote node"))
1641
1642 for ((inst_uuid, _), status) in zip(disks, data):
1643 instdisk.setdefault(inst_uuid, {}).setdefault(node.uuid, []) \
1644 .append(status)
1645
1646 # Add empty entries for diskless instances.
1647 for inst_uuid in diskless_instances:
1648 assert inst_uuid not in instdisk
1649 instdisk[inst_uuid] = {}
1650 # ...and disk-full instances that happen to have no disks
1651 for inst_uuid in nodisk_instances:
1652 assert inst_uuid not in instdisk
1653 instdisk[inst_uuid] = {}
1654
1655 assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
1656 len(nuuids) <= len(
1657 self.cfg.GetInstanceNodes(instanceinfo[inst].uuid)) and
1658 compat.all(isinstance(s, (tuple, list)) and
1659 len(s) == 2 for s in statuses)
1660 for inst, nuuids in instdisk.items()
1661 for nuuid, statuses in nuuids.items())
1662 if __debug__:
1663 instdisk_keys = set(instdisk)
1664 instanceinfo_keys = set(instanceinfo)
1665 assert instdisk_keys == instanceinfo_keys, \
1666 ("instdisk keys (%s) do not match instanceinfo keys (%s)" %
1667 (instdisk_keys, instanceinfo_keys))
1668
1669 return instdisk
1670
1671 @staticmethod
1672 def _SshNodeSelector(group_uuid, all_nodes):
1673 """Create endless iterators for all potential SSH check hosts.
1674
1675 """
1676 nodes = [node for node in all_nodes
1677 if (node.group != group_uuid and
1678 not node.offline)]
1679 keyfunc = operator.attrgetter("group")
1680
1681 return map(itertools.cycle,
1682 [sorted(map(operator.attrgetter("name"), names))
1683 for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
1684 keyfunc)])
1685
1686 @classmethod
1687 def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
1688 """Choose which nodes should talk to which other nodes.
1689
1690 We will make nodes contact all nodes in their group, and one node from
1691 every other group.
1692
1693 @rtype: tuple of (string, dict of strings to list of strings, string)
1694 @return: a tuple containing the list of all online nodes, a dictionary
1695 mapping node names to additional nodes of other node groups to which
1696 connectivity should be tested, and a list of all online master
1697 candidates
1698
1699 @warning: This algorithm has a known issue if one node group is much
1700 smaller than others (e.g. just one node). In such a case all other
1701 nodes will talk to the single node.
1702
1703 """
1704 online_nodes = sorted(node.name for node in group_nodes if not node.offline)
1705 online_mcs = sorted(node.name for node in group_nodes
1706 if (node.master_candidate and not node.offline))
1707 sel = cls._SshNodeSelector(group_uuid, all_nodes)
1708
1709 return (online_nodes,
1710 dict((name, sorted([i.next() for i in sel]))
1711 for name in online_nodes),
1712 online_mcs)
1713
1714 def _PrepareSshSetupCheck(self):
1715 """Prepare the input data for the SSH setup verification.
1716
1717 """
1718 all_nodes_info = self.cfg.GetAllNodesInfo()
1719 potential_master_candidates = self.cfg.GetPotentialMasterCandidates()
1720 node_status = [
1721 (uuid, node_info.name, node_info.master_candidate,
1722 node_info.name in potential_master_candidates, not node_info.offline)
1723 for (uuid, node_info) in all_nodes_info.items()]
1724 return node_status
1725
1726 def BuildHooksEnv(self):
1727 """Build hooks env.
1728
1729 Cluster-Verify hooks just ran in the post phase and their failure makes
1730 the output be logged in the verify output and the verification to fail.
1731
1732 """
1733 env = {
1734 "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags()),
1735 }
1736
1737 env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
1738 for node in self.my_node_info.values())
1739
1740 return env
1741
1742 def BuildHooksNodes(self):
1743 """Build hooks nodes.
1744
1745 """
1746 return ([], list(self.my_node_info.keys()))
1747
1748 @staticmethod
1749 def _VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced,
1750 i_offline, n_offline, n_drained):
1751 feedback_fn("* Other Notes")
1752 if i_non_redundant:
1753 feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
1754 % len(i_non_redundant))
1755
1756 if i_non_a_balanced:
1757 feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
1758 % len(i_non_a_balanced))
1759
1760 if i_offline:
1761 feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline)
1762
1763 if n_offline:
1764 feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
1765
1766 if n_drained:
1767 feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
1768
1769 def Exec(self, feedback_fn): # pylint: disable=R0915
1770 """Verify integrity of the node group, performing various test on nodes.
1771
1772 """
1773 # This method has too many local variables. pylint: disable=R0914
1774 feedback_fn("* Verifying group '%s'" % self.group_info.name)
1775
1776 if not self.my_node_uuids:
1777 # empty node group
1778 feedback_fn("* Empty node group, skipping verification")
1779 return True
1780
1781 self.bad = False
1782 verbose = self.op.verbose
1783 self._feedback_fn = feedback_fn
1784
1785 vg_name = self.cfg.GetVGName()
1786 drbd_helper = self.cfg.GetDRBDHelper()
1787 cluster = self.cfg.GetClusterInfo()
1788 hypervisors = cluster.enabled_hypervisors
1789 node_data_list = self.my_node_info.values()
1790
1791 i_non_redundant = [] # Non redundant instances
1792 i_non_a_balanced = [] # Non auto-balanced instances
1793 i_offline = 0 # Count of offline instances
1794 n_offline = 0 # Count of offline nodes
1795 n_drained = 0 # Count of nodes being drained
1796 node_vol_should = {}
1797
1798 # FIXME: verify OS list
1799
1800 # File verification
1801 filemap = ComputeAncillaryFiles(cluster, False)
1802
1803 # do local checksums
1804 master_node_uuid = self.master_node = self.cfg.GetMasterNode()
1805 master_ip = self.cfg.GetMasterIP()
1806
1807 feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids))
1808
1809 user_scripts = []
1810 if self.cfg.GetUseExternalMipScript():
1811 user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT)
1812
1813 node_verify_param = {
1814 constants.NV_FILELIST:
1815 map(vcluster.MakeVirtualPath,
1816 utils.UniqueSequence(filename
1817 for files in filemap
1818 for filename in files)),
1819 constants.NV_NODELIST:
1820 self._SelectSshCheckNodes(node_data_list, self.group_uuid,
1821 self.all_node_info.values()),
1822 constants.NV_HYPERVISOR: hypervisors,
1823 constants.NV_HVPARAMS:
1824 _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
1825 constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip)
1826 for node in node_data_list
1827 if not node.offline],
1828 constants.NV_INSTANCELIST: hypervisors,
1829 constants.NV_VERSION: None,
1830 constants.NV_HVINFO: self.cfg.GetHypervisorType(),
1831 constants.NV_NODESETUP: None,
1832 constants.NV_TIME: None,
1833 constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip),
1834 constants.NV_OSLIST: None,
1835 constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(),
1836 constants.NV_USERSCRIPTS: user_scripts,
1837 constants.NV_CLIENT_CERT: None,
1838 }
1839
1840 if self.cfg.GetClusterInfo().modify_ssh_setup:
1841 node_verify_param[constants.NV_SSH_SETUP] = self._PrepareSshSetupCheck()
1842 if self.op.verify_clutter:
1843 node_verify_param[constants.NV_SSH_CLUTTER] = True
1844
1845 if vg_name is not None:
1846 node_verify_param[constants.NV_VGLIST] = None
1847 node_verify_param[constants.NV_LVLIST] = vg_name
1848 node_verify_param[constants.NV_PVLIST] = [vg_name]
1849
1850 if cluster.IsDiskTemplateEnabled(constants.DT_DRBD8):
1851 if drbd_helper:
1852 node_verify_param[constants.NV_DRBDVERSION] = None
1853 node_verify_param[constants.NV_DRBDLIST] = None
1854 node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
1855
1856 if cluster.IsFileStorageEnabled() or \
1857 cluster.IsSharedFileStorageEnabled():
1858 # Load file storage paths only from master node
1859 node_verify_param[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1860 self.cfg.GetMasterNodeName()
1861 if cluster.IsFileStorageEnabled():
1862 node_verify_param[constants.NV_FILE_STORAGE_PATH] = \
1863 cluster.file_storage_dir
1864 if cluster.IsSharedFileStorageEnabled():
1865 node_verify_param[constants.NV_SHARED_FILE_STORAGE_PATH] = \
1866 cluster.shared_file_storage_dir
1867
1868 # bridge checks
1869 # FIXME: this needs to be changed per node-group, not cluster-wide
1870 bridges = set()
1871 default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
1872 if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
1873 bridges.add(default_nicpp[constants.NIC_LINK])
1874 for inst_uuid in self.my_inst_info.values():
1875 for nic in inst_uuid.nics:
1876 full_nic = cluster.SimpleFillNIC(nic.nicparams)
1877 if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
1878 bridges.add(full_nic[constants.NIC_LINK])
1879
1880 if bridges:
1881 node_verify_param[constants.NV_BRIDGES] = list(bridges)
1882
1883 # Build our expected cluster state
1884 node_image = dict((node.uuid, self.NodeImage(offline=node.offline,
1885 uuid=node.uuid,
1886 vm_capable=node.vm_capable))
1887 for node in node_data_list)
1888
1889 # Gather OOB paths
1890 oob_paths = []
1891 for node in self.all_node_info.values():
1892 path = SupportsOob(self.cfg, node)
1893 if path and path not in oob_paths:
1894 oob_paths.append(path)
1895
1896 if oob_paths:
1897 node_verify_param[constants.NV_OOB_PATHS] = oob_paths
1898
1899 for inst_uuid in self.my_inst_uuids:
1900 instance = self.my_inst_info[inst_uuid]
1901 if instance.admin_state == constants.ADMINST_OFFLINE:
1902 i_offline += 1
1903
1904 inst_nodes = self.cfg.GetInstanceNodes(instance.uuid)
1905 for nuuid in inst_nodes:
1906 if nuuid not in node_image:
1907 gnode = self.NodeImage(uuid=nuuid)
1908 gnode.ghost = (nuuid not in self.all_node_info)
1909 node_image[nuuid] = gnode
1910
1911 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
1912
1913 pnode = instance.primary_node
1914 node_image[pnode].pinst.append(instance.uuid)
1915
1916 for snode in self.cfg.GetInstanceSecondaryNodes(instance.uuid):
1917 nimg = node_image[snode]
1918 nimg.sinst.append(instance.uuid)
1919 if pnode not in nimg.sbp:
1920 nimg.sbp[pnode] = []
1921 nimg.sbp[pnode].append(instance.uuid)
1922
1923 es_flags = rpc.GetExclusiveStorageForNodes(self.cfg,
1924 self.my_node_info.keys())
1925 # The value of exclusive_storage should be the same across the group, so if
1926 # it's True for at least a node, we act as if it were set for all the nodes
1927 self._exclusive_storage = compat.any(es_flags.values())
1928 if self._exclusive_storage:
1929 node_verify_param[constants.NV_EXCLUSIVEPVS] = True
1930
1931 node_group_uuids = dict(map(lambda n: (n.name, n.group),
1932 self.cfg.GetAllNodesInfo().values()))
1933 groups_config = self.cfg.GetAllNodeGroupsInfoDict()
1934
1935 # At this point, we have the in-memory data structures complete,
1936 # except for the runtime information, which we'll gather next
1937
1938 # NOTE: Here we lock the configuration for the duration of RPC calls,
1939 # which means that the cluster configuration changes are blocked during
1940 # this period.
1941 # This is something that should be done only exceptionally and only for
1942 # justified cases!
1943 # In this case, we need the lock as we can only verify the integrity of
1944 # configuration files on MCs only if we know nobody else is modifying it.
1945 # FIXME: The check for integrity of config.data should be moved to
1946 # WConfD, which is the only one who can otherwise ensure nobody
1947 # will modify the configuration during the check.
1948 with self.cfg.GetConfigManager(shared=True, forcelock=True):
1949 feedback_fn("* Gathering information about nodes (%s nodes)" %
1950 len(self.my_node_uuids))
1951 # Force the configuration to be fully distributed before doing any tests
1952 self.cfg.FlushConfig()
1953 # Due to the way our RPC system works, exact response times cannot be
1954 # guaranteed (e.g. a broken node could run into a timeout). By keeping
1955 # the time before and after executing the request, we can at least have
1956 # a time window.
1957 nvinfo_starttime = time.time()
1958 # Get lock on the configuration so that nobody modifies it concurrently.
1959 # Otherwise it can be modified by other jobs, failing the consistency
1960 # test.
1961 # NOTE: This is an exceptional situation, we should otherwise avoid
1962 # locking the configuration for something but very fast, pure operations.
1963 cluster_name = self.cfg.GetClusterName()
1964 hvparams = self.cfg.GetClusterInfo().hvparams
1965 all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids,
1966 node_verify_param,
1967 cluster_name,
1968 hvparams,
1969 node_group_uuids,
1970 groups_config)
1971 nvinfo_endtime = time.time()
1972
1973 if self.extra_lv_nodes and vg_name is not None:
1974 feedback_fn("* Gathering information about extra nodes (%s nodes)" %
1975 len(self.extra_lv_nodes))
1976 extra_lv_nvinfo = \
1977 self.rpc.call_node_verify(self.extra_lv_nodes,
1978 {constants.NV_LVLIST: vg_name},
1979 self.cfg.GetClusterName(),
1980 self.cfg.GetClusterInfo().hvparams,
1981 node_group_uuids,
1982 groups_config)
1983 else:
1984 extra_lv_nvinfo = {}
1985
1986 # If not all nodes are being checked, we need to make sure the master
1987 # node and a non-checked vm_capable node are in the list.
1988 absent_node_uuids = set(self.all_node_info).difference(self.my_node_info)
1989 if absent_node_uuids:
1990 vf_nvinfo = all_nvinfo.copy()
1991 vf_node_info = list(self.my_node_info.values())
1992 additional_node_uuids = []
1993 if master_node_uuid not in self.my_node_info:
1994 additional_node_uuids.append(master_node_uuid)
1995 vf_node_info.append(self.all_node_info[master_node_uuid])
1996 # Add the first vm_capable node we find which is not included,
1997 # excluding the master node (which we already have)
1998 for node_uuid in absent_node_uuids:
1999 nodeinfo = self.all_node_info[node_uuid]
2000 if (nodeinfo.vm_capable and not nodeinfo.offline and
2001 node_uuid != master_node_uuid):
2002 additional_node_uuids.append(node_uuid)
2003 vf_node_info.append(self.all_node_info[node_uuid])
2004 break
2005 key = constants.NV_FILELIST
2006
2007 feedback_fn("* Gathering information about the master node")
2008 vf_nvinfo.update(self.rpc.call_node_verify(
2009 additional_node_uuids, {key: node_verify_param[key]},
2010 self.cfg.GetClusterName(), self.cfg.GetClusterInfo().hvparams,
2011 node_group_uuids,
2012 groups_config))
2013 else:
2014 vf_nvinfo = all_nvinfo
2015 vf_node_info = self.my_node_info.values()
2016
2017 all_drbd_map = self.cfg.ComputeDRBDMap()
2018
2019 feedback_fn("* Gathering disk information (%s nodes)" %
2020 len(self.my_node_uuids))
2021 instdisk = self._CollectDiskInfo(self.my_node_info.keys(), node_image,
2022 self.my_inst_info)
2023
2024 feedback_fn("* Verifying configuration file consistency")
2025
2026 self._VerifyClientCertificates(self.my_node_info.values(), all_nvinfo)
2027 if self.cfg.GetClusterInfo().modify_ssh_setup:
2028 self._VerifySshSetup(self.my_node_info.values(), all_nvinfo)
2029 self._VerifyFiles(vf_node_info, master_node_uuid, vf_nvinfo, filemap)
2030
2031 feedback_fn("* Verifying node status")
2032
2033 refos_img = None
2034
2035 for node_i in node_data_list:
2036 nimg = node_image[node_i.uuid]
2037
2038 if node_i.offline:
2039 if verbose:
2040 feedback_fn("* Skipping offline node %s" % (node_i.name,))
2041 n_offline += 1
2042 continue
2043
2044 if node_i.uuid == master_node_uuid:
2045 ntype = "master"
2046 elif node_i.master_candidate:
2047 ntype = "master candidate"
2048 elif node_i.drained:
2049 ntype = "drained"
2050 n_drained += 1
2051 else:
2052 ntype = "regular"
2053 if verbose:
2054 feedback_fn("* Verifying node %s (%s)" % (node_i.name, ntype))
2055
2056 msg = all_nvinfo[node_i.uuid].fail_msg
2057 self._ErrorIf(msg, constants.CV_ENODERPC, node_i.name,
2058 "while contacting node: %s", msg)
2059 if msg:
2060 nimg.rpc_fail = True
2061 continue
2062
2063 nresult = all_nvinfo[node_i.uuid].payload
2064
2065 nimg.call_ok = self._VerifyNode(node_i, nresult)
2066 self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
2067 self._VerifyNodeNetwork(node_i, nresult)
2068 self._VerifyNodeUserScripts(node_i, nresult)
2069 self._VerifyOob(node_i, nresult)
2070 self._VerifyAcceptedFileStoragePaths(node_i, nresult,
2071 node_i.uuid == master_node_uuid)
2072 self._VerifyFileStoragePaths(node_i, nresult)
2073 self._VerifySharedFileStoragePaths(node_i, nresult)
2074 self._VerifyGlusterStoragePaths(node_i, nresult)
2075
2076 if nimg.vm_capable:
2077 self._UpdateVerifyNodeLVM(node_i, nresult, vg_name, nimg)
2078 if constants.DT_DRBD8 in cluster.enabled_disk_templates:
2079 self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info,
2080 self.all_disks_info, drbd_helper, all_drbd_map)
2081
2082 if (constants.DT_PLAIN in cluster.enabled_disk_templates) or \
2083 (constants.DT_DRBD8 in cluster.enabled_disk_templates):
2084 self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
2085 self._UpdateNodeInstances(node_i, nresult, nimg)
2086 self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
2087 self._UpdateNodeOS(node_i, nresult, nimg)
2088
2089 if not nimg.os_fail:
2090 if refos_img is None:
2091 refos_img = nimg
2092 self._VerifyNodeOS(node_i, nimg, refos_img)
2093 self._VerifyNodeBridges(node_i, nresult, bridges)
2094
2095 # Check whether all running instances are primary for the node. (This
2096 # can no longer be done from _VerifyInstance below, since some of the
2097 # wrong instances could be from other node groups.)
2098 non_primary_inst_uuids = set(nimg.instances).difference(nimg.pinst)
2099
2100 for inst_uuid in non_primary_inst_uuids:
2101 test = inst_uuid in self.all_inst_info
2102 self._ErrorIf(test, constants.CV_EINSTANCEWRONGNODE,
2103 self.cfg.GetInstanceName(inst_uuid),
2104 "instance should not run on node %s", node_i.name)
2105 self._ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE, node_i.name,
2106 "node is running unknown instance %s", inst_uuid)
2107
2108 self._VerifyGroupDRBDVersion(all_nvinfo)
2109 self._VerifyGroupLVM(node_image, vg_name)
2110
2111 for node_uuid, result in extra_lv_nvinfo.items():
2112 self._UpdateNodeVolumes(self.all_node_info[node_uuid], result.payload,
2113 node_image[node_uuid], vg_name)
2114
2115 feedback_fn("* Verifying instance status")
2116 for inst_uuid in self.my_inst_uuids:
2117 instance = self.my_inst_info[inst_uuid]
2118 if verbose:
2119 feedback_fn("* Verifying instance %s" % instance.name)
2120 self._VerifyInstance(instance, node_image, instdisk[inst_uuid])
2121
2122 # If the instance is not fully redundant we cannot survive losing its
2123 # primary node, so we are not N+1 compliant.
2124 inst_disks = self.cfg.GetInstanceDisks(instance.uuid)
2125 if not utils.AllDiskOfType(inst_disks, constants.DTS_MIRRORED):
2126 i_non_redundant.append(instance)
2127
2128 if not cluster.FillBE(instance)[constants.BE_AUTO_BALANCE]:
2129 i_non_a_balanced.append(instance)
2130
2131 feedback_fn("* Verifying orphan volumes")
2132 reserved = utils.FieldSet(*cluster.reserved_lvs)
2133
2134 # We will get spurious "unknown volume" warnings if any node of this group
2135 # is secondary for an instance whose primary is in another group. To avoid
2136 # them, we find these instances and add their volumes to node_vol_should.
2137 for instance in self.all_inst_info.values():
2138 for secondary in self.cfg.GetInstanceSecondaryNodes(instance.uuid):
2139 if (secondary in self.my_node_info
2140 and instance.name not in self.my_inst_info):
2141 self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
2142 break
2143
2144 self._VerifyOrphanVolumes(vg_name, node_vol_should, node_image, reserved)
2145
2146 if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
2147 feedback_fn("* Verifying N+1 Memory redundancy")
2148 self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
2149
2150 self._VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced,
2151 i_offline, n_offline, n_drained)
2152
2153 return not self.bad
2154
2155 def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
2156 """Analyze the post-hooks' result
2157
2158 This method analyses the hook result, handles it, and sends some
2159 nicely-formatted feedback back to the user.
2160
2161 @param phase: one of L{constants.HOOKS_PHASE_POST} or
2162 L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
2163 @param hooks_results: the results of the multi-node hooks rpc call
2164 @param feedback_fn: function used send feedback back to the caller
2165 @param lu_result: previous Exec result
2166 @return: the new Exec result, based on the previous result
2167 and hook results
2168
2169 """
2170 # We only really run POST phase hooks, only for non-empty groups,
2171 # and are only interested in their results
2172 if not self.my_node_uuids:
2173 # empty node group
2174 pass
2175 elif phase == constants.HOOKS_PHASE_POST:
2176 # Used to change hooks' output to proper indentation
2177 feedback_fn("* Hooks Results")
2178 assert hooks_results, "invalid result from hooks"
2179
2180 for node_name in hooks_results:
2181 res = hooks_results[node_name]
2182 msg = res.fail_msg
2183 test = msg and not res.offline
2184 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
2185 "Communication failure in hooks execution: %s", msg)
2186 if test:
2187 lu_result = False
2188 continue
2189 if res.offline:
2190 # No need to investigate payload if node is offline
2191 continue
2192 for script, hkr, output in res.payload:
2193 test = hkr == constants.HKR_FAIL
2194 self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
2195 "Script %s failed, output:", script)
2196 if test:
2197 output = self._HOOKS_INDENT_RE.sub(" ", output)
2198 feedback_fn("%s" % output)
2199 lu_result = False
2200
2201 return lu_result