Bulk-removal of SSH keys
[ganeti-github.git] / lib / backend.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31 """Functions used by the node daemon
32
33 @var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
34 the L{UploadFile} function
35 @var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted
36 in the L{_CleanDirectory} function
37
38 """
39
40 # pylint: disable=E1103,C0302
41
42 # E1103: %s %r has no %r member (but some types could not be
43 # inferred), because the _TryOSFromDisk returns either (True, os_obj)
44 # or (False, "string") which confuses pylint
45
46 # C0302: This module has become too big and should be split up
47
48
49 import base64
50 import errno
51 import logging
52 import os
53 import os.path
54 import pycurl
55 import random
56 import re
57 import shutil
58 import signal
59 import stat
60 import tempfile
61 import time
62 import zlib
63 import contextlib
64 import collections
65
66 from ganeti import errors
67 from ganeti import http
68 from ganeti import utils
69 from ganeti import ssh
70 from ganeti import hypervisor
71 from ganeti.hypervisor import hv_base
72 from ganeti import constants
73 from ganeti.storage import bdev
74 from ganeti.storage import drbd
75 from ganeti.storage import extstorage
76 from ganeti.storage import filestorage
77 from ganeti import objects
78 from ganeti import ssconf
79 from ganeti import serializer
80 from ganeti import netutils
81 from ganeti import runtime
82 from ganeti import compat
83 from ganeti import pathutils
84 from ganeti import vcluster
85 from ganeti import ht
86 from ganeti.storage.base import BlockDev
87 from ganeti.storage.drbd import DRBD8
88 from ganeti import hooksmaster
89 import ganeti.metad as metad
90
91
92 _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id"
93 _ALLOWED_CLEAN_DIRS = compat.UniqueFrozenset([
94 pathutils.DATA_DIR,
95 pathutils.JOB_QUEUE_ARCHIVE_DIR,
96 pathutils.QUEUE_DIR,
97 pathutils.CRYPTO_KEYS_DIR,
98 ])
99 _MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60
100 _X509_KEY_FILE = "key"
101 _X509_CERT_FILE = "cert"
102 _IES_STATUS_FILE = "status"
103 _IES_PID_FILE = "pid"
104 _IES_CA_FILE = "ca"
105
106 #: Valid LVS output line regex
107 _LVSLINE_REGEX = re.compile(r"^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$")
108
109 # Actions for the master setup script
110 _MASTER_START = "start"
111 _MASTER_STOP = "stop"
112
113 #: Maximum file permissions for restricted command directory and executables
114 _RCMD_MAX_MODE = (stat.S_IRWXU |
115 stat.S_IRGRP | stat.S_IXGRP |
116 stat.S_IROTH | stat.S_IXOTH)
117
118 #: Delay before returning an error for restricted commands
119 _RCMD_INVALID_DELAY = 10
120
121 #: How long to wait to acquire lock for restricted commands (shorter than
122 #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many
123 #: command requests arrive
124 _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8
125
126
127 class RPCFail(Exception):
128 """Class denoting RPC failure.
129
130 Its argument is the error message.
131
132 """
133
134
135 def _GetInstReasonFilename(instance_name):
136 """Path of the file containing the reason of the instance status change.
137
138 @type instance_name: string
139 @param instance_name: The name of the instance
140 @rtype: string
141 @return: The path of the file
142
143 """
144 return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name)
145
146
147 def _StoreInstReasonTrail(instance_name, trail):
148 """Serialize a reason trail related to an instance change of state to file.
149
150 The exact location of the file depends on the name of the instance and on
151 the configuration of the Ganeti cluster defined at deploy time.
152
153 @type instance_name: string
154 @param instance_name: The name of the instance
155
156 @type trail: list of reasons
157 @param trail: reason trail
158
159 @rtype: None
160
161 """
162 json = serializer.DumpJson(trail)
163 filename = _GetInstReasonFilename(instance_name)
164 utils.WriteFile(filename, data=json)
165
166
167 def _Fail(msg, *args, **kwargs):
168 """Log an error and the raise an RPCFail exception.
169
170 This exception is then handled specially in the ganeti daemon and
171 turned into a 'failed' return type. As such, this function is a
172 useful shortcut for logging the error and returning it to the master
173 daemon.
174
175 @type msg: string
176 @param msg: the text of the exception
177 @raise RPCFail
178
179 """
180 if args:
181 msg = msg % args
182 if "log" not in kwargs or kwargs["log"]: # if we should log this error
183 if "exc" in kwargs and kwargs["exc"]:
184 logging.exception(msg)
185 else:
186 logging.error(msg)
187 raise RPCFail(msg)
188
189
190 def _GetConfig():
191 """Simple wrapper to return a SimpleStore.
192
193 @rtype: L{ssconf.SimpleStore}
194 @return: a SimpleStore instance
195
196 """
197 return ssconf.SimpleStore()
198
199
200 def _GetSshRunner(cluster_name):
201 """Simple wrapper to return an SshRunner.
202
203 @type cluster_name: str
204 @param cluster_name: the cluster name, which is needed
205 by the SshRunner constructor
206 @rtype: L{ssh.SshRunner}
207 @return: an SshRunner instance
208
209 """
210 return ssh.SshRunner(cluster_name)
211
212
213 def _Decompress(data):
214 """Unpacks data compressed by the RPC client.
215
216 @type data: list or tuple
217 @param data: Data sent by RPC client
218 @rtype: str
219 @return: Decompressed data
220
221 """
222 assert isinstance(data, (list, tuple))
223 assert len(data) == 2
224 (encoding, content) = data
225 if encoding == constants.RPC_ENCODING_NONE:
226 return content
227 elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
228 return zlib.decompress(base64.b64decode(content))
229 else:
230 raise AssertionError("Unknown data encoding")
231
232
233 def _CleanDirectory(path, exclude=None):
234 """Removes all regular files in a directory.
235
236 @type path: str
237 @param path: the directory to clean
238 @type exclude: list
239 @param exclude: list of files to be excluded, defaults
240 to the empty list
241
242 """
243 if path not in _ALLOWED_CLEAN_DIRS:
244 _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'",
245 path)
246
247 if not os.path.isdir(path):
248 return
249 if exclude is None:
250 exclude = []
251 else:
252 # Normalize excluded paths
253 exclude = [os.path.normpath(i) for i in exclude]
254
255 for rel_name in utils.ListVisibleFiles(path):
256 full_name = utils.PathJoin(path, rel_name)
257 if full_name in exclude:
258 continue
259 if os.path.isfile(full_name) and not os.path.islink(full_name):
260 utils.RemoveFile(full_name)
261
262
263 def _BuildUploadFileList():
264 """Build the list of allowed upload files.
265
266 This is abstracted so that it's built only once at module import time.
267
268 """
269 allowed_files = set([
270 pathutils.CLUSTER_CONF_FILE,
271 pathutils.ETC_HOSTS,
272 pathutils.SSH_KNOWN_HOSTS_FILE,
273 pathutils.VNC_PASSWORD_FILE,
274 pathutils.RAPI_CERT_FILE,
275 pathutils.SPICE_CERT_FILE,
276 pathutils.SPICE_CACERT_FILE,
277 pathutils.RAPI_USERS_FILE,
278 pathutils.CONFD_HMAC_KEY,
279 pathutils.CLUSTER_DOMAIN_SECRET_FILE,
280 ])
281
282 for hv_name in constants.HYPER_TYPES:
283 hv_class = hypervisor.GetHypervisorClass(hv_name)
284 allowed_files.update(hv_class.GetAncillaryFiles()[0])
285
286 assert pathutils.FILE_STORAGE_PATHS_FILE not in allowed_files, \
287 "Allowed file storage paths should never be uploaded via RPC"
288
289 return frozenset(allowed_files)
290
291
292 _ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
293
294
295 def JobQueuePurge():
296 """Removes job queue files and archived jobs.
297
298 @rtype: tuple
299 @return: True, None
300
301 """
302 _CleanDirectory(pathutils.QUEUE_DIR, exclude=[pathutils.JOB_QUEUE_LOCK_FILE])
303 _CleanDirectory(pathutils.JOB_QUEUE_ARCHIVE_DIR)
304
305
306 def GetMasterNodeName():
307 """Returns the master node name.
308
309 @rtype: string
310 @return: name of the master node
311 @raise RPCFail: in case of errors
312
313 """
314 try:
315 return _GetConfig().GetMasterNode()
316 except errors.ConfigurationError, err:
317 _Fail("Cluster configuration incomplete: %s", err, exc=True)
318
319
320 def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
321 """Decorator that runs hooks before and after the decorated function.
322
323 @type hook_opcode: string
324 @param hook_opcode: opcode of the hook
325 @type hooks_path: string
326 @param hooks_path: path of the hooks
327 @type env_builder_fn: function
328 @param env_builder_fn: function that returns a dictionary containing the
329 environment variables for the hooks. Will get all the parameters of the
330 decorated function.
331 @raise RPCFail: in case of pre-hook failure
332
333 """
334 def decorator(fn):
335 def wrapper(*args, **kwargs):
336 _, myself = ssconf.GetMasterAndMyself()
337 nodes = ([myself], [myself]) # these hooks run locally
338
339 env_fn = compat.partial(env_builder_fn, *args, **kwargs)
340
341 cfg = _GetConfig()
342 hr = HooksRunner()
343 hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes,
344 hr.RunLocalHooks, None, env_fn, None,
345 logging.warning, cfg.GetClusterName(),
346 cfg.GetMasterNode())
347 hm.RunPhase(constants.HOOKS_PHASE_PRE)
348 result = fn(*args, **kwargs)
349 hm.RunPhase(constants.HOOKS_PHASE_POST)
350
351 return result
352 return wrapper
353 return decorator
354
355
356 def _BuildMasterIpEnv(master_params, use_external_mip_script=None):
357 """Builds environment variables for master IP hooks.
358
359 @type master_params: L{objects.MasterNetworkParameters}
360 @param master_params: network parameters of the master
361 @type use_external_mip_script: boolean
362 @param use_external_mip_script: whether to use an external master IP
363 address setup script (unused, but necessary per the implementation of the
364 _RunLocalHooks decorator)
365
366 """
367 # pylint: disable=W0613
368 ver = netutils.IPAddress.GetVersionFromAddressFamily(master_params.ip_family)
369 env = {
370 "MASTER_NETDEV": master_params.netdev,
371 "MASTER_IP": master_params.ip,
372 "MASTER_NETMASK": str(master_params.netmask),
373 "CLUSTER_IP_VERSION": str(ver),
374 }
375
376 return env
377
378
379 def _RunMasterSetupScript(master_params, action, use_external_mip_script):
380 """Execute the master IP address setup script.
381
382 @type master_params: L{objects.MasterNetworkParameters}
383 @param master_params: network parameters of the master
384 @type action: string
385 @param action: action to pass to the script. Must be one of
386 L{backend._MASTER_START} or L{backend._MASTER_STOP}
387 @type use_external_mip_script: boolean
388 @param use_external_mip_script: whether to use an external master IP
389 address setup script
390 @raise backend.RPCFail: if there are errors during the execution of the
391 script
392
393 """
394 env = _BuildMasterIpEnv(master_params)
395
396 if use_external_mip_script:
397 setup_script = pathutils.EXTERNAL_MASTER_SETUP_SCRIPT
398 else:
399 setup_script = pathutils.DEFAULT_MASTER_SETUP_SCRIPT
400
401 result = utils.RunCmd([setup_script, action], env=env, reset_env=True)
402
403 if result.failed:
404 _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" %
405 (action, result.exit_code, result.output), log=True)
406
407
408 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup",
409 _BuildMasterIpEnv)
410 def ActivateMasterIp(master_params, use_external_mip_script):
411 """Activate the IP address of the master daemon.
412
413 @type master_params: L{objects.MasterNetworkParameters}
414 @param master_params: network parameters of the master
415 @type use_external_mip_script: boolean
416 @param use_external_mip_script: whether to use an external master IP
417 address setup script
418 @raise RPCFail: in case of errors during the IP startup
419
420 """
421 _RunMasterSetupScript(master_params, _MASTER_START,
422 use_external_mip_script)
423
424
425 def StartMasterDaemons(no_voting):
426 """Activate local node as master node.
427
428 The function will start the master daemons (ganeti-masterd and ganeti-rapi).
429
430 @type no_voting: boolean
431 @param no_voting: whether to start ganeti-masterd without a node vote
432 but still non-interactively
433 @rtype: None
434
435 """
436
437 if no_voting:
438 masterd_args = "--no-voting --yes-do-it"
439 else:
440 masterd_args = ""
441
442 env = {
443 "EXTRA_MASTERD_ARGS": masterd_args,
444 }
445
446 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-master"], env=env)
447 if result.failed:
448 msg = "Can't start Ganeti master: %s" % result.output
449 logging.error(msg)
450 _Fail(msg)
451
452
453 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown",
454 _BuildMasterIpEnv)
455 def DeactivateMasterIp(master_params, use_external_mip_script):
456 """Deactivate the master IP on this node.
457
458 @type master_params: L{objects.MasterNetworkParameters}
459 @param master_params: network parameters of the master
460 @type use_external_mip_script: boolean
461 @param use_external_mip_script: whether to use an external master IP
462 address setup script
463 @raise RPCFail: in case of errors during the IP turndown
464
465 """
466 _RunMasterSetupScript(master_params, _MASTER_STOP,
467 use_external_mip_script)
468
469
470 def StopMasterDaemons():
471 """Stop the master daemons on this node.
472
473 Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node.
474
475 @rtype: None
476
477 """
478 # TODO: log and report back to the caller the error failures; we
479 # need to decide in which case we fail the RPC for this
480
481 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-master"])
482 if result.failed:
483 logging.error("Could not stop Ganeti master, command %s had exitcode %s"
484 " and error %s",
485 result.cmd, result.exit_code, result.output)
486
487
488 def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
489 """Change the netmask of the master IP.
490
491 @param old_netmask: the old value of the netmask
492 @param netmask: the new value of the netmask
493 @param master_ip: the master IP
494 @param master_netdev: the master network device
495
496 """
497 if old_netmask == netmask:
498 return
499
500 if not netutils.IPAddress.Own(master_ip):
501 _Fail("The master IP address is not up, not attempting to change its"
502 " netmask")
503
504 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
505 "%s/%s" % (master_ip, netmask),
506 "dev", master_netdev, "label",
507 "%s:0" % master_netdev])
508 if result.failed:
509 _Fail("Could not set the new netmask on the master IP address")
510
511 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
512 "%s/%s" % (master_ip, old_netmask),
513 "dev", master_netdev, "label",
514 "%s:0" % master_netdev])
515 if result.failed:
516 _Fail("Could not bring down the master IP address with the old netmask")
517
518
519 def EtcHostsModify(mode, host, ip):
520 """Modify a host entry in /etc/hosts.
521
522 @param mode: The mode to operate. Either add or remove entry
523 @param host: The host to operate on
524 @param ip: The ip associated with the entry
525
526 """
527 if mode == constants.ETC_HOSTS_ADD:
528 if not ip:
529 RPCFail("Mode 'add' needs 'ip' parameter, but parameter not"
530 " present")
531 utils.AddHostToEtcHosts(host, ip)
532 elif mode == constants.ETC_HOSTS_REMOVE:
533 if ip:
534 RPCFail("Mode 'remove' does not allow 'ip' parameter, but"
535 " parameter is present")
536 utils.RemoveHostFromEtcHosts(host)
537 else:
538 RPCFail("Mode not supported")
539
540
541 def LeaveCluster(modify_ssh_setup):
542 """Cleans up and remove the current node.
543
544 This function cleans up and prepares the current node to be removed
545 from the cluster.
546
547 If processing is successful, then it raises an
548 L{errors.QuitGanetiException} which is used as a special case to
549 shutdown the node daemon.
550
551 @param modify_ssh_setup: boolean
552
553 """
554 _CleanDirectory(pathutils.DATA_DIR)
555 _CleanDirectory(pathutils.CRYPTO_KEYS_DIR)
556 JobQueuePurge()
557
558 if modify_ssh_setup:
559 try:
560 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
561
562 ssh.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
563
564 utils.RemoveFile(priv_key)
565 utils.RemoveFile(pub_key)
566 except errors.OpExecError:
567 logging.exception("Error while processing ssh files")
568
569 try:
570 utils.RemoveFile(pathutils.CONFD_HMAC_KEY)
571 utils.RemoveFile(pathutils.RAPI_CERT_FILE)
572 utils.RemoveFile(pathutils.SPICE_CERT_FILE)
573 utils.RemoveFile(pathutils.SPICE_CACERT_FILE)
574 utils.RemoveFile(pathutils.NODED_CERT_FILE)
575 except: # pylint: disable=W0702
576 logging.exception("Error while removing cluster secrets")
577
578 utils.StopDaemon(constants.CONFD)
579 utils.StopDaemon(constants.MOND)
580 utils.StopDaemon(constants.KVMD)
581
582 # Raise a custom exception (handled in ganeti-noded)
583 raise errors.QuitGanetiException(True, "Shutdown scheduled")
584
585
586 def _CheckStorageParams(params, num_params):
587 """Performs sanity checks for storage parameters.
588
589 @type params: list
590 @param params: list of storage parameters
591 @type num_params: int
592 @param num_params: expected number of parameters
593
594 """
595 if params is None:
596 raise errors.ProgrammerError("No storage parameters for storage"
597 " reporting is provided.")
598 if not isinstance(params, list):
599 raise errors.ProgrammerError("The storage parameters are not of type"
600 " list: '%s'" % params)
601 if not len(params) == num_params:
602 raise errors.ProgrammerError("Did not receive the expected number of"
603 "storage parameters: expected %s,"
604 " received '%s'" % (num_params, len(params)))
605
606
607 def _CheckLvmStorageParams(params):
608 """Performs sanity check for the 'exclusive storage' flag.
609
610 @see: C{_CheckStorageParams}
611
612 """
613 _CheckStorageParams(params, 1)
614 excl_stor = params[0]
615 if not isinstance(params[0], bool):
616 raise errors.ProgrammerError("Exclusive storage parameter is not"
617 " boolean: '%s'." % excl_stor)
618 return excl_stor
619
620
621 def _GetLvmVgSpaceInfo(name, params):
622 """Wrapper around C{_GetVgInfo} which checks the storage parameters.
623
624 @type name: string
625 @param name: name of the volume group
626 @type params: list
627 @param params: list of storage parameters, which in this case should be
628 containing only one for exclusive storage
629
630 """
631 excl_stor = _CheckLvmStorageParams(params)
632 return _GetVgInfo(name, excl_stor)
633
634
635 def _GetVgInfo(
636 name, excl_stor, info_fn=bdev.LogicalVolume.GetVGInfo):
637 """Retrieves information about a LVM volume group.
638
639 """
640 # TODO: GetVGInfo supports returning information for multiple VGs at once
641 vginfo = info_fn([name], excl_stor)
642 if vginfo:
643 vg_free = int(round(vginfo[0][0], 0))
644 vg_size = int(round(vginfo[0][1], 0))
645 else:
646 vg_free = None
647 vg_size = None
648
649 return {
650 "type": constants.ST_LVM_VG,
651 "name": name,
652 "storage_free": vg_free,
653 "storage_size": vg_size,
654 }
655
656
657 def _GetLvmPvSpaceInfo(name, params):
658 """Wrapper around C{_GetVgSpindlesInfo} with sanity checks.
659
660 @see: C{_GetLvmVgSpaceInfo}
661
662 """
663 excl_stor = _CheckLvmStorageParams(params)
664 return _GetVgSpindlesInfo(name, excl_stor)
665
666
667 def _GetVgSpindlesInfo(
668 name, excl_stor, info_fn=bdev.LogicalVolume.GetVgSpindlesInfo):
669 """Retrieves information about spindles in an LVM volume group.
670
671 @type name: string
672 @param name: VG name
673 @type excl_stor: bool
674 @param excl_stor: exclusive storage
675 @rtype: dict
676 @return: dictionary whose keys are "name", "vg_free", "vg_size" for VG name,
677 free spindles, total spindles respectively
678
679 """
680 if excl_stor:
681 (vg_free, vg_size) = info_fn(name)
682 else:
683 vg_free = 0
684 vg_size = 0
685 return {
686 "type": constants.ST_LVM_PV,
687 "name": name,
688 "storage_free": vg_free,
689 "storage_size": vg_size,
690 }
691
692
693 def _GetHvInfo(name, hvparams, get_hv_fn=hypervisor.GetHypervisor):
694 """Retrieves node information from a hypervisor.
695
696 The information returned depends on the hypervisor. Common items:
697
698 - vg_size is the size of the configured volume group in MiB
699 - vg_free is the free size of the volume group in MiB
700 - memory_dom0 is the memory allocated for domain0 in MiB
701 - memory_free is the currently available (free) ram in MiB
702 - memory_total is the total number of ram in MiB
703 - hv_version: the hypervisor version, if available
704
705 @type hvparams: dict of string
706 @param hvparams: the hypervisor's hvparams
707
708 """
709 return get_hv_fn(name).GetNodeInfo(hvparams=hvparams)
710
711
712 def _GetHvInfoAll(hv_specs, get_hv_fn=hypervisor.GetHypervisor):
713 """Retrieves node information for all hypervisors.
714
715 See C{_GetHvInfo} for information on the output.
716
717 @type hv_specs: list of pairs (string, dict of strings)
718 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
719
720 """
721 if hv_specs is None:
722 return None
723
724 result = []
725 for hvname, hvparams in hv_specs:
726 result.append(_GetHvInfo(hvname, hvparams, get_hv_fn))
727 return result
728
729
730 def _GetNamedNodeInfo(names, fn):
731 """Calls C{fn} for all names in C{names} and returns a dictionary.
732
733 @rtype: None or dict
734
735 """
736 if names is None:
737 return None
738 else:
739 return map(fn, names)
740
741
742 def GetNodeInfo(storage_units, hv_specs):
743 """Gives back a hash with different information about the node.
744
745 @type storage_units: list of tuples (string, string, list)
746 @param storage_units: List of tuples (storage unit, identifier, parameters) to
747 ask for disk space information. In case of lvm-vg, the identifier is
748 the VG name. The parameters can contain additional, storage-type-specific
749 parameters, for example exclusive storage for lvm storage.
750 @type hv_specs: list of pairs (string, dict of strings)
751 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
752 @rtype: tuple; (string, None/dict, None/dict)
753 @return: Tuple containing boot ID, volume group information and hypervisor
754 information
755
756 """
757 bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n")
758 storage_info = _GetNamedNodeInfo(
759 storage_units,
760 (lambda (storage_type, storage_key, storage_params):
761 _ApplyStorageInfoFunction(storage_type, storage_key, storage_params)))
762 hv_info = _GetHvInfoAll(hv_specs)
763 return (bootid, storage_info, hv_info)
764
765
766 def _GetFileStorageSpaceInfo(path, params):
767 """Wrapper around filestorage.GetSpaceInfo.
768
769 The purpose of this wrapper is to call filestorage.GetFileStorageSpaceInfo
770 and ignore the *args parameter to not leak it into the filestorage
771 module's code.
772
773 @see: C{filestorage.GetFileStorageSpaceInfo} for description of the
774 parameters.
775
776 """
777 _CheckStorageParams(params, 0)
778 return filestorage.GetFileStorageSpaceInfo(path)
779
780
781 # FIXME: implement storage reporting for all missing storage types.
782 _STORAGE_TYPE_INFO_FN = {
783 constants.ST_BLOCK: None,
784 constants.ST_DISKLESS: None,
785 constants.ST_EXT: None,
786 constants.ST_FILE: _GetFileStorageSpaceInfo,
787 constants.ST_LVM_PV: _GetLvmPvSpaceInfo,
788 constants.ST_LVM_VG: _GetLvmVgSpaceInfo,
789 constants.ST_SHARED_FILE: None,
790 constants.ST_GLUSTER: None,
791 constants.ST_RADOS: None,
792 }
793
794
795 def _ApplyStorageInfoFunction(storage_type, storage_key, *args):
796 """Looks up and applies the correct function to calculate free and total
797 storage for the given storage type.
798
799 @type storage_type: string
800 @param storage_type: the storage type for which the storage shall be reported.
801 @type storage_key: string
802 @param storage_key: identifier of a storage unit, e.g. the volume group name
803 of an LVM storage unit
804 @type args: any
805 @param args: various parameters that can be used for storage reporting. These
806 parameters and their semantics vary from storage type to storage type and
807 are just propagated in this function.
808 @return: the results of the application of the storage space function (see
809 _STORAGE_TYPE_INFO_FN) if storage space reporting is implemented for that
810 storage type
811 @raises NotImplementedError: for storage types who don't support space
812 reporting yet
813 """
814 fn = _STORAGE_TYPE_INFO_FN[storage_type]
815 if fn is not None:
816 return fn(storage_key, *args)
817 else:
818 raise NotImplementedError
819
820
821 def _CheckExclusivePvs(pvi_list):
822 """Check that PVs are not shared among LVs
823
824 @type pvi_list: list of L{objects.LvmPvInfo} objects
825 @param pvi_list: information about the PVs
826
827 @rtype: list of tuples (string, list of strings)
828 @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...])
829
830 """
831 res = []
832 for pvi in pvi_list:
833 if len(pvi.lv_list) > 1:
834 res.append((pvi.name, pvi.lv_list))
835 return res
836
837
838 def _VerifyHypervisors(what, vm_capable, result, all_hvparams,
839 get_hv_fn=hypervisor.GetHypervisor):
840 """Verifies the hypervisor. Appends the results to the 'results' list.
841
842 @type what: C{dict}
843 @param what: a dictionary of things to check
844 @type vm_capable: boolean
845 @param vm_capable: whether or not this node is vm capable
846 @type result: dict
847 @param result: dictionary of verification results; results of the
848 verifications in this function will be added here
849 @type all_hvparams: dict of dict of string
850 @param all_hvparams: dictionary mapping hypervisor names to hvparams
851 @type get_hv_fn: function
852 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
853
854 """
855 if not vm_capable:
856 return
857
858 if constants.NV_HYPERVISOR in what:
859 result[constants.NV_HYPERVISOR] = {}
860 for hv_name in what[constants.NV_HYPERVISOR]:
861 hvparams = all_hvparams[hv_name]
862 try:
863 val = get_hv_fn(hv_name).Verify(hvparams=hvparams)
864 except errors.HypervisorError, err:
865 val = "Error while checking hypervisor: %s" % str(err)
866 result[constants.NV_HYPERVISOR][hv_name] = val
867
868
869 def _VerifyHvparams(what, vm_capable, result,
870 get_hv_fn=hypervisor.GetHypervisor):
871 """Verifies the hvparams. Appends the results to the 'results' list.
872
873 @type what: C{dict}
874 @param what: a dictionary of things to check
875 @type vm_capable: boolean
876 @param vm_capable: whether or not this node is vm capable
877 @type result: dict
878 @param result: dictionary of verification results; results of the
879 verifications in this function will be added here
880 @type get_hv_fn: function
881 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
882
883 """
884 if not vm_capable:
885 return
886
887 if constants.NV_HVPARAMS in what:
888 result[constants.NV_HVPARAMS] = []
889 for source, hv_name, hvparms in what[constants.NV_HVPARAMS]:
890 try:
891 logging.info("Validating hv %s, %s", hv_name, hvparms)
892 get_hv_fn(hv_name).ValidateParameters(hvparms)
893 except errors.HypervisorError, err:
894 result[constants.NV_HVPARAMS].append((source, hv_name, str(err)))
895
896
897 def _VerifyInstanceList(what, vm_capable, result, all_hvparams):
898 """Verifies the instance list.
899
900 @type what: C{dict}
901 @param what: a dictionary of things to check
902 @type vm_capable: boolean
903 @param vm_capable: whether or not this node is vm capable
904 @type result: dict
905 @param result: dictionary of verification results; results of the
906 verifications in this function will be added here
907 @type all_hvparams: dict of dict of string
908 @param all_hvparams: dictionary mapping hypervisor names to hvparams
909
910 """
911 if constants.NV_INSTANCELIST in what and vm_capable:
912 # GetInstanceList can fail
913 try:
914 val = GetInstanceList(what[constants.NV_INSTANCELIST],
915 all_hvparams=all_hvparams)
916 except RPCFail, err:
917 val = str(err)
918 result[constants.NV_INSTANCELIST] = val
919
920
921 def _VerifyNodeInfo(what, vm_capable, result, all_hvparams):
922 """Verifies the node info.
923
924 @type what: C{dict}
925 @param what: a dictionary of things to check
926 @type vm_capable: boolean
927 @param vm_capable: whether or not this node is vm capable
928 @type result: dict
929 @param result: dictionary of verification results; results of the
930 verifications in this function will be added here
931 @type all_hvparams: dict of dict of string
932 @param all_hvparams: dictionary mapping hypervisor names to hvparams
933
934 """
935 if constants.NV_HVINFO in what and vm_capable:
936 hvname = what[constants.NV_HVINFO]
937 hyper = hypervisor.GetHypervisor(hvname)
938 hvparams = all_hvparams[hvname]
939 result[constants.NV_HVINFO] = hyper.GetNodeInfo(hvparams=hvparams)
940
941
942 def _VerifyClientCertificate(cert_file=pathutils.NODED_CLIENT_CERT_FILE):
943 """Verify the existance and validity of the client SSL certificate.
944
945 Also, verify that the client certificate is not self-signed. Self-
946 signed client certificates stem from Ganeti versions 2.12.0 - 2.12.4
947 and should be replaced by client certificates signed by the server
948 certificate. Hence we output a warning when we encounter a self-signed
949 one.
950
951 """
952 create_cert_cmd = "gnt-cluster renew-crypto --new-node-certificates"
953 if not os.path.exists(cert_file):
954 return (constants.CV_ERROR,
955 "The client certificate does not exist. Run '%s' to create"
956 " client certificates for all nodes." % create_cert_cmd)
957
958 (errcode, msg) = utils.VerifyCertificate(cert_file)
959 if errcode is not None:
960 return (errcode, msg)
961
962 (errcode, msg) = utils.IsCertificateSelfSigned(cert_file)
963 if errcode is not None:
964 return (errcode, msg)
965
966 # if everything is fine, we return the digest to be compared to the config
967 return (None, utils.GetCertificateDigest(cert_filename=cert_file))
968
969
970 def _VerifySshSetup(node_status_list, my_name,
971 pub_key_file=pathutils.SSH_PUB_KEYS):
972 """Verifies the state of the SSH key files.
973
974 @type node_status_list: list of tuples
975 @param node_status_list: list of nodes of the cluster associated with a
976 couple of flags: (uuid, name, is_master_candidate,
977 is_potential_master_candidate, online)
978 @type my_name: str
979 @param my_name: name of this node
980 @type pub_key_file: str
981 @param pub_key_file: filename of the public key file
982
983 """
984 if node_status_list is None:
985 return ["No node list to check against the pub_key_file received."]
986
987 my_status_list = [(my_uuid, name, mc, pot_mc, online) for
988 (my_uuid, name, mc, pot_mc, online)
989 in node_status_list if name == my_name]
990 if len(my_status_list) == 0:
991 return ["Cannot find node information for node '%s'." % my_name]
992 (my_uuid, _, _, potential_master_candidate, online) = \
993 my_status_list[0]
994
995 result = []
996
997 if not os.path.exists(pub_key_file):
998 result.append("The public key file '%s' does not exist. Consider running"
999 " 'gnt-cluster renew-crypto --new-ssh-keys"
1000 " [--no-ssh-key-check]' to fix this." % pub_key_file)
1001 return result
1002
1003 pot_mc_uuids = [uuid for (uuid, _, _, _, _) in node_status_list]
1004 offline_nodes = [uuid for (uuid, _, _, _, online) in node_status_list
1005 if not online]
1006 pub_keys = ssh.QueryPubKeyFile(None)
1007
1008 if potential_master_candidate:
1009 # Check that the set of potential master candidates matches the
1010 # public key file
1011 pub_uuids_set = set(pub_keys.keys()) - set(offline_nodes)
1012 pot_mc_uuids_set = set(pot_mc_uuids) - set(offline_nodes)
1013 missing_uuids = set([])
1014 if pub_uuids_set != pot_mc_uuids_set:
1015 unknown_uuids = pub_uuids_set - pot_mc_uuids_set
1016 if unknown_uuids:
1017 result.append("The following node UUIDs are listed in the public key"
1018 " file on node '%s', but are not potential master"
1019 " candidates: %s."
1020 % (my_name, ", ".join(list(unknown_uuids))))
1021 missing_uuids = pot_mc_uuids_set - pub_uuids_set
1022 if missing_uuids:
1023 result.append("The following node UUIDs of potential master candidates"
1024 " are missing in the public key file on node %s: %s."
1025 % (my_name, ", ".join(list(missing_uuids))))
1026
1027 (_, key_files) = \
1028 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1029 (_, dsa_pub_key_filename) = key_files[constants.SSHK_DSA]
1030
1031 my_keys = pub_keys[my_uuid]
1032
1033 dsa_pub_key = utils.ReadFile(dsa_pub_key_filename)
1034 if dsa_pub_key.strip() not in my_keys:
1035 result.append("The dsa key of node %s does not match this node's key"
1036 " in the pub key file." % (my_name))
1037 if len(my_keys) != 1:
1038 result.append("There is more than one key for node %s in the public key"
1039 " file." % my_name)
1040 else:
1041 if len(pub_keys.keys()) > 0:
1042 result.append("The public key file of node '%s' is not empty, although"
1043 " the node is not a potential master candidate."
1044 % my_name)
1045
1046 # Check that all master candidate keys are in the authorized_keys file
1047 (auth_key_file, _) = \
1048 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1049 for (uuid, name, mc, _, online) in node_status_list:
1050 if not online:
1051 continue
1052 if uuid in missing_uuids:
1053 continue
1054 if mc:
1055 for key in pub_keys[uuid]:
1056 if not ssh.HasAuthorizedKey(auth_key_file, key):
1057 result.append("A SSH key of master candidate '%s' (UUID: '%s') is"
1058 " not in the 'authorized_keys' file of node '%s'."
1059 % (name, uuid, my_name))
1060 else:
1061 for key in pub_keys[uuid]:
1062 if name != my_name and ssh.HasAuthorizedKey(auth_key_file, key):
1063 result.append("A SSH key of normal node '%s' (UUID: '%s') is in the"
1064 " 'authorized_keys' file of node '%s'."
1065 % (name, uuid, my_name))
1066 if name == my_name and not ssh.HasAuthorizedKey(auth_key_file, key):
1067 result.append("A SSH key of normal node '%s' (UUID: '%s') is not"
1068 " in the 'authorized_keys' file of itself."
1069 % (my_name, uuid))
1070
1071 return result
1072
1073
1074 def _VerifySshClutter(node_status_list, my_name):
1075 """Verifies that the 'authorized_keys' files are not cluttered up.
1076
1077 @type node_status_list: list of tuples
1078 @param node_status_list: list of nodes of the cluster associated with a
1079 couple of flags: (uuid, name, is_master_candidate,
1080 is_potential_master_candidate, online)
1081 @type my_name: str
1082 @param my_name: name of this node
1083
1084 """
1085 result = []
1086 (auth_key_file, _) = \
1087 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1088 node_names = [name for (_, name, _, _) in node_status_list]
1089 multiple_occurrences = ssh.CheckForMultipleKeys(auth_key_file, node_names)
1090 if multiple_occurrences:
1091 msg = "There are hosts which have more than one SSH key stored for the" \
1092 " same user in the 'authorized_keys' file of node %s. This can be" \
1093 " due to an unsuccessful operation which cluttered up the" \
1094 " 'authorized_keys' file. We recommend to clean this up manually. " \
1095 % my_name
1096 for host, occ in multiple_occurrences.items():
1097 msg += "Entry for '%s' in lines %s. " % (host, utils.CommaJoin(occ))
1098 result.append(msg)
1099
1100 return result
1101
1102
1103 def VerifyNode(what, cluster_name, all_hvparams, node_groups, groups_cfg):
1104 """Verify the status of the local node.
1105
1106 Based on the input L{what} parameter, various checks are done on the
1107 local node.
1108
1109 If the I{filelist} key is present, this list of
1110 files is checksummed and the file/checksum pairs are returned.
1111
1112 If the I{nodelist} key is present, we check that we have
1113 connectivity via ssh with the target nodes (and check the hostname
1114 report).
1115
1116 If the I{node-net-test} key is present, we check that we have
1117 connectivity to the given nodes via both primary IP and, if
1118 applicable, secondary IPs.
1119
1120 @type what: C{dict}
1121 @param what: a dictionary of things to check:
1122 - filelist: list of files for which to compute checksums
1123 - nodelist: list of nodes we should check ssh communication with
1124 - node-net-test: list of nodes we should check node daemon port
1125 connectivity with
1126 - hypervisor: list with hypervisors to run the verify for
1127 @type cluster_name: string
1128 @param cluster_name: the cluster's name
1129 @type all_hvparams: dict of dict of strings
1130 @param all_hvparams: a dictionary mapping hypervisor names to hvparams
1131 @type node_groups: a dict of strings
1132 @param node_groups: node _names_ mapped to their group uuids (it's enough to
1133 have only those nodes that are in `what["nodelist"]`)
1134 @type groups_cfg: a dict of dict of strings
1135 @param groups_cfg: a dictionary mapping group uuids to their configuration
1136 @rtype: dict
1137 @return: a dictionary with the same keys as the input dict, and
1138 values representing the result of the checks
1139
1140 """
1141 result = {}
1142 my_name = netutils.Hostname.GetSysName()
1143 port = netutils.GetDaemonPort(constants.NODED)
1144 vm_capable = my_name not in what.get(constants.NV_NONVMNODES, [])
1145
1146 _VerifyHypervisors(what, vm_capable, result, all_hvparams)
1147 _VerifyHvparams(what, vm_capable, result)
1148
1149 if constants.NV_FILELIST in what:
1150 fingerprints = utils.FingerprintFiles(map(vcluster.LocalizeVirtualPath,
1151 what[constants.NV_FILELIST]))
1152 result[constants.NV_FILELIST] = \
1153 dict((vcluster.MakeVirtualPath(key), value)
1154 for (key, value) in fingerprints.items())
1155
1156 if constants.NV_CLIENT_CERT in what:
1157 result[constants.NV_CLIENT_CERT] = _VerifyClientCertificate()
1158
1159 if constants.NV_SSH_SETUP in what:
1160 result[constants.NV_SSH_SETUP] = \
1161 _VerifySshSetup(what[constants.NV_SSH_SETUP], my_name)
1162 if constants.NV_SSH_CLUTTER in what:
1163 result[constants.NV_SSH_CLUTTER] = \
1164 _VerifySshClutter(what[constants.NV_SSH_SETUP], my_name)
1165
1166 if constants.NV_NODELIST in what:
1167 (nodes, bynode, mcs) = what[constants.NV_NODELIST]
1168
1169 # Add nodes from other groups (different for each node)
1170 try:
1171 nodes.extend(bynode[my_name])
1172 except KeyError:
1173 pass
1174
1175 # Use a random order
1176 random.shuffle(nodes)
1177
1178 # Try to contact all nodes
1179 val = {}
1180 for node in nodes:
1181 params = groups_cfg.get(node_groups.get(node))
1182 ssh_port = params["ndparams"].get(constants.ND_SSH_PORT)
1183 logging.debug("Ssh port %s (None = default) for node %s",
1184 str(ssh_port), node)
1185
1186 # We only test if master candidates can communicate to other nodes.
1187 # We cannot test if normal nodes cannot communicate with other nodes,
1188 # because the administrator might have installed additional SSH keys,
1189 # over which Ganeti has no power.
1190 if my_name in mcs:
1191 success, message = _GetSshRunner(cluster_name). \
1192 VerifyNodeHostname(node, ssh_port)
1193 if not success:
1194 val[node] = message
1195
1196 result[constants.NV_NODELIST] = val
1197
1198 if constants.NV_NODENETTEST in what:
1199 result[constants.NV_NODENETTEST] = tmp = {}
1200 my_pip = my_sip = None
1201 for name, pip, sip in what[constants.NV_NODENETTEST]:
1202 if name == my_name:
1203 my_pip = pip
1204 my_sip = sip
1205 break
1206 if not my_pip:
1207 tmp[my_name] = ("Can't find my own primary/secondary IP"
1208 " in the node list")
1209 else:
1210 for name, pip, sip in what[constants.NV_NODENETTEST]:
1211 fail = []
1212 if not netutils.TcpPing(pip, port, source=my_pip):
1213 fail.append("primary")
1214 if sip != pip:
1215 if not netutils.TcpPing(sip, port, source=my_sip):
1216 fail.append("secondary")
1217 if fail:
1218 tmp[name] = ("failure using the %s interface(s)" %
1219 " and ".join(fail))
1220
1221 if constants.NV_MASTERIP in what:
1222 # FIXME: add checks on incoming data structures (here and in the
1223 # rest of the function)
1224 master_name, master_ip = what[constants.NV_MASTERIP]
1225 if master_name == my_name:
1226 source = constants.IP4_ADDRESS_LOCALHOST
1227 else:
1228 source = None
1229 result[constants.NV_MASTERIP] = netutils.TcpPing(master_ip, port,
1230 source=source)
1231
1232 if constants.NV_USERSCRIPTS in what:
1233 result[constants.NV_USERSCRIPTS] = \
1234 [script for script in what[constants.NV_USERSCRIPTS]
1235 if not utils.IsExecutable(script)]
1236
1237 if constants.NV_OOB_PATHS in what:
1238 result[constants.NV_OOB_PATHS] = tmp = []
1239 for path in what[constants.NV_OOB_PATHS]:
1240 try:
1241 st = os.stat(path)
1242 except OSError, err:
1243 tmp.append("error stating out of band helper: %s" % err)
1244 else:
1245 if stat.S_ISREG(st.st_mode):
1246 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR:
1247 tmp.append(None)
1248 else:
1249 tmp.append("out of band helper %s is not executable" % path)
1250 else:
1251 tmp.append("out of band helper %s is not a file" % path)
1252
1253 if constants.NV_LVLIST in what and vm_capable:
1254 try:
1255 val = GetVolumeList(utils.ListVolumeGroups().keys())
1256 except RPCFail, err:
1257 val = str(err)
1258 result[constants.NV_LVLIST] = val
1259
1260 _VerifyInstanceList(what, vm_capable, result, all_hvparams)
1261
1262 if constants.NV_VGLIST in what and vm_capable:
1263 result[constants.NV_VGLIST] = utils.ListVolumeGroups()
1264
1265 if constants.NV_PVLIST in what and vm_capable:
1266 check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what
1267 val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST],
1268 filter_allocatable=False,
1269 include_lvs=check_exclusive_pvs)
1270 if check_exclusive_pvs:
1271 result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val)
1272 for pvi in val:
1273 # Avoid sending useless data on the wire
1274 pvi.lv_list = []
1275 result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val)
1276
1277 if constants.NV_VERSION in what:
1278 result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
1279 constants.RELEASE_VERSION)
1280
1281 _VerifyNodeInfo(what, vm_capable, result, all_hvparams)
1282
1283 if constants.NV_DRBDVERSION in what and vm_capable:
1284 try:
1285 drbd_version = DRBD8.GetProcInfo().GetVersionString()
1286 except errors.BlockDeviceError, err:
1287 logging.warning("Can't get DRBD version", exc_info=True)
1288 drbd_version = str(err)
1289 result[constants.NV_DRBDVERSION] = drbd_version
1290
1291 if constants.NV_DRBDLIST in what and vm_capable:
1292 try:
1293 used_minors = drbd.DRBD8.GetUsedDevs()
1294 except errors.BlockDeviceError, err:
1295 logging.warning("Can't get used minors list", exc_info=True)
1296 used_minors = str(err)
1297 result[constants.NV_DRBDLIST] = used_minors
1298
1299 if constants.NV_DRBDHELPER in what and vm_capable:
1300 status = True
1301 try:
1302 payload = drbd.DRBD8.GetUsermodeHelper()
1303 except errors.BlockDeviceError, err:
1304 logging.error("Can't get DRBD usermode helper: %s", str(err))
1305 status = False
1306 payload = str(err)
1307 result[constants.NV_DRBDHELPER] = (status, payload)
1308
1309 if constants.NV_NODESETUP in what:
1310 result[constants.NV_NODESETUP] = tmpr = []
1311 if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"):
1312 tmpr.append("The sysfs filesytem doesn't seem to be mounted"
1313 " under /sys, missing required directories /sys/block"
1314 " and /sys/class/net")
1315 if (not os.path.isdir("/proc/sys") or
1316 not os.path.isfile("/proc/sysrq-trigger")):
1317 tmpr.append("The procfs filesystem doesn't seem to be mounted"
1318 " under /proc, missing required directory /proc/sys and"
1319 " the file /proc/sysrq-trigger")
1320
1321 if constants.NV_TIME in what:
1322 result[constants.NV_TIME] = utils.SplitTime(time.time())
1323
1324 if constants.NV_OSLIST in what and vm_capable:
1325 result[constants.NV_OSLIST] = DiagnoseOS()
1326
1327 if constants.NV_BRIDGES in what and vm_capable:
1328 result[constants.NV_BRIDGES] = [bridge
1329 for bridge in what[constants.NV_BRIDGES]
1330 if not utils.BridgeExists(bridge)]
1331
1332 if what.get(constants.NV_ACCEPTED_STORAGE_PATHS) == my_name:
1333 result[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1334 filestorage.ComputeWrongFileStoragePaths()
1335
1336 if what.get(constants.NV_FILE_STORAGE_PATH):
1337 pathresult = filestorage.CheckFileStoragePath(
1338 what[constants.NV_FILE_STORAGE_PATH])
1339 if pathresult:
1340 result[constants.NV_FILE_STORAGE_PATH] = pathresult
1341
1342 if what.get(constants.NV_SHARED_FILE_STORAGE_PATH):
1343 pathresult = filestorage.CheckFileStoragePath(
1344 what[constants.NV_SHARED_FILE_STORAGE_PATH])
1345 if pathresult:
1346 result[constants.NV_SHARED_FILE_STORAGE_PATH] = pathresult
1347
1348 return result
1349
1350
1351 def GetCryptoTokens(token_requests):
1352 """Perform actions on the node's cryptographic tokens.
1353
1354 Token types can be 'ssl' or 'ssh'. So far only some actions are implemented
1355 for 'ssl'. Action 'get' returns the digest of the public client ssl
1356 certificate. Action 'create' creates a new client certificate and private key
1357 and also returns the digest of the certificate. The third parameter of a
1358 token request are optional parameters for the actions, so far only the
1359 filename is supported.
1360
1361 @type token_requests: list of tuples of (string, string, dict), where the
1362 first string is in constants.CRYPTO_TYPES, the second in
1363 constants.CRYPTO_ACTIONS. The third parameter is a dictionary of string
1364 to string.
1365 @param token_requests: list of requests of cryptographic tokens and actions
1366 to perform on them. The actions come with a dictionary of options.
1367 @rtype: list of tuples (string, string)
1368 @return: list of tuples of the token type and the public crypto token
1369
1370 """
1371 tokens = []
1372 for (token_type, action, _) in token_requests:
1373 if token_type not in constants.CRYPTO_TYPES:
1374 raise errors.ProgrammerError("Token type '%s' not supported." %
1375 token_type)
1376 if action not in constants.CRYPTO_ACTIONS:
1377 raise errors.ProgrammerError("Action '%s' is not supported." %
1378 action)
1379 if token_type == constants.CRYPTO_TYPE_SSL_DIGEST:
1380 tokens.append((token_type,
1381 utils.GetCertificateDigest()))
1382 return tokens
1383
1384
1385 def EnsureDaemon(daemon_name, run):
1386 """Ensures the given daemon is running or stopped.
1387
1388 @type daemon_name: string
1389 @param daemon_name: name of the daemon (e.g., constants.KVMD)
1390
1391 @type run: bool
1392 @param run: whether to start or stop the daemon
1393
1394 @rtype: bool
1395 @return: 'True' if daemon successfully started/stopped,
1396 'False' otherwise
1397
1398 """
1399 allowed_daemons = [constants.KVMD]
1400
1401 if daemon_name not in allowed_daemons:
1402 fn = lambda _: False
1403 elif run:
1404 fn = utils.EnsureDaemon
1405 else:
1406 fn = utils.StopDaemon
1407
1408 return fn(daemon_name)
1409
1410
1411 def _InitSshUpdateData(data, noded_cert_file, ssconf_store):
1412 (_, noded_cert) = \
1413 utils.ExtractX509Certificate(utils.ReadFile(noded_cert_file))
1414 data[constants.SSHS_NODE_DAEMON_CERTIFICATE] = noded_cert
1415
1416 cluster_name = ssconf_store.GetClusterName()
1417 data[constants.SSHS_CLUSTER_NAME] = cluster_name
1418
1419
1420 def AddNodeSshKey(node_uuid, node_name,
1421 potential_master_candidates,
1422 to_authorized_keys=False,
1423 to_public_keys=False,
1424 get_public_keys=False,
1425 pub_key_file=pathutils.SSH_PUB_KEYS,
1426 ssconf_store=None,
1427 noded_cert_file=pathutils.NODED_CERT_FILE,
1428 run_cmd_fn=ssh.RunSshCmdWithStdin):
1429 """Distributes a node's public SSH key across the cluster.
1430
1431 Note that this function should only be executed on the master node, which
1432 then will copy the new node's key to all nodes in the cluster via SSH.
1433
1434 Also note: at least one of the flags C{to_authorized_keys},
1435 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1436 the function to actually perform any actions.
1437
1438 @type node_uuid: str
1439 @param node_uuid: the UUID of the node whose key is added
1440 @type node_name: str
1441 @param node_name: the name of the node whose key is added
1442 @type potential_master_candidates: list of str
1443 @param potential_master_candidates: list of node names of potential master
1444 candidates; this should match the list of uuids in the public key file
1445 @type to_authorized_keys: boolean
1446 @param to_authorized_keys: whether the key should be added to the
1447 C{authorized_keys} file of all nodes
1448 @type to_public_keys: boolean
1449 @param to_public_keys: whether the keys should be added to the public key file
1450 @type get_public_keys: boolean
1451 @param get_public_keys: whether the node should add the clusters' public keys
1452 to its {ganeti_pub_keys} file
1453
1454 """
1455 node_list = [SshAddNodeInfo(name=node_name, uuid=node_uuid,
1456 to_authorized_keys=to_authorized_keys,
1457 to_public_keys=to_public_keys,
1458 get_public_keys=get_public_keys)]
1459 return AddNodeSshKeyBulk(node_list,
1460 potential_master_candidates,
1461 pub_key_file=pub_key_file,
1462 ssconf_store=ssconf_store,
1463 noded_cert_file=noded_cert_file,
1464 run_cmd_fn=run_cmd_fn)
1465
1466
1467 # Node info named tuple specifically for the use with AddNodeSshKeyBulk
1468 SshAddNodeInfo = collections.namedtuple(
1469 "SshAddNodeInfo",
1470 ["uuid",
1471 "name",
1472 "to_authorized_keys",
1473 "to_public_keys",
1474 "get_public_keys"])
1475
1476
1477 def AddNodeSshKeyBulk(node_list,
1478 potential_master_candidates,
1479 pub_key_file=pathutils.SSH_PUB_KEYS,
1480 ssconf_store=None,
1481 noded_cert_file=pathutils.NODED_CERT_FILE,
1482 run_cmd_fn=ssh.RunSshCmdWithStdin):
1483 """Distributes a node's public SSH key across the cluster.
1484
1485 Note that this function should only be executed on the master node, which
1486 then will copy the new node's key to all nodes in the cluster via SSH.
1487
1488 Also note: at least one of the flags C{to_authorized_keys},
1489 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1490 the function to actually perform any actions.
1491
1492 @type node_list: list of SshAddNodeInfo tuples
1493 @param node_list: list of tuples containing the necessary node information for
1494 adding their keys
1495 @type potential_master_candidates: list of str
1496 @param potential_master_candidates: list of node names of potential master
1497 candidates; this should match the list of uuids in the public key file
1498
1499 """
1500 # whether there are any keys to be added or retrieved at all
1501 to_authorized_keys = any([node_info.to_authorized_keys for node_info in
1502 node_list])
1503 to_public_keys = any([node_info.to_public_keys for node_info in
1504 node_list])
1505 get_public_keys = any([node_info.get_public_keys for node_info in
1506 node_list])
1507
1508 # assure that at least one of those flags is true, as the function would
1509 # not do anything otherwise
1510 assert (to_authorized_keys or to_public_keys or get_public_keys)
1511
1512 if not ssconf_store:
1513 ssconf_store = ssconf.SimpleStore()
1514
1515 for node_info in node_list:
1516 # replacement not necessary for keys that are not supposed to be in the
1517 # list of public keys
1518 if not node_info.to_public_keys:
1519 continue
1520 # Check and fix sanity of key file
1521 keys_by_name = ssh.QueryPubKeyFile([node_info.name], key_file=pub_key_file)
1522 keys_by_uuid = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1523
1524 if (not keys_by_name or node_info.name not in keys_by_name) \
1525 and (not keys_by_uuid or node_info.uuid not in keys_by_uuid):
1526 raise errors.SshUpdateError(
1527 "No keys found for the new node '%s' (UUID %s) in the list of public"
1528 " SSH keys, neither for the name or the UUID" %
1529 (node_info.name, node_info.uuid))
1530 else:
1531 if node_info.name in keys_by_name:
1532 # Replace the name by UUID in the file as the name should only be used
1533 # temporarily
1534 ssh.ReplaceNameByUuid(node_info.uuid, node_info.name,
1535 error_fn=errors.SshUpdateError,
1536 key_file=pub_key_file)
1537
1538 # Retrieve updated map of UUIDs to keys
1539 keys_by_uuid = ssh.QueryPubKeyFile(
1540 [node_info.uuid for node_info in node_list], key_file=pub_key_file)
1541
1542 # Update the master node's key files
1543 (auth_key_file, _) = \
1544 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1545 for node_info in node_list:
1546 if node_info.to_authorized_keys:
1547 ssh.AddAuthorizedKeys(auth_key_file, keys_by_uuid[node_info.uuid])
1548
1549 base_data = {}
1550 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1551 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1552
1553 ssh_port_map = ssconf_store.GetSshPortMap()
1554
1555 # Update the target nodes themselves
1556 for node_info in node_list:
1557 logging.debug("Updating SSH key files of target node '%s'.", node_info.name)
1558 if node_info.get_public_keys:
1559 node_data = {}
1560 _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
1561 all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
1562 node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1563 (constants.SSHS_OVERRIDE, all_keys)
1564
1565 try:
1566 utils.RetryByNumberOfTimes(
1567 constants.SSHS_MAX_RETRIES,
1568 errors.SshUpdateError,
1569 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1570 ssh_port_map.get(node_info.name), node_data,
1571 debug=False, verbose=False, use_cluster_key=False,
1572 ask_key=False, strict_host_check=False)
1573 except errors.SshUpdateError as e:
1574 # Clean up the master's public key file if adding key fails
1575 if node_info.to_public_keys:
1576 ssh.RemovePublicKey(node_info.uuid)
1577 raise e
1578
1579 # Update all nodes except master and the target nodes
1580 keys_by_uuid_auth = ssh.QueryPubKeyFile(
1581 [node_info.uuid for node_info in node_list
1582 if node_info.to_authorized_keys],
1583 key_file=pub_key_file)
1584 if to_authorized_keys:
1585 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1586 (constants.SSHS_ADD, keys_by_uuid_auth)
1587
1588 pot_mc_data = base_data.copy()
1589 keys_by_uuid_pub = ssh.QueryPubKeyFile(
1590 [node_info.uuid for node_info in node_list
1591 if node_info.to_public_keys],
1592 key_file=pub_key_file)
1593 if to_public_keys:
1594 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1595 (constants.SSHS_REPLACE_OR_ADD, keys_by_uuid_pub)
1596
1597 all_nodes = ssconf_store.GetNodeList()
1598 master_node = ssconf_store.GetMasterNode()
1599 online_nodes = ssconf_store.GetOnlineNodeList()
1600
1601 node_errors = []
1602 for node in all_nodes:
1603 if node == master_node:
1604 logging.debug("Skipping master node '%s'.", master_node)
1605 continue
1606 if node not in online_nodes:
1607 logging.debug("Skipping offline node '%s'.", node)
1608 continue
1609 if node in potential_master_candidates:
1610 logging.debug("Updating SSH key files of node '%s'.", node)
1611 try:
1612 utils.RetryByNumberOfTimes(
1613 constants.SSHS_MAX_RETRIES,
1614 errors.SshUpdateError,
1615 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1616 ssh_port_map.get(node), pot_mc_data,
1617 debug=False, verbose=False, use_cluster_key=False,
1618 ask_key=False, strict_host_check=False)
1619 except errors.SshUpdateError as last_exception:
1620 error_msg = ("When adding the key of node '%s', updating SSH key"
1621 " files of node '%s' failed after %s retries."
1622 " Not trying again. Last error was: %s." %
1623 (node, node_info.name, constants.SSHS_MAX_RETRIES,
1624 last_exception))
1625 node_errors.append((node, error_msg))
1626 # We only log the error and don't throw an exception, because
1627 # one unreachable node shall not abort the entire procedure.
1628 logging.error(error_msg)
1629
1630 else:
1631 if to_authorized_keys:
1632 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
1633 ssh_port_map.get(node), base_data,
1634 debug=False, verbose=False, use_cluster_key=False,
1635 ask_key=False, strict_host_check=False)
1636
1637 return node_errors
1638
1639
1640 def RemoveNodeSshKey(node_uuid, node_name,
1641 master_candidate_uuids,
1642 potential_master_candidates,
1643 master_uuid=None,
1644 keys_to_remove=None,
1645 from_authorized_keys=False,
1646 from_public_keys=False,
1647 clear_authorized_keys=False,
1648 clear_public_keys=False,
1649 pub_key_file=pathutils.SSH_PUB_KEYS,
1650 ssconf_store=None,
1651 noded_cert_file=pathutils.NODED_CERT_FILE,
1652 readd=False,
1653 run_cmd_fn=ssh.RunSshCmdWithStdin):
1654 """Removes the node's SSH keys from the key files and distributes those.
1655
1656 Note that at least one of the flags C{from_authorized_keys},
1657 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1658 has to be set to C{True} for the function to perform any action at all.
1659 Not doing so will trigger an assertion in the function.
1660
1661 @type node_uuid: str
1662 @param node_uuid: UUID of the node whose key is removed
1663 @type node_name: str
1664 @param node_name: name of the node whose key is remove
1665 @type master_candidate_uuids: list of str
1666 @param master_candidate_uuids: list of UUIDs of the current master candidates
1667 @type potential_master_candidates: list of str
1668 @param potential_master_candidates: list of names of potential master
1669 candidates
1670 @type keys_to_remove: dict of str to list of str
1671 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1672 to be removed. This list is supposed to be used only if the keys are not
1673 in the public keys file. This is for example the case when removing a
1674 master node's key.
1675 @type from_authorized_keys: boolean
1676 @param from_authorized_keys: whether or not the key should be removed
1677 from the C{authorized_keys} file
1678 @type from_public_keys: boolean
1679 @param from_public_keys: whether or not the key should be remove from
1680 the C{ganeti_pub_keys} file
1681 @type clear_authorized_keys: boolean
1682 @param clear_authorized_keys: whether or not the C{authorized_keys} file
1683 should be cleared on the node whose keys are removed
1684 @type clear_public_keys: boolean
1685 @param clear_public_keys: whether to clear the node's C{ganeti_pub_key} file
1686 @type readd: boolean
1687 @param readd: whether this is called during a readd operation.
1688 @rtype: list of string
1689 @returns: list of feedback messages
1690
1691 """
1692 # Non-disruptive error messages, list of (node, msg) pairs
1693 result_msgs = []
1694
1695 # Make sure at least one of these flags is true.
1696 if not (from_authorized_keys or from_public_keys or clear_authorized_keys
1697 or clear_public_keys):
1698 raise errors.SshUpdateError("No removal from any key file was requested.")
1699
1700 if not ssconf_store:
1701 ssconf_store = ssconf.SimpleStore()
1702
1703 master_node = ssconf_store.GetMasterNode()
1704 ssh_port_map = ssconf_store.GetSshPortMap()
1705
1706 if from_authorized_keys or from_public_keys:
1707 if keys_to_remove:
1708 keys = keys_to_remove
1709 else:
1710 keys = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
1711 if (not keys or node_uuid not in keys) and not readd:
1712 raise errors.SshUpdateError("Node '%s' not found in the list of public"
1713 " SSH keys. It seems someone tries to"
1714 " remove a key from outside the cluster!"
1715 % node_uuid)
1716 # During an upgrade all nodes have the master key. In this case we
1717 # should not remove it to avoid accidentally shutting down cluster
1718 # SSH communication
1719 master_keys = None
1720 if master_uuid:
1721 master_keys = ssh.QueryPubKeyFile([master_uuid], key_file=pub_key_file)
1722 for master_key in master_keys:
1723 if master_key in keys[node_uuid]:
1724 keys[node_uuid].remove(master_key)
1725
1726 if node_name == master_node and not keys_to_remove:
1727 raise errors.SshUpdateError("Cannot remove the master node's keys.")
1728
1729 if node_uuid in keys:
1730 base_data = {}
1731 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1732 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1733
1734 if from_authorized_keys:
1735 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1736 (constants.SSHS_REMOVE, keys)
1737 (auth_key_file, _) = \
1738 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False,
1739 dircheck=False)
1740 ssh.RemoveAuthorizedKeys(auth_key_file, keys[node_uuid])
1741
1742 pot_mc_data = base_data.copy()
1743
1744 if from_public_keys:
1745 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1746 (constants.SSHS_REMOVE, keys)
1747 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
1748
1749 all_nodes = ssconf_store.GetNodeList()
1750 online_nodes = ssconf_store.GetOnlineNodeList()
1751 logging.debug("Removing key of node '%s' from all nodes but itself and"
1752 " master.", node_name)
1753 for node in all_nodes:
1754 if node == master_node:
1755 logging.debug("Skipping master node '%s'.", master_node)
1756 continue
1757 if node not in online_nodes:
1758 logging.debug("Skipping offline node '%s'.", node)
1759 continue
1760 if node == node_name:
1761 logging.debug("Skipping node itself '%s'.", node_name)
1762 continue
1763 ssh_port = ssh_port_map.get(node)
1764 if not ssh_port:
1765 raise errors.OpExecError("No SSH port information available for"
1766 " node '%s', map: %s." %
1767 (node, ssh_port_map))
1768 error_msg_final = ("When removing the key of node '%s', updating the"
1769 " SSH key files of node '%s' failed. Last error"
1770 " was: %s.")
1771 if node in potential_master_candidates:
1772 logging.debug("Updating key setup of potential master candidate node"
1773 " %s.", node)
1774 try:
1775 utils.RetryByNumberOfTimes(
1776 constants.SSHS_MAX_RETRIES,
1777 errors.SshUpdateError,
1778 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1779 ssh_port, pot_mc_data,
1780 debug=False, verbose=False, use_cluster_key=False,
1781 ask_key=False, strict_host_check=False)
1782 except errors.SshUpdateError as last_exception:
1783 error_msg = error_msg_final % (
1784 node_name, node, last_exception)
1785 result_msgs.append((node, error_msg))
1786 logging.error(error_msg)
1787
1788 else:
1789 if from_authorized_keys:
1790 logging.debug("Updating key setup of normal node %s.", node)
1791 try:
1792 utils.RetryByNumberOfTimes(
1793 constants.SSHS_MAX_RETRIES,
1794 errors.SshUpdateError,
1795 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1796 ssh_port, base_data,
1797 debug=False, verbose=False, use_cluster_key=False,
1798 ask_key=False, strict_host_check=False)
1799 except errors.SshUpdateError as last_exception:
1800 error_msg = error_msg_final % (
1801 node_name, node, last_exception)
1802 result_msgs.append((node, error_msg))
1803 logging.error(error_msg)
1804
1805 if clear_authorized_keys or from_public_keys or clear_public_keys:
1806 data = {}
1807 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1808 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1809 ssh_port = ssh_port_map.get(node_name)
1810 if not ssh_port:
1811 raise errors.OpExecError("No SSH port information available for"
1812 " node '%s', which is leaving the cluster.")
1813
1814 if clear_authorized_keys:
1815 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore,
1816 # we have to specify exactly which keys to clear to leave keys untouched
1817 # that were not added by Ganeti.
1818 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids
1819 if uuid != node_uuid]
1820 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids,
1821 key_file=pub_key_file)
1822 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1823 (constants.SSHS_REMOVE, candidate_keys)
1824
1825 if clear_public_keys:
1826 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1827 (constants.SSHS_CLEAR, {})
1828 elif from_public_keys:
1829 # Since clearing the public keys subsumes removing just a single key,
1830 # we only do it of clear_public_keys is 'False'.
1831
1832 if keys[node_uuid]:
1833 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1834 (constants.SSHS_REMOVE, keys)
1835
1836 # If we have no changes to any keyfile, just return
1837 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or
1838 constants.SSHS_SSH_AUTHORIZED_KEYS in data):
1839 return
1840
1841 logging.debug("Updating SSH key setup of target node '%s'.", node_name)
1842 try:
1843 utils.RetryByNumberOfTimes(
1844 constants.SSHS_MAX_RETRIES,
1845 errors.SshUpdateError,
1846 run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE,
1847 ssh_port, data,
1848 debug=False, verbose=False, use_cluster_key=False,
1849 ask_key=False, strict_host_check=False)
1850 except errors.SshUpdateError as last_exception:
1851 result_msgs.append(
1852 (node_name,
1853 ("Removing SSH keys from node '%s' failed."
1854 " This can happen when the node is already unreachable."
1855 " Error: %s" % (node_name, last_exception))))
1856
1857 return result_msgs
1858
1859
1860 # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
1861 SshRemoveNodeInfo = collections.namedtuple(
1862 "SshRemoveNodeInfo",
1863 ["uuid",
1864 "name",
1865 "from_authorized_keys",
1866 "from_public_keys",
1867 "clear_authorized_keys",
1868 "clear_public_keys"])
1869
1870
1871 def RemoveNodeSshKeyBulk(node_list,
1872 master_candidate_uuids,
1873 potential_master_candidates,
1874 master_uuid=None,
1875 keys_to_remove=None,
1876 pub_key_file=pathutils.SSH_PUB_KEYS,
1877 ssconf_store=None,
1878 noded_cert_file=pathutils.NODED_CERT_FILE,
1879 readd=False,
1880 run_cmd_fn=ssh.RunSshCmdWithStdin):
1881 """Removes the node's SSH keys from the key files and distributes those.
1882
1883 Note that at least one of the flags C{from_authorized_keys},
1884 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1885 of at least one node has to be set to C{True} for the function to perform any
1886 action at all. Not doing so will trigger an assertion in the function.
1887
1888 @type node_list: list of C{SshRemoveNodeInfo}.
1889 @param node_list: list of information about nodes whose keys are being removed
1890 @type master_candidate_uuids: list of str
1891 @param master_candidate_uuids: list of UUIDs of the current master candidates
1892 @type potential_master_candidates: list of str
1893 @param potential_master_candidates: list of names of potential master
1894 candidates
1895 @type keys_to_remove: dict of str to list of str
1896 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1897 to be removed. This list is supposed to be used only if the keys are not
1898 in the public keys file. This is for example the case when removing a
1899 master node's key.
1900 @type readd: boolean
1901 @param readd: whether this is called during a readd operation.
1902 @rtype: list of string
1903 @returns: list of feedback messages
1904
1905 """
1906 # Non-disruptive error messages, list of (node, msg) pairs
1907 result_msgs = []
1908
1909 # whether there are any keys to be added or retrieved at all
1910 from_authorized_keys = any([node_info.from_authorized_keys for node_info in
1911 node_list])
1912 from_public_keys = any([node_info.from_public_keys for node_info in
1913 node_list])
1914 clear_authorized_keys = any([node_info.clear_authorized_keys for node_info in
1915 node_list])
1916 clear_public_keys = any([node_info.clear_public_keys for node_info in
1917 node_list])
1918
1919 # Make sure at least one of these flags is true.
1920 if not (from_authorized_keys or from_public_keys or clear_authorized_keys
1921 or clear_public_keys):
1922 raise errors.SshUpdateError("No removal from any key file was requested.")
1923
1924 if not ssconf_store:
1925 ssconf_store = ssconf.SimpleStore()
1926
1927 master_node = ssconf_store.GetMasterNode()
1928 ssh_port_map = ssconf_store.GetSshPortMap()
1929
1930 if from_authorized_keys or from_public_keys:
1931 all_keys_to_remove = {}
1932 for node_info in node_list:
1933 if node_info.name == master_node and not keys_to_remove:
1934 raise errors.SshUpdateError("Cannot remove the master node's keys.")
1935 if keys_to_remove:
1936 keys = keys_to_remove
1937 else:
1938 keys = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1939 if (not keys or node_info.uuid not in keys) and not readd:
1940 raise errors.SshUpdateError("Node '%s' not found in the list of"
1941 " public SSH keys. It seems someone"
1942 " tries to remove a key from outside"
1943 " the cluster!" % node_info.uuid)
1944 # During an upgrade all nodes have the master key. In this case we
1945 # should not remove it to avoid accidentally shutting down cluster
1946 # SSH communication
1947 master_keys = None
1948 if master_uuid:
1949 master_keys = ssh.QueryPubKeyFile([master_uuid],
1950 key_file=pub_key_file)
1951 for master_key in master_keys:
1952 if master_key in keys[node_info.uuid]:
1953 keys[node_info.uuid].remove(master_key)
1954
1955 all_keys_to_remove.update(keys)
1956
1957 if all_keys_to_remove:
1958 base_data = {}
1959 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1960 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1961
1962 if from_authorized_keys:
1963 # UUIDs of nodes that are supposed to be removed from the
1964 # authorized_keys files.
1965 nodes_remove_from_authorized_keys = [
1966 node_info.uuid for node_info in node_list
1967 if node_info.from_authorized_keys]
1968 keys_to_remove_from_authorized_keys = dict([
1969 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1970 if uuid in nodes_remove_from_authorized_keys])
1971 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1972 (constants.SSHS_REMOVE, keys_to_remove_from_authorized_keys)
1973 (auth_key_file, _) = \
1974 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False,
1975 dircheck=False)
1976
1977 for uuid in nodes_remove_from_authorized_keys:
1978 ssh.RemoveAuthorizedKeys(auth_key_file,
1979 keys_to_remove_from_authorized_keys[uuid])
1980
1981 pot_mc_data = base_data.copy()
1982
1983 if from_public_keys:
1984 nodes_remove_from_public_keys = [
1985 node_info.uuid for node_info in node_list
1986 if node_info.from_public_keys]
1987 keys_to_remove_from_public_keys = dict([
1988 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1989 if uuid in nodes_remove_from_public_keys])
1990 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1991 (constants.SSHS_REMOVE, keys_to_remove_from_public_keys)
1992
1993 all_nodes = ssconf_store.GetNodeList()
1994 online_nodes = ssconf_store.GetOnlineNodeList()
1995 all_nodes_to_remove = [node_info.name for node_info in node_list]
1996 logging.debug("Removing keys of nodes '%s' from all nodes but itself and"
1997 " master.", ", ".join(all_nodes_to_remove))
1998 for node in all_nodes:
1999 if node == master_node:
2000 logging.debug("Skipping master node '%s'.", master_node)
2001 continue
2002 if node not in online_nodes:
2003 logging.debug("Skipping offline node '%s'.", node)
2004 continue
2005 if node in all_nodes_to_remove:
2006 logging.debug("Skipping node whose key is removed itself '%s'.", node)
2007 continue
2008 ssh_port = ssh_port_map.get(node)
2009 if not ssh_port:
2010 raise errors.OpExecError("No SSH port information available for"
2011 " node '%s', map: %s." %
2012 (node, ssh_port_map))
2013 error_msg_final = ("When removing the key of node '%s', updating the"
2014 " SSH key files of node '%s' failed. Last error"
2015 " was: %s.")
2016 if node in potential_master_candidates:
2017 logging.debug("Updating key setup of potential master candidate node"
2018 " %s.", node)
2019 try:
2020 utils.RetryByNumberOfTimes(
2021 constants.SSHS_MAX_RETRIES,
2022 errors.SshUpdateError,
2023 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
2024 ssh_port, pot_mc_data,
2025 debug=False, verbose=False, use_cluster_key=False,
2026 ask_key=False, strict_host_check=False)
2027 except errors.SshUpdateError as last_exception:
2028 error_msg = error_msg_final % (
2029 node_info.name, node, last_exception)
2030 result_msgs.append((node, error_msg))
2031 logging.error(error_msg)
2032
2033 else:
2034 if from_authorized_keys:
2035 logging.debug("Updating key setup of normal node %s.", node)
2036 try:
2037 utils.RetryByNumberOfTimes(
2038 constants.SSHS_MAX_RETRIES,
2039 errors.SshUpdateError,
2040 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
2041 ssh_port, base_data,
2042 debug=False, verbose=False, use_cluster_key=False,
2043 ask_key=False, strict_host_check=False)
2044 except errors.SshUpdateError as last_exception:
2045 error_msg = error_msg_final % (
2046 node_info.name, node, last_exception)
2047 result_msgs.append((node, error_msg))
2048 logging.error(error_msg)
2049
2050 for node_info in node_list:
2051 if node_info.clear_authorized_keys or node_info.from_public_keys or \
2052 node_info.clear_public_keys:
2053 data = {}
2054 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
2055 cluster_name = data[constants.SSHS_CLUSTER_NAME]
2056 ssh_port = ssh_port_map.get(node_info.name)
2057 if not ssh_port:
2058 raise errors.OpExecError("No SSH port information available for"
2059 " node '%s', which is leaving the cluster.")
2060
2061 if node_info.clear_authorized_keys:
2062 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore,
2063 # we have to specify exactly which keys to clear to leave keys untouched
2064 # that were not added by Ganeti.
2065 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids
2066 if uuid != node_info.uuid]
2067 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids,
2068 key_file=pub_key_file)
2069 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
2070 (constants.SSHS_REMOVE, candidate_keys)
2071
2072 if node_info.clear_public_keys:
2073 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
2074 (constants.SSHS_CLEAR, {})
2075 elif node_info.from_public_keys:
2076 # Since clearing the public keys subsumes removing just a single key,
2077 # we only do it if clear_public_keys is 'False'.
2078
2079 if all_keys_to_remove:
2080 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
2081 (constants.SSHS_REMOVE, all_keys_to_remove)
2082
2083 # If we have no changes to any keyfile, just return
2084 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or
2085 constants.SSHS_SSH_AUTHORIZED_KEYS in data):
2086 return
2087
2088 logging.debug("Updating SSH key setup of target node '%s'.",
2089 node_info.name)
2090 try:
2091 utils.RetryByNumberOfTimes(
2092 constants.SSHS_MAX_RETRIES,
2093 errors.SshUpdateError,
2094 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
2095 ssh_port, data,
2096 debug=False, verbose=False, use_cluster_key=False,
2097 ask_key=False, strict_host_check=False)
2098 except errors.SshUpdateError as last_exception:
2099 result_msgs.append(
2100 (node_info.name,
2101 ("Removing SSH keys from node '%s' failed."
2102 " This can happen when the node is already unreachable."
2103 " Error: %s" % (node_info.name, last_exception))))
2104
2105 if all_keys_to_remove and from_public_keys:
2106 for node_uuid in nodes_remove_from_public_keys:
2107 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
2108
2109 return result_msgs
2110
2111
2112 def _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map,
2113 pub_key_file=pathutils.SSH_PUB_KEYS,
2114 ssconf_store=None,
2115 noded_cert_file=pathutils.NODED_CERT_FILE,
2116 run_cmd_fn=ssh.RunSshCmdWithStdin,
2117 suffix=""):
2118 """Generates the root SSH key pair on the node.
2119
2120 @type node_uuid: str
2121 @param node_uuid: UUID of the node whose key is removed
2122 @type node_name: str
2123 @param node_name: name of the node whose key is remove
2124 @type ssh_port_map: dict of str to int
2125 @param ssh_port_map: mapping of node names to their SSH port
2126
2127 """
2128 if not ssconf_store:
2129 ssconf_store = ssconf.SimpleStore()
2130
2131 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
2132 if not keys_by_uuid or node_uuid not in keys_by_uuid:
2133 raise errors.SshUpdateError("Node %s (UUID: %s) whose key is requested to"
2134 " be regenerated is not registered in the"
2135 " public keys file." % (node_name, node_uuid))
2136
2137 data = {}
2138 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
2139 cluster_name = data[constants.SSHS_CLUSTER_NAME]
2140 data[constants.SSHS_GENERATE] = {constants.SSHS_SUFFIX: suffix}
2141
2142 run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
2143 ssh_port_map.get(node_name), data,
2144 debug=False, verbose=False, use_cluster_key=False,
2145 ask_key=False, strict_host_check=False)
2146
2147
2148 def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
2149 master_node_uuids = [node_uuid for (node_uuid, node_name)
2150 in node_uuid_name_map
2151 if node_name == master_node_name]
2152 if len(master_node_uuids) != 1:
2153 raise errors.SshUpdateError("No (unique) master UUID found. Master node"
2154 " name: '%s', Master UUID: '%s'" %
2155 (master_node_name, master_node_uuids))
2156 return master_node_uuids[0]
2157
2158
2159 def _GetOldMasterKeys(master_node_uuid, pub_key_file):
2160 old_master_keys_by_uuid = ssh.QueryPubKeyFile([master_node_uuid],
2161 key_file=pub_key_file)
2162 if not old_master_keys_by_uuid:
2163 raise errors.SshUpdateError("No public key of the master node (UUID '%s')"
2164 " found, not generating a new key."
2165 % master_node_uuid)
2166 return old_master_keys_by_uuid
2167
2168
2169 def _GetNewMasterKey(root_keyfiles, master_node_uuid):
2170 new_master_keys = []
2171 for (_, (_, public_key_file)) in root_keyfiles.items():
2172 public_key_dir = os.path.dirname(public_key_file)
2173 public_key_file_tmp_filename = \
2174 os.path.splitext(os.path.basename(public_key_file))[0] \
2175 + constants.SSHS_MASTER_SUFFIX + ".pub"
2176 public_key_path_tmp = os.path.join(public_key_dir,
2177 public_key_file_tmp_filename)
2178 if os.path.exists(public_key_path_tmp):
2179 # for some key types, there might not be any keys
2180 key = utils.ReadFile(public_key_path_tmp)
2181 new_master_keys.append(key)
2182 if not new_master_keys:
2183 raise errors.SshUpdateError("Cannot find any type of temporary SSH key.")
2184 return {master_node_uuid: new_master_keys}
2185
2186
2187 def _ReplaceMasterKeyOnMaster(root_keyfiles):
2188 number_of_moves = 0
2189 for (_, (private_key_file, public_key_file)) in root_keyfiles.items():
2190 key_dir = os.path.dirname(public_key_file)
2191 private_key_file_tmp = \
2192 os.path.basename(private_key_file) + constants.SSHS_MASTER_SUFFIX
2193 public_key_file_tmp = private_key_file_tmp + ".pub"
2194 private_key_path_tmp = os.path.join(key_dir,
2195 private_key_file_tmp)
2196 public_key_path_tmp = os.path.join(key_dir,
2197 public_key_file_tmp)
2198 if os.path.exists(public_key_file):
2199 utils.CreateBackup(public_key_file)
2200 utils.RemoveFile(public_key_file)
2201 if os.path.exists(private_key_file):
2202 utils.CreateBackup(private_key_file)
2203 utils.RemoveFile(private_key_file)
2204 if os.path.exists(public_key_path_tmp) and \
2205 os.path.exists(private_key_path_tmp):
2206 # for some key types, there might not be any keys
2207 shutil.move(public_key_path_tmp, public_key_file)
2208 shutil.move(private_key_path_tmp, private_key_file)
2209 number_of_moves += 1
2210 if not number_of_moves:
2211 raise errors.SshUpdateError("Could not move at least one master SSH key.")
2212
2213
2214 def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
2215 potential_master_candidates,
2216 pub_key_file=pathutils.SSH_PUB_KEYS,
2217 ssconf_store=None,
2218 noded_cert_file=pathutils.NODED_CERT_FILE,
2219 run_cmd_fn=ssh.RunSshCmdWithStdin):
2220 """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
2221
2222 @type node_uuids: list of str
2223 @param node_uuids: list of node UUIDs whose keys should be renewed
2224 @type node_names: list of str
2225 @param node_names: list of node names whose keys should be removed. This list
2226 should match the C{node_uuids} parameter
2227 @type master_candidate_uuids: list of str
2228 @param master_candidate_uuids: list of UUIDs of master candidates or
2229 master node
2230 @type pub_key_file: str
2231 @param pub_key_file: file path of the the public key file
2232 @type noded_cert_file: str
2233 @param noded_cert_file: path of the noded SSL certificate file
2234 @type run_cmd_fn: function
2235 @param run_cmd_fn: function to run commands on remote nodes via SSH
2236 @raises ProgrammerError: if node_uuids and node_names don't match;
2237 SshUpdateError if a node's key is missing from the public key file,
2238 if a node's new SSH key could not be fetched from it, if there is
2239 none or more than one entry in the public key list for the master
2240 node.
2241
2242 """
2243 if not ssconf_store:
2244 ssconf_store = ssconf.SimpleStore()
2245 cluster_name = ssconf_store.GetClusterName()
2246
2247 if not len(node_uuids) == len(node_names):
2248 raise errors.ProgrammerError("List of nodes UUIDs and node names"
2249 " does not match in length.")
2250
2251 (_, root_keyfiles) = \
2252 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2253 (_, dsa_pub_keyfile) = root_keyfiles[constants.SSHK_DSA]
2254 old_master_key = utils.ReadFile(dsa_pub_keyfile)
2255
2256 node_uuid_name_map = zip(node_uuids, node_names)
2257
2258 master_node_name = ssconf_store.GetMasterNode()
2259 master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name)
2260 ssh_port_map = ssconf_store.GetSshPortMap()
2261 # List of all node errors that happened, but which did not abort the
2262 # procedure as a whole. It is important that this is a list to have a
2263 # somewhat chronological history of events.
2264 all_node_errors = []
2265
2266 # process non-master nodes
2267
2268 # keys to add in bulk at the end
2269 node_keys_to_add = []
2270
2271 for node_uuid, node_name in node_uuid_name_map:
2272 if node_name == master_node_name:
2273 continue
2274 master_candidate = node_uuid in master_candidate_uuids
2275 potential_master_candidate = node_name in potential_master_candidates
2276
2277 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
2278 if not keys_by_uuid:
2279 raise errors.SshUpdateError("No public key of node %s (UUID %s) found,"
2280 " not generating a new key."
2281 % (node_name, node_uuid))
2282
2283 if master_candidate:
2284 logging.debug("Fetching old SSH key from node '%s'.", node_name)
2285 old_pub_key = ssh.ReadRemoteSshPubKeys(dsa_pub_keyfile,
2286 node_name, cluster_name,
2287 ssh_port_map[node_name],
2288 False, # ask_key
2289 False) # key_check
2290 if old_pub_key != old_master_key:
2291 # If we are already in a multi-key setup (that is past Ganeti 2.12),
2292 # we can safely remove the old key of the node. Otherwise, we cannot
2293 # remove that node's key, because it is also the master node's key
2294 # and that would terminate all communication from the master to the
2295 # node.
2296 logging.debug("Removing SSH key of node '%s'.", node_name)
2297 node_errors = RemoveNodeSshKey(
2298 node_uuid, node_name, master_candidate_uuids,
2299 potential_master_candidates,
2300 master_uuid=master_node_uuid, from_authorized_keys=master_candidate,
2301 from_public_keys=False, clear_authorized_keys=False,
2302 clear_public_keys=False)
2303 if node_errors:
2304 all_node_errors = all_node_errors + node_errors
2305 else:
2306 logging.debug("Old key of node '%s' is the same as the current master"
2307 " key. Not deleting that key on the node.", node_name)
2308
2309 logging.debug("Generating new SSH key for node '%s'.", node_name)
2310 _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map,
2311 pub_key_file=pub_key_file,
2312 ssconf_store=ssconf_store,
2313 noded_cert_file=noded_cert_file,
2314 run_cmd_fn=run_cmd_fn)
2315
2316 try:
2317 logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
2318 pub_key = ssh.ReadRemoteSshPubKeys(dsa_pub_keyfile,
2319 node_name, cluster_name,
2320 ssh_port_map[node_name],
2321 False, # ask_key
2322 False) # key_check
2323 except:
2324 raise errors.SshUpdateError("Could not fetch key of node %s"
2325 " (UUID %s)" % (node_name, node_uuid))
2326
2327 if potential_master_candidate:
2328 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
2329 ssh.AddPublicKey(node_uuid, pub_key, key_file=pub_key_file)
2330
2331 logging.debug("Add ssh key of node '%s'.", node_name)
2332 node_info = SshAddNodeInfo(name=node_name,
2333 uuid=node_uuid,
2334 to_authorized_keys=master_candidate,
2335 to_public_keys=potential_master_candidate,
2336 get_public_keys=True)
2337 node_keys_to_add.append(node_info)
2338
2339 node_errors = AddNodeSshKeyBulk(
2340 node_keys_to_add, potential_master_candidates,
2341 pub_key_file=pub_key_file, ssconf_store=ssconf_store,
2342 noded_cert_file=noded_cert_file,
2343 run_cmd_fn=run_cmd_fn)
2344 if node_errors:
2345 all_node_errors = all_node_errors + node_errors
2346
2347 # Renewing the master node's key
2348
2349 # Preserve the old keys for now
2350 old_master_keys_by_uuid = _GetOldMasterKeys(master_node_uuid, pub_key_file)
2351
2352 # Generate a new master key with a suffix, don't touch the old one for now
2353 logging.debug("Generate new ssh key of master.")
2354 _GenerateNodeSshKey(master_node_uuid, master_node_name, ssh_port_map,
2355 pub_key_file=pub_key_file,
2356 ssconf_store=ssconf_store,
2357 noded_cert_file=noded_cert_file,
2358 run_cmd_fn=run_cmd_fn,
2359 suffix=constants.SSHS_MASTER_SUFFIX)
2360 # Read newly created master key
2361 new_master_key_dict = _GetNewMasterKey(root_keyfiles, master_node_uuid)
2362
2363 # Replace master key in the master nodes' public key file
2364 ssh.RemovePublicKey(master_node_uuid, key_file=pub_key_file)
2365 for pub_key in new_master_key_dict[master_node_uuid]:
2366 ssh.AddPublicKey(master_node_uuid, pub_key, key_file=pub_key_file)
2367
2368 # Add new master key to all node's public and authorized keys
2369 logging.debug("Add new master key to all nodes.")
2370 node_errors = AddNodeSshKey(
2371 master_node_uuid, master_node_name, potential_master_candidates,
2372 to_authorized_keys=True, to_public_keys=True,
2373 get_public_keys=False, pub_key_file=pub_key_file,
2374 ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
2375 run_cmd_fn=run_cmd_fn)
2376 if node_errors:
2377 all_node_errors = all_node_errors + node_errors
2378
2379 # Remove the old key file and rename the new key to the non-temporary filename
2380 _ReplaceMasterKeyOnMaster(root_keyfiles)
2381
2382 # Remove old key from authorized keys
2383 (auth_key_file, _) = \
2384 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2385 ssh.RemoveAuthorizedKeys(auth_key_file,
2386 old_master_keys_by_uuid[master_node_uuid])
2387
2388 # Remove the old key from all node's authorized keys file
2389 logging.debug("Remove the old master key from all nodes.")
2390 node_errors = RemoveNodeSshKey(
2391 master_node_uuid, master_node_name, master_candidate_uuids,
2392 potential_master_candidates,
2393 keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
2394 from_public_keys=False, clear_authorized_keys=False,
2395 clear_public_keys=False)
2396 if node_errors:
2397 all_node_errors = all_node_errors + node_errors
2398
2399 return all_node_errors
2400
2401
2402 def GetBlockDevSizes(devices):
2403 """Return the size of the given block devices
2404
2405 @type devices: list
2406 @param devices: list of block device nodes to query
2407 @rtype: dict
2408 @return:
2409 dictionary of all block devices under /dev (key). The value is their
2410 size in MiB.
2411
2412 {'/dev/disk/by-uuid/123456-12321231-312312-312': 124}
2413
2414 """
2415 DEV_PREFIX = "/dev/"
2416 blockdevs = {}
2417
2418 for devpath in devices:
2419 if not utils.IsBelowDir(DEV_PREFIX, devpath):
2420 continue
2421
2422 try:
2423 st = os.stat(devpath)
2424 except EnvironmentError, err:
2425 logging.warning("Error stat()'ing device %s: %s", devpath, str(err))
2426 continue
2427
2428 if stat.S_ISBLK(st.st_mode):
2429 result = utils.RunCmd(["blockdev", "--getsize64", devpath])
2430 if result.failed:
2431 # We don't want to fail, just do not list this device as available
2432 logging.warning("Cannot get size for block device %s", devpath)
2433 continue
2434
2435 size = int(result.stdout) / (1024 * 1024)
2436 blockdevs[devpath] = size
2437 return blockdevs
2438
2439
2440 def GetVolumeList(vg_names):
2441 """Compute list of logical volumes and their size.
2442
2443 @type vg_names: list
2444 @param vg_names: the volume groups whose LVs we should list, or
2445 empty for all volume groups
2446 @rtype: dict
2447 @return:
2448 dictionary of all partions (key) with value being a tuple of
2449 their size (in MiB), inactive and online status::
2450
2451 {'xenvg/test1': ('20.06', True, True)}
2452
2453 in case of errors, a string is returned with the error
2454 details.
2455
2456 """
2457 lvs = {}
2458 sep = "|"
2459 if not vg_names:
2460 vg_names = []
2461 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2462 "--separator=%s" % sep,
2463 "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names)
2464 if result.failed:
2465 _Fail("Failed to list logical volumes, lvs output: %s", result.output)
2466
2467 for line in result.stdout.splitlines():
2468 line = line.strip()
2469 match = _LVSLINE_REGEX.match(line)
2470 if not match:
2471 logging.error("Invalid line returned from lvs output: '%s'", line)
2472 continue
2473 vg_name, name, size, attr = match.groups()
2474 inactive = attr[4] == "-"
2475 online = attr[5] == "o"
2476 virtual = attr[0] == "v"
2477 if virtual:
2478 # we don't want to report such volumes as existing, since they
2479 # don't really hold data
2480 continue
2481 lvs[vg_name + "/" + name] = (size, inactive, online)
2482
2483 return lvs
2484
2485
2486 def ListVolumeGroups():
2487 """List the volume groups and their size.
2488
2489 @rtype: dict
2490 @return: dictionary with keys volume name and values the
2491 size of the volume
2492
2493 """
2494 return utils.ListVolumeGroups()
2495
2496
2497 def NodeVolumes():
2498 """List all volumes on this node.
2499
2500 @rtype: list
2501 @return:
2502 A list of dictionaries, each having four keys:
2503 - name: the logical volume name,
2504 - size: the size of the logical volume
2505 - dev: the physical device on which the LV lives
2506 - vg: the volume group to which it belongs
2507
2508 In case of errors, we return an empty list and log the
2509 error.
2510
2511 Note that since a logical volume can live on multiple physical
2512 volumes, the resulting list might include a logical volume
2513 multiple times.
2514
2515 """
2516 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2517 "--separator=|",
2518 "--options=lv_name,lv_size,devices,vg_name"])
2519 if result.failed:
2520 _Fail("Failed to list logical volumes, lvs output: %s",
2521 result.output)
2522
2523 def parse_dev(dev):
2524 return dev.split("(")[0]
2525
2526 def handle_dev(dev):
2527 return [parse_dev(x) for x in dev.split(",")]
2528
2529 def map_line(line):
2530 line = [v.strip() for v in line]
2531 return [{"name": line[0], "size": line[1],
2532 "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])]
2533
2534 all_devs = []
2535 for line in result.stdout.splitlines():
2536 if line.count("|") >= 3:
2537 all_devs.extend(map_line(line.split("|")))
2538 else:
2539 logging.warning("Strange line in the output from lvs: '%s'", line)
2540 return all_devs
2541
2542
2543 def BridgesExist(bridges_list):
2544 """Check if a list of bridges exist on the current node.
2545
2546 @rtype: boolean
2547 @return: C{True} if all of them exist, C{False} otherwise
2548
2549 """
2550 missing = []
2551 for bridge in bridges_list:
2552 if not utils.BridgeExists(bridge):
2553 missing.append(bridge)
2554
2555 if missing:
2556 _Fail("Missing bridges %s", utils.CommaJoin(missing))
2557
2558
2559 def GetInstanceListForHypervisor(hname, hvparams=None,
2560 get_hv_fn=hypervisor.GetHypervisor):
2561 """Provides a list of instances of the given hypervisor.
2562
2563 @type hname: string
2564 @param hname: name of the hypervisor
2565 @type hvparams: dict of strings
2566 @param hvparams: hypervisor parameters for the given hypervisor
2567 @type get_hv_fn: function
2568 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2569 name; optional parameter to increase testability
2570
2571 @rtype: list
2572 @return: a list of all running instances on the current node
2573 - instance1.example.com
2574 - instance2.example.com
2575
2576 """
2577 try:
2578 return get_hv_fn(hname).ListInstances(hvparams=hvparams)
2579 except errors.HypervisorError, err:
2580 _Fail("Error enumerating instances (hypervisor %s): %s",
2581 hname, err, exc=True)
2582
2583
2584 def GetInstanceList(hypervisor_list, all_hvparams=None,
2585 get_hv_fn=hypervisor.GetHypervisor):
2586 """Provides a list of instances.
2587
2588 @type hypervisor_list: list
2589 @param hypervisor_list: the list of hypervisors to query information
2590 @type all_hvparams: dict of dict of strings
2591 @param all_hvparams: a dictionary mapping hypervisor types to respective
2592 cluster-wide hypervisor parameters
2593 @type get_hv_fn: function
2594 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2595 name; optional parameter to increase testability
2596
2597 @rtype: list
2598 @return: a list of all running instances on the current node
2599 - instance1.example.com
2600 - instance2.example.com
2601
2602 """
2603 results = []
2604 for hname in hypervisor_list:
2605 hvparams = all_hvparams[hname]
2606 results.extend(GetInstanceListForHypervisor(hname, hvparams=hvparams,
2607 get_hv_fn=get_hv_fn))
2608 return results
2609
2610
2611 def GetInstanceInfo(instance, hname, hvparams=None):
2612 """Gives back the information about an instance as a dictionary.
2613
2614 @type instance: string
2615 @param instance: the instance name
2616 @type hname: string
2617 @param hname: the hypervisor type of the instance
2618 @type hvparams: dict of strings
2619 @param hvparams: the instance's hvparams
2620
2621 @rtype: dict
2622 @return: dictionary with the following keys:
2623 - memory: memory size of instance (int)
2624 - state: state of instance (HvInstanceState)
2625 - time: cpu time of instance (float)
2626 - vcpus: the number of vcpus (int)
2627
2628 """
2629 output = {}
2630
2631 iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance,
2632 hvparams=hvparams)
2633 if iinfo is not None:
2634 output["memory"] = iinfo[2]
2635 output["vcpus"] = iinfo[3]
2636 output["state"] = iinfo[4]
2637 output["time"] = iinfo[5]
2638
2639 return output
2640
2641
2642 def GetInstanceMigratable(instance):
2643 """Computes whether an instance can be migrated.
2644
2645 @type instance: L{objects.Instance}
2646 @param instance: object representing the instance to be checked.
2647
2648 @rtype: tuple
2649 @return: tuple of (result, description) where:
2650 - result: whether the instance can be migrated or not
2651 - description: a description of the issue, if relevant
2652
2653 """
2654 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2655 iname = instance.name
2656 if iname not in hyper.ListInstances(hvparams=instance.hvparams):
2657 _Fail("Instance %s is not running", iname)
2658
2659 for idx in range(len(instance.disks_info)):
2660 link_name = _GetBlockDevSymlinkPath(iname, idx)
2661 if not os.path.islink(link_name):
2662 logging.warning("Instance %s is missing symlink %s for disk %d",
2663 iname, link_name, idx)
2664
2665
2666 def GetAllInstancesInfo(hypervisor_list, all_hvparams):
2667 """Gather data about all instances.
2668
2669 This is the equivalent of L{GetInstanceInfo}, except that it
2670 computes data for all instances at once, thus being faster if one
2671 needs data about more than one instance.
2672
2673 @type hypervisor_list: list
2674 @param hypervisor_list: list of hypervisors to query for instance data
2675 @type all_hvparams: dict of dict of strings
2676 @param all_hvparams: mapping of hypervisor names to hvparams
2677
2678 @rtype: dict
2679 @return: dictionary of instance: data, with data having the following keys:
2680 - memory: memory size of instance (int)
2681 - state: xen state of instance (string)
2682 - time: cpu time of instance (float)
2683 - vcpus: the number of vcpus
2684
2685 """
2686 output = {}
2687 for hname in hypervisor_list:
2688 hvparams = all_hvparams[hname]
2689 iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo(hvparams)
2690 if iinfo:
2691 for name, _, memory, vcpus, state, times in iinfo:
2692 value = {
2693 "memory": memory,
2694 "vcpus": vcpus,
2695 "state": state,
2696 "time": times,
2697 }
2698 if name in output:
2699 # we only check static parameters, like memory and vcpus,
2700 # and not state and time which can change between the
2701 # invocations of the different hypervisors
2702 for key in "memory", "vcpus":
2703 if value[key] != output[name][key]:
2704 _Fail("Instance %s is running twice"
2705 " with different parameters", name)
2706 output[name] = value
2707
2708 return output
2709
2710
2711 def GetInstanceConsoleInfo(instance_param_dict,
2712 get_hv_fn=hypervisor.GetHypervisor):
2713 """Gather data about the console access of a set of instances of this node.
2714
2715 This function assumes that the caller already knows which instances are on
2716 this node, by calling a function such as L{GetAllInstancesInfo} or
2717 L{GetInstanceList}.
2718
2719 For every instance, a large amount of configuration data needs to be
2720 provided to the hypervisor interface in order to receive the console
2721 information. Whether this could or should be cut down can be discussed.
2722 The information is provided in a dictionary indexed by instance name,
2723 allowing any number of instance queries to be done.
2724
2725 @type instance_param_dict: dict of string to tuple of dictionaries, where the
2726 dictionaries represent: L{objects.Instance}, L{objects.Node},
2727 L{objects.NodeGroup}, HvParams, BeParams
2728 @param instance_param_dict: mapping of instance name to parameters necessary
2729 for console information retrieval
2730
2731 @rtype: dict
2732 @return: dictionary of instance: data, with data having the following keys:
2733 - instance: instance name
2734 - kind: console kind
2735 - message: used with kind == CONS_MESSAGE, indicates console to be
2736 unavailable, supplies error message
2737 - host: host to connect to
2738 - port: port to use
2739 - user: user for login
2740 - command: the command, broken into parts as an array
2741 - display: unknown, potentially unused?
2742
2743 """
2744
2745 output = {}
2746 for inst_name in instance_param_dict:
2747 instance = instance_param_dict[inst_name]["instance"]
2748 pnode = instance_param_dict[inst_name]["node"]
2749 group = instance_param_dict[inst_name]["group"]
2750 hvparams = instance_param_dict[inst_name]["hvParams"]
2751 beparams = instance_param_dict[inst_name]["beParams"]
2752
2753 instance = objects.Instance.FromDict(instance)
2754 pnode = objects.Node.FromDict(pnode)
2755 group = objects.NodeGroup.FromDict(group)
2756
2757 h = get_hv_fn(instance.hypervisor)
2758 output[inst_name] = h.GetInstanceConsole(instance, pnode, group,
2759 hvparams, beparams).ToDict()
2760
2761 return output
2762
2763
2764 def _InstanceLogName(kind, os_name, instance, component):
2765 """Compute the OS log filename for a given instance and operation.
2766
2767 The instance name and os name are passed in as strings since not all
2768 operations have these as part of an instance object.
2769
2770 @type kind: string
2771 @param kind: the operation type (e.g. add, import, etc.)
2772 @type os_name: string
2773 @param os_name: the os name
2774 @type instance: string
2775 @param instance: the name of the instance being imported/added/etc.
2776 @type component: string or None
2777 @param component: the name of the component of the instance being
2778 transferred
2779
2780 """
2781 # TODO: Use tempfile.mkstemp to create unique filename
2782 if component:
2783 assert "/" not in component
2784 c_msg = "-%s" % component
2785 else:
2786 c_msg = ""
2787 base = ("%s-%s-%s%s-%s.log" %
2788 (kind, os_name, instance, c_msg, utils.TimestampForFilename()))
2789 return utils.PathJoin(pathutils.LOG_OS_DIR, base)
2790
2791
2792 def InstanceOsAdd(instance, reinstall, debug):
2793 """Add an OS to an instance.
2794
2795 @type instance: L{objects.Instance}
2796 @param instance: Instance whose OS is to be installed
2797 @type reinstall: boolean
2798 @param reinstall: whether this is an instance reinstall
2799 @type debug: integer
2800 @param debug: debug level, passed to the OS scripts
2801 @rtype: None
2802
2803 """
2804 inst_os = OSFromDisk(instance.os)
2805
2806 create_env = OSEnvironment(instance, inst_os, debug)
2807 if reinstall:
2808 create_env["INSTANCE_REINSTALL"] = "1"
2809
2810 logfile = _InstanceLogName("add", instance.os, instance.name, None)
2811
2812 result = utils.RunCmd([inst_os.create_script], env=create_env,
2813 cwd=inst_os.path, output=logfile, reset_env=True)
2814 if result.failed:
2815 logging.error("os create command '%s' returned error: %s, logfile: %s,"
2816 " output: %s", result.cmd, result.fail_reason, logfile,
2817 result.output)
2818 lines = [utils.SafeEncode(val)
2819 for val in utils.TailFile(logfile, lines=20)]
2820 _Fail("OS create script failed (%s), last lines in the"
2821 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2822
2823
2824 def RunRenameInstance(instance, old_name, debug):
2825 """Run the OS rename script for an instance.
2826
2827 @type instance: L{objects.Instance}
2828 @param instance: Instance whose OS is to be installed
2829 @type old_name: string
2830 @param old_name: previous instance name
2831 @type debug: integer
2832 @param debug: debug level, passed to the OS scripts
2833 @rtype: boolean
2834 @return: the success of the operation
2835
2836 """
2837 inst_os = OSFromDisk(instance.os)
2838
2839 rename_env = OSEnvironment(instance, inst_os, debug)
2840 rename_env["OLD_INSTANCE_NAME"] = old_name
2841
2842 logfile = _InstanceLogName("rename", instance.os,
2843 "%s-%s" % (old_name, instance.name), None)
2844
2845 result = utils.RunCmd([inst_os.rename_script], env=rename_env,
2846 cwd=inst_os.path, output=logfile, reset_env=True)
2847
2848 if result.failed:
2849 logging.error("os create command '%s' returned error: %s output: %s",
2850 result.cmd, result.fail_reason, result.output)
2851 lines = [utils.SafeEncode(val)
2852 for val in utils.TailFile(logfile, lines=20)]
2853 _Fail("OS rename script failed (%s), last lines in the"
2854 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2855
2856
2857 def _GetBlockDevSymlinkPath(instance_name, idx, _dir=None):
2858 """Returns symlink path for block device.
2859
2860 """
2861 if _dir is None:
2862 _dir = pathutils.DISK_LINKS_DIR
2863
2864 return utils.PathJoin(_dir,
2865 ("%s%s%s" %
2866 (instance_name, constants.DISK_SEPARATOR, idx)))
2867
2868
2869 def _SymlinkBlockDev(instance_name, device_path, idx):
2870 """Set up symlinks to a instance's block device.
2871
2872 This is an auxiliary function run when an instance is start (on the primary
2873 node) or when an instance is migrated (on the target node).
2874
2875
2876 @param instance_name: the name of the target instance
2877 @param device_path: path of the physical block device, on the node
2878 @param idx: the disk index
2879 @return: absolute path to the disk's symlink
2880
2881 """
2882 # In case we have only a userspace access URI, device_path is None
2883 if not device_path:
2884 return None
2885
2886 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2887 try:
2888 os.symlink(device_path, link_name)
2889 except OSError, err:
2890 if err.errno == errno.EEXIST:
2891 if (not os.path.islink(link_name) or
2892 os.readlink(link_name) != device_path):
2893 os.remove(link_name)
2894 os.symlink(device_path, link_name)
2895 else:
2896 raise
2897
2898 return link_name
2899
2900
2901 def _RemoveBlockDevLinks(instance_name, disks):
2902 """Remove the block device symlinks belonging to the given instance.
2903
2904 """
2905 for idx, _ in enumerate(disks):
2906 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2907 if os.path.islink(link_name):
2908 try:
2909 os.remove(link_name)
2910 except OSError:
2911 logging.exception("Can't remove symlink '%s'", link_name)
2912
2913
2914 def _CalculateDeviceURI(instance, disk, device):
2915 """Get the URI for the device.
2916
2917 @type instance: L{objects.Instance}
2918 @param instance: the instance which disk belongs to
2919 @type disk: L{objects.Disk}
2920 @param disk: the target disk object
2921 @type device: L{bdev.BlockDev}
2922 @param device: the corresponding BlockDevice
2923 @rtype: string
2924 @return: the device uri if any else None
2925
2926 """
2927 access_mode = disk.params.get(constants.LDP_ACCESS,
2928 constants.DISK_KERNELSPACE)
2929 if access_mode == constants.DISK_USERSPACE:
2930 # This can raise errors.BlockDeviceError
2931 return device.GetUserspaceAccessUri(instance.hypervisor)
2932 else:
2933 return None
2934
2935
2936 def _GatherAndLinkBlockDevs(instance):
2937 """Set up an instance's block device(s).
2938
2939 This is run on the primary node at instance startup. The block
2940 devices must be already assembled.
2941
2942 @type instance: L{objects.Instance}
2943 @param instance: the instance whose disks we should assemble
2944 @rtype: list
2945 @return: list of (disk_object, link_name, drive_uri)
2946
2947 """
2948 block_devices = []
2949 for idx, disk in enumerate(instance.disks_info):
2950 device = _RecursiveFindBD(disk)
2951 if device is None:
2952 raise errors.BlockDeviceError("Block device '%s' is not set up." %
2953 str(disk))
2954 device.Open()
2955 try:
2956 link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
2957 except OSError, e:
2958 raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
2959 e.strerror)
2960 uri = _CalculateDeviceURI(instance, disk, device)
2961
2962 block_devices.append((disk, link_name, uri))
2963
2964 return block_devices
2965
2966
2967 def _IsInstanceUserDown(instance_info):
2968 return instance_info and \
2969 "state" in instance_info and \
2970 hv_base.HvInstanceState.IsShutdown(instance_info["state"])
2971
2972
2973 def _GetInstanceInfo(instance):
2974 """Helper function L{GetInstanceInfo}"""
2975 return GetInstanceInfo(instance.name, instance.hypervisor,
2976 hvparams=instance.hvparams)
2977
2978
2979 def StartInstance(instance, startup_paused, reason, store_reason=True):
2980 """Start an instance.
2981
2982 @type instance: L{objects.Instance}
2983 @param instance: the instance object
2984 @type startup_paused: bool
2985 @param instance: pause instance at startup?
2986 @type reason: list of reasons
2987 @param reason: the reason trail for this startup
2988 @type store_reason: boolean
2989 @param store_reason: whether to store the shutdown reason trail on file
2990 @rtype: None
2991
2992 """
2993 instance_info = _GetInstanceInfo(instance)
2994
2995 if instance_info and not _IsInstanceUserDown(instance_info):
2996 logging.info("Instance '%s' already running, not starting", instance.name)
2997 return
2998
2999 try:
3000 block_devices = _GatherAndLinkBlockDevs(instance)
3001 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3002 hyper.StartInstance(instance, block_devices, startup_paused)
3003 if store_reason:
3004 _StoreInstReasonTrail(instance.name, reason)
3005 except errors.BlockDeviceError, err:
3006 _Fail("Block device error: %s", err, exc=True)
3007 except errors.HypervisorError, err:
3008 _RemoveBlockDevLinks(instance.name, instance.disks_info)
3009 _Fail("Hypervisor error: %s", err, exc=True)
3010
3011
3012 def InstanceShutdown(instance, timeout, reason, store_reason=True):
3013 """Shut an instance down.
3014
3015 @note: this functions uses polling with a hardcoded timeout.
3016
3017 @type instance: L{objects.Instance}
3018 @param instance: the instance object
3019 @type timeout: integer
3020 @param timeout: maximum timeout for soft shutdown
3021 @type reason: list of reasons
3022 @param reason: the reason trail for this shutdown
3023 @type store_reason: boolean
3024 @param store_reason: whether to store the shutdown reason trail on file
3025 @rtype: None
3026
3027 """
3028 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3029
3030 if not _GetInstanceInfo(instance):
3031 logging.info("Instance '%s' not running, doing nothing", instance.name)
3032 return
3033
3034 class _TryShutdown(object):
3035 def __init__(self):
3036 self.tried_once = False
3037
3038 def __call__(self):
3039 if not _GetInstanceInfo(instance):
3040 return
3041
3042 try:
3043 hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout)
3044 if store_reason:
3045 _StoreInstReasonTrail(instance.name, reason)
3046 except errors.HypervisorError, err:
3047 # if the instance is no longer existing, consider this a
3048 # success and go to cleanup
3049 if not _GetInstanceInfo(instance):
3050 return
3051
3052 _Fail("Failed to stop instance '%s': %s", instance.name, err)
3053
3054 self.tried_once = True
3055
3056 raise utils.RetryAgain()
3057
3058 try:
3059 utils.Retry(_TryShutdown(), 5, timeout)
3060 except utils.RetryTimeout:
3061 # the shutdown did not succeed
3062 logging.error("Shutdown of '%s' unsuccessful, forcing", instance.name)
3063
3064 try:
3065 hyper.StopInstance(instance, force=True)
3066 except errors.HypervisorError, err:
3067 # only raise an error if the instance still exists, otherwise
3068 # the error could simply be "instance ... unknown"!
3069 if _GetInstanceInfo(instance):
3070 _Fail("Failed to force stop instance '%s': %s", instance.name, err)
3071
3072 time.sleep(1)
3073
3074 if _GetInstanceInfo(instance):
3075 _Fail("Could not shutdown instance '%s' even by destroy", instance.name)
3076
3077 try:
3078 hyper.CleanupInstance(instance.name)
3079 except errors.HypervisorError, err:
3080 logging.warning("Failed to execute post-shutdown cleanup step: %s", err)
3081
3082 _RemoveBlockDevLinks(instance.name, instance.disks_info)
3083
3084
3085 def InstanceReboot(instance, reboot_type, shutdown_timeout, reason):
3086 """Reboot an instance.
3087
3088 @type instance: L{objects.Instance}
3089 @param instance: the instance object to reboot
3090 @type reboot_type: str
3091 @param reboot_type: the type of reboot, one the following
3092 constants:
3093 - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
3094 instance OS, do not recreate the VM
3095 - L{constants.INSTANCE_REBOOT_HARD}: tear down and
3096 restart the VM (at the hypervisor level)
3097 - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is
3098 not accepted here, since that mode is handled differently, in
3099 cmdlib, and translates into full stop and start of the
3100 instance (instead of a call_instance_reboot RPC)
3101 @type shutdown_timeout: integer
3102 @param shutdown_timeout: maximum timeout for soft shutdown
3103 @type reason: list of reasons
3104 @param reason: the reason trail for this reboot
3105 @rtype: None
3106
3107 """
3108 # TODO: this is inconsistent with 'StartInstance' and 'InstanceShutdown'
3109 # because those functions simply 'return' on error whereas this one
3110 # raises an exception with '_Fail'
3111 if not _GetInstanceInfo(instance):
3112 _Fail("Cannot reboot instance '%s' that is not running", instance.name)
3113
3114 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3115 if reboot_type == constants.INSTANCE_REBOOT_SOFT:
3116 try:
3117 hyper.RebootInstance(instance)
3118 except errors.HypervisorError, err:
3119 _Fail("Failed to soft reboot instance '%s': %s", instance.name, err)
3120 elif reboot_type == constants.INSTANCE_REBOOT_HARD:
3121 try:
3122 InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False)
3123 result = StartInstance(instance, False, reason, store_reason=False)
3124 _StoreInstReasonTrail(instance.name, reason)
3125 return result
3126 except errors.HypervisorError, err:
3127 _Fail("Failed to hard reboot instance '%s': %s", instance.name, err)
3128 else:
3129 _Fail("Invalid reboot_type received: '%s'", reboot_type)
3130
3131
3132 def InstanceBalloonMemory(instance, memory):
3133 """Resize an instance's memory.
3134
3135 @type instance: L{objects.Instance}
3136 @param instance: the instance object
3137 @type memory: int
3138 @param memory: new memory amount in MB
3139 @rtype: None
3140
3141 """
3142 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3143 running = hyper.ListInstances(hvparams=instance.hvparams)
3144 if instance.name not in running:
3145 logging.info("Instance %s is not running, cannot balloon", instance.name)
3146 return
3147 try:
3148 hyper.BalloonInstanceMemory(instance, memory)
3149 except errors.HypervisorError, err:
3150 _Fail("Failed to balloon instance memory: %s", err, exc=True)
3151
3152
3153 def MigrationInfo(instance):
3154 """Gather information about an instance to be migrated.
3155
3156 @type instance: L{objects.Instance}
3157 @param instance: the instance definition
3158
3159 """
3160 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3161 try:
3162 info = hyper.MigrationInfo(instance)
3163 except errors.HypervisorError, err:
3164 _Fail("Failed to fetch migration information: %s", err, exc=True)
3165 return info
3166
3167
3168 def AcceptInstance(instance, info, target):
3169 """Prepare the node to accept an instance.
3170
3171 @type instance: L{objects.Instance}
3172 @param instance: the instance definition
3173 @type info: string/data (opaque)
3174 @param info: migration information, from the source node
3175 @type target: string
3176 @param target: target host (usually ip), on this node
3177
3178 """
3179 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3180 try:
3181 hyper.AcceptInstance(instance, info, target)
3182 except errors.HypervisorError, err:
3183 _Fail("Failed to accept instance: %s", err, exc=True)
3184
3185
3186 def FinalizeMigrationDst(instance, info, success):
3187 """Finalize any preparation to accept an instance.
3188
3189 @type instance: L{objects.Instance}
3190 @param instance: the instance definition
3191 @type info: string/data (opaque)
3192 @param info: migration information, from the source node
3193 @type success: boolean
3194 @param success: whether the migration was a success or a failure
3195
3196 """
3197 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3198 try:
3199 hyper.FinalizeMigrationDst(instance, info, success)
3200 except errors.HypervisorError, err:
3201 _Fail("Failed to finalize migration on the target node: %s", err, exc=True)
3202
3203
3204 def MigrateInstance(cluster_name, instance, target, live):
3205 """Migrates an instance to another node.
3206
3207 @type cluster_name: string
3208 @param cluster_name: name of the cluster
3209 @type instance: L{objects.Instance}
3210 @param instance: the instance definition
3211 @type target: string
3212 @param target: the target node name
3213 @type live: boolean
3214 @param live: whether the migration should be done live or not (the
3215 interpretation of this parameter is left to the hypervisor)
3216 @raise RPCFail: if migration fails for some reason
3217
3218 """
3219 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3220
3221 try:
3222 hyper.MigrateInstance(cluster_name, instance, target, live)
3223 except errors.HypervisorError, err:
3224 _Fail("Failed to migrate instance: %s", err, exc=True)
3225
3226
3227 def FinalizeMigrationSource(instance, success, live):
3228 """Finalize the instance migration on the source node.
3229
3230 @type instance: L{objects.Instance}
3231 @param instance: the instance definition of the migrated instance
3232 @type success: bool
3233 @param success: whether the migration succeeded or not
3234 @type live: bool
3235 @param live: whether the user requested a live migration or not
3236 @raise RPCFail: If the execution fails for some reason
3237
3238 """
3239 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3240
3241 try:
3242 hyper.FinalizeMigrationSource(instance, success, live)
3243 except Exception, err: # pylint: disable=W0703
3244 _Fail("Failed to finalize the migration on the source node: %s", err,
3245 exc=True)
3246
3247
3248 def GetMigrationStatus(instance):
3249 """Get the migration status
3250
3251 @type instance: L{objects.Instance}
3252 @param instance: the instance that is being migrated
3253 @rtype: L{objects.MigrationStatus}
3254 @return: the status of the current migration (one of
3255 L{constants.HV_MIGRATION_VALID_STATUSES}), plus any additional
3256 progress info that can be retrieved from the hypervisor
3257 @raise RPCFail: If the migration status cannot be retrieved
3258
3259 """
3260 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3261 try:
3262 return hyper.GetMigrationStatus(instance)
3263 except Exception, err: # pylint: disable=W0703
3264 _Fail("Failed to get migration status: %s", err, exc=True)
3265
3266
3267 def HotplugDevice(instance, action, dev_type, device, extra, seq):
3268 """Hotplug a device
3269
3270 Hotplug is currently supported only for KVM Hypervisor.
3271 @type instance: L{objects.Instance}
3272 @param instance: the instance to which we hotplug a device
3273 @type action: string
3274 @param action: the hotplug action to perform
3275 @type dev_type: string
3276 @param dev_type: the device type to hotplug
3277 @type device: either L{objects.NIC} or L{objects.Disk}
3278 @param device: the device object to hotplug
3279 @type extra: tuple
3280 @param extra: extra info used for disk hotplug (disk link, drive uri)
3281 @type seq: int
3282 @param seq: the index of the device from master perspective
3283 @raise RPCFail: in case instance does not have KVM hypervisor
3284
3285 """
3286 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3287 try:
3288 hyper.VerifyHotplugSupport(instance, action, dev_type)
3289 except errors.HotplugError, err:
3290 _Fail("Hotplug is not supported: %s", err)
3291
3292 if action == constants.HOTPLUG_ACTION_ADD:
3293 fn = hyper.HotAddDevice
3294 elif action == constants.HOTPLUG_ACTION_REMOVE:
3295 fn = hyper.HotDelDevice
3296 elif action == constants.HOTPLUG_ACTION_MODIFY:
3297 fn = hyper.HotModDevice
3298 else:
3299 assert action in constants.HOTPLUG_ALL_ACTIONS
3300
3301 return fn(instance, dev_type, device, extra, seq)
3302
3303
3304 def HotplugSupported(instance):
3305 """Checks if hotplug is generally supported.
3306
3307 """
3308 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3309 try:
3310 hyper.HotplugSupported(instance)
3311 except errors.HotplugError, err:
3312 _Fail("Hotplug is not supported: %s", err)
3313
3314
3315 def ModifyInstanceMetadata(metadata):
3316 """Sends instance data to the metadata daemon.
3317
3318 Uses the Luxi transport layer to communicate with the metadata
3319 daemon configuration server. It starts the metadata daemon if it is
3320 not running.
3321 The daemon must be enabled during at configuration time.
3322
3323 @type metadata: dict
3324 @param metadata: instance metadata obtained by calling
3325 L{objects.Instance.ToDict} on an instance object
3326
3327 """
3328 if not constants.ENABLE_METAD:
3329 raise errors.ProgrammerError("The metadata deamon is disabled, yet"
3330 " ModifyInstanceMetadata has been called")
3331
3332 if not utils.IsDaemonAlive(constants.METAD):
3333 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.METAD])