Propagate verbose/debug option to ssh_update calls
[ganeti-github.git] / lib / backend.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31 """Functions used by the node daemon
32
33 @var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
34 the L{UploadFile} function
35 @var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted
36 in the L{_CleanDirectory} function
37
38 """
39
40 # pylint: disable=E1103,C0302
41
42 # E1103: %s %r has no %r member (but some types could not be
43 # inferred), because the _TryOSFromDisk returns either (True, os_obj)
44 # or (False, "string") which confuses pylint
45
46 # C0302: This module has become too big and should be split up
47
48
49 import base64
50 import errno
51 import logging
52 import os
53 import os.path
54 import pycurl
55 import random
56 import re
57 import shutil
58 import signal
59 import stat
60 import tempfile
61 import time
62 import zlib
63 import contextlib
64 import collections
65
66 from ganeti import errors
67 from ganeti import http
68 from ganeti import utils
69 from ganeti import ssh
70 from ganeti import hypervisor
71 from ganeti.hypervisor import hv_base
72 from ganeti import constants
73 from ganeti.storage import bdev
74 from ganeti.storage import drbd
75 from ganeti.storage import extstorage
76 from ganeti.storage import filestorage
77 from ganeti import objects
78 from ganeti import ssconf
79 from ganeti import serializer
80 from ganeti import netutils
81 from ganeti import runtime
82 from ganeti import compat
83 from ganeti import pathutils
84 from ganeti import vcluster
85 from ganeti import ht
86 from ganeti.storage.base import BlockDev
87 from ganeti.storage.drbd import DRBD8
88 from ganeti import hooksmaster
89 import ganeti.metad as metad
90
91
92 _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id"
93 _ALLOWED_CLEAN_DIRS = compat.UniqueFrozenset([
94 pathutils.DATA_DIR,
95 pathutils.JOB_QUEUE_ARCHIVE_DIR,
96 pathutils.QUEUE_DIR,
97 pathutils.CRYPTO_KEYS_DIR,
98 ])
99 _MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60
100 _X509_KEY_FILE = "key"
101 _X509_CERT_FILE = "cert"
102 _IES_STATUS_FILE = "status"
103 _IES_PID_FILE = "pid"
104 _IES_CA_FILE = "ca"
105
106 #: Valid LVS output line regex
107 _LVSLINE_REGEX = re.compile(r"^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$")
108
109 # Actions for the master setup script
110 _MASTER_START = "start"
111 _MASTER_STOP = "stop"
112
113 #: Maximum file permissions for restricted command directory and executables
114 _RCMD_MAX_MODE = (stat.S_IRWXU |
115 stat.S_IRGRP | stat.S_IXGRP |
116 stat.S_IROTH | stat.S_IXOTH)
117
118 #: Delay before returning an error for restricted commands
119 _RCMD_INVALID_DELAY = 10
120
121 #: How long to wait to acquire lock for restricted commands (shorter than
122 #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many
123 #: command requests arrive
124 _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8
125
126
127 class RPCFail(Exception):
128 """Class denoting RPC failure.
129
130 Its argument is the error message.
131
132 """
133
134
135 def _GetInstReasonFilename(instance_name):
136 """Path of the file containing the reason of the instance status change.
137
138 @type instance_name: string
139 @param instance_name: The name of the instance
140 @rtype: string
141 @return: The path of the file
142
143 """
144 return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name)
145
146
147 def _StoreInstReasonTrail(instance_name, trail):
148 """Serialize a reason trail related to an instance change of state to file.
149
150 The exact location of the file depends on the name of the instance and on
151 the configuration of the Ganeti cluster defined at deploy time.
152
153 @type instance_name: string
154 @param instance_name: The name of the instance
155
156 @type trail: list of reasons
157 @param trail: reason trail
158
159 @rtype: None
160
161 """
162 json = serializer.DumpJson(trail)
163 filename = _GetInstReasonFilename(instance_name)
164 utils.WriteFile(filename, data=json)
165
166
167 def _Fail(msg, *args, **kwargs):
168 """Log an error and the raise an RPCFail exception.
169
170 This exception is then handled specially in the ganeti daemon and
171 turned into a 'failed' return type. As such, this function is a
172 useful shortcut for logging the error and returning it to the master
173 daemon.
174
175 @type msg: string
176 @param msg: the text of the exception
177 @raise RPCFail
178
179 """
180 if args:
181 msg = msg % args
182 if "log" not in kwargs or kwargs["log"]: # if we should log this error
183 if "exc" in kwargs and kwargs["exc"]:
184 logging.exception(msg)
185 else:
186 logging.error(msg)
187 raise RPCFail(msg)
188
189
190 def _GetConfig():
191 """Simple wrapper to return a SimpleStore.
192
193 @rtype: L{ssconf.SimpleStore}
194 @return: a SimpleStore instance
195
196 """
197 return ssconf.SimpleStore()
198
199
200 def _GetSshRunner(cluster_name):
201 """Simple wrapper to return an SshRunner.
202
203 @type cluster_name: str
204 @param cluster_name: the cluster name, which is needed
205 by the SshRunner constructor
206 @rtype: L{ssh.SshRunner}
207 @return: an SshRunner instance
208
209 """
210 return ssh.SshRunner(cluster_name)
211
212
213 def _Decompress(data):
214 """Unpacks data compressed by the RPC client.
215
216 @type data: list or tuple
217 @param data: Data sent by RPC client
218 @rtype: str
219 @return: Decompressed data
220
221 """
222 assert isinstance(data, (list, tuple))
223 assert len(data) == 2
224 (encoding, content) = data
225 if encoding == constants.RPC_ENCODING_NONE:
226 return content
227 elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
228 return zlib.decompress(base64.b64decode(content))
229 else:
230 raise AssertionError("Unknown data encoding")
231
232
233 def _CleanDirectory(path, exclude=None):
234 """Removes all regular files in a directory.
235
236 @type path: str
237 @param path: the directory to clean
238 @type exclude: list
239 @param exclude: list of files to be excluded, defaults
240 to the empty list
241
242 """
243 if path not in _ALLOWED_CLEAN_DIRS:
244 _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'",
245 path)
246
247 if not os.path.isdir(path):
248 return
249 if exclude is None:
250 exclude = []
251 else:
252 # Normalize excluded paths
253 exclude = [os.path.normpath(i) for i in exclude]
254
255 for rel_name in utils.ListVisibleFiles(path):
256 full_name = utils.PathJoin(path, rel_name)
257 if full_name in exclude:
258 continue
259 if os.path.isfile(full_name) and not os.path.islink(full_name):
260 utils.RemoveFile(full_name)
261
262
263 def _BuildUploadFileList():
264 """Build the list of allowed upload files.
265
266 This is abstracted so that it's built only once at module import time.
267
268 """
269 allowed_files = set([
270 pathutils.CLUSTER_CONF_FILE,
271 pathutils.ETC_HOSTS,
272 pathutils.SSH_KNOWN_HOSTS_FILE,
273 pathutils.VNC_PASSWORD_FILE,
274 pathutils.RAPI_CERT_FILE,
275 pathutils.SPICE_CERT_FILE,
276 pathutils.SPICE_CACERT_FILE,
277 pathutils.RAPI_USERS_FILE,
278 pathutils.CONFD_HMAC_KEY,
279 pathutils.CLUSTER_DOMAIN_SECRET_FILE,
280 ])
281
282 for hv_name in constants.HYPER_TYPES:
283 hv_class = hypervisor.GetHypervisorClass(hv_name)
284 allowed_files.update(hv_class.GetAncillaryFiles()[0])
285
286 assert pathutils.FILE_STORAGE_PATHS_FILE not in allowed_files, \
287 "Allowed file storage paths should never be uploaded via RPC"
288
289 return frozenset(allowed_files)
290
291
292 _ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
293
294
295 def JobQueuePurge():
296 """Removes job queue files and archived jobs.
297
298 @rtype: tuple
299 @return: True, None
300
301 """
302 _CleanDirectory(pathutils.QUEUE_DIR, exclude=[pathutils.JOB_QUEUE_LOCK_FILE])
303 _CleanDirectory(pathutils.JOB_QUEUE_ARCHIVE_DIR)
304
305
306 def GetMasterNodeName():
307 """Returns the master node name.
308
309 @rtype: string
310 @return: name of the master node
311 @raise RPCFail: in case of errors
312
313 """
314 try:
315 return _GetConfig().GetMasterNode()
316 except errors.ConfigurationError, err:
317 _Fail("Cluster configuration incomplete: %s", err, exc=True)
318
319
320 def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
321 """Decorator that runs hooks before and after the decorated function.
322
323 @type hook_opcode: string
324 @param hook_opcode: opcode of the hook
325 @type hooks_path: string
326 @param hooks_path: path of the hooks
327 @type env_builder_fn: function
328 @param env_builder_fn: function that returns a dictionary containing the
329 environment variables for the hooks. Will get all the parameters of the
330 decorated function.
331 @raise RPCFail: in case of pre-hook failure
332
333 """
334 def decorator(fn):
335 def wrapper(*args, **kwargs):
336 _, myself = ssconf.GetMasterAndMyself()
337 nodes = ([myself], [myself]) # these hooks run locally
338
339 env_fn = compat.partial(env_builder_fn, *args, **kwargs)
340
341 cfg = _GetConfig()
342 hr = HooksRunner()
343 hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes,
344 hr.RunLocalHooks, None, env_fn, None,
345 logging.warning, cfg.GetClusterName(),
346 cfg.GetMasterNode())
347 hm.RunPhase(constants.HOOKS_PHASE_PRE)
348 result = fn(*args, **kwargs)
349 hm.RunPhase(constants.HOOKS_PHASE_POST)
350
351 return result
352 return wrapper
353 return decorator
354
355
356 def _BuildMasterIpEnv(master_params, use_external_mip_script=None):
357 """Builds environment variables for master IP hooks.
358
359 @type master_params: L{objects.MasterNetworkParameters}
360 @param master_params: network parameters of the master
361 @type use_external_mip_script: boolean
362 @param use_external_mip_script: whether to use an external master IP
363 address setup script (unused, but necessary per the implementation of the
364 _RunLocalHooks decorator)
365
366 """
367 # pylint: disable=W0613
368 ver = netutils.IPAddress.GetVersionFromAddressFamily(master_params.ip_family)
369 env = {
370 "MASTER_NETDEV": master_params.netdev,
371 "MASTER_IP": master_params.ip,
372 "MASTER_NETMASK": str(master_params.netmask),
373 "CLUSTER_IP_VERSION": str(ver),
374 }
375
376 return env
377
378
379 def _RunMasterSetupScript(master_params, action, use_external_mip_script):
380 """Execute the master IP address setup script.
381
382 @type master_params: L{objects.MasterNetworkParameters}
383 @param master_params: network parameters of the master
384 @type action: string
385 @param action: action to pass to the script. Must be one of
386 L{backend._MASTER_START} or L{backend._MASTER_STOP}
387 @type use_external_mip_script: boolean
388 @param use_external_mip_script: whether to use an external master IP
389 address setup script
390 @raise backend.RPCFail: if there are errors during the execution of the
391 script
392
393 """
394 env = _BuildMasterIpEnv(master_params)
395
396 if use_external_mip_script:
397 setup_script = pathutils.EXTERNAL_MASTER_SETUP_SCRIPT
398 else:
399 setup_script = pathutils.DEFAULT_MASTER_SETUP_SCRIPT
400
401 result = utils.RunCmd([setup_script, action], env=env, reset_env=True)
402
403 if result.failed:
404 _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" %
405 (action, result.exit_code, result.output), log=True)
406
407
408 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup",
409 _BuildMasterIpEnv)
410 def ActivateMasterIp(master_params, use_external_mip_script):
411 """Activate the IP address of the master daemon.
412
413 @type master_params: L{objects.MasterNetworkParameters}
414 @param master_params: network parameters of the master
415 @type use_external_mip_script: boolean
416 @param use_external_mip_script: whether to use an external master IP
417 address setup script
418 @raise RPCFail: in case of errors during the IP startup
419
420 """
421 _RunMasterSetupScript(master_params, _MASTER_START,
422 use_external_mip_script)
423
424
425 def StartMasterDaemons(no_voting):
426 """Activate local node as master node.
427
428 The function will start the master daemons (ganeti-masterd and ganeti-rapi).
429
430 @type no_voting: boolean
431 @param no_voting: whether to start ganeti-masterd without a node vote
432 but still non-interactively
433 @rtype: None
434
435 """
436
437 if no_voting:
438 masterd_args = "--no-voting --yes-do-it"
439 else:
440 masterd_args = ""
441
442 env = {
443 "EXTRA_MASTERD_ARGS": masterd_args,
444 }
445
446 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-master"], env=env)
447 if result.failed:
448 msg = "Can't start Ganeti master: %s" % result.output
449 logging.error(msg)
450 _Fail(msg)
451
452
453 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown",
454 _BuildMasterIpEnv)
455 def DeactivateMasterIp(master_params, use_external_mip_script):
456 """Deactivate the master IP on this node.
457
458 @type master_params: L{objects.MasterNetworkParameters}
459 @param master_params: network parameters of the master
460 @type use_external_mip_script: boolean
461 @param use_external_mip_script: whether to use an external master IP
462 address setup script
463 @raise RPCFail: in case of errors during the IP turndown
464
465 """
466 _RunMasterSetupScript(master_params, _MASTER_STOP,
467 use_external_mip_script)
468
469
470 def StopMasterDaemons():
471 """Stop the master daemons on this node.
472
473 Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node.
474
475 @rtype: None
476
477 """
478 # TODO: log and report back to the caller the error failures; we
479 # need to decide in which case we fail the RPC for this
480
481 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-master"])
482 if result.failed:
483 logging.error("Could not stop Ganeti master, command %s had exitcode %s"
484 " and error %s",
485 result.cmd, result.exit_code, result.output)
486
487
488 def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
489 """Change the netmask of the master IP.
490
491 @param old_netmask: the old value of the netmask
492 @param netmask: the new value of the netmask
493 @param master_ip: the master IP
494 @param master_netdev: the master network device
495
496 """
497 if old_netmask == netmask:
498 return
499
500 if not netutils.IPAddress.Own(master_ip):
501 _Fail("The master IP address is not up, not attempting to change its"
502 " netmask")
503
504 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
505 "%s/%s" % (master_ip, netmask),
506 "dev", master_netdev, "label",
507 "%s:0" % master_netdev])
508 if result.failed:
509 _Fail("Could not set the new netmask on the master IP address")
510
511 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
512 "%s/%s" % (master_ip, old_netmask),
513 "dev", master_netdev, "label",
514 "%s:0" % master_netdev])
515 if result.failed:
516 _Fail("Could not bring down the master IP address with the old netmask")
517
518
519 def EtcHostsModify(mode, host, ip):
520 """Modify a host entry in /etc/hosts.
521
522 @param mode: The mode to operate. Either add or remove entry
523 @param host: The host to operate on
524 @param ip: The ip associated with the entry
525
526 """
527 if mode == constants.ETC_HOSTS_ADD:
528 if not ip:
529 RPCFail("Mode 'add' needs 'ip' parameter, but parameter not"
530 " present")
531 utils.AddHostToEtcHosts(host, ip)
532 elif mode == constants.ETC_HOSTS_REMOVE:
533 if ip:
534 RPCFail("Mode 'remove' does not allow 'ip' parameter, but"
535 " parameter is present")
536 utils.RemoveHostFromEtcHosts(host)
537 else:
538 RPCFail("Mode not supported")
539
540
541 def LeaveCluster(modify_ssh_setup):
542 """Cleans up and remove the current node.
543
544 This function cleans up and prepares the current node to be removed
545 from the cluster.
546
547 If processing is successful, then it raises an
548 L{errors.QuitGanetiException} which is used as a special case to
549 shutdown the node daemon.
550
551 @param modify_ssh_setup: boolean
552
553 """
554 _CleanDirectory(pathutils.DATA_DIR)
555 _CleanDirectory(pathutils.CRYPTO_KEYS_DIR)
556 JobQueuePurge()
557
558 if modify_ssh_setup:
559 try:
560 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
561
562 ssh.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
563
564 utils.RemoveFile(priv_key)
565 utils.RemoveFile(pub_key)
566 except errors.OpExecError:
567 logging.exception("Error while processing ssh files")
568
569 try:
570 utils.RemoveFile(pathutils.CONFD_HMAC_KEY)
571 utils.RemoveFile(pathutils.RAPI_CERT_FILE)
572 utils.RemoveFile(pathutils.SPICE_CERT_FILE)
573 utils.RemoveFile(pathutils.SPICE_CACERT_FILE)
574 utils.RemoveFile(pathutils.NODED_CERT_FILE)
575 except: # pylint: disable=W0702
576 logging.exception("Error while removing cluster secrets")
577
578 utils.StopDaemon(constants.CONFD)
579 utils.StopDaemon(constants.MOND)
580 utils.StopDaemon(constants.KVMD)
581
582 # Raise a custom exception (handled in ganeti-noded)
583 raise errors.QuitGanetiException(True, "Shutdown scheduled")
584
585
586 def _CheckStorageParams(params, num_params):
587 """Performs sanity checks for storage parameters.
588
589 @type params: list
590 @param params: list of storage parameters
591 @type num_params: int
592 @param num_params: expected number of parameters
593
594 """
595 if params is None:
596 raise errors.ProgrammerError("No storage parameters for storage"
597 " reporting is provided.")
598 if not isinstance(params, list):
599 raise errors.ProgrammerError("The storage parameters are not of type"
600 " list: '%s'" % params)
601 if not len(params) == num_params:
602 raise errors.ProgrammerError("Did not receive the expected number of"
603 "storage parameters: expected %s,"
604 " received '%s'" % (num_params, len(params)))
605
606
607 def _CheckLvmStorageParams(params):
608 """Performs sanity check for the 'exclusive storage' flag.
609
610 @see: C{_CheckStorageParams}
611
612 """
613 _CheckStorageParams(params, 1)
614 excl_stor = params[0]
615 if not isinstance(params[0], bool):
616 raise errors.ProgrammerError("Exclusive storage parameter is not"
617 " boolean: '%s'." % excl_stor)
618 return excl_stor
619
620
621 def _GetLvmVgSpaceInfo(name, params):
622 """Wrapper around C{_GetVgInfo} which checks the storage parameters.
623
624 @type name: string
625 @param name: name of the volume group
626 @type params: list
627 @param params: list of storage parameters, which in this case should be
628 containing only one for exclusive storage
629
630 """
631 excl_stor = _CheckLvmStorageParams(params)
632 return _GetVgInfo(name, excl_stor)
633
634
635 def _GetVgInfo(
636 name, excl_stor, info_fn=bdev.LogicalVolume.GetVGInfo):
637 """Retrieves information about a LVM volume group.
638
639 """
640 # TODO: GetVGInfo supports returning information for multiple VGs at once
641 vginfo = info_fn([name], excl_stor)
642 if vginfo:
643 vg_free = int(round(vginfo[0][0], 0))
644 vg_size = int(round(vginfo[0][1], 0))
645 else:
646 vg_free = None
647 vg_size = None
648
649 return {
650 "type": constants.ST_LVM_VG,
651 "name": name,
652 "storage_free": vg_free,
653 "storage_size": vg_size,
654 }
655
656
657 def _GetLvmPvSpaceInfo(name, params):
658 """Wrapper around C{_GetVgSpindlesInfo} with sanity checks.
659
660 @see: C{_GetLvmVgSpaceInfo}
661
662 """
663 excl_stor = _CheckLvmStorageParams(params)
664 return _GetVgSpindlesInfo(name, excl_stor)
665
666
667 def _GetVgSpindlesInfo(
668 name, excl_stor, info_fn=bdev.LogicalVolume.GetVgSpindlesInfo):
669 """Retrieves information about spindles in an LVM volume group.
670
671 @type name: string
672 @param name: VG name
673 @type excl_stor: bool
674 @param excl_stor: exclusive storage
675 @rtype: dict
676 @return: dictionary whose keys are "name", "vg_free", "vg_size" for VG name,
677 free spindles, total spindles respectively
678
679 """
680 if excl_stor:
681 (vg_free, vg_size) = info_fn(name)
682 else:
683 vg_free = 0
684 vg_size = 0
685 return {
686 "type": constants.ST_LVM_PV,
687 "name": name,
688 "storage_free": vg_free,
689 "storage_size": vg_size,
690 }
691
692
693 def _GetHvInfo(name, hvparams, get_hv_fn=hypervisor.GetHypervisor):
694 """Retrieves node information from a hypervisor.
695
696 The information returned depends on the hypervisor. Common items:
697
698 - vg_size is the size of the configured volume group in MiB
699 - vg_free is the free size of the volume group in MiB
700 - memory_dom0 is the memory allocated for domain0 in MiB
701 - memory_free is the currently available (free) ram in MiB
702 - memory_total is the total number of ram in MiB
703 - hv_version: the hypervisor version, if available
704
705 @type hvparams: dict of string
706 @param hvparams: the hypervisor's hvparams
707
708 """
709 return get_hv_fn(name).GetNodeInfo(hvparams=hvparams)
710
711
712 def _GetHvInfoAll(hv_specs, get_hv_fn=hypervisor.GetHypervisor):
713 """Retrieves node information for all hypervisors.
714
715 See C{_GetHvInfo} for information on the output.
716
717 @type hv_specs: list of pairs (string, dict of strings)
718 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
719
720 """
721 if hv_specs is None:
722 return None
723
724 result = []
725 for hvname, hvparams in hv_specs:
726 result.append(_GetHvInfo(hvname, hvparams, get_hv_fn))
727 return result
728
729
730 def _GetNamedNodeInfo(names, fn):
731 """Calls C{fn} for all names in C{names} and returns a dictionary.
732
733 @rtype: None or dict
734
735 """
736 if names is None:
737 return None
738 else:
739 return map(fn, names)
740
741
742 def GetNodeInfo(storage_units, hv_specs):
743 """Gives back a hash with different information about the node.
744
745 @type storage_units: list of tuples (string, string, list)
746 @param storage_units: List of tuples (storage unit, identifier, parameters) to
747 ask for disk space information. In case of lvm-vg, the identifier is
748 the VG name. The parameters can contain additional, storage-type-specific
749 parameters, for example exclusive storage for lvm storage.
750 @type hv_specs: list of pairs (string, dict of strings)
751 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
752 @rtype: tuple; (string, None/dict, None/dict)
753 @return: Tuple containing boot ID, volume group information and hypervisor
754 information
755
756 """
757 bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n")
758 storage_info = _GetNamedNodeInfo(
759 storage_units,
760 (lambda (storage_type, storage_key, storage_params):
761 _ApplyStorageInfoFunction(storage_type, storage_key, storage_params)))
762 hv_info = _GetHvInfoAll(hv_specs)
763 return (bootid, storage_info, hv_info)
764
765
766 def _GetFileStorageSpaceInfo(path, params):
767 """Wrapper around filestorage.GetSpaceInfo.
768
769 The purpose of this wrapper is to call filestorage.GetFileStorageSpaceInfo
770 and ignore the *args parameter to not leak it into the filestorage
771 module's code.
772
773 @see: C{filestorage.GetFileStorageSpaceInfo} for description of the
774 parameters.
775
776 """
777 _CheckStorageParams(params, 0)
778 return filestorage.GetFileStorageSpaceInfo(path)
779
780
781 # FIXME: implement storage reporting for all missing storage types.
782 _STORAGE_TYPE_INFO_FN = {
783 constants.ST_BLOCK: None,
784 constants.ST_DISKLESS: None,
785 constants.ST_EXT: None,
786 constants.ST_FILE: _GetFileStorageSpaceInfo,
787 constants.ST_LVM_PV: _GetLvmPvSpaceInfo,
788 constants.ST_LVM_VG: _GetLvmVgSpaceInfo,
789 constants.ST_SHARED_FILE: None,
790 constants.ST_GLUSTER: None,
791 constants.ST_RADOS: None,
792 }
793
794
795 def _ApplyStorageInfoFunction(storage_type, storage_key, *args):
796 """Looks up and applies the correct function to calculate free and total
797 storage for the given storage type.
798
799 @type storage_type: string
800 @param storage_type: the storage type for which the storage shall be reported.
801 @type storage_key: string
802 @param storage_key: identifier of a storage unit, e.g. the volume group name
803 of an LVM storage unit
804 @type args: any
805 @param args: various parameters that can be used for storage reporting. These
806 parameters and their semantics vary from storage type to storage type and
807 are just propagated in this function.
808 @return: the results of the application of the storage space function (see
809 _STORAGE_TYPE_INFO_FN) if storage space reporting is implemented for that
810 storage type
811 @raises NotImplementedError: for storage types who don't support space
812 reporting yet
813 """
814 fn = _STORAGE_TYPE_INFO_FN[storage_type]
815 if fn is not None:
816 return fn(storage_key, *args)
817 else:
818 raise NotImplementedError
819
820
821 def _CheckExclusivePvs(pvi_list):
822 """Check that PVs are not shared among LVs
823
824 @type pvi_list: list of L{objects.LvmPvInfo} objects
825 @param pvi_list: information about the PVs
826
827 @rtype: list of tuples (string, list of strings)
828 @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...])
829
830 """
831 res = []
832 for pvi in pvi_list:
833 if len(pvi.lv_list) > 1:
834 res.append((pvi.name, pvi.lv_list))
835 return res
836
837
838 def _VerifyHypervisors(what, vm_capable, result, all_hvparams,
839 get_hv_fn=hypervisor.GetHypervisor):
840 """Verifies the hypervisor. Appends the results to the 'results' list.
841
842 @type what: C{dict}
843 @param what: a dictionary of things to check
844 @type vm_capable: boolean
845 @param vm_capable: whether or not this node is vm capable
846 @type result: dict
847 @param result: dictionary of verification results; results of the
848 verifications in this function will be added here
849 @type all_hvparams: dict of dict of string
850 @param all_hvparams: dictionary mapping hypervisor names to hvparams
851 @type get_hv_fn: function
852 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
853
854 """
855 if not vm_capable:
856 return
857
858 if constants.NV_HYPERVISOR in what:
859 result[constants.NV_HYPERVISOR] = {}
860 for hv_name in what[constants.NV_HYPERVISOR]:
861 hvparams = all_hvparams[hv_name]
862 try:
863 val = get_hv_fn(hv_name).Verify(hvparams=hvparams)
864 except errors.HypervisorError, err:
865 val = "Error while checking hypervisor: %s" % str(err)
866 result[constants.NV_HYPERVISOR][hv_name] = val
867
868
869 def _VerifyHvparams(what, vm_capable, result,
870 get_hv_fn=hypervisor.GetHypervisor):
871 """Verifies the hvparams. Appends the results to the 'results' list.
872
873 @type what: C{dict}
874 @param what: a dictionary of things to check
875 @type vm_capable: boolean
876 @param vm_capable: whether or not this node is vm capable
877 @type result: dict
878 @param result: dictionary of verification results; results of the
879 verifications in this function will be added here
880 @type get_hv_fn: function
881 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
882
883 """
884 if not vm_capable:
885 return
886
887 if constants.NV_HVPARAMS in what:
888 result[constants.NV_HVPARAMS] = []
889 for source, hv_name, hvparms in what[constants.NV_HVPARAMS]:
890 try:
891 logging.info("Validating hv %s, %s", hv_name, hvparms)
892 get_hv_fn(hv_name).ValidateParameters(hvparms)
893 except errors.HypervisorError, err:
894 result[constants.NV_HVPARAMS].append((source, hv_name, str(err)))
895
896
897 def _VerifyInstanceList(what, vm_capable, result, all_hvparams):
898 """Verifies the instance list.
899
900 @type what: C{dict}
901 @param what: a dictionary of things to check
902 @type vm_capable: boolean
903 @param vm_capable: whether or not this node is vm capable
904 @type result: dict
905 @param result: dictionary of verification results; results of the
906 verifications in this function will be added here
907 @type all_hvparams: dict of dict of string
908 @param all_hvparams: dictionary mapping hypervisor names to hvparams
909
910 """
911 if constants.NV_INSTANCELIST in what and vm_capable:
912 # GetInstanceList can fail
913 try:
914 val = GetInstanceList(what[constants.NV_INSTANCELIST],
915 all_hvparams=all_hvparams)
916 except RPCFail, err:
917 val = str(err)
918 result[constants.NV_INSTANCELIST] = val
919
920
921 def _VerifyNodeInfo(what, vm_capable, result, all_hvparams):
922 """Verifies the node info.
923
924 @type what: C{dict}
925 @param what: a dictionary of things to check
926 @type vm_capable: boolean
927 @param vm_capable: whether or not this node is vm capable
928 @type result: dict
929 @param result: dictionary of verification results; results of the
930 verifications in this function will be added here
931 @type all_hvparams: dict of dict of string
932 @param all_hvparams: dictionary mapping hypervisor names to hvparams
933
934 """
935 if constants.NV_HVINFO in what and vm_capable:
936 hvname = what[constants.NV_HVINFO]
937 hyper = hypervisor.GetHypervisor(hvname)
938 hvparams = all_hvparams[hvname]
939 result[constants.NV_HVINFO] = hyper.GetNodeInfo(hvparams=hvparams)
940
941
942 def _VerifyClientCertificate(cert_file=pathutils.NODED_CLIENT_CERT_FILE):
943 """Verify the existance and validity of the client SSL certificate.
944
945 Also, verify that the client certificate is not self-signed. Self-
946 signed client certificates stem from Ganeti versions 2.12.0 - 2.12.4
947 and should be replaced by client certificates signed by the server
948 certificate. Hence we output a warning when we encounter a self-signed
949 one.
950
951 """
952 create_cert_cmd = "gnt-cluster renew-crypto --new-node-certificates"
953 if not os.path.exists(cert_file):
954 return (constants.CV_ERROR,
955 "The client certificate does not exist. Run '%s' to create"
956 " client certificates for all nodes." % create_cert_cmd)
957
958 (errcode, msg) = utils.VerifyCertificate(cert_file)
959 if errcode is not None:
960 return (errcode, msg)
961
962 (errcode, msg) = utils.IsCertificateSelfSigned(cert_file)
963 if errcode is not None:
964 return (errcode, msg)
965
966 # if everything is fine, we return the digest to be compared to the config
967 return (None, utils.GetCertificateDigest(cert_filename=cert_file))
968
969
970 def _VerifySshSetup(node_status_list, my_name, ssh_key_type,
971 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS):
972 """Verifies the state of the SSH key files.
973
974 @type node_status_list: list of tuples
975 @param node_status_list: list of nodes of the cluster associated with a
976 couple of flags: (uuid, name, is_master_candidate,
977 is_potential_master_candidate, online)
978 @type my_name: str
979 @param my_name: name of this node
980 @type ssh_key_type: one of L{constants.SSHK_ALL}
981 @param ssh_key_type: type of key used on nodes
982 @type ganeti_pub_keys_file: str
983 @param ganeti_pub_keys_file: filename of the public keys file
984
985 """
986 if node_status_list is None:
987 return ["No node list to check against the pub_key_file received."]
988
989 my_status_list = [(my_uuid, name, mc, pot_mc, online) for
990 (my_uuid, name, mc, pot_mc, online)
991 in node_status_list if name == my_name]
992 if len(my_status_list) == 0:
993 return ["Cannot find node information for node '%s'." % my_name]
994 (my_uuid, _, _, potential_master_candidate, online) = \
995 my_status_list[0]
996
997 result = []
998
999 if not os.path.exists(ganeti_pub_keys_file):
1000 result.append("The public key file '%s' does not exist. Consider running"
1001 " 'gnt-cluster renew-crypto --new-ssh-keys"
1002 " [--no-ssh-key-check]' to fix this." % ganeti_pub_keys_file)
1003 return result
1004
1005 pot_mc_uuids = [uuid for (uuid, _, _, _, _) in node_status_list]
1006 offline_nodes = [uuid for (uuid, _, _, _, online) in node_status_list
1007 if not online]
1008 pub_keys = ssh.QueryPubKeyFile(None, key_file=ganeti_pub_keys_file)
1009
1010 if potential_master_candidate:
1011 # Check that the set of potential master candidates matches the
1012 # public key file
1013 pub_uuids_set = set(pub_keys.keys()) - set(offline_nodes)
1014 pot_mc_uuids_set = set(pot_mc_uuids) - set(offline_nodes)
1015 missing_uuids = set([])
1016 if pub_uuids_set != pot_mc_uuids_set:
1017 unknown_uuids = pub_uuids_set - pot_mc_uuids_set
1018 if unknown_uuids:
1019 result.append("The following node UUIDs are listed in the public key"
1020 " file on node '%s', but are not potential master"
1021 " candidates: %s."
1022 % (my_name, ", ".join(list(unknown_uuids))))
1023 missing_uuids = pot_mc_uuids_set - pub_uuids_set
1024 if missing_uuids:
1025 result.append("The following node UUIDs of potential master candidates"
1026 " are missing in the public key file on node %s: %s."
1027 % (my_name, ", ".join(list(missing_uuids))))
1028
1029 (_, key_files) = \
1030 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1031 (_, node_pub_key_file) = key_files[ssh_key_type]
1032
1033 my_keys = pub_keys[my_uuid]
1034
1035 node_pub_key = utils.ReadFile(node_pub_key_file)
1036 if node_pub_key.strip() not in my_keys:
1037 result.append("The dsa key of node %s does not match this node's key"
1038 " in the pub key file." % my_name)
1039 if len(my_keys) != 1:
1040 result.append("There is more than one key for node %s in the public key"
1041 " file." % my_name)
1042 else:
1043 if len(pub_keys.keys()) > 0:
1044 result.append("The public key file of node '%s' is not empty, although"
1045 " the node is not a potential master candidate."
1046 % my_name)
1047
1048 # Check that all master candidate keys are in the authorized_keys file
1049 (auth_key_file, _) = \
1050 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1051 for (uuid, name, mc, _, online) in node_status_list:
1052 if not online:
1053 continue
1054 if uuid in missing_uuids:
1055 continue
1056 if mc:
1057 for key in pub_keys[uuid]:
1058 if not ssh.HasAuthorizedKey(auth_key_file, key):
1059 result.append("A SSH key of master candidate '%s' (UUID: '%s') is"
1060 " not in the 'authorized_keys' file of node '%s'."
1061 % (name, uuid, my_name))
1062 else:
1063 for key in pub_keys[uuid]:
1064 if name != my_name and ssh.HasAuthorizedKey(auth_key_file, key):
1065 result.append("A SSH key of normal node '%s' (UUID: '%s') is in the"
1066 " 'authorized_keys' file of node '%s'."
1067 % (name, uuid, my_name))
1068 if name == my_name and not ssh.HasAuthorizedKey(auth_key_file, key):
1069 result.append("A SSH key of normal node '%s' (UUID: '%s') is not"
1070 " in the 'authorized_keys' file of itself."
1071 % (my_name, uuid))
1072
1073 return result
1074
1075
1076 def _VerifySshClutter(node_status_list, my_name):
1077 """Verifies that the 'authorized_keys' files are not cluttered up.
1078
1079 @type node_status_list: list of tuples
1080 @param node_status_list: list of nodes of the cluster associated with a
1081 couple of flags: (uuid, name, is_master_candidate,
1082 is_potential_master_candidate, online)
1083 @type my_name: str
1084 @param my_name: name of this node
1085
1086 """
1087 result = []
1088 (auth_key_file, _) = \
1089 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1090 node_names = [name for (_, name, _, _) in node_status_list]
1091 multiple_occurrences = ssh.CheckForMultipleKeys(auth_key_file, node_names)
1092 if multiple_occurrences:
1093 msg = "There are hosts which have more than one SSH key stored for the" \
1094 " same user in the 'authorized_keys' file of node %s. This can be" \
1095 " due to an unsuccessful operation which cluttered up the" \
1096 " 'authorized_keys' file. We recommend to clean this up manually. " \
1097 % my_name
1098 for host, occ in multiple_occurrences.items():
1099 msg += "Entry for '%s' in lines %s. " % (host, utils.CommaJoin(occ))
1100 result.append(msg)
1101
1102 return result
1103
1104
1105 def VerifyNode(what, cluster_name, all_hvparams):
1106 """Verify the status of the local node.
1107
1108 Based on the input L{what} parameter, various checks are done on the
1109 local node.
1110
1111 If the I{filelist} key is present, this list of
1112 files is checksummed and the file/checksum pairs are returned.
1113
1114 If the I{nodelist} key is present, we check that we have
1115 connectivity via ssh with the target nodes (and check the hostname
1116 report).
1117
1118 If the I{node-net-test} key is present, we check that we have
1119 connectivity to the given nodes via both primary IP and, if
1120 applicable, secondary IPs.
1121
1122 @type what: C{dict}
1123 @param what: a dictionary of things to check:
1124 - filelist: list of files for which to compute checksums
1125 - nodelist: list of nodes we should check ssh communication with
1126 - node-net-test: list of nodes we should check node daemon port
1127 connectivity with
1128 - hypervisor: list with hypervisors to run the verify for
1129 @type cluster_name: string
1130 @param cluster_name: the cluster's name
1131 @type all_hvparams: dict of dict of strings
1132 @param all_hvparams: a dictionary mapping hypervisor names to hvparams
1133 @rtype: dict
1134 @return: a dictionary with the same keys as the input dict, and
1135 values representing the result of the checks
1136
1137 """
1138 result = {}
1139 my_name = netutils.Hostname.GetSysName()
1140 port = netutils.GetDaemonPort(constants.NODED)
1141 vm_capable = my_name not in what.get(constants.NV_NONVMNODES, [])
1142
1143 _VerifyHypervisors(what, vm_capable, result, all_hvparams)
1144 _VerifyHvparams(what, vm_capable, result)
1145
1146 if constants.NV_FILELIST in what:
1147 fingerprints = utils.FingerprintFiles(map(vcluster.LocalizeVirtualPath,
1148 what[constants.NV_FILELIST]))
1149 result[constants.NV_FILELIST] = \
1150 dict((vcluster.MakeVirtualPath(key), value)
1151 for (key, value) in fingerprints.items())
1152
1153 if constants.NV_CLIENT_CERT in what:
1154 result[constants.NV_CLIENT_CERT] = _VerifyClientCertificate()
1155
1156 if constants.NV_SSH_SETUP in what:
1157 node_status_list, key_type = what[constants.NV_SSH_SETUP]
1158 result[constants.NV_SSH_SETUP] = \
1159 _VerifySshSetup(node_status_list, my_name, key_type)
1160 if constants.NV_SSH_CLUTTER in what:
1161 result[constants.NV_SSH_CLUTTER] = \
1162 _VerifySshClutter(what[constants.NV_SSH_SETUP], my_name)
1163
1164 if constants.NV_NODELIST in what:
1165 (nodes, bynode, mcs) = what[constants.NV_NODELIST]
1166
1167 # Add nodes from other groups (different for each node)
1168 try:
1169 nodes.extend(bynode[my_name])
1170 except KeyError:
1171 pass
1172
1173 # Use a random order
1174 random.shuffle(nodes)
1175
1176 # Try to contact all nodes
1177 val = {}
1178 ssh_port_map = ssconf.SimpleStore().GetSshPortMap()
1179 for node in nodes:
1180 # We only test if master candidates can communicate to other nodes.
1181 # We cannot test if normal nodes cannot communicate with other nodes,
1182 # because the administrator might have installed additional SSH keys,
1183 # over which Ganeti has no power.
1184 if my_name in mcs:
1185 success, message = _GetSshRunner(cluster_name). \
1186 VerifyNodeHostname(node, ssh_port_map[node])
1187 if not success:
1188 val[node] = message
1189
1190 result[constants.NV_NODELIST] = val
1191
1192 if constants.NV_NODENETTEST in what:
1193 result[constants.NV_NODENETTEST] = tmp = {}
1194 my_pip = my_sip = None
1195 for name, pip, sip in what[constants.NV_NODENETTEST]:
1196 if name == my_name:
1197 my_pip = pip
1198 my_sip = sip
1199 break
1200 if not my_pip:
1201 tmp[my_name] = ("Can't find my own primary/secondary IP"
1202 " in the node list")
1203 else:
1204 for name, pip, sip in what[constants.NV_NODENETTEST]:
1205 fail = []
1206 if not netutils.TcpPing(pip, port, source=my_pip):
1207 fail.append("primary")
1208 if sip != pip:
1209 if not netutils.TcpPing(sip, port, source=my_sip):
1210 fail.append("secondary")
1211 if fail:
1212 tmp[name] = ("failure using the %s interface(s)" %
1213 " and ".join(fail))
1214
1215 if constants.NV_MASTERIP in what:
1216 # FIXME: add checks on incoming data structures (here and in the
1217 # rest of the function)
1218 master_name, master_ip = what[constants.NV_MASTERIP]
1219 if master_name == my_name:
1220 source = constants.IP4_ADDRESS_LOCALHOST
1221 else:
1222 source = None
1223 result[constants.NV_MASTERIP] = netutils.TcpPing(master_ip, port,
1224 source=source)
1225
1226 if constants.NV_USERSCRIPTS in what:
1227 result[constants.NV_USERSCRIPTS] = \
1228 [script for script in what[constants.NV_USERSCRIPTS]
1229 if not utils.IsExecutable(script)]
1230
1231 if constants.NV_OOB_PATHS in what:
1232 result[constants.NV_OOB_PATHS] = tmp = []
1233 for path in what[constants.NV_OOB_PATHS]:
1234 try:
1235 st = os.stat(path)
1236 except OSError, err:
1237 tmp.append("error stating out of band helper: %s" % err)
1238 else:
1239 if stat.S_ISREG(st.st_mode):
1240 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR:
1241 tmp.append(None)
1242 else:
1243 tmp.append("out of band helper %s is not executable" % path)
1244 else:
1245 tmp.append("out of band helper %s is not a file" % path)
1246
1247 if constants.NV_LVLIST in what and vm_capable:
1248 try:
1249 val = GetVolumeList(utils.ListVolumeGroups().keys())
1250 except RPCFail, err:
1251 val = str(err)
1252 result[constants.NV_LVLIST] = val
1253
1254 _VerifyInstanceList(what, vm_capable, result, all_hvparams)
1255
1256 if constants.NV_VGLIST in what and vm_capable:
1257 result[constants.NV_VGLIST] = utils.ListVolumeGroups()
1258
1259 if constants.NV_PVLIST in what and vm_capable:
1260 check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what
1261 val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST],
1262 filter_allocatable=False,
1263 include_lvs=check_exclusive_pvs)
1264 if check_exclusive_pvs:
1265 result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val)
1266 for pvi in val:
1267 # Avoid sending useless data on the wire
1268 pvi.lv_list = []
1269 result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val)
1270
1271 if constants.NV_VERSION in what:
1272 result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
1273 constants.RELEASE_VERSION)
1274
1275 _VerifyNodeInfo(what, vm_capable, result, all_hvparams)
1276
1277 if constants.NV_DRBDVERSION in what and vm_capable:
1278 try:
1279 drbd_version = DRBD8.GetProcInfo().GetVersionString()
1280 except errors.BlockDeviceError, err:
1281 logging.warning("Can't get DRBD version", exc_info=True)
1282 drbd_version = str(err)
1283 result[constants.NV_DRBDVERSION] = drbd_version
1284
1285 if constants.NV_DRBDLIST in what and vm_capable:
1286 try:
1287 used_minors = drbd.DRBD8.GetUsedDevs()
1288 except errors.BlockDeviceError, err:
1289 logging.warning("Can't get used minors list", exc_info=True)
1290 used_minors = str(err)
1291 result[constants.NV_DRBDLIST] = used_minors
1292
1293 if constants.NV_DRBDHELPER in what and vm_capable:
1294 status = True
1295 try:
1296 payload = drbd.DRBD8.GetUsermodeHelper()
1297 except errors.BlockDeviceError, err:
1298 logging.error("Can't get DRBD usermode helper: %s", str(err))
1299 status = False
1300 payload = str(err)
1301 result[constants.NV_DRBDHELPER] = (status, payload)
1302
1303 if constants.NV_NODESETUP in what:
1304 result[constants.NV_NODESETUP] = tmpr = []
1305 if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"):
1306 tmpr.append("The sysfs filesytem doesn't seem to be mounted"
1307 " under /sys, missing required directories /sys/block"
1308 " and /sys/class/net")
1309 if (not os.path.isdir("/proc/sys") or
1310 not os.path.isfile("/proc/sysrq-trigger")):
1311 tmpr.append("The procfs filesystem doesn't seem to be mounted"
1312 " under /proc, missing required directory /proc/sys and"
1313 " the file /proc/sysrq-trigger")
1314
1315 if constants.NV_TIME in what:
1316 result[constants.NV_TIME] = utils.SplitTime(time.time())
1317
1318 if constants.NV_OSLIST in what and vm_capable:
1319 result[constants.NV_OSLIST] = DiagnoseOS()
1320
1321 if constants.NV_BRIDGES in what and vm_capable:
1322 result[constants.NV_BRIDGES] = [bridge
1323 for bridge in what[constants.NV_BRIDGES]
1324 if not utils.BridgeExists(bridge)]
1325
1326 if what.get(constants.NV_ACCEPTED_STORAGE_PATHS) == my_name:
1327 result[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1328 filestorage.ComputeWrongFileStoragePaths()
1329
1330 if what.get(constants.NV_FILE_STORAGE_PATH):
1331 pathresult = filestorage.CheckFileStoragePath(
1332 what[constants.NV_FILE_STORAGE_PATH])
1333 if pathresult:
1334 result[constants.NV_FILE_STORAGE_PATH] = pathresult
1335
1336 if what.get(constants.NV_SHARED_FILE_STORAGE_PATH):
1337 pathresult = filestorage.CheckFileStoragePath(
1338 what[constants.NV_SHARED_FILE_STORAGE_PATH])
1339 if pathresult:
1340 result[constants.NV_SHARED_FILE_STORAGE_PATH] = pathresult
1341
1342 return result
1343
1344
1345 def GetCryptoTokens(token_requests):
1346 """Perform actions on the node's cryptographic tokens.
1347
1348 Token types can be 'ssl' or 'ssh'. So far only some actions are implemented
1349 for 'ssl'. Action 'get' returns the digest of the public client ssl
1350 certificate. Action 'create' creates a new client certificate and private key
1351 and also returns the digest of the certificate. The third parameter of a
1352 token request are optional parameters for the actions, so far only the
1353 filename is supported.
1354
1355 @type token_requests: list of tuples of (string, string, dict), where the
1356 first string is in constants.CRYPTO_TYPES, the second in
1357 constants.CRYPTO_ACTIONS. The third parameter is a dictionary of string
1358 to string.
1359 @param token_requests: list of requests of cryptographic tokens and actions
1360 to perform on them. The actions come with a dictionary of options.
1361 @rtype: list of tuples (string, string)
1362 @return: list of tuples of the token type and the public crypto token
1363
1364 """
1365 tokens = []
1366 for (token_type, action, _) in token_requests:
1367 if token_type not in constants.CRYPTO_TYPES:
1368 raise errors.ProgrammerError("Token type '%s' not supported." %
1369 token_type)
1370 if action not in constants.CRYPTO_ACTIONS:
1371 raise errors.ProgrammerError("Action '%s' is not supported." %
1372 action)
1373 if token_type == constants.CRYPTO_TYPE_SSL_DIGEST:
1374 tokens.append((token_type,
1375 utils.GetCertificateDigest()))
1376 return tokens
1377
1378
1379 def EnsureDaemon(daemon_name, run):
1380 """Ensures the given daemon is running or stopped.
1381
1382 @type daemon_name: string
1383 @param daemon_name: name of the daemon (e.g., constants.KVMD)
1384
1385 @type run: bool
1386 @param run: whether to start or stop the daemon
1387
1388 @rtype: bool
1389 @return: 'True' if daemon successfully started/stopped,
1390 'False' otherwise
1391
1392 """
1393 allowed_daemons = [constants.KVMD]
1394
1395 if daemon_name not in allowed_daemons:
1396 fn = lambda _: False
1397 elif run:
1398 fn = utils.EnsureDaemon
1399 else:
1400 fn = utils.StopDaemon
1401
1402 return fn(daemon_name)
1403
1404
1405 def _InitSshUpdateData(data, noded_cert_file, ssconf_store):
1406 (_, noded_cert) = \
1407 utils.ExtractX509Certificate(utils.ReadFile(noded_cert_file))
1408 data[constants.SSHS_NODE_DAEMON_CERTIFICATE] = noded_cert
1409
1410 cluster_name = ssconf_store.GetClusterName()
1411 data[constants.SSHS_CLUSTER_NAME] = cluster_name
1412
1413
1414 def AddNodeSshKey(node_uuid, node_name,
1415 potential_master_candidates,
1416 to_authorized_keys=False,
1417 to_public_keys=False,
1418 get_public_keys=False,
1419 pub_key_file=pathutils.SSH_PUB_KEYS,
1420 ssconf_store=None,
1421 noded_cert_file=pathutils.NODED_CERT_FILE,
1422 run_cmd_fn=ssh.RunSshCmdWithStdin,
1423 ssh_update_debug=False,
1424 ssh_update_verbose=False):
1425 """Distributes a node's public SSH key across the cluster.
1426
1427 Note that this function should only be executed on the master node, which
1428 then will copy the new node's key to all nodes in the cluster via SSH.
1429
1430 Also note: at least one of the flags C{to_authorized_keys},
1431 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1432 the function to actually perform any actions.
1433
1434 @type node_uuid: str
1435 @param node_uuid: the UUID of the node whose key is added
1436 @type node_name: str
1437 @param node_name: the name of the node whose key is added
1438 @type potential_master_candidates: list of str
1439 @param potential_master_candidates: list of node names of potential master
1440 candidates; this should match the list of uuids in the public key file
1441 @type to_authorized_keys: boolean
1442 @param to_authorized_keys: whether the key should be added to the
1443 C{authorized_keys} file of all nodes
1444 @type to_public_keys: boolean
1445 @param to_public_keys: whether the keys should be added to the public key file
1446 @type get_public_keys: boolean
1447 @param get_public_keys: whether the node should add the clusters' public keys
1448 to its {ganeti_pub_keys} file
1449
1450 """
1451 node_list = [SshAddNodeInfo(name=node_name, uuid=node_uuid,
1452 to_authorized_keys=to_authorized_keys,
1453 to_public_keys=to_public_keys,
1454 get_public_keys=get_public_keys)]
1455 return AddNodeSshKeyBulk(node_list,
1456 potential_master_candidates,
1457 pub_key_file=pub_key_file,
1458 ssconf_store=ssconf_store,
1459 noded_cert_file=noded_cert_file,
1460 run_cmd_fn=run_cmd_fn,
1461 ssh_update_debug=ssh_update_debug,
1462 ssh_update_verbose=ssh_update_verbose)
1463
1464
1465 # Node info named tuple specifically for the use with AddNodeSshKeyBulk
1466 SshAddNodeInfo = collections.namedtuple(
1467 "SshAddNodeInfo",
1468 ["uuid",
1469 "name",
1470 "to_authorized_keys",
1471 "to_public_keys",
1472 "get_public_keys"])
1473
1474
1475 def AddNodeSshKeyBulk(node_list,
1476 potential_master_candidates,
1477 pub_key_file=pathutils.SSH_PUB_KEYS,
1478 ssconf_store=None,
1479 noded_cert_file=pathutils.NODED_CERT_FILE,
1480 run_cmd_fn=ssh.RunSshCmdWithStdin,
1481 ssh_update_debug=False,
1482 ssh_update_verbose=False):
1483 """Distributes a node's public SSH key across the cluster.
1484
1485 Note that this function should only be executed on the master node, which
1486 then will copy the new node's key to all nodes in the cluster via SSH.
1487
1488 Also note: at least one of the flags C{to_authorized_keys},
1489 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1490 the function to actually perform any actions.
1491
1492 @type node_list: list of SshAddNodeInfo tuples
1493 @param node_list: list of tuples containing the necessary node information for
1494 adding their keys
1495 @type potential_master_candidates: list of str
1496 @param potential_master_candidates: list of node names of potential master
1497 candidates; this should match the list of uuids in the public key file
1498
1499 """
1500 # whether there are any keys to be added or retrieved at all
1501 to_authorized_keys = any([node_info.to_authorized_keys for node_info in
1502 node_list])
1503 to_public_keys = any([node_info.to_public_keys for node_info in
1504 node_list])
1505 get_public_keys = any([node_info.get_public_keys for node_info in
1506 node_list])
1507
1508 # assure that at least one of those flags is true, as the function would
1509 # not do anything otherwise
1510 assert (to_authorized_keys or to_public_keys or get_public_keys)
1511
1512 if not ssconf_store:
1513 ssconf_store = ssconf.SimpleStore()
1514
1515 for node_info in node_list:
1516 # replacement not necessary for keys that are not supposed to be in the
1517 # list of public keys
1518 if not node_info.to_public_keys:
1519 continue
1520 # Check and fix sanity of key file
1521 keys_by_name = ssh.QueryPubKeyFile([node_info.name], key_file=pub_key_file)
1522 keys_by_uuid = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1523
1524 if (not keys_by_name or node_info.name not in keys_by_name) \
1525 and (not keys_by_uuid or node_info.uuid not in keys_by_uuid):
1526 raise errors.SshUpdateError(
1527 "No keys found for the new node '%s' (UUID %s) in the list of public"
1528 " SSH keys, neither for the name or the UUID" %
1529 (node_info.name, node_info.uuid))
1530 else:
1531 if node_info.name in keys_by_name:
1532 # Replace the name by UUID in the file as the name should only be used
1533 # temporarily
1534 ssh.ReplaceNameByUuid(node_info.uuid, node_info.name,
1535 error_fn=errors.SshUpdateError,
1536 key_file=pub_key_file)
1537
1538 # Retrieve updated map of UUIDs to keys
1539 keys_by_uuid = ssh.QueryPubKeyFile(
1540 [node_info.uuid for node_info in node_list], key_file=pub_key_file)
1541
1542 # Update the master node's key files
1543 (auth_key_file, _) = \
1544 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1545 for node_info in node_list:
1546 if node_info.to_authorized_keys:
1547 ssh.AddAuthorizedKeys(auth_key_file, keys_by_uuid[node_info.uuid])
1548
1549 base_data = {}
1550 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1551 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1552
1553 ssh_port_map = ssconf_store.GetSshPortMap()
1554
1555 # Update the target nodes themselves
1556 for node_info in node_list:
1557 logging.debug("Updating SSH key files of target node '%s'.", node_info.name)
1558 if node_info.get_public_keys:
1559 node_data = {}
1560 _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
1561 all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
1562 node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1563 (constants.SSHS_OVERRIDE, all_keys)
1564
1565 try:
1566 utils.RetryByNumberOfTimes(
1567 constants.SSHS_MAX_RETRIES,
1568 errors.SshUpdateError,
1569 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1570 ssh_port_map.get(node_info.name), node_data,
1571 debug=ssh_update_debug, verbose=ssh_update_verbose,
1572 use_cluster_key=False, ask_key=False, strict_host_check=False)
1573 except errors.SshUpdateError as e:
1574 # Clean up the master's public key file if adding key fails
1575 if node_info.to_public_keys:
1576 ssh.RemovePublicKey(node_info.uuid)
1577 raise e
1578
1579 # Update all nodes except master and the target nodes
1580 keys_by_uuid_auth = ssh.QueryPubKeyFile(
1581 [node_info.uuid for node_info in node_list
1582 if node_info.to_authorized_keys],
1583 key_file=pub_key_file)
1584 if to_authorized_keys:
1585 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1586 (constants.SSHS_ADD, keys_by_uuid_auth)
1587
1588 pot_mc_data = base_data.copy()
1589 keys_by_uuid_pub = ssh.QueryPubKeyFile(
1590 [node_info.uuid for node_info in node_list
1591 if node_info.to_public_keys],
1592 key_file=pub_key_file)
1593 if to_public_keys:
1594 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1595 (constants.SSHS_REPLACE_OR_ADD, keys_by_uuid_pub)
1596
1597 all_nodes = ssconf_store.GetNodeList()
1598 master_node = ssconf_store.GetMasterNode()
1599 online_nodes = ssconf_store.GetOnlineNodeList()
1600
1601 node_errors = []
1602 for node in all_nodes:
1603 if node == master_node:
1604 logging.debug("Skipping master node '%s'.", master_node)
1605 continue
1606 if node not in online_nodes:
1607 logging.debug("Skipping offline node '%s'.", node)
1608 continue
1609 if node in potential_master_candidates:
1610 logging.debug("Updating SSH key files of node '%s'.", node)
1611 try:
1612 utils.RetryByNumberOfTimes(
1613 constants.SSHS_MAX_RETRIES,
1614 errors.SshUpdateError,
1615 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1616 ssh_port_map.get(node), pot_mc_data,
1617 debug=ssh_update_debug, verbose=ssh_update_verbose,
1618 use_cluster_key=False, ask_key=False, strict_host_check=False)
1619 except errors.SshUpdateError as last_exception:
1620 error_msg = ("When adding the key of node '%s', updating SSH key"
1621 " files of node '%s' failed after %s retries."
1622 " Not trying again. Last error was: %s." %
1623 (node, node_info.name, constants.SSHS_MAX_RETRIES,
1624 last_exception))
1625 node_errors.append((node, error_msg))
1626 # We only log the error and don't throw an exception, because
1627 # one unreachable node shall not abort the entire procedure.
1628 logging.error(error_msg)
1629
1630 else:
1631 if to_authorized_keys:
1632 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
1633 ssh_port_map.get(node), base_data,
1634 debug=ssh_update_debug, verbose=ssh_update_verbose,
1635 use_cluster_key=False, ask_key=False,
1636 strict_host_check=False)
1637
1638 return node_errors
1639
1640
1641 # TODO: will be fixed with pending patch series.
1642 # pylint: disable=R0913
1643 def RemoveNodeSshKey(node_uuid, node_name,
1644 master_candidate_uuids,
1645 potential_master_candidates,
1646 master_uuid=None,
1647 keys_to_remove=None,
1648 from_authorized_keys=False,
1649 from_public_keys=False,
1650 clear_authorized_keys=False,
1651 clear_public_keys=False,
1652 pub_key_file=pathutils.SSH_PUB_KEYS,
1653 ssconf_store=None,
1654 noded_cert_file=pathutils.NODED_CERT_FILE,
1655 readd=False,
1656 run_cmd_fn=ssh.RunSshCmdWithStdin,
1657 ssh_update_debug=False,
1658 ssh_update_verbose=False):
1659 """Removes the node's SSH keys from the key files and distributes those.
1660
1661 Note that at least one of the flags C{from_authorized_keys},
1662 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1663 has to be set to C{True} for the function to perform any action at all.
1664 Not doing so will trigger an assertion in the function.
1665
1666 @type node_uuid: str
1667 @param node_uuid: UUID of the node whose key is removed
1668 @type node_name: str
1669 @param node_name: name of the node whose key is remove
1670 @type master_candidate_uuids: list of str
1671 @param master_candidate_uuids: list of UUIDs of the current master candidates
1672 @type potential_master_candidates: list of str
1673 @param potential_master_candidates: list of names of potential master
1674 candidates
1675 @type keys_to_remove: dict of str to list of str
1676 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1677 to be removed. This list is supposed to be used only if the keys are not
1678 in the public keys file. This is for example the case when removing a
1679 master node's key.
1680 @type from_authorized_keys: boolean
1681 @param from_authorized_keys: whether or not the key should be removed
1682 from the C{authorized_keys} file
1683 @type from_public_keys: boolean
1684 @param from_public_keys: whether or not the key should be remove from
1685 the C{ganeti_pub_keys} file
1686 @type clear_authorized_keys: boolean
1687 @param clear_authorized_keys: whether or not the C{authorized_keys} file
1688 should be cleared on the node whose keys are removed
1689 @type clear_public_keys: boolean
1690 @param clear_public_keys: whether to clear the node's C{ganeti_pub_key} file
1691 @type readd: boolean
1692 @param readd: whether this is called during a readd operation.
1693 @rtype: list of string
1694 @returns: list of feedback messages
1695
1696 """
1697 node_list = [SshRemoveNodeInfo(uuid=node_uuid,
1698 name=node_name,
1699 from_authorized_keys=from_authorized_keys,
1700 from_public_keys=from_public_keys,
1701 clear_authorized_keys=clear_authorized_keys,
1702 clear_public_keys=clear_public_keys)]
1703 return RemoveNodeSshKeyBulk(node_list,
1704 master_candidate_uuids,
1705 potential_master_candidates,
1706 master_uuid=master_uuid,
1707 keys_to_remove=keys_to_remove,
1708 pub_key_file=pub_key_file,
1709 ssconf_store=ssconf_store,
1710 noded_cert_file=noded_cert_file,
1711 readd=readd,
1712 run_cmd_fn=run_cmd_fn,
1713 ssh_update_debug=ssh_update_debug,
1714 ssh_update_verbose=ssh_update_verbose)
1715
1716
1717 # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
1718 SshRemoveNodeInfo = collections.namedtuple(
1719 "SshRemoveNodeInfo",
1720 ["uuid",
1721 "name",
1722 "from_authorized_keys",
1723 "from_public_keys",
1724 "clear_authorized_keys",
1725 "clear_public_keys"])
1726
1727
1728 def RemoveNodeSshKeyBulk(node_list,
1729 master_candidate_uuids,
1730 potential_master_candidates,
1731 master_uuid=None,
1732 keys_to_remove=None,
1733 pub_key_file=pathutils.SSH_PUB_KEYS,
1734 ssconf_store=None,
1735 noded_cert_file=pathutils.NODED_CERT_FILE,
1736 readd=False,
1737 run_cmd_fn=ssh.RunSshCmdWithStdin,
1738 ssh_update_debug=False,
1739 ssh_update_verbose=False):
1740 """Removes the node's SSH keys from the key files and distributes those.
1741
1742 Note that at least one of the flags C{from_authorized_keys},
1743 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1744 of at least one node has to be set to C{True} for the function to perform any
1745 action at all. Not doing so will trigger an assertion in the function.
1746
1747 @type node_list: list of C{SshRemoveNodeInfo}.
1748 @param node_list: list of information about nodes whose keys are being removed
1749 @type master_candidate_uuids: list of str
1750 @param master_candidate_uuids: list of UUIDs of the current master candidates
1751 @type potential_master_candidates: list of str
1752 @param potential_master_candidates: list of names of potential master
1753 candidates
1754 @type keys_to_remove: dict of str to list of str
1755 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1756 to be removed. This list is supposed to be used only if the keys are not
1757 in the public keys file. This is for example the case when removing a
1758 master node's key.
1759 @type readd: boolean
1760 @param readd: whether this is called during a readd operation.
1761 @rtype: list of string
1762 @returns: list of feedback messages
1763
1764 """
1765 # Non-disruptive error messages, list of (node, msg) pairs
1766 result_msgs = []
1767
1768 # whether there are any keys to be added or retrieved at all
1769 from_authorized_keys = any([node_info.from_authorized_keys for node_info in
1770 node_list])
1771 from_public_keys = any([node_info.from_public_keys for node_info in
1772 node_list])
1773 clear_authorized_keys = any([node_info.clear_authorized_keys for node_info in
1774 node_list])
1775 clear_public_keys = any([node_info.clear_public_keys for node_info in
1776 node_list])
1777
1778 # Make sure at least one of these flags is true.
1779 if not (from_authorized_keys or from_public_keys or clear_authorized_keys
1780 or clear_public_keys):
1781 raise errors.SshUpdateError("No removal from any key file was requested.")
1782
1783 if not ssconf_store:
1784 ssconf_store = ssconf.SimpleStore()
1785
1786 master_node = ssconf_store.GetMasterNode()
1787 ssh_port_map = ssconf_store.GetSshPortMap()
1788
1789 all_keys_to_remove = {}
1790 if from_authorized_keys or from_public_keys:
1791 for node_info in node_list:
1792 # Skip nodes that don't actually need any keys to be removed.
1793 if not (node_info.from_authorized_keys or node_info.from_public_keys):
1794 continue
1795 if node_info.name == master_node and not keys_to_remove:
1796 raise errors.SshUpdateError("Cannot remove the master node's keys.")
1797 if keys_to_remove:
1798 keys = keys_to_remove
1799 else:
1800 keys = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1801 if (not keys or node_info.uuid not in keys) and not readd:
1802 raise errors.SshUpdateError("Node '%s' not found in the list of"
1803 " public SSH keys. It seems someone"
1804 " tries to remove a key from outside"
1805 " the cluster!" % node_info.uuid)
1806 # During an upgrade all nodes have the master key. In this case we
1807 # should not remove it to avoid accidentally shutting down cluster
1808 # SSH communication
1809 master_keys = None
1810 if master_uuid:
1811 master_keys = ssh.QueryPubKeyFile([master_uuid],
1812 key_file=pub_key_file)
1813 for master_key in master_keys:
1814 if master_key in keys[node_info.uuid]:
1815 keys[node_info.uuid].remove(master_key)
1816
1817 all_keys_to_remove.update(keys)
1818
1819 if all_keys_to_remove:
1820 base_data = {}
1821 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1822 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1823
1824 if from_authorized_keys:
1825 # UUIDs of nodes that are supposed to be removed from the
1826 # authorized_keys files.
1827 nodes_remove_from_authorized_keys = [
1828 node_info.uuid for node_info in node_list
1829 if node_info.from_authorized_keys]
1830 keys_to_remove_from_authorized_keys = dict([
1831 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1832 if uuid in nodes_remove_from_authorized_keys])
1833 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1834 (constants.SSHS_REMOVE, keys_to_remove_from_authorized_keys)
1835 (auth_key_file, _) = \
1836 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False,
1837 dircheck=False)
1838
1839 for uuid in nodes_remove_from_authorized_keys:
1840 ssh.RemoveAuthorizedKeys(auth_key_file,
1841 keys_to_remove_from_authorized_keys[uuid])
1842
1843 pot_mc_data = base_data.copy()
1844
1845 if from_public_keys:
1846 nodes_remove_from_public_keys = [
1847 node_info.uuid for node_info in node_list
1848 if node_info.from_public_keys]
1849 keys_to_remove_from_public_keys = dict([
1850 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1851 if uuid in nodes_remove_from_public_keys])
1852 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1853 (constants.SSHS_REMOVE, keys_to_remove_from_public_keys)
1854
1855 all_nodes = ssconf_store.GetNodeList()
1856 online_nodes = ssconf_store.GetOnlineNodeList()
1857 all_nodes_to_remove = [node_info.name for node_info in node_list]
1858 logging.debug("Removing keys of nodes '%s' from all nodes but itself and"
1859 " master.", ", ".join(all_nodes_to_remove))
1860 for node in all_nodes:
1861 if node == master_node:
1862 logging.debug("Skipping master node '%s'.", master_node)
1863 continue
1864 if node not in online_nodes:
1865 logging.debug("Skipping offline node '%s'.", node)
1866 continue
1867 if node in all_nodes_to_remove:
1868 logging.debug("Skipping node whose key is removed itself '%s'.", node)
1869 continue
1870 ssh_port = ssh_port_map.get(node)
1871 if not ssh_port:
1872 raise errors.OpExecError("No SSH port information available for"
1873 " node '%s', map: %s." %
1874 (node, ssh_port_map))
1875 error_msg_final = ("When removing the key of node '%s', updating the"
1876 " SSH key files of node '%s' failed. Last error"
1877 " was: %s.")
1878 if node in potential_master_candidates:
1879 logging.debug("Updating key setup of potential master candidate node"
1880 " %s.", node)
1881 try:
1882 utils.RetryByNumberOfTimes(
1883 constants.SSHS_MAX_RETRIES,
1884 errors.SshUpdateError,
1885 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1886 ssh_port, pot_mc_data,
1887 debug=ssh_update_debug, verbose=ssh_update_verbose,
1888 use_cluster_key=False, ask_key=False, strict_host_check=False)
1889 except errors.SshUpdateError as last_exception:
1890 error_msg = error_msg_final % (
1891 node_info.name, node, last_exception)
1892 result_msgs.append((node, error_msg))
1893 logging.error(error_msg)
1894
1895 else:
1896 if from_authorized_keys:
1897 logging.debug("Updating key setup of normal node %s.", node)
1898 try:
1899 utils.RetryByNumberOfTimes(
1900 constants.SSHS_MAX_RETRIES,
1901 errors.SshUpdateError,
1902 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1903 ssh_port, base_data,
1904 debug=ssh_update_debug, verbose=ssh_update_verbose,
1905 use_cluster_key=False, ask_key=False, strict_host_check=False)
1906 except errors.SshUpdateError as last_exception:
1907 error_msg = error_msg_final % (
1908 node_info.name, node, last_exception)
1909 result_msgs.append((node, error_msg))
1910 logging.error(error_msg)
1911
1912 for node_info in node_list:
1913 if node_info.clear_authorized_keys or node_info.from_public_keys or \
1914 node_info.clear_public_keys:
1915 data = {}
1916 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1917 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1918 ssh_port = ssh_port_map.get(node_info.name)
1919 if not ssh_port:
1920 raise errors.OpExecError("No SSH port information available for"
1921 " node '%s', which is leaving the cluster.")
1922
1923 if node_info.clear_authorized_keys:
1924 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore,
1925 # we have to specify exactly which keys to clear to leave keys untouched
1926 # that were not added by Ganeti.
1927 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids
1928 if uuid != node_info.uuid]
1929 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids,
1930 key_file=pub_key_file)
1931 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1932 (constants.SSHS_REMOVE, candidate_keys)
1933
1934 if node_info.clear_public_keys:
1935 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1936 (constants.SSHS_CLEAR, {})
1937 elif node_info.from_public_keys:
1938 # Since clearing the public keys subsumes removing just a single key,
1939 # we only do it if clear_public_keys is 'False'.
1940
1941 if all_keys_to_remove:
1942 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1943 (constants.SSHS_REMOVE, all_keys_to_remove)
1944
1945 # If we have no changes to any keyfile, just return
1946 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or
1947 constants.SSHS_SSH_AUTHORIZED_KEYS in data):
1948 return
1949
1950 logging.debug("Updating SSH key setup of target node '%s'.",
1951 node_info.name)
1952 try:
1953 utils.RetryByNumberOfTimes(
1954 constants.SSHS_MAX_RETRIES,
1955 errors.SshUpdateError,
1956 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1957 ssh_port, data,
1958 debug=ssh_update_debug, verbose=ssh_update_verbose,
1959 use_cluster_key=False, ask_key=False, strict_host_check=False)
1960 except errors.SshUpdateError as last_exception:
1961 result_msgs.append(
1962 (node_info.name,
1963 ("Removing SSH keys from node '%s' failed."
1964 " This can happen when the node is already unreachable."
1965 " Error: %s" % (node_info.name, last_exception))))
1966
1967 if all_keys_to_remove and from_public_keys:
1968 for node_uuid in nodes_remove_from_public_keys:
1969 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
1970
1971 return result_msgs
1972 # pylint: enable=R0913
1973
1974
1975 def _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, ssh_key_type,
1976 ssh_key_bits, pub_key_file=pathutils.SSH_PUB_KEYS,
1977 ssconf_store=None,
1978 noded_cert_file=pathutils.NODED_CERT_FILE,
1979 run_cmd_fn=ssh.RunSshCmdWithStdin,
1980 suffix="",
1981 ssh_update_debug=False,
1982 ssh_update_verbose=False):
1983 """Generates the root SSH key pair on the node.
1984
1985 @type node_uuid: str
1986 @param node_uuid: UUID of the node whose key is removed
1987 @type node_name: str
1988 @param node_name: name of the node whose key is remove
1989 @type ssh_port_map: dict of str to int
1990 @param ssh_port_map: mapping of node names to their SSH port
1991 @type ssh_key_type: One of L{constants.SSHK_ALL}
1992 @param ssh_key_type: the type of SSH key to be generated
1993 @type ssh_key_bits: int
1994 @param ssh_key_bits: the length of the key to be generated
1995
1996 """
1997 if not ssconf_store:
1998 ssconf_store = ssconf.SimpleStore()
1999
2000 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
2001 if not keys_by_uuid or node_uuid not in keys_by_uuid:
2002 raise errors.SshUpdateError("Node %s (UUID: %s) whose key is requested to"
2003 " be regenerated is not registered in the"
2004 " public keys file." % (node_name, node_uuid))
2005
2006 data = {}
2007 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
2008 cluster_name = data[constants.SSHS_CLUSTER_NAME]
2009 data[constants.SSHS_GENERATE] = (ssh_key_type, ssh_key_bits, suffix)
2010
2011 run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
2012 ssh_port_map.get(node_name), data,
2013 debug=ssh_update_debug, verbose=ssh_update_verbose,
2014 use_cluster_key=False, ask_key=False, strict_host_check=False)
2015
2016
2017 def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
2018 master_node_uuids = [node_uuid for (node_uuid, node_name)
2019 in node_uuid_name_map
2020 if node_name == master_node_name]
2021 if len(master_node_uuids) != 1:
2022 raise errors.SshUpdateError("No (unique) master UUID found. Master node"
2023 " name: '%s', Master UUID: '%s'" %
2024 (master_node_name, master_node_uuids))
2025 return master_node_uuids[0]
2026
2027
2028 def _GetOldMasterKeys(master_node_uuid, pub_key_file):
2029 old_master_keys_by_uuid = ssh.QueryPubKeyFile([master_node_uuid],
2030 key_file=pub_key_file)
2031 if not old_master_keys_by_uuid:
2032 raise errors.SshUpdateError("No public key of the master node (UUID '%s')"
2033 " found, not generating a new key."
2034 % master_node_uuid)
2035 return old_master_keys_by_uuid
2036
2037
2038 def _GetNewMasterKey(root_keyfiles, master_node_uuid):
2039 new_master_keys = []
2040 for (_, (_, public_key_file)) in root_keyfiles.items():
2041 public_key_dir = os.path.dirname(public_key_file)
2042 public_key_file_tmp_filename = \
2043 os.path.splitext(os.path.basename(public_key_file))[0] \
2044 + constants.SSHS_MASTER_SUFFIX + ".pub"
2045 public_key_path_tmp = os.path.join(public_key_dir,
2046 public_key_file_tmp_filename)
2047 if os.path.exists(public_key_path_tmp):
2048 # for some key types, there might not be any keys
2049 key = utils.ReadFile(public_key_path_tmp)
2050 new_master_keys.append(key)
2051 if not new_master_keys:
2052 raise errors.SshUpdateError("Cannot find any type of temporary SSH key.")
2053 return {master_node_uuid: new_master_keys}
2054
2055
2056 def _ReplaceMasterKeyOnMaster(root_keyfiles):
2057 number_of_moves = 0
2058 for (_, (private_key_file, public_key_file)) in root_keyfiles.items():
2059 key_dir = os.path.dirname(public_key_file)
2060 private_key_file_tmp = \
2061 os.path.basename(private_key_file) + constants.SSHS_MASTER_SUFFIX
2062 public_key_file_tmp = private_key_file_tmp + ".pub"
2063 private_key_path_tmp = os.path.join(key_dir,
2064 private_key_file_tmp)
2065 public_key_path_tmp = os.path.join(key_dir,
2066 public_key_file_tmp)
2067 if os.path.exists(public_key_file):
2068 utils.CreateBackup(public_key_file)
2069 utils.RemoveFile(public_key_file)
2070 if os.path.exists(private_key_file):
2071 utils.CreateBackup(private_key_file)
2072 utils.RemoveFile(private_key_file)
2073 if os.path.exists(public_key_path_tmp) and \
2074 os.path.exists(private_key_path_tmp):
2075 # for some key types, there might not be any keys
2076 shutil.move(public_key_path_tmp, public_key_file)
2077 shutil.move(private_key_path_tmp, private_key_file)
2078 number_of_moves += 1
2079 if not number_of_moves:
2080 raise errors.SshUpdateError("Could not move at least one master SSH key.")
2081
2082
2083 def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
2084 potential_master_candidates, old_key_type, new_key_type,
2085 new_key_bits,
2086 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS,
2087 ssconf_store=None,
2088 noded_cert_file=pathutils.NODED_CERT_FILE,
2089 run_cmd_fn=ssh.RunSshCmdWithStdin,
2090 ssh_update_debug=False,
2091 ssh_update_verbose=False):
2092 """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
2093
2094 @type node_uuids: list of str
2095 @param node_uuids: list of node UUIDs whose keys should be renewed
2096 @type node_names: list of str
2097 @param node_names: list of node names whose keys should be removed. This list
2098 should match the C{node_uuids} parameter
2099 @type master_candidate_uuids: list of str
2100 @param master_candidate_uuids: list of UUIDs of master candidates or
2101 master node
2102 @type old_key_type: One of L{constants.SSHK_ALL}
2103 @param old_key_type: the type of SSH key already present on nodes
2104 @type new_key_type: One of L{constants.SSHK_ALL}
2105 @param new_key_type: the type of SSH key to be generated
2106 @type new_key_bits: int
2107 @param new_key_bits: the length of the key to be generated
2108 @type ganeti_pub_keys_file: str
2109 @param ganeti_pub_keys_file: file path of the the public key file
2110 @type noded_cert_file: str
2111 @param noded_cert_file: path of the noded SSL certificate file
2112 @type run_cmd_fn: function
2113 @param run_cmd_fn: function to run commands on remote nodes via SSH
2114 @raises ProgrammerError: if node_uuids and node_names don't match;
2115 SshUpdateError if a node's key is missing from the public key file,
2116 if a node's new SSH key could not be fetched from it, if there is
2117 none or more than one entry in the public key list for the master
2118 node.
2119
2120 """
2121 if not ssconf_store:
2122 ssconf_store = ssconf.SimpleStore()
2123 cluster_name = ssconf_store.GetClusterName()
2124
2125 if not len(node_uuids) == len(node_names):
2126 raise errors.ProgrammerError("List of nodes UUIDs and node names"
2127 " does not match in length.")
2128
2129 (_, root_keyfiles) = \
2130 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2131 (_, old_pub_keyfile) = root_keyfiles[old_key_type]
2132 (_, new_pub_keyfile) = root_keyfiles[new_key_type]
2133 old_master_key = utils.ReadFile(old_pub_keyfile)
2134
2135 node_uuid_name_map = zip(node_uuids, node_names)
2136
2137 master_node_name = ssconf_store.GetMasterNode()
2138 master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name)
2139 ssh_port_map = ssconf_store.GetSshPortMap()
2140 # List of all node errors that happened, but which did not abort the
2141 # procedure as a whole. It is important that this is a list to have a
2142 # somewhat chronological history of events.
2143 all_node_errors = []
2144
2145 # process non-master nodes
2146
2147 # keys to add in bulk at the end
2148 node_keys_to_add = []
2149
2150 # list of all nodes
2151 node_list = []
2152
2153 # list of keys to be removed before generating new keys
2154 node_info_to_remove = []
2155
2156 for node_uuid, node_name in node_uuid_name_map:
2157 if node_name == master_node_name:
2158 continue
2159 master_candidate = node_uuid in master_candidate_uuids
2160 potential_master_candidate = node_name in potential_master_candidates
2161 node_list.append((node_uuid, node_name, master_candidate,
2162 potential_master_candidate))
2163
2164 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid],
2165 key_file=ganeti_pub_keys_file)
2166 if not keys_by_uuid:
2167 raise errors.SshUpdateError("No public key of node %s (UUID %s) found,"
2168 " not generating a new key."
2169 % (node_name, node_uuid))
2170
2171 if master_candidate:
2172 logging.debug("Fetching old SSH key from node '%s'.", node_name)
2173 old_pub_key = ssh.ReadRemoteSshPubKeys(old_pub_keyfile,
2174 node_name, cluster_name,
2175 ssh_port_map[node_name],
2176 False, # ask_key
2177 False) # key_check
2178 if old_pub_key != old_master_key:
2179 # If we are already in a multi-key setup (that is past Ganeti 2.12),
2180 # we can safely remove the old key of the node. Otherwise, we cannot
2181 # remove that node's key, because it is also the master node's key
2182 # and that would terminate all communication from the master to the
2183 # node.
2184 node_info_to_remove.append(SshRemoveNodeInfo(
2185 uuid=node_uuid,
2186 name=node_name,
2187 from_authorized_keys=master_candidate,
2188 from_public_keys=False,
2189 clear_authorized_keys=False,
2190 clear_public_keys=False))
2191 else:
2192 logging.debug("Old key of node '%s' is the same as the current master"
2193 " key. Not deleting that key on the node.", node_name)
2194
2195 logging.debug("Removing old SSH keys of all master candidates.")
2196 if node_info_to_remove:
2197 node_errors = RemoveNodeSshKeyBulk(
2198 node_info_to_remove,
2199 master_candidate_uuids,
2200 potential_master_candidates,
2201 master_uuid=master_node_uuid,
2202 ssh_update_debug=ssh_update_debug,
2203 ssh_update_verbose=ssh_update_verbose)
2204 if node_errors:
2205 all_node_errors = all_node_errors + node_errors
2206
2207 for (node_uuid, node_name, master_candidate, potential_master_candidate) \
2208 in node_list:
2209
2210 logging.debug("Generating new SSH key for node '%s'.", node_name)
2211 _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, new_key_type,
2212 new_key_bits, pub_key_file=ganeti_pub_keys_file,
2213 ssconf_store=ssconf_store,
2214 noded_cert_file=noded_cert_file,
2215 run_cmd_fn=run_cmd_fn,
2216 ssh_update_verbose=ssh_update_verbose,
2217 ssh_update_debug=ssh_update_debug)
2218
2219 try:
2220 logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
2221 pub_key = ssh.ReadRemoteSshPubKeys(new_pub_keyfile,
2222 node_name, cluster_name,
2223 ssh_port_map[node_name],
2224 False, # ask_key
2225 False) # key_check
2226 except:
2227 raise errors.SshUpdateError("Could not fetch key of node %s"
2228 " (UUID %s)" % (node_name, node_uuid))
2229
2230 if potential_master_candidate:
2231 ssh.RemovePublicKey(node_uuid, key_file=ganeti_pub_keys_file)
2232 ssh.AddPublicKey(node_uuid, pub_key, key_file=ganeti_pub_keys_file)
2233
2234 node_info = SshAddNodeInfo(name=node_name,
2235 uuid=node_uuid,
2236 to_authorized_keys=master_candidate,
2237 to_public_keys=potential_master_candidate,
2238 get_public_keys=True)
2239 node_keys_to_add.append(node_info)
2240
2241 node_errors = AddNodeSshKeyBulk(
2242 node_keys_to_add, potential_master_candidates,
2243 pub_key_file=ganeti_pub_keys_file, ssconf_store=ssconf_store,
2244 noded_cert_file=noded_cert_file,
2245 run_cmd_fn=run_cmd_fn,
2246 ssh_update_debug=ssh_update_debug,
2247 ssh_update_verbose=ssh_update_verbose)
2248 if node_errors:
2249 all_node_errors = all_node_errors + node_errors
2250
2251 # Renewing the master node's key
2252
2253 # Preserve the old keys for now
2254 old_master_keys_by_uuid = _GetOldMasterKeys(master_node_uuid,
2255 ganeti_pub_keys_file)
2256
2257 # Generate a new master key with a suffix, don't touch the old one for now
2258 logging.debug("Generate new ssh key of master.")
2259 _GenerateNodeSshKey(master_node_uuid, master_node_name, ssh_port_map,
2260 new_key_type, new_key_bits,
2261 pub_key_file=ganeti_pub_keys_file,
2262 ssconf_store=ssconf_store,
2263 noded_cert_file=noded_cert_file,
2264 run_cmd_fn=run_cmd_fn,
2265 suffix=constants.SSHS_MASTER_SUFFIX,
2266 ssh_update_debug=ssh_update_debug,
2267 ssh_update_verbose=ssh_update_verbose)
2268 # Read newly created master key
2269 new_master_key_dict = _GetNewMasterKey(root_keyfiles, master_node_uuid)
2270
2271 # Replace master key in the master nodes' public key file
2272 ssh.RemovePublicKey(master_node_uuid, key_file=ganeti_pub_keys_file)
2273 for pub_key in new_master_key_dict[master_node_uuid]:
2274 ssh.AddPublicKey(master_node_uuid, pub_key, key_file=ganeti_pub_keys_file)
2275
2276 # Add new master key to all node's public and authorized keys
2277 logging.debug("Add new master key to all nodes.")
2278 node_errors = AddNodeSshKey(
2279 master_node_uuid, master_node_name, potential_master_candidates,
2280 to_authorized_keys=True, to_public_keys=True,
2281 get_public_keys=False, pub_key_file=ganeti_pub_keys_file,
2282 ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
2283 run_cmd_fn=run_cmd_fn,
2284 ssh_update_debug=ssh_update_debug,
2285 ssh_update_verbose=ssh_update_verbose)
2286 if node_errors:
2287 all_node_errors = all_node_errors + node_errors
2288
2289 # Remove the old key file and rename the new key to the non-temporary filename
2290 _ReplaceMasterKeyOnMaster(root_keyfiles)
2291
2292 # Remove old key from authorized keys
2293 (auth_key_file, _) = \
2294 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2295 ssh.RemoveAuthorizedKeys(auth_key_file,
2296 old_master_keys_by_uuid[master_node_uuid])
2297
2298 # Remove the old key from all node's authorized keys file
2299 logging.debug("Remove the old master key from all nodes.")
2300 node_errors = RemoveNodeSshKey(
2301 master_node_uuid, master_node_name, master_candidate_uuids,
2302 potential_master_candidates,
2303 keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
2304 from_public_keys=False, clear_authorized_keys=False,
2305 clear_public_keys=False,
2306 ssh_update_debug=ssh_update_debug,
2307 ssh_update_verbose=ssh_update_verbose)
2308 if node_errors:
2309 all_node_errors = all_node_errors + node_errors
2310
2311 return all_node_errors
2312
2313
2314 def GetBlockDevSizes(devices):
2315 """Return the size of the given block devices
2316
2317 @type devices: list
2318 @param devices: list of block device nodes to query
2319 @rtype: dict
2320 @return:
2321 dictionary of all block devices under /dev (key). The value is their
2322 size in MiB.
2323
2324 {'/dev/disk/by-uuid/123456-12321231-312312-312': 124}
2325
2326 """
2327 DEV_PREFIX = "/dev/"
2328 blockdevs = {}
2329
2330 for devpath in devices:
2331 if not utils.IsBelowDir(DEV_PREFIX, devpath):
2332 continue
2333
2334 try:
2335 st = os.stat(devpath)
2336 except EnvironmentError, err:
2337 logging.warning("Error stat()'ing device %s: %s", devpath, str(err))
2338 continue
2339
2340 if stat.S_ISBLK(st.st_mode):
2341 result = utils.RunCmd(["blockdev", "--getsize64", devpath])
2342 if result.failed:
2343 # We don't want to fail, just do not list this device as available
2344 logging.warning("Cannot get size for block device %s", devpath)
2345 continue
2346
2347 size = int(result.stdout) / (1024 * 1024)
2348 blockdevs[devpath] = size
2349 return blockdevs
2350
2351
2352 def GetVolumeList(vg_names):
2353 """Compute list of logical volumes and their size.
2354
2355 @type vg_names: list
2356 @param vg_names: the volume groups whose LVs we should list, or
2357 empty for all volume groups
2358 @rtype: dict
2359 @return:
2360 dictionary of all partions (key) with value being a tuple of
2361 their size (in MiB), inactive and online status::
2362
2363 {'xenvg/test1': ('20.06', True, True)}
2364
2365 in case of errors, a string is returned with the error
2366 details.
2367
2368 """
2369 lvs = {}
2370 sep = "|"
2371 if not vg_names:
2372 vg_names = []
2373 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2374 "--separator=%s" % sep,
2375 "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names)
2376 if result.failed:
2377 _Fail("Failed to list logical volumes, lvs output: %s", result.output)
2378
2379 for line in result.stdout.splitlines():
2380 line = line.strip()
2381 match = _LVSLINE_REGEX.match(line)
2382 if not match:
2383 logging.error("Invalid line returned from lvs output: '%s'", line)
2384 continue
2385 vg_name, name, size, attr = match.groups()
2386 inactive = attr[4] == "-"
2387 online = attr[5] == "o"
2388 virtual = attr[0] == "v"
2389 if virtual:
2390 # we don't want to report such volumes as existing, since they
2391 # don't really hold data
2392 continue
2393 lvs[vg_name + "/" + name] = (size, inactive, online)
2394
2395 return lvs
2396
2397
2398 def ListVolumeGroups():
2399 """List the volume groups and their size.
2400
2401 @rtype: dict
2402 @return: dictionary with keys volume name and values the
2403 size of the volume
2404
2405 """
2406 return utils.ListVolumeGroups()
2407
2408
2409 def NodeVolumes():
2410 """List all volumes on this node.
2411
2412 @rtype: list
2413 @return:
2414 A list of dictionaries, each having four keys:
2415 - name: the logical volume name,
2416 - size: the size of the logical volume
2417 - dev: the physical device on which the LV lives
2418 - vg: the volume group to which it belongs
2419
2420 In case of errors, we return an empty list and log the
2421 error.
2422
2423 Note that since a logical volume can live on multiple physical
2424 volumes, the resulting list might include a logical volume
2425 multiple times.
2426
2427 """
2428 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2429 "--separator=|",
2430 "--options=lv_name,lv_size,devices,vg_name"])
2431 if result.failed:
2432 _Fail("Failed to list logical volumes, lvs output: %s",
2433 result.output)
2434
2435 def parse_dev(dev):
2436 return dev.split("(")[0]
2437
2438 def handle_dev(dev):
2439 return [parse_dev(x) for x in dev.split(",")]
2440
2441 def map_line(line):
2442 line = [v.strip() for v in line]
2443 return [{"name": line[0], "size": line[1],
2444 "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])]
2445
2446 all_devs = []
2447 for line in result.stdout.splitlines():
2448 if line.count("|") >= 3:
2449 all_devs.extend(map_line(line.split("|")))
2450 else:
2451 logging.warning("Strange line in the output from lvs: '%s'", line)
2452 return all_devs
2453
2454
2455 def BridgesExist(bridges_list):
2456 """Check if a list of bridges exist on the current node.
2457
2458 @rtype: boolean
2459 @return: C{True} if all of them exist, C{False} otherwise
2460
2461 """
2462 missing = []
2463 for bridge in bridges_list:
2464 if not utils.BridgeExists(bridge):
2465 missing.append(bridge)
2466
2467 if missing:
2468 _Fail("Missing bridges %s", utils.CommaJoin(missing))
2469
2470
2471 def GetInstanceListForHypervisor(hname, hvparams=None,
2472 get_hv_fn=hypervisor.GetHypervisor):
2473 """Provides a list of instances of the given hypervisor.
2474
2475 @type hname: string
2476 @param hname: name of the hypervisor
2477 @type hvparams: dict of strings
2478 @param hvparams: hypervisor parameters for the given hypervisor
2479 @type get_hv_fn: function
2480 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2481 name; optional parameter to increase testability
2482
2483 @rtype: list
2484 @return: a list of all running instances on the current node
2485 - instance1.example.com
2486 - instance2.example.com
2487
2488 """
2489 try:
2490 return get_hv_fn(hname).ListInstances(hvparams=hvparams)
2491 except errors.HypervisorError, err:
2492 _Fail("Error enumerating instances (hypervisor %s): %s",
2493 hname, err, exc=True)
2494
2495
2496 def GetInstanceList(hypervisor_list, all_hvparams=None,
2497 get_hv_fn=hypervisor.GetHypervisor):
2498 """Provides a list of instances.
2499
2500 @type hypervisor_list: list
2501 @param hypervisor_list: the list of hypervisors to query information
2502 @type all_hvparams: dict of dict of strings
2503 @param all_hvparams: a dictionary mapping hypervisor types to respective
2504 cluster-wide hypervisor parameters
2505 @type get_hv_fn: function
2506 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2507 name; optional parameter to increase testability
2508
2509 @rtype: list
2510 @return: a list of all running instances on the current node
2511 - instance1.example.com
2512 - instance2.example.com
2513
2514 """
2515 results = []
2516 for hname in hypervisor_list:
2517 hvparams = all_hvparams[hname]
2518 results.extend(GetInstanceListForHypervisor(hname, hvparams=hvparams,
2519 get_hv_fn=get_hv_fn))
2520 return results
2521
2522
2523 def GetInstanceInfo(instance, hname, hvparams=None):
2524 """Gives back the information about an instance as a dictionary.
2525
2526 @type instance: string
2527 @param instance: the instance name
2528 @type hname: string
2529 @param hname: the hypervisor type of the instance
2530 @type hvparams: dict of strings
2531 @param hvparams: the instance's hvparams
2532
2533 @rtype: dict
2534 @return: dictionary with the following keys:
2535 - memory: memory size of instance (int)
2536 - state: state of instance (HvInstanceState)
2537 - time: cpu time of instance (float)
2538 - vcpus: the number of vcpus (int)
2539
2540 """
2541 output = {}
2542
2543 iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance,
2544 hvparams=hvparams)
2545 if iinfo is not None:
2546 output["memory"] = iinfo[2]
2547 output["vcpus"] = iinfo[3]
2548 output["state"] = iinfo[4]
2549 output["time"] = iinfo[5]
2550
2551 return output
2552
2553
2554 def GetInstanceMigratable(instance):
2555 """Computes whether an instance can be migrated.
2556
2557 @type instance: L{objects.Instance}
2558 @param instance: object representing the instance to be checked.
2559
2560 @rtype: tuple
2561 @return: tuple of (result, description) where:
2562 - result: whether the instance can be migrated or not
2563 - description: a description of the issue, if relevant
2564
2565 """
2566 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2567 iname = instance.name
2568 if iname not in hyper.ListInstances(hvparams=instance.hvparams):
2569 _Fail("Instance %s is not running", iname)
2570
2571 for idx in range(len(instance.disks_info)):
2572 link_name = _GetBlockDevSymlinkPath(iname, idx)
2573 if not os.path.islink(link_name):
2574 logging.warning("Instance %s is missing symlink %s for disk %d",
2575 iname, link_name, idx)
2576
2577
2578 def GetAllInstancesInfo(hypervisor_list, all_hvparams):
2579 """Gather data about all instances.
2580
2581 This is the equivalent of L{GetInstanceInfo}, except that it
2582 computes data for all instances at once, thus being faster if one
2583 needs data about more than one instance.
2584
2585 @type hypervisor_list: list
2586 @param hypervisor_list: list of hypervisors to query for instance data
2587 @type all_hvparams: dict of dict of strings
2588 @param all_hvparams: mapping of hypervisor names to hvparams
2589
2590 @rtype: dict
2591 @return: dictionary of instance: data, with data having the following keys:
2592 - memory: memory size of instance (int)
2593 - state: xen state of instance (string)
2594 - time: cpu time of instance (float)
2595 - vcpus: the number of vcpus
2596
2597 """
2598 output = {}
2599 for hname in hypervisor_list:
2600 hvparams = all_hvparams[hname]
2601 iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo(hvparams)
2602 if iinfo:
2603 for name, _, memory, vcpus, state, times in iinfo:
2604 value = {
2605 "memory": memory,
2606 "vcpus": vcpus,
2607 "state": state,
2608 "time": times,
2609 }
2610 if name in output:
2611 # we only check static parameters, like memory and vcpus,
2612 # and not state and time which can change between the
2613 # invocations of the different hypervisors
2614 for key in "memory", "vcpus":
2615 if value[key] != output[name][key]:
2616 _Fail("Instance %s is running twice"
2617 " with different parameters", name)
2618 output[name] = value
2619
2620 return output
2621
2622
2623 def GetInstanceConsoleInfo(instance_param_dict,
2624 get_hv_fn=hypervisor.GetHypervisor):
2625 """Gather data about the console access of a set of instances of this node.
2626
2627 This function assumes that the caller already knows which instances are on
2628 this node, by calling a function such as L{GetAllInstancesInfo} or
2629 L{GetInstanceList}.
2630
2631 For every instance, a large amount of configuration data needs to be
2632 provided to the hypervisor interface in order to receive the console
2633 information. Whether this could or should be cut down can be discussed.
2634 The information is provided in a dictionary indexed by instance name,
2635 allowing any number of instance queries to be done.
2636
2637 @type instance_param_dict: dict of string to tuple of dictionaries, where the
2638 dictionaries represent: L{objects.Instance}, L{objects.Node},
2639 L{objects.NodeGroup}, HvParams, BeParams
2640 @param instance_param_dict: mapping of instance name to parameters necessary
2641 for console information retrieval
2642
2643 @rtype: dict
2644 @return: dictionary of instance: data, with data having the following keys:
2645 - instance: instance name
2646 - kind: console kind
2647 - message: used with kind == CONS_MESSAGE, indicates console to be
2648 unavailable, supplies error message
2649 - host: host to connect to
2650 - port: port to use
2651 - user: user for login
2652 - command: the command, broken into parts as an array
2653 - display: unknown, potentially unused?
2654
2655 """
2656
2657 output = {}
2658 for inst_name in instance_param_dict:
2659 instance = instance_param_dict[inst_name]["instance"]
2660 pnode = instance_param_dict[inst_name]["node"]
2661 group = instance_param_dict[inst_name]["group"]
2662 hvparams = instance_param_dict[inst_name]["hvParams"]
2663 beparams = instance_param_dict[inst_name]["beParams"]
2664
2665 instance = objects.Instance.FromDict(instance)
2666 pnode = objects.Node.FromDict(pnode)
2667 group = objects.NodeGroup.FromDict(group)
2668
2669 h = get_hv_fn(instance.hypervisor)
2670 output[inst_name] = h.GetInstanceConsole(instance, pnode, group,
2671 hvparams, beparams).ToDict()
2672
2673 return output
2674
2675
2676 def _InstanceLogName(kind, os_name, instance, component):
2677 """Compute the OS log filename for a given instance and operation.
2678
2679 The instance name and os name are passed in as strings since not all
2680 operations have these as part of an instance object.
2681
2682 @type kind: string
2683 @param kind: the operation type (e.g. add, import, etc.)
2684 @type os_name: string
2685 @param os_name: the os name
2686 @type instance: string
2687 @param instance: the name of the instance being imported/added/etc.
2688 @type component: string or None
2689 @param component: the name of the component of the instance being
2690 transferred
2691
2692 """
2693 # TODO: Use tempfile.mkstemp to create unique filename
2694 if component:
2695 assert "/" not in component
2696 c_msg = "-%s" % component
2697 else:
2698 c_msg = ""
2699 base = ("%s-%s-%s%s-%s.log" %
2700 (kind, os_name, instance, c_msg, utils.TimestampForFilename()))
2701 return utils.PathJoin(pathutils.LOG_OS_DIR, base)
2702
2703
2704 def InstanceOsAdd(instance, reinstall, debug):
2705 """Add an OS to an instance.
2706
2707 @type instance: L{objects.Instance}
2708 @param instance: Instance whose OS is to be installed
2709 @type reinstall: boolean
2710 @param reinstall: whether this is an instance reinstall
2711 @type debug: integer
2712 @param debug: debug level, passed to the OS scripts
2713 @rtype: None
2714
2715 """
2716 inst_os = OSFromDisk(instance.os)
2717
2718 create_env = OSEnvironment(instance, inst_os, debug)
2719 if reinstall:
2720 create_env["INSTANCE_REINSTALL"] = "1"
2721
2722 logfile = _InstanceLogName("add", instance.os, instance.name, None)
2723
2724 result = utils.RunCmd([inst_os.create_script], env=create_env,
2725 cwd=inst_os.path, output=logfile, reset_env=True)
2726 if result.failed:
2727 logging.error("os create command '%s' returned error: %s, logfile: %s,"
2728 " output: %s", result.cmd, result.fail_reason, logfile,
2729 result.output)
2730 lines = [utils.SafeEncode(val)
2731 for val in utils.TailFile(logfile, lines=20)]
2732 _Fail("OS create script failed (%s), last lines in the"
2733 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2734
2735
2736 def RunRenameInstance(instance, old_name, debug):
2737 """Run the OS rename script for an instance.
2738
2739 @type instance: L{objects.Instance}
2740 @param instance: Instance whose OS is to be installed
2741 @type old_name: string
2742 @param old_name: previous instance name
2743 @type debug: integer
2744 @param debug: debug level, passed to the OS scripts
2745 @rtype: boolean
2746 @return: the success of the operation
2747
2748 """
2749 inst_os = OSFromDisk(instance.os)
2750
2751 rename_env = OSEnvironment(instance, inst_os, debug)
2752 rename_env["OLD_INSTANCE_NAME"] = old_name
2753
2754 logfile = _InstanceLogName("rename", instance.os,
2755 "%s-%s" % (old_name, instance.name), None)
2756
2757 result = utils.RunCmd([inst_os.rename_script], env=rename_env,
2758 cwd=inst_os.path, output=logfile, reset_env=True)
2759
2760 if result.failed:
2761 logging.error("os create command '%s' returned error: %s output: %s",
2762 result.cmd, result.fail_reason, result.output)
2763 lines = [utils.SafeEncode(val)
2764 for val in utils.TailFile(logfile, lines=20)]
2765 _Fail("OS rename script failed (%s), last lines in the"
2766 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2767
2768
2769 def _GetBlockDevSymlinkPath(instance_name, idx, _dir=None):
2770 """Returns symlink path for block device.
2771
2772 """
2773 if _dir is None:
2774 _dir = pathutils.DISK_LINKS_DIR
2775
2776 return utils.PathJoin(_dir,
2777 ("%s%s%s" %
2778 (instance_name, constants.DISK_SEPARATOR, idx)))
2779
2780
2781 def _SymlinkBlockDev(instance_name, device_path, idx):
2782 """Set up symlinks to a instance's block device.
2783
2784 This is an auxiliary function run when an instance is start (on the primary
2785 node) or when an instance is migrated (on the target node).
2786
2787
2788 @param instance_name: the name of the target instance
2789 @param device_path: path of the physical block device, on the node
2790 @param idx: the disk index
2791 @return: absolute path to the disk's symlink
2792
2793 """
2794 # In case we have only a userspace access URI, device_path is None
2795 if not device_path:
2796 return None
2797
2798 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2799 try:
2800 os.symlink(device_path, link_name)
2801 except OSError, err:
2802 if err.errno == errno.EEXIST:
2803 if (not os.path.islink(link_name) or
2804 os.readlink(link_name) != device_path):
2805 os.remove(link_name)
2806 os.symlink(device_path, link_name)
2807 else:
2808 raise
2809
2810 return link_name
2811
2812
2813 def _RemoveBlockDevLinks(instance_name, disks):
2814 """Remove the block device symlinks belonging to the given instance.
2815
2816 """
2817 for idx, _ in enumerate(disks):
2818 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2819 if os.path.islink(link_name):
2820 try:
2821 os.remove(link_name)
2822 except OSError:
2823 logging.exception("Can't remove symlink '%s'", link_name)
2824
2825
2826 def _CalculateDeviceURI(instance, disk, device):
2827 """Get the URI for the device.
2828
2829 @type instance: L{objects.Instance}
2830 @param instance: the instance which disk belongs to
2831 @type disk: L{objects.Disk}
2832 @param disk: the target disk object
2833 @type device: L{bdev.BlockDev}
2834 @param device: the corresponding BlockDevice
2835 @rtype: string
2836 @return: the device uri if any else None
2837
2838 """
2839 access_mode = disk.params.get(constants.LDP_ACCESS,
2840 constants.DISK_KERNELSPACE)
2841 if access_mode == constants.DISK_USERSPACE:
2842 # This can raise errors.BlockDeviceError
2843 return device.GetUserspaceAccessUri(instance.hypervisor)
2844 else:
2845 return None
2846
2847
2848 def _GatherAndLinkBlockDevs(instance):
2849 """Set up an instance's block device(s).
2850
2851 This is run on the primary node at instance startup. The block
2852 devices must be already assembled.
2853
2854 @type instance: L{objects.Instance}
2855 @param instance: the instance whose disks we should assemble
2856 @rtype: list
2857 @return: list of (disk_object, link_name, drive_uri)
2858
2859 """
2860 block_devices = []
2861 for idx, disk in enumerate(instance.disks_info):
2862 device = _RecursiveFindBD(disk)
2863 if device is None:
2864 raise errors.BlockDeviceError("Block device '%s' is not set up." %
2865 str(disk))
2866 device.Open()
2867 try:
2868 link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
2869 except OSError, e:
2870 raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
2871 e.strerror)
2872 uri = _CalculateDeviceURI(instance, disk, device)
2873
2874 block_devices.append((disk, link_name, uri))
2875
2876 return block_devices
2877
2878
2879 def _IsInstanceUserDown(instance_info):
2880 return instance_info and \
2881 "state" in instance_info and \
2882 hv_base.HvInstanceState.IsShutdown(instance_info["state"])
2883
2884
2885 def _GetInstanceInfo(instance):
2886 """Helper function L{GetInstanceInfo}"""
2887 return GetInstanceInfo(instance.name, instance.hypervisor,
2888 hvparams=instance.hvparams)
2889
2890
2891 def StartInstance(instance, startup_paused, reason, store_reason=True):
2892 """Start an instance.
2893
2894 @type instance: L{objects.Instance}
2895 @param instance: the instance object
2896 @type startup_paused: bool
2897 @param instance: pause instance at startup?
2898 @type reason: list of reasons
2899 @param reason: the reason trail for this startup
2900 @type store_reason: boolean
2901 @param store_reason: whether to store the shutdown reason trail on file
2902 @rtype: None
2903
2904 """
2905 instance_info = _GetInstanceInfo(instance)
2906
2907 if instance_info and not _IsInstanceUserDown(instance_info):
2908 logging.info("Instance '%s' already running, not starting", instance.name)
2909 return
2910
2911 try:
2912 block_devices = _GatherAndLinkBlockDevs(instance)
2913 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2914 hyper.StartInstance(instance, block_devices, startup_paused)
2915 if store_reason:
2916 _StoreInstReasonTrail(instance.name, reason)
2917 except errors.BlockDeviceError, err:
2918 _Fail("Block device error: %s", err, exc=True)
2919 except errors.HypervisorError, err:
2920 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2921 _Fail("Hypervisor error: %s", err, exc=True)
2922
2923
2924 def InstanceShutdown(instance, timeout, reason, store_reason=True):
2925 """Shut an instance down.
2926
2927 @note: this functions uses polling with a hardcoded timeout.
2928
2929 @type instance: L{objects.Instance}
2930 @param instance: the instance object
2931 @type timeout: integer
2932 @param timeout: maximum timeout for soft shutdown
2933 @type reason: list of reasons
2934 @param reason: the reason trail for this shutdown
2935 @type store_reason: boolean
2936 @param store_reason: whether to store the shutdown reason trail on file
2937 @rtype: None
2938
2939 """
2940 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2941
2942 if not _GetInstanceInfo(instance):
2943 logging.info("Instance '%s' not running, doing nothing", instance.name)
2944 return
2945
2946 class _TryShutdown(object):
2947 def __init__(self):
2948 self.tried_once = False
2949
2950 def __call__(self):
2951 if not _GetInstanceInfo(instance):
2952 return
2953
2954 try:
2955 hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout)
2956 if store_reason:
2957 _StoreInstReasonTrail(instance.name, reason)
2958 except errors.HypervisorError, err:
2959 # if the instance is no longer existing, consider this a
2960 # success and go to cleanup
2961 if not _GetInstanceInfo(instance):
2962 return
2963
2964 _Fail("Failed to stop instance '%s': %s", instance.name, err)
2965
2966 self.tried_once = True
2967
2968 raise utils.RetryAgain()
2969
2970 try:
2971 utils.Retry(_TryShutdown(), 5, timeout)
2972 except utils.RetryTimeout:
2973 # the shutdown did not succeed
2974 logging.error("Shutdown of '%s' unsuccessful, forcing", instance.name)
2975
2976 try:
2977 hyper.StopInstance(instance, force=True)
2978 except errors.HypervisorError, err:
2979 # only raise an error if the instance still exists, otherwise
2980 # the error could simply be "instance ... unknown"!
2981 if _GetInstanceInfo(instance):
2982 _Fail("Failed to force stop instance '%s': %s", instance.name, err)
2983
2984 time.sleep(1)
2985
2986 if _GetInstanceInfo(instance):
2987 _Fail("Could not shutdown instance '%s' even by destroy", instance.name)
2988
2989 try:
2990 hyper.CleanupInstance(instance.name)
2991 except errors.HypervisorError, err:
2992 logging.warning("Failed to execute post-shutdown cleanup step: %s", err)
2993
2994 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2995
2996
2997 def InstanceReboot(instance, reboot_type, shutdown_timeout, reason):
2998 """Reboot an instance.
2999
3000 @type instance: L{objects.Instance}
3001 @param instance: the instance object to reboot
3002 @type reboot_type: str
3003 @param reboot_type: the type of reboot, one the following
3004 constants:
3005 - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
3006 instance OS, do not recreate the VM
3007 - L{constants.INSTANCE_REBOOT_HARD}: tear down and
3008 restart the VM (at the hypervisor level)
3009 - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is
3010 not accepted here, since that mode is handled differently, in
3011 cmdlib, and translates into full stop and start of the
3012 instance (instead of a call_instance_reboot RPC)
3013 @type shutdown_timeout: integer
3014 @param shutdown_timeout: maximum timeout for soft shutdown
3015 @type reason: list of reasons
3016 @param reason: the reason trail for this reboot
3017 @rtype: None
3018
3019 """
3020 # TODO: this is inconsistent with 'StartInstance' and 'InstanceShutdown'
3021 # because those functions simply 'return' on error whereas this one
3022 # raises an exception with '_Fail'
3023 if not _GetInstanceInfo(instance):
3024 _Fail("Cannot reboot instance '%s' that is not running", instance.name)
3025
3026 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3027 if reboot_type == constants.INSTANCE_REBOOT_SOFT:
3028 try:
3029 hyper.RebootInstance(instance)
3030 except errors.HypervisorError, err:
3031 _Fail("Failed to soft reboot instance '%s': %s", instance.name, err)
3032 elif reboot_type == constants.INSTANCE_REBOOT_HARD:
3033 try:
3034 InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False)
3035 result = StartInstance(instance, False, reason, store_reason=False)
3036 _StoreInstReasonTrail(instance.name, reason)
3037 return result
3038 except errors.HypervisorError, err:
3039 _Fail("Failed to hard reboot instance '%s': %s", instance.name, err)
3040 else:
3041 _Fail("Invalid reboot_type received: '%s'", reboot_type)
3042
3043
3044 def InstanceBalloonMemory(instance, memory):
3045 """Resize an instance's memory.
3046
3047 @type instance: L{objects.Instance}
3048 @param instance: the instance object
3049 @type memory: int
3050 @param memory: new memory amount in MB
3051 @rtype: None
3052
3053 """
3054 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3055 running = hyper.ListInstances(hvparams=instance.hvparams)
3056 if instance.name not in running:
3057 logging.info("Instance %s is not running, cannot balloon", instance.name)
3058 return
3059 try:
3060 hyper.BalloonInstanceMemory(instance, memory)
3061 except errors.HypervisorError, err:
3062 _Fail("Failed to balloon instance memory: %s", err, exc=True)
3063
3064
3065 def MigrationInfo(instance):
3066 """Gather information about an instance to be migrated.
3067
3068 @type instance: L{objects.Instance}
3069 @param instance: the instance definition
3070
3071 """
3072 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3073 try:
3074 info = hyper.MigrationInfo(instance)
3075 except errors.HypervisorError, err:
3076 _Fail("Failed to fetch migration information: %s", err, exc=True)
3077 return info
3078
3079
3080 def AcceptInstance(instance, info, target):
3081 """Prepare the node to accept an instance.
3082
3083 @type instance: L{objects.Instance}
3084 @param instance: the instance definition
3085 @type info: string/data (opaque)
3086 @param info: migration information, from the source node
3087 @type target: string
3088 @param target: target host (usually ip), on this node
3089
3090 """
3091 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3092 try:
3093 hyper.AcceptInstance(instance, info, target)
3094 except errors.HypervisorError, err:
3095 _Fail("Failed to accept instance: %s", err, exc=True)
3096
3097
3098 def FinalizeMigrationDst(instance, info, success):
3099 """Finalize any preparation to accept an instance.
3100
3101 @type instance: L{objects.Instance}
3102 @param instance: the instance definition
3103 @type info: string/data (opaque)
3104 @param info: migration information, from the source node
3105 @type success: boolean
3106 @param success: whether the migration was a success or a failure
3107
3108 """
3109 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3110 try:
3111 hyper.FinalizeMigrationDst(instance, info, success)
3112 except errors.HypervisorError, err:
3113 _Fail("Failed to finalize migration on the target node: %s", err, exc=True)
3114
3115
3116 def MigrateInstance(cluster_name, instance, target, live):
3117 """Migrates an instance to another node.
3118
3119 @type cluster_name: string
3120 @param cluster_name: name of the cluster
3121 @type instance: L{objects.Instance}
3122 @param instance: the instance definition
3123 @type target: string
3124 @param target: the target node name
3125 @type live: boolean
3126 @param live: whether the migration should be done live or not (the
3127 interpretation of this parameter is left to the hypervisor)
3128 @raise RPCFail: if migration fails for some reason
3129
3130 """
3131 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3132
3133 try:
3134 hyper.MigrateInstance(cluster_name, instance, target, live)
3135 except errors.HypervisorError, err:
3136 _Fail("Failed to migrate instance: %s", err, exc=True)
3137
3138
3139 def FinalizeMigrationSource(instance, success, live):
3140 """Finalize the instance migration on the source node.
3141
3142 @type instance: L{objects.Instance}
3143 @param instance: the instance definition of the migrated instance
3144 @type success: bool
3145 @param success: whether the migration succeeded or not
3146 @type live: bool
3147 @param live: whether the user requested a live migration or not
3148 @raise RPCFail: If the execution fails for some reason
3149
3150 """
3151 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3152
3153 try:
3154 hyper.FinalizeMigrationSource(instance, success, live)
3155 except Exception, err: # pylint: disable=W0703
3156 _Fail("Failed to finalize the migration on the source node: %s", err,
3157 exc=True)
3158
3159
3160 def GetMigrationStatus(instance):
3161 """Get the migration status
3162
3163 @type instance: L{objects.Instance}
3164 @param instance: the instance that is being migrated
3165 @rtype: L{objects.MigrationStatus}
3166 @return: the status of the current migration (one of
3167 L{constants.HV_MIGRATION_VALID_STATUSES}), plus any additional
3168 progress info that can be retrieved from the hypervisor
3169 @raise RPCFail: If the migration status cannot be retrieved
3170
3171 """
3172 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3173 try:
3174 return hyper.GetMigrationStatus(instance)
3175 except Exception, err: # pylint: disable=W0703
3176 _Fail("Failed to get migration status: %s", err, exc=True)
3177
3178
3179 def HotplugDevice(instance, action, dev_type, device, extra, seq):
3180 """Hotplug a device
3181
3182 Hotplug is currently supported only for KVM Hypervisor.
3183 @type instance: L{objects.Instance}
3184 @param instance: the instance to which we hotplug a device
3185 @type action: string
3186 @param action: the hotplug action to perform
3187 @type dev_type: string
3188 @param dev_type: the device type to hotplug
3189 @type device: either L{objects.NIC} or L{objects.Disk}
3190 @param device: the device object to hotplug
3191 @type extra: tuple
3192 @param extra: extra info used for disk hotplug (disk link, drive uri)
3193 @type seq: int
3194 @param seq: the index of the device from master perspective
3195 @raise RPCFail: in case instance does not have KVM hypervisor
3196
3197 """
3198 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3199 try:
3200 hyper.VerifyHotplugSupport(instance, action, dev_type)
3201 except errors.HotplugError, err:
3202 _Fail("Hotplug is not supported: %s", err)
3203
3204 if action == constants.HOTPLUG_ACTION_ADD:
3205 fn = hyper.HotAddDevice
3206 elif action == constants.HOTPLUG_ACTION_REMOVE:
3207 fn = hyper.HotDelDevice
3208 elif action == constants.HOTPLUG_ACTION_MODIFY:
3209 fn = hyper.HotModDevice
3210 else:
3211 assert action in constants.HOTPLUG_ALL_ACTIONS
3212
3213 return fn(instance, dev_type, device, extra, seq)
3214
3215
3216 def HotplugSupported(instance):
3217 """Checks if hotplug is generally supported.
3218
3219 """
3220 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3221 try:
3222 hyper.HotplugSupported(instance)
3223 except errors.HotplugError, err:
3224 _Fail("Hotplug is not supported: %s", err)
3225
3226
3227 def ModifyInstanceMetadata(metadata):
3228 """Sends instance data to the metadata daemon.
3229
3230 Uses the Luxi transport layer to communicate with the metadata
3231 daemon configuration server. It starts the metadata daemon if it is
3232 not running.
3233 The daemon must be enabled during at configuration time.
3234
3235 @type metadata: dict
3236 @param metadata: instance metadata obtained by calling
3237 L{objects.Instance.ToDict} on an instance object
3238
3239 """
3240 if not constants.ENABLE_METAD:
3241 raise errors.ProgrammerError("The metadata deamon is disabled, yet"
3242 " ModifyInstanceMetadata has been called")
3243
3244 if not utils.IsDaemonAlive(constants.METAD):
3245 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.METAD])
3246 if result.failed:
3247 raise errors.HypervisorError("Failed to start metadata daemon")
3248
3249 with contextlib.closing(metad.Client()) as client:
3250 client.UpdateConfig(metadata)
3251
3252
3253 def BlockdevCreate(disk, size, owner, on_primary, info, excl_stor):
3254 """Creates a block device for an instance.
3255
3256 @type disk: L{objects.Disk}
3257 @param disk: the object describing the disk we should create
3258 @type size: int
3259 @param size: the size of the physical underlying device, in MiB
3260 @type owner: str
3261 @param owner: the name of the instance for which disk is created,
3262 used for device cache data
3263 @type on_primary: boolean
3264 @param on_primary: indicates if it is the primary node or not
3265 @type info: string
3266 @param info: string that will be sent to the physical device
3267 creation, used for example to set (LVM) tags on LVs
3268 @type excl_stor: boolean
3269 @param excl_stor: Whether exclusive_storage is active
3270
3271 @return: the new unique_id of the device (this can sometime be
3272 computed only after creation), or None. On secondary nodes,
3273 it's not required to return anything.
3274
3275 """
3276 # TODO: remove the obsolete "size" argument
3277 # pylint: disable=W0613
3278 clist = []
3279 if disk.children:
3280 for child in disk.children:
3281 try:
3282 crdev = _RecursiveAssembleBD(child, owner, on_primary)
3283 except errors.BlockDeviceError, err:
3284 _Fail("Can't assemble device %s: %s", child, err)
3285 if on_primary or disk.AssembleOnSecondary():
3286 # we need the children open in case the device itself has to
3287 # be assembled
3288 try:
3289 # pylint: disable=E1103
3290 crdev.Open()
3291 except errors.BlockDeviceError, err:
3292 _Fail("Can't make child '%s' read-write: %s", child, err)
3293 clist.append(crdev)
3294
3295 try:
3296 device = bdev.Create(disk, clist, excl_stor)
3297 except errors.BlockDeviceError, err:
3298 _Fail("Can't create block device: %s", err)
3299
3300 if on_primary or disk.AssembleOnSecondary():
3301 try:
3302 device.Assemble()
3303 except errors.BlockDeviceError, err:
3304 _Fail("Can't assemble device after creation, unusual event: %s", err)
3305 if on_primary or disk.OpenOnSecondary():
3306 try:
3307 device.Open(force=True)
3308 except errors.BlockDeviceError, err:
3309 _Fail("Can't make device r/w after creation, unusual event: %s", err)
3310 DevCacheManager.UpdateCache(device.dev_path, owner,
3311 on_primary, disk.iv_name)
3312
3313 device.SetInfo(info)
3314
3315 return device.unique_id
3316
3317
3318 def _DumpDevice(source_path, target_path, offset, size, truncate):
3319 """This function images/wipes the device using a local file.
3320
3321 @type source_path: string
3322 @param source_path: path of the image or data source (e.g., "/dev/zero")
3323
3324 @type target_path: string
3325 @param target_path: path of the device to image/wipe
3326
3327 @type offset: int
3328 @param offset: offset in MiB in the output file
3329
3330 @type size: int
3331 @param size: maximum size in MiB to write (data source might be smaller)
3332
3333 @type truncate: bool
3334 @param truncate: whether the file should be truncated
3335
3336 @return: None
3337 @raise RPCFail: in case of failure
3338
3339 """
3340 # Internal sizes are always in Mebibytes; if the following "dd" command
3341 # should use a different block size the offset and size given to this
3342 # function must be adjusted accordingly before being passed to "dd".
3343 block_size = constants.DD_BLOCK_SIZE
3344
3345 cmd = [constants.DD_CMD, "if=%s" % source_path, "seek=%d" % offset,
3346 "bs=%s" % block_size, "oflag=direct", "of=%s" % target_path,
3347 "count=%d" % size]
3348