1143: Add gnt-cluster modify --modify-ssh-setup option
[ganeti-github.git] / lib / backend.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31 """Functions used by the node daemon
32
33 @var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
34 the L{UploadFile} function
35 @var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted
36 in the L{_CleanDirectory} function
37
38 """
39
40 # pylint: disable=E1103,C0302
41
42 # E1103: %s %r has no %r member (but some types could not be
43 # inferred), because the _TryOSFromDisk returns either (True, os_obj)
44 # or (False, "string") which confuses pylint
45
46 # C0302: This module has become too big and should be split up
47
48
49 import base64
50 import errno
51 import logging
52 import os
53 import os.path
54 import pycurl
55 import random
56 import re
57 import shutil
58 import signal
59 import stat
60 import tempfile
61 import time
62 import zlib
63 import contextlib
64 import collections
65
66 from ganeti import errors
67 from ganeti import http
68 from ganeti import utils
69 from ganeti import ssh
70 from ganeti import hypervisor
71 from ganeti.hypervisor import hv_base
72 from ganeti import constants
73 from ganeti.storage import bdev
74 from ganeti.storage import drbd
75 from ganeti.storage import extstorage
76 from ganeti.storage import filestorage
77 from ganeti import objects
78 from ganeti import ssconf
79 from ganeti import serializer
80 from ganeti import netutils
81 from ganeti import runtime
82 from ganeti import compat
83 from ganeti import pathutils
84 from ganeti import vcluster
85 from ganeti import ht
86 from ganeti.storage.base import BlockDev
87 from ganeti.storage.drbd import DRBD8
88 from ganeti import hooksmaster
89 import ganeti.metad as metad
90
91
92 _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id"
93 _ALLOWED_CLEAN_DIRS = compat.UniqueFrozenset([
94 pathutils.DATA_DIR,
95 pathutils.JOB_QUEUE_ARCHIVE_DIR,
96 pathutils.QUEUE_DIR,
97 pathutils.CRYPTO_KEYS_DIR,
98 ])
99 _MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60
100 _X509_KEY_FILE = "key"
101 _X509_CERT_FILE = "cert"
102 _IES_STATUS_FILE = "status"
103 _IES_PID_FILE = "pid"
104 _IES_CA_FILE = "ca"
105
106 #: Valid LVS output line regex
107 _LVSLINE_REGEX = re.compile(r"^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$")
108
109 # Actions for the master setup script
110 _MASTER_START = "start"
111 _MASTER_STOP = "stop"
112
113 #: Maximum file permissions for restricted command directory and executables
114 _RCMD_MAX_MODE = (stat.S_IRWXU |
115 stat.S_IRGRP | stat.S_IXGRP |
116 stat.S_IROTH | stat.S_IXOTH)
117
118 #: Delay before returning an error for restricted commands
119 _RCMD_INVALID_DELAY = 10
120
121 #: How long to wait to acquire lock for restricted commands (shorter than
122 #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many
123 #: command requests arrive
124 _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8
125
126
127 class RPCFail(Exception):
128 """Class denoting RPC failure.
129
130 Its argument is the error message.
131
132 """
133
134
135 def _GetInstReasonFilename(instance_name):
136 """Path of the file containing the reason of the instance status change.
137
138 @type instance_name: string
139 @param instance_name: The name of the instance
140 @rtype: string
141 @return: The path of the file
142
143 """
144 return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name)
145
146
147 def _StoreInstReasonTrail(instance_name, trail):
148 """Serialize a reason trail related to an instance change of state to file.
149
150 The exact location of the file depends on the name of the instance and on
151 the configuration of the Ganeti cluster defined at deploy time.
152
153 @type instance_name: string
154 @param instance_name: The name of the instance
155
156 @type trail: list of reasons
157 @param trail: reason trail
158
159 @rtype: None
160
161 """
162 json = serializer.DumpJson(trail)
163 filename = _GetInstReasonFilename(instance_name)
164 utils.WriteFile(filename, data=json)
165
166
167 def _Fail(msg, *args, **kwargs):
168 """Log an error and the raise an RPCFail exception.
169
170 This exception is then handled specially in the ganeti daemon and
171 turned into a 'failed' return type. As such, this function is a
172 useful shortcut for logging the error and returning it to the master
173 daemon.
174
175 @type msg: string
176 @param msg: the text of the exception
177 @raise RPCFail
178
179 """
180 if args:
181 msg = msg % args
182 if "log" not in kwargs or kwargs["log"]: # if we should log this error
183 if "exc" in kwargs and kwargs["exc"]:
184 logging.exception(msg)
185 else:
186 logging.error(msg)
187 raise RPCFail(msg)
188
189
190 def _GetConfig():
191 """Simple wrapper to return a SimpleStore.
192
193 @rtype: L{ssconf.SimpleStore}
194 @return: a SimpleStore instance
195
196 """
197 return ssconf.SimpleStore()
198
199
200 def _GetSshRunner(cluster_name):
201 """Simple wrapper to return an SshRunner.
202
203 @type cluster_name: str
204 @param cluster_name: the cluster name, which is needed
205 by the SshRunner constructor
206 @rtype: L{ssh.SshRunner}
207 @return: an SshRunner instance
208
209 """
210 return ssh.SshRunner(cluster_name)
211
212
213 def _Decompress(data):
214 """Unpacks data compressed by the RPC client.
215
216 @type data: list or tuple
217 @param data: Data sent by RPC client
218 @rtype: str
219 @return: Decompressed data
220
221 """
222 assert isinstance(data, (list, tuple))
223 assert len(data) == 2
224 (encoding, content) = data
225 if encoding == constants.RPC_ENCODING_NONE:
226 return content
227 elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
228 return zlib.decompress(base64.b64decode(content))
229 else:
230 raise AssertionError("Unknown data encoding")
231
232
233 def _CleanDirectory(path, exclude=None):
234 """Removes all regular files in a directory.
235
236 @type path: str
237 @param path: the directory to clean
238 @type exclude: list
239 @param exclude: list of files to be excluded, defaults
240 to the empty list
241
242 """
243 if path not in _ALLOWED_CLEAN_DIRS:
244 _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'",
245 path)
246
247 if not os.path.isdir(path):
248 return
249 if exclude is None:
250 exclude = []
251 else:
252 # Normalize excluded paths
253 exclude = [os.path.normpath(i) for i in exclude]
254
255 for rel_name in utils.ListVisibleFiles(path):
256 full_name = utils.PathJoin(path, rel_name)
257 if full_name in exclude:
258 continue
259 if os.path.isfile(full_name) and not os.path.islink(full_name):
260 utils.RemoveFile(full_name)
261
262
263 def _BuildUploadFileList():
264 """Build the list of allowed upload files.
265
266 This is abstracted so that it's built only once at module import time.
267
268 """
269 allowed_files = set([
270 pathutils.CLUSTER_CONF_FILE,
271 pathutils.ETC_HOSTS,
272 pathutils.SSH_KNOWN_HOSTS_FILE,
273 pathutils.VNC_PASSWORD_FILE,
274 pathutils.RAPI_CERT_FILE,
275 pathutils.SPICE_CERT_FILE,
276 pathutils.SPICE_CACERT_FILE,
277 pathutils.RAPI_USERS_FILE,
278 pathutils.CONFD_HMAC_KEY,
279 pathutils.CLUSTER_DOMAIN_SECRET_FILE,
280 ])
281
282 for hv_name in constants.HYPER_TYPES:
283 hv_class = hypervisor.GetHypervisorClass(hv_name)
284 allowed_files.update(hv_class.GetAncillaryFiles()[0])
285
286 assert pathutils.FILE_STORAGE_PATHS_FILE not in allowed_files, \
287 "Allowed file storage paths should never be uploaded via RPC"
288
289 return frozenset(allowed_files)
290
291
292 _ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
293
294
295 def JobQueuePurge():
296 """Removes job queue files and archived jobs.
297
298 @rtype: tuple
299 @return: True, None
300
301 """
302 _CleanDirectory(pathutils.QUEUE_DIR, exclude=[pathutils.JOB_QUEUE_LOCK_FILE])
303 _CleanDirectory(pathutils.JOB_QUEUE_ARCHIVE_DIR)
304
305
306 def GetMasterNodeName():
307 """Returns the master node name.
308
309 @rtype: string
310 @return: name of the master node
311 @raise RPCFail: in case of errors
312
313 """
314 try:
315 return _GetConfig().GetMasterNode()
316 except errors.ConfigurationError, err:
317 _Fail("Cluster configuration incomplete: %s", err, exc=True)
318
319
320 def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
321 """Decorator that runs hooks before and after the decorated function.
322
323 @type hook_opcode: string
324 @param hook_opcode: opcode of the hook
325 @type hooks_path: string
326 @param hooks_path: path of the hooks
327 @type env_builder_fn: function
328 @param env_builder_fn: function that returns a dictionary containing the
329 environment variables for the hooks. Will get all the parameters of the
330 decorated function.
331 @raise RPCFail: in case of pre-hook failure
332
333 """
334 def decorator(fn):
335 def wrapper(*args, **kwargs):
336 _, myself = ssconf.GetMasterAndMyself()
337 nodes = ([myself], [myself]) # these hooks run locally
338
339 env_fn = compat.partial(env_builder_fn, *args, **kwargs)
340
341 cfg = _GetConfig()
342 hr = HooksRunner()
343 hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes,
344 hr.RunLocalHooks, None, env_fn, None,
345 logging.warning, cfg.GetClusterName(),
346 cfg.GetMasterNode())
347 hm.RunPhase(constants.HOOKS_PHASE_PRE)
348 result = fn(*args, **kwargs)
349 hm.RunPhase(constants.HOOKS_PHASE_POST)
350
351 return result
352 return wrapper
353 return decorator
354
355
356 def _BuildMasterIpEnv(master_params, use_external_mip_script=None):
357 """Builds environment variables for master IP hooks.
358
359 @type master_params: L{objects.MasterNetworkParameters}
360 @param master_params: network parameters of the master
361 @type use_external_mip_script: boolean
362 @param use_external_mip_script: whether to use an external master IP
363 address setup script (unused, but necessary per the implementation of the
364 _RunLocalHooks decorator)
365
366 """
367 # pylint: disable=W0613
368 ver = netutils.IPAddress.GetVersionFromAddressFamily(master_params.ip_family)
369 env = {
370 "MASTER_NETDEV": master_params.netdev,
371 "MASTER_IP": master_params.ip,
372 "MASTER_NETMASK": str(master_params.netmask),
373 "CLUSTER_IP_VERSION": str(ver),
374 }
375
376 return env
377
378
379 def _RunMasterSetupScript(master_params, action, use_external_mip_script):
380 """Execute the master IP address setup script.
381
382 @type master_params: L{objects.MasterNetworkParameters}
383 @param master_params: network parameters of the master
384 @type action: string
385 @param action: action to pass to the script. Must be one of
386 L{backend._MASTER_START} or L{backend._MASTER_STOP}
387 @type use_external_mip_script: boolean
388 @param use_external_mip_script: whether to use an external master IP
389 address setup script
390 @raise backend.RPCFail: if there are errors during the execution of the
391 script
392
393 """
394 env = _BuildMasterIpEnv(master_params)
395
396 if use_external_mip_script:
397 setup_script = pathutils.EXTERNAL_MASTER_SETUP_SCRIPT
398 else:
399 setup_script = pathutils.DEFAULT_MASTER_SETUP_SCRIPT
400
401 result = utils.RunCmd([setup_script, action], env=env, reset_env=True)
402
403 if result.failed:
404 _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" %
405 (action, result.exit_code, result.output), log=True)
406
407
408 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup",
409 _BuildMasterIpEnv)
410 def ActivateMasterIp(master_params, use_external_mip_script):
411 """Activate the IP address of the master daemon.
412
413 @type master_params: L{objects.MasterNetworkParameters}
414 @param master_params: network parameters of the master
415 @type use_external_mip_script: boolean
416 @param use_external_mip_script: whether to use an external master IP
417 address setup script
418 @raise RPCFail: in case of errors during the IP startup
419
420 """
421 _RunMasterSetupScript(master_params, _MASTER_START,
422 use_external_mip_script)
423
424
425 def StartMasterDaemons(no_voting):
426 """Activate local node as master node.
427
428 The function will start the master daemons (ganeti-masterd and ganeti-rapi).
429
430 @type no_voting: boolean
431 @param no_voting: whether to start ganeti-masterd without a node vote
432 but still non-interactively
433 @rtype: None
434
435 """
436
437 if no_voting:
438 daemon_args = "--no-voting --yes-do-it"
439 else:
440 daemon_args = ""
441
442 env = {
443 "EXTRA_LUXID_ARGS": daemon_args,
444 "EXTRA_WCONFD_ARGS": daemon_args,
445 }
446
447 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-master"], env=env)
448 if result.failed:
449 msg = "Can't start Ganeti master: %s" % result.output
450 logging.error(msg)
451 _Fail(msg)
452
453
454 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown",
455 _BuildMasterIpEnv)
456 def DeactivateMasterIp(master_params, use_external_mip_script):
457 """Deactivate the master IP on this node.
458
459 @type master_params: L{objects.MasterNetworkParameters}
460 @param master_params: network parameters of the master
461 @type use_external_mip_script: boolean
462 @param use_external_mip_script: whether to use an external master IP
463 address setup script
464 @raise RPCFail: in case of errors during the IP turndown
465
466 """
467 _RunMasterSetupScript(master_params, _MASTER_STOP,
468 use_external_mip_script)
469
470
471 def StopMasterDaemons():
472 """Stop the master daemons on this node.
473
474 Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node.
475
476 @rtype: None
477
478 """
479 # TODO: log and report back to the caller the error failures; we
480 # need to decide in which case we fail the RPC for this
481
482 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-master"])
483 if result.failed:
484 logging.error("Could not stop Ganeti master, command %s had exitcode %s"
485 " and error %s",
486 result.cmd, result.exit_code, result.output)
487
488
489 def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
490 """Change the netmask of the master IP.
491
492 @param old_netmask: the old value of the netmask
493 @param netmask: the new value of the netmask
494 @param master_ip: the master IP
495 @param master_netdev: the master network device
496
497 """
498 if old_netmask == netmask:
499 return
500
501 if not netutils.IPAddress.Own(master_ip):
502 _Fail("The master IP address is not up, not attempting to change its"
503 " netmask")
504
505 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
506 "%s/%s" % (master_ip, netmask),
507 "dev", master_netdev, "label",
508 "%s:0" % master_netdev])
509 if result.failed:
510 _Fail("Could not set the new netmask on the master IP address")
511
512 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
513 "%s/%s" % (master_ip, old_netmask),
514 "dev", master_netdev, "label",
515 "%s:0" % master_netdev])
516 if result.failed:
517 _Fail("Could not bring down the master IP address with the old netmask")
518
519
520 def EtcHostsModify(mode, host, ip):
521 """Modify a host entry in /etc/hosts.
522
523 @param mode: The mode to operate. Either add or remove entry
524 @param host: The host to operate on
525 @param ip: The ip associated with the entry
526
527 """
528 if mode == constants.ETC_HOSTS_ADD:
529 if not ip:
530 RPCFail("Mode 'add' needs 'ip' parameter, but parameter not"
531 " present")
532 utils.AddHostToEtcHosts(host, ip)
533 elif mode == constants.ETC_HOSTS_REMOVE:
534 if ip:
535 RPCFail("Mode 'remove' does not allow 'ip' parameter, but"
536 " parameter is present")
537 utils.RemoveHostFromEtcHosts(host)
538 else:
539 RPCFail("Mode not supported")
540
541
542 def LeaveCluster(modify_ssh_setup):
543 """Cleans up and remove the current node.
544
545 This function cleans up and prepares the current node to be removed
546 from the cluster.
547
548 If processing is successful, then it raises an
549 L{errors.QuitGanetiException} which is used as a special case to
550 shutdown the node daemon.
551
552 @param modify_ssh_setup: boolean
553
554 """
555 _CleanDirectory(pathutils.DATA_DIR)
556 _CleanDirectory(pathutils.CRYPTO_KEYS_DIR)
557 JobQueuePurge()
558
559 if modify_ssh_setup:
560 try:
561 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
562
563 ssh.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
564
565 utils.RemoveFile(priv_key)
566 utils.RemoveFile(pub_key)
567 except errors.OpExecError:
568 logging.exception("Error while processing ssh files")
569 except IOError:
570 logging.exception("At least one SSH file was not accessible.")
571
572 try:
573 utils.RemoveFile(pathutils.CONFD_HMAC_KEY)
574 utils.RemoveFile(pathutils.RAPI_CERT_FILE)
575 utils.RemoveFile(pathutils.SPICE_CERT_FILE)
576 utils.RemoveFile(pathutils.SPICE_CACERT_FILE)
577 utils.RemoveFile(pathutils.NODED_CERT_FILE)
578 except: # pylint: disable=W0702
579 logging.exception("Error while removing cluster secrets")
580
581 utils.StopDaemon(constants.CONFD)
582 utils.StopDaemon(constants.MOND)
583 utils.StopDaemon(constants.KVMD)
584
585 # Raise a custom exception (handled in ganeti-noded)
586 raise errors.QuitGanetiException(True, "Shutdown scheduled")
587
588
589 def _CheckStorageParams(params, num_params):
590 """Performs sanity checks for storage parameters.
591
592 @type params: list
593 @param params: list of storage parameters
594 @type num_params: int
595 @param num_params: expected number of parameters
596
597 """
598 if params is None:
599 raise errors.ProgrammerError("No storage parameters for storage"
600 " reporting is provided.")
601 if not isinstance(params, list):
602 raise errors.ProgrammerError("The storage parameters are not of type"
603 " list: '%s'" % params)
604 if not len(params) == num_params:
605 raise errors.ProgrammerError("Did not receive the expected number of"
606 "storage parameters: expected %s,"
607 " received '%s'" % (num_params, len(params)))
608
609
610 def _CheckLvmStorageParams(params):
611 """Performs sanity check for the 'exclusive storage' flag.
612
613 @see: C{_CheckStorageParams}
614
615 """
616 _CheckStorageParams(params, 1)
617 excl_stor = params[0]
618 if not isinstance(params[0], bool):
619 raise errors.ProgrammerError("Exclusive storage parameter is not"
620 " boolean: '%s'." % excl_stor)
621 return excl_stor
622
623
624 def _GetLvmVgSpaceInfo(name, params):
625 """Wrapper around C{_GetVgInfo} which checks the storage parameters.
626
627 @type name: string
628 @param name: name of the volume group
629 @type params: list
630 @param params: list of storage parameters, which in this case should be
631 containing only one for exclusive storage
632
633 """
634 excl_stor = _CheckLvmStorageParams(params)
635 return _GetVgInfo(name, excl_stor)
636
637
638 def _GetVgInfo(
639 name, excl_stor, info_fn=bdev.LogicalVolume.GetVGInfo):
640 """Retrieves information about a LVM volume group.
641
642 """
643 # TODO: GetVGInfo supports returning information for multiple VGs at once
644 vginfo = info_fn([name], excl_stor)
645 if vginfo:
646 vg_free = int(round(vginfo[0][0], 0))
647 vg_size = int(round(vginfo[0][1], 0))
648 else:
649 vg_free = None
650 vg_size = None
651
652 return {
653 "type": constants.ST_LVM_VG,
654 "name": name,
655 "storage_free": vg_free,
656 "storage_size": vg_size,
657 }
658
659
660 def _GetLvmPvSpaceInfo(name, params):
661 """Wrapper around C{_GetVgSpindlesInfo} with sanity checks.
662
663 @see: C{_GetLvmVgSpaceInfo}
664
665 """
666 excl_stor = _CheckLvmStorageParams(params)
667 return _GetVgSpindlesInfo(name, excl_stor)
668
669
670 def _GetVgSpindlesInfo(
671 name, excl_stor, info_fn=bdev.LogicalVolume.GetVgSpindlesInfo):
672 """Retrieves information about spindles in an LVM volume group.
673
674 @type name: string
675 @param name: VG name
676 @type excl_stor: bool
677 @param excl_stor: exclusive storage
678 @rtype: dict
679 @return: dictionary whose keys are "name", "vg_free", "vg_size" for VG name,
680 free spindles, total spindles respectively
681
682 """
683 if excl_stor:
684 (vg_free, vg_size) = info_fn(name)
685 else:
686 vg_free = 0
687 vg_size = 0
688 return {
689 "type": constants.ST_LVM_PV,
690 "name": name,
691 "storage_free": vg_free,
692 "storage_size": vg_size,
693 }
694
695
696 def _GetHvInfo(name, hvparams, get_hv_fn=hypervisor.GetHypervisor):
697 """Retrieves node information from a hypervisor.
698
699 The information returned depends on the hypervisor. Common items:
700
701 - vg_size is the size of the configured volume group in MiB
702 - vg_free is the free size of the volume group in MiB
703 - memory_dom0 is the memory allocated for domain0 in MiB
704 - memory_free is the currently available (free) ram in MiB
705 - memory_total is the total number of ram in MiB
706 - hv_version: the hypervisor version, if available
707
708 @type hvparams: dict of string
709 @param hvparams: the hypervisor's hvparams
710
711 """
712 return get_hv_fn(name).GetNodeInfo(hvparams=hvparams)
713
714
715 def _GetHvInfoAll(hv_specs, get_hv_fn=hypervisor.GetHypervisor):
716 """Retrieves node information for all hypervisors.
717
718 See C{_GetHvInfo} for information on the output.
719
720 @type hv_specs: list of pairs (string, dict of strings)
721 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
722
723 """
724 if hv_specs is None:
725 return None
726
727 result = []
728 for hvname, hvparams in hv_specs:
729 result.append(_GetHvInfo(hvname, hvparams, get_hv_fn))
730 return result
731
732
733 def _GetNamedNodeInfo(names, fn):
734 """Calls C{fn} for all names in C{names} and returns a dictionary.
735
736 @rtype: None or dict
737
738 """
739 if names is None:
740 return None
741 else:
742 return map(fn, names)
743
744
745 def GetNodeInfo(storage_units, hv_specs):
746 """Gives back a hash with different information about the node.
747
748 @type storage_units: list of tuples (string, string, list)
749 @param storage_units: List of tuples (storage unit, identifier, parameters) to
750 ask for disk space information. In case of lvm-vg, the identifier is
751 the VG name. The parameters can contain additional, storage-type-specific
752 parameters, for example exclusive storage for lvm storage.
753 @type hv_specs: list of pairs (string, dict of strings)
754 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
755 @rtype: tuple; (string, None/dict, None/dict)
756 @return: Tuple containing boot ID, volume group information and hypervisor
757 information
758
759 """
760 bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n")
761 storage_info = _GetNamedNodeInfo(
762 storage_units,
763 (lambda (storage_type, storage_key, storage_params):
764 _ApplyStorageInfoFunction(storage_type, storage_key, storage_params)))
765 hv_info = _GetHvInfoAll(hv_specs)
766 return (bootid, storage_info, hv_info)
767
768
769 def _GetFileStorageSpaceInfo(path, params):
770 """Wrapper around filestorage.GetSpaceInfo.
771
772 The purpose of this wrapper is to call filestorage.GetFileStorageSpaceInfo
773 and ignore the *args parameter to not leak it into the filestorage
774 module's code.
775
776 @see: C{filestorage.GetFileStorageSpaceInfo} for description of the
777 parameters.
778
779 """
780 _CheckStorageParams(params, 0)
781 return filestorage.GetFileStorageSpaceInfo(path)
782
783
784 # FIXME: implement storage reporting for all missing storage types.
785 _STORAGE_TYPE_INFO_FN = {
786 constants.ST_BLOCK: None,
787 constants.ST_DISKLESS: None,
788 constants.ST_EXT: None,
789 constants.ST_FILE: _GetFileStorageSpaceInfo,
790 constants.ST_LVM_PV: _GetLvmPvSpaceInfo,
791 constants.ST_LVM_VG: _GetLvmVgSpaceInfo,
792 constants.ST_SHARED_FILE: None,
793 constants.ST_GLUSTER: None,
794 constants.ST_RADOS: None,
795 }
796
797
798 def _ApplyStorageInfoFunction(storage_type, storage_key, *args):
799 """Looks up and applies the correct function to calculate free and total
800 storage for the given storage type.
801
802 @type storage_type: string
803 @param storage_type: the storage type for which the storage shall be reported.
804 @type storage_key: string
805 @param storage_key: identifier of a storage unit, e.g. the volume group name
806 of an LVM storage unit
807 @type args: any
808 @param args: various parameters that can be used for storage reporting. These
809 parameters and their semantics vary from storage type to storage type and
810 are just propagated in this function.
811 @return: the results of the application of the storage space function (see
812 _STORAGE_TYPE_INFO_FN) if storage space reporting is implemented for that
813 storage type
814 @raises NotImplementedError: for storage types who don't support space
815 reporting yet
816 """
817 fn = _STORAGE_TYPE_INFO_FN[storage_type]
818 if fn is not None:
819 return fn(storage_key, *args)
820 else:
821 raise NotImplementedError
822
823
824 def _CheckExclusivePvs(pvi_list):
825 """Check that PVs are not shared among LVs
826
827 @type pvi_list: list of L{objects.LvmPvInfo} objects
828 @param pvi_list: information about the PVs
829
830 @rtype: list of tuples (string, list of strings)
831 @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...])
832
833 """
834 res = []
835 for pvi in pvi_list:
836 if len(pvi.lv_list) > 1:
837 res.append((pvi.name, pvi.lv_list))
838 return res
839
840
841 def _VerifyHypervisors(what, vm_capable, result, all_hvparams,
842 get_hv_fn=hypervisor.GetHypervisor):
843 """Verifies the hypervisor. Appends the results to the 'results' list.
844
845 @type what: C{dict}
846 @param what: a dictionary of things to check
847 @type vm_capable: boolean
848 @param vm_capable: whether or not this node is vm capable
849 @type result: dict
850 @param result: dictionary of verification results; results of the
851 verifications in this function will be added here
852 @type all_hvparams: dict of dict of string
853 @param all_hvparams: dictionary mapping hypervisor names to hvparams
854 @type get_hv_fn: function
855 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
856
857 """
858 if not vm_capable:
859 return
860
861 if constants.NV_HYPERVISOR in what:
862 result[constants.NV_HYPERVISOR] = {}
863 for hv_name in what[constants.NV_HYPERVISOR]:
864 hvparams = all_hvparams[hv_name]
865 try:
866 val = get_hv_fn(hv_name).Verify(hvparams=hvparams)
867 except errors.HypervisorError, err:
868 val = "Error while checking hypervisor: %s" % str(err)
869 result[constants.NV_HYPERVISOR][hv_name] = val
870
871
872 def _VerifyHvparams(what, vm_capable, result,
873 get_hv_fn=hypervisor.GetHypervisor):
874 """Verifies the hvparams. Appends the results to the 'results' list.
875
876 @type what: C{dict}
877 @param what: a dictionary of things to check
878 @type vm_capable: boolean
879 @param vm_capable: whether or not this node is vm capable
880 @type result: dict
881 @param result: dictionary of verification results; results of the
882 verifications in this function will be added here
883 @type get_hv_fn: function
884 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
885
886 """
887 if not vm_capable:
888 return
889
890 if constants.NV_HVPARAMS in what:
891 result[constants.NV_HVPARAMS] = []
892 for source, hv_name, hvparms in what[constants.NV_HVPARAMS]:
893 try:
894 logging.info("Validating hv %s, %s", hv_name, hvparms)
895 get_hv_fn(hv_name).ValidateParameters(hvparms)
896 except errors.HypervisorError, err:
897 result[constants.NV_HVPARAMS].append((source, hv_name, str(err)))
898
899
900 def _VerifyInstanceList(what, vm_capable, result, all_hvparams):
901 """Verifies the instance list.
902
903 @type what: C{dict}
904 @param what: a dictionary of things to check
905 @type vm_capable: boolean
906 @param vm_capable: whether or not this node is vm capable
907 @type result: dict
908 @param result: dictionary of verification results; results of the
909 verifications in this function will be added here
910 @type all_hvparams: dict of dict of string
911 @param all_hvparams: dictionary mapping hypervisor names to hvparams
912
913 """
914 if constants.NV_INSTANCELIST in what and vm_capable:
915 # GetInstanceList can fail
916 try:
917 val = GetInstanceList(what[constants.NV_INSTANCELIST],
918 all_hvparams=all_hvparams)
919 except RPCFail, err:
920 val = str(err)
921 result[constants.NV_INSTANCELIST] = val
922
923
924 def _VerifyNodeInfo(what, vm_capable, result, all_hvparams):
925 """Verifies the node info.
926
927 @type what: C{dict}
928 @param what: a dictionary of things to check
929 @type vm_capable: boolean
930 @param vm_capable: whether or not this node is vm capable
931 @type result: dict
932 @param result: dictionary of verification results; results of the
933 verifications in this function will be added here
934 @type all_hvparams: dict of dict of string
935 @param all_hvparams: dictionary mapping hypervisor names to hvparams
936
937 """
938 if constants.NV_HVINFO in what and vm_capable:
939 hvname = what[constants.NV_HVINFO]
940 hyper = hypervisor.GetHypervisor(hvname)
941 hvparams = all_hvparams[hvname]
942 result[constants.NV_HVINFO] = hyper.GetNodeInfo(hvparams=hvparams)
943
944
945 def _VerifyClientCertificate(cert_file=pathutils.NODED_CLIENT_CERT_FILE):
946 """Verify the existance and validity of the client SSL certificate.
947
948 Also, verify that the client certificate is not self-signed. Self-
949 signed client certificates stem from Ganeti versions 2.12.0 - 2.12.4
950 and should be replaced by client certificates signed by the server
951 certificate. Hence we output a warning when we encounter a self-signed
952 one.
953
954 """
955 create_cert_cmd = "gnt-cluster renew-crypto --new-node-certificates"
956 if not os.path.exists(cert_file):
957 return (constants.CV_ERROR,
958 "The client certificate does not exist. Run '%s' to create"
959 " client certificates for all nodes." % create_cert_cmd)
960
961 (errcode, msg) = utils.VerifyCertificate(cert_file)
962 if errcode is not None:
963 return (errcode, msg)
964
965 (errcode, msg) = utils.IsCertificateSelfSigned(cert_file)
966 if errcode is not None:
967 return (errcode, msg)
968
969 # if everything is fine, we return the digest to be compared to the config
970 return (None, utils.GetCertificateDigest(cert_filename=cert_file))
971
972
973 def _VerifySshSetup(node_status_list, my_name, ssh_key_type,
974 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS):
975 """Verifies the state of the SSH key files.
976
977 @type node_status_list: list of tuples
978 @param node_status_list: list of nodes of the cluster associated with a
979 couple of flags: (uuid, name, is_master_candidate,
980 is_potential_master_candidate, online)
981 @type my_name: str
982 @param my_name: name of this node
983 @type ssh_key_type: one of L{constants.SSHK_ALL}
984 @param ssh_key_type: type of key used on nodes
985 @type ganeti_pub_keys_file: str
986 @param ganeti_pub_keys_file: filename of the public keys file
987
988 """
989 if node_status_list is None:
990 return ["No node list to check against the pub_key_file received."]
991
992 my_status_list = [(my_uuid, name, mc, pot_mc, online) for
993 (my_uuid, name, mc, pot_mc, online)
994 in node_status_list if name == my_name]
995 if len(my_status_list) == 0:
996 return ["Cannot find node information for node '%s'." % my_name]
997 (my_uuid, _, _, potential_master_candidate, online) = \
998 my_status_list[0]
999
1000 result = []
1001
1002 if not os.path.exists(ganeti_pub_keys_file):
1003 result.append("The public key file '%s' does not exist. Consider running"
1004 " 'gnt-cluster renew-crypto --new-ssh-keys"
1005 " [--no-ssh-key-check]' to fix this." % ganeti_pub_keys_file)
1006 return result
1007
1008 pot_mc_uuids = [uuid for (uuid, _, _, _, _) in node_status_list]
1009 offline_nodes = [uuid for (uuid, _, _, _, online) in node_status_list
1010 if not online]
1011 pub_keys = ssh.QueryPubKeyFile(None, key_file=ganeti_pub_keys_file)
1012
1013 if potential_master_candidate:
1014 # Check that the set of potential master candidates matches the
1015 # public key file
1016 pub_uuids_set = set(pub_keys.keys()) - set(offline_nodes)
1017 pot_mc_uuids_set = set(pot_mc_uuids) - set(offline_nodes)
1018 missing_uuids = set([])
1019 if pub_uuids_set != pot_mc_uuids_set:
1020 unknown_uuids = pub_uuids_set - pot_mc_uuids_set
1021 if unknown_uuids:
1022 result.append("The following node UUIDs are listed in the public key"
1023 " file on node '%s', but are not potential master"
1024 " candidates: %s."
1025 % (my_name, ", ".join(list(unknown_uuids))))
1026 missing_uuids = pot_mc_uuids_set - pub_uuids_set
1027 if missing_uuids:
1028 result.append("The following node UUIDs of potential master candidates"
1029 " are missing in the public key file on node %s: %s."
1030 % (my_name, ", ".join(list(missing_uuids))))
1031
1032 (_, key_files) = \
1033 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1034 (_, node_pub_key_file) = key_files[ssh_key_type]
1035
1036 my_keys = pub_keys[my_uuid]
1037
1038 node_pub_key = utils.ReadFile(node_pub_key_file)
1039 if node_pub_key.strip() not in my_keys:
1040 result.append("The dsa key of node %s does not match this node's key"
1041 " in the pub key file." % my_name)
1042 if len(my_keys) != 1:
1043 result.append("There is more than one key for node %s in the public key"
1044 " file." % my_name)
1045 else:
1046 if len(pub_keys.keys()) > 0:
1047 result.append("The public key file of node '%s' is not empty, although"
1048 " the node is not a potential master candidate."
1049 % my_name)
1050
1051 # Check that all master candidate keys are in the authorized_keys file
1052 (auth_key_file, _) = \
1053 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1054 for (uuid, name, mc, _, online) in node_status_list:
1055 if not online:
1056 continue
1057 if uuid in missing_uuids:
1058 continue
1059 if mc:
1060 for key in pub_keys[uuid]:
1061 if not ssh.HasAuthorizedKey(auth_key_file, key):
1062 result.append("A SSH key of master candidate '%s' (UUID: '%s') is"
1063 " not in the 'authorized_keys' file of node '%s'."
1064 % (name, uuid, my_name))
1065 else:
1066 for key in pub_keys[uuid]:
1067 if name != my_name and ssh.HasAuthorizedKey(auth_key_file, key):
1068 result.append("A SSH key of normal node '%s' (UUID: '%s') is in the"
1069 " 'authorized_keys' file of node '%s'."
1070 % (name, uuid, my_name))
1071 if name == my_name and not ssh.HasAuthorizedKey(auth_key_file, key):
1072 result.append("A SSH key of normal node '%s' (UUID: '%s') is not"
1073 " in the 'authorized_keys' file of itself."
1074 % (my_name, uuid))
1075
1076 return result
1077
1078
1079 def _VerifySshClutter(node_status_list, my_name):
1080 """Verifies that the 'authorized_keys' files are not cluttered up.
1081
1082 @type node_status_list: list of tuples
1083 @param node_status_list: list of nodes of the cluster associated with a
1084 couple of flags: (uuid, name, is_master_candidate,
1085 is_potential_master_candidate, online)
1086 @type my_name: str
1087 @param my_name: name of this node
1088
1089 """
1090 result = []
1091 (auth_key_file, _) = \
1092 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1093 node_names = [name for (_, name, _, _) in node_status_list]
1094 multiple_occurrences = ssh.CheckForMultipleKeys(auth_key_file, node_names)
1095 if multiple_occurrences:
1096 msg = "There are hosts which have more than one SSH key stored for the" \
1097 " same user in the 'authorized_keys' file of node %s. This can be" \
1098 " due to an unsuccessful operation which cluttered up the" \
1099 " 'authorized_keys' file. We recommend to clean this up manually. " \
1100 % my_name
1101 for host, occ in multiple_occurrences.items():
1102 msg += "Entry for '%s' in lines %s. " % (host, utils.CommaJoin(occ))
1103 result.append(msg)
1104
1105 return result
1106
1107
1108 def VerifyNodeNetTest(my_name, test_config):
1109 """Verify nodes are reachable.
1110
1111 @type my_name: string
1112 @param my_name: name of the node this test is running on
1113
1114 @type test_config: tuple (node_list, master_candidate_list)
1115 @param test_config: configuration for test as passed from
1116 LUClusterVerify() in what[constants.NV_NODENETTEST]
1117
1118 @rtype: dict
1119 @return: a dictionary with node names as keys and error messages
1120 as values
1121 """
1122 result = {}
1123 nodes, master_candidates = test_config
1124 port = netutils.GetDaemonPort(constants.NODED)
1125
1126 if my_name not in master_candidates:
1127 return result
1128
1129 my_pip, my_sip = next(
1130 ((pip, sip) for name, pip, sip in nodes if name == my_name),
1131 (None, None)
1132 )
1133 if not my_pip:
1134 result[my_name] = ("Can't find my own primary/secondary IP"
1135 " in the node list")
1136 return result
1137
1138 for name, pip, sip in nodes:
1139 fail = []
1140 if not netutils.TcpPing(pip, port, source=my_pip):
1141 fail.append("primary")
1142 if sip != pip:
1143 if not netutils.TcpPing(sip, port, source=my_sip):
1144 fail.append("secondary")
1145 if fail:
1146 result[name] = ("failure using the %s interface(s)" %
1147 " and ".join(fail))
1148 return result
1149
1150
1151 def VerifyMasterIP(my_name, test_config):
1152 """Verify master IP is reachable.
1153
1154 @type my_name: string
1155 @param my_name: name of the node this test is running on
1156
1157 @type test_config: tuple (master_name, master_up, master_candidates)
1158 @param test_config: configuration for test as passed from
1159 LUClusterVerify() in what[constants.NV_MASTERIP]
1160
1161 @rtype: bool or None
1162 @return: Boolean test result, None if skipped
1163 """
1164 master_name, master_ip, master_candidates = test_config
1165 port = netutils.GetDaemonPort(constants.NODED)
1166 if my_name not in master_candidates:
1167 return None
1168
1169 if master_name == my_name:
1170 source = constants.IP4_ADDRESS_LOCALHOST
1171 else:
1172 source = None
1173 return netutils.TcpPing(master_ip, port, source=source)
1174
1175
1176 def VerifyNode(what, cluster_name, all_hvparams):
1177 """Verify the status of the local node.
1178
1179 Based on the input L{what} parameter, various checks are done on the
1180 local node.
1181
1182 If the I{filelist} key is present, this list of
1183 files is checksummed and the file/checksum pairs are returned.
1184
1185 If the I{nodelist} key is present, we check that we have
1186 connectivity via ssh with the target nodes (and check the hostname
1187 report).
1188
1189 If the I{node-net-test} key is present, we check that we have
1190 connectivity to the given nodes via both primary IP and, if
1191 applicable, secondary IPs.
1192
1193 @type what: C{dict}
1194 @param what: a dictionary of things to check:
1195 - filelist: list of files for which to compute checksums
1196 - nodelist: list of nodes we should check ssh communication with
1197 - node-net-test: list of nodes we should check node daemon port
1198 connectivity with
1199 - hypervisor: list with hypervisors to run the verify for
1200 @type cluster_name: string
1201 @param cluster_name: the cluster's name
1202 @type all_hvparams: dict of dict of strings
1203 @param all_hvparams: a dictionary mapping hypervisor names to hvparams
1204 @rtype: dict
1205 @return: a dictionary with the same keys as the input dict, and
1206 values representing the result of the checks
1207
1208 """
1209 result = {}
1210 my_name = netutils.Hostname.GetSysName()
1211 vm_capable = my_name not in what.get(constants.NV_NONVMNODES, [])
1212
1213 _VerifyHypervisors(what, vm_capable, result, all_hvparams)
1214 _VerifyHvparams(what, vm_capable, result)
1215
1216 if constants.NV_FILELIST in what:
1217 fingerprints = utils.FingerprintFiles(map(vcluster.LocalizeVirtualPath,
1218 what[constants.NV_FILELIST]))
1219 result[constants.NV_FILELIST] = \
1220 dict((vcluster.MakeVirtualPath(key), value)
1221 for (key, value) in fingerprints.items())
1222
1223 if constants.NV_CLIENT_CERT in what:
1224 result[constants.NV_CLIENT_CERT] = _VerifyClientCertificate()
1225
1226 if constants.NV_SSH_SETUP in what:
1227 node_status_list, key_type = what[constants.NV_SSH_SETUP]
1228 result[constants.NV_SSH_SETUP] = \
1229 _VerifySshSetup(node_status_list, my_name, key_type)
1230 if constants.NV_SSH_CLUTTER in what:
1231 result[constants.NV_SSH_CLUTTER] = \
1232 _VerifySshClutter(what[constants.NV_SSH_SETUP], my_name)
1233
1234 if constants.NV_NODELIST in what:
1235 (nodes, bynode, mcs) = what[constants.NV_NODELIST]
1236
1237 # Add nodes from other groups (different for each node)
1238 try:
1239 nodes.extend(bynode[my_name])
1240 except KeyError:
1241 pass
1242
1243 # Use a random order
1244 random.shuffle(nodes)
1245
1246 # Try to contact all nodes
1247 val = {}
1248 ssh_port_map = ssconf.SimpleStore().GetSshPortMap()
1249 for node in nodes:
1250 # We only test if master candidates can communicate to other nodes.
1251 # We cannot test if normal nodes cannot communicate with other nodes,
1252 # because the administrator might have installed additional SSH keys,
1253 # over which Ganeti has no power.
1254 if my_name in mcs:
1255 success, message = _GetSshRunner(cluster_name). \
1256 VerifyNodeHostname(node, ssh_port_map[node])
1257 if not success:
1258 val[node] = message
1259
1260 result[constants.NV_NODELIST] = val
1261
1262 if constants.NV_NODENETTEST in what:
1263 result[constants.NV_NODENETTEST] = VerifyNodeNetTest(
1264 my_name, what[constants.NV_NODENETTEST])
1265
1266 if constants.NV_MASTERIP in what:
1267 result[constants.NV_MASTERIP] = VerifyMasterIP(
1268 my_name, what[constants.NV_MASTERIP])
1269
1270 if constants.NV_USERSCRIPTS in what:
1271 result[constants.NV_USERSCRIPTS] = \
1272 [script for script in what[constants.NV_USERSCRIPTS]
1273 if not utils.IsExecutable(script)]
1274
1275 if constants.NV_OOB_PATHS in what:
1276 result[constants.NV_OOB_PATHS] = tmp = []
1277 for path in what[constants.NV_OOB_PATHS]:
1278 try:
1279 st = os.stat(path)
1280 except OSError, err:
1281 tmp.append("error stating out of band helper: %s" % err)
1282 else:
1283 if stat.S_ISREG(st.st_mode):
1284 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR:
1285 tmp.append(None)
1286 else:
1287 tmp.append("out of band helper %s is not executable" % path)
1288 else:
1289 tmp.append("out of band helper %s is not a file" % path)
1290
1291 if constants.NV_LVLIST in what and vm_capable:
1292 try:
1293 val = GetVolumeList(utils.ListVolumeGroups().keys())
1294 except RPCFail, err:
1295 val = str(err)
1296 result[constants.NV_LVLIST] = val
1297
1298 _VerifyInstanceList(what, vm_capable, result, all_hvparams)
1299
1300 if constants.NV_VGLIST in what and vm_capable:
1301 result[constants.NV_VGLIST] = utils.ListVolumeGroups()
1302
1303 if constants.NV_PVLIST in what and vm_capable:
1304 check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what
1305 val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST],
1306 filter_allocatable=False,
1307 include_lvs=check_exclusive_pvs)
1308 if check_exclusive_pvs:
1309 result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val)
1310 for pvi in val:
1311 # Avoid sending useless data on the wire
1312 pvi.lv_list = []
1313 result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val)
1314
1315 if constants.NV_VERSION in what:
1316 result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
1317 constants.RELEASE_VERSION)
1318
1319 _VerifyNodeInfo(what, vm_capable, result, all_hvparams)
1320
1321 if constants.NV_DRBDVERSION in what and vm_capable:
1322 try:
1323 drbd_version = DRBD8.GetProcInfo().GetVersionString()
1324 except errors.BlockDeviceError, err:
1325 logging.warning("Can't get DRBD version", exc_info=True)
1326 drbd_version = str(err)
1327 result[constants.NV_DRBDVERSION] = drbd_version
1328
1329 if constants.NV_DRBDLIST in what and vm_capable:
1330 try:
1331 used_minors = drbd.DRBD8.GetUsedDevs()
1332 except errors.BlockDeviceError, err:
1333 logging.warning("Can't get used minors list", exc_info=True)
1334 used_minors = str(err)
1335 result[constants.NV_DRBDLIST] = used_minors
1336
1337 if constants.NV_DRBDHELPER in what and vm_capable:
1338 status = True
1339 try:
1340 payload = drbd.DRBD8.GetUsermodeHelper()
1341 except errors.BlockDeviceError, err:
1342 logging.error("Can't get DRBD usermode helper: %s", str(err))
1343 status = False
1344 payload = str(err)
1345 result[constants.NV_DRBDHELPER] = (status, payload)
1346
1347 if constants.NV_NODESETUP in what:
1348 result[constants.NV_NODESETUP] = tmpr = []
1349 if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"):
1350 tmpr.append("The sysfs filesytem doesn't seem to be mounted"
1351 " under /sys, missing required directories /sys/block"
1352 " and /sys/class/net")
1353 if (not os.path.isdir("/proc/sys") or
1354 not os.path.isfile("/proc/sysrq-trigger")):
1355 tmpr.append("The procfs filesystem doesn't seem to be mounted"
1356 " under /proc, missing required directory /proc/sys and"
1357 " the file /proc/sysrq-trigger")
1358
1359 if constants.NV_TIME in what:
1360 result[constants.NV_TIME] = utils.SplitTime(time.time())
1361
1362 if constants.NV_OSLIST in what and vm_capable:
1363 result[constants.NV_OSLIST] = DiagnoseOS()
1364
1365 if constants.NV_BRIDGES in what and vm_capable:
1366 result[constants.NV_BRIDGES] = [bridge
1367 for bridge in what[constants.NV_BRIDGES]
1368 if not utils.BridgeExists(bridge)]
1369
1370 if what.get(constants.NV_ACCEPTED_STORAGE_PATHS) == my_name:
1371 result[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1372 filestorage.ComputeWrongFileStoragePaths()
1373
1374 if what.get(constants.NV_FILE_STORAGE_PATH):
1375 pathresult = filestorage.CheckFileStoragePath(
1376 what[constants.NV_FILE_STORAGE_PATH])
1377 if pathresult:
1378 result[constants.NV_FILE_STORAGE_PATH] = pathresult
1379
1380 if what.get(constants.NV_SHARED_FILE_STORAGE_PATH):
1381 pathresult = filestorage.CheckFileStoragePath(
1382 what[constants.NV_SHARED_FILE_STORAGE_PATH])
1383 if pathresult:
1384 result[constants.NV_SHARED_FILE_STORAGE_PATH] = pathresult
1385
1386 return result
1387
1388
1389 def GetCryptoTokens(token_requests):
1390 """Perform actions on the node's cryptographic tokens.
1391
1392 Token types can be 'ssl' or 'ssh'. So far only some actions are implemented
1393 for 'ssl'. Action 'get' returns the digest of the public client ssl
1394 certificate. Action 'create' creates a new client certificate and private key
1395 and also returns the digest of the certificate. The third parameter of a
1396 token request are optional parameters for the actions, so far only the
1397 filename is supported.
1398
1399 @type token_requests: list of tuples of (string, string, dict), where the
1400 first string is in constants.CRYPTO_TYPES, the second in
1401 constants.CRYPTO_ACTIONS. The third parameter is a dictionary of string
1402 to string.
1403 @param token_requests: list of requests of cryptographic tokens and actions
1404 to perform on them. The actions come with a dictionary of options.
1405 @rtype: list of tuples (string, string)
1406 @return: list of tuples of the token type and the public crypto token
1407
1408 """
1409 tokens = []
1410 for (token_type, action, _) in token_requests:
1411 if token_type not in constants.CRYPTO_TYPES:
1412 raise errors.ProgrammerError("Token type '%s' not supported." %
1413 token_type)
1414 if action not in constants.CRYPTO_ACTIONS:
1415 raise errors.ProgrammerError("Action '%s' is not supported." %
1416 action)
1417 if token_type == constants.CRYPTO_TYPE_SSL_DIGEST:
1418 tokens.append((token_type,
1419 utils.GetCertificateDigest()))
1420 return tokens
1421
1422
1423 def EnsureDaemon(daemon_name, run):
1424 """Ensures the given daemon is running or stopped.
1425
1426 @type daemon_name: string
1427 @param daemon_name: name of the daemon (e.g., constants.KVMD)
1428
1429 @type run: bool
1430 @param run: whether to start or stop the daemon
1431
1432 @rtype: bool
1433 @return: 'True' if daemon successfully started/stopped,
1434 'False' otherwise
1435
1436 """
1437 allowed_daemons = [constants.KVMD]
1438
1439 if daemon_name not in allowed_daemons:
1440 fn = lambda _: False
1441 elif run:
1442 fn = utils.EnsureDaemon
1443 else:
1444 fn = utils.StopDaemon
1445
1446 return fn(daemon_name)
1447
1448
1449 def _InitSshUpdateData(data, noded_cert_file, ssconf_store):
1450 (_, noded_cert) = \
1451 utils.ExtractX509Certificate(utils.ReadFile(noded_cert_file))
1452 data[constants.SSHS_NODE_DAEMON_CERTIFICATE] = noded_cert
1453
1454 cluster_name = ssconf_store.GetClusterName()
1455 data[constants.SSHS_CLUSTER_NAME] = cluster_name
1456
1457
1458 def AddNodeSshKey(node_uuid, node_name,
1459 potential_master_candidates,
1460 to_authorized_keys=False,
1461 to_public_keys=False,
1462 get_public_keys=False,
1463 pub_key_file=pathutils.SSH_PUB_KEYS,
1464 ssconf_store=None,
1465 noded_cert_file=pathutils.NODED_CERT_FILE,
1466 run_cmd_fn=ssh.RunSshCmdWithStdin,
1467 ssh_update_debug=False,
1468 ssh_update_verbose=False):
1469 """Distributes a node's public SSH key across the cluster.
1470
1471 Note that this function should only be executed on the master node, which
1472 then will copy the new node's key to all nodes in the cluster via SSH.
1473
1474 Also note: at least one of the flags C{to_authorized_keys},
1475 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1476 the function to actually perform any actions.
1477
1478 @type node_uuid: str
1479 @param node_uuid: the UUID of the node whose key is added
1480 @type node_name: str
1481 @param node_name: the name of the node whose key is added
1482 @type potential_master_candidates: list of str
1483 @param potential_master_candidates: list of node names of potential master
1484 candidates; this should match the list of uuids in the public key file
1485 @type to_authorized_keys: boolean
1486 @param to_authorized_keys: whether the key should be added to the
1487 C{authorized_keys} file of all nodes
1488 @type to_public_keys: boolean
1489 @param to_public_keys: whether the keys should be added to the public key file
1490 @type get_public_keys: boolean
1491 @param get_public_keys: whether the node should add the clusters' public keys
1492 to its {ganeti_pub_keys} file
1493
1494 """
1495 node_list = [SshAddNodeInfo(name=node_name, uuid=node_uuid,
1496 to_authorized_keys=to_authorized_keys,
1497 to_public_keys=to_public_keys,
1498 get_public_keys=get_public_keys)]
1499 return AddNodeSshKeyBulk(node_list,
1500 potential_master_candidates,
1501 pub_key_file=pub_key_file,
1502 ssconf_store=ssconf_store,
1503 noded_cert_file=noded_cert_file,
1504 run_cmd_fn=run_cmd_fn,
1505 ssh_update_debug=ssh_update_debug,
1506 ssh_update_verbose=ssh_update_verbose)
1507
1508
1509 # Node info named tuple specifically for the use with AddNodeSshKeyBulk
1510 SshAddNodeInfo = collections.namedtuple(
1511 "SshAddNodeInfo",
1512 ["uuid",
1513 "name",
1514 "to_authorized_keys",
1515 "to_public_keys",
1516 "get_public_keys"])
1517
1518
1519 def AddNodeSshKeyBulk(node_list,
1520 potential_master_candidates,
1521 pub_key_file=pathutils.SSH_PUB_KEYS,
1522 ssconf_store=None,
1523 noded_cert_file=pathutils.NODED_CERT_FILE,
1524 run_cmd_fn=ssh.RunSshCmdWithStdin,
1525 ssh_update_debug=False,
1526 ssh_update_verbose=False):
1527 """Distributes a node's public SSH key across the cluster.
1528
1529 Note that this function should only be executed on the master node, which
1530 then will copy the new node's key to all nodes in the cluster via SSH.
1531
1532 Also note: at least one of the flags C{to_authorized_keys},
1533 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1534 the function to actually perform any actions.
1535
1536 @type node_list: list of SshAddNodeInfo tuples
1537 @param node_list: list of tuples containing the necessary node information for
1538 adding their keys
1539 @type potential_master_candidates: list of str
1540 @param potential_master_candidates: list of node names of potential master
1541 candidates; this should match the list of uuids in the public key file
1542
1543 """
1544 # whether there are any keys to be added or retrieved at all
1545 to_authorized_keys = any([node_info.to_authorized_keys for node_info in
1546 node_list])
1547 to_public_keys = any([node_info.to_public_keys for node_info in
1548 node_list])
1549
1550 if not ssconf_store:
1551 ssconf_store = ssconf.SimpleStore()
1552
1553 for node_info in node_list:
1554 # replacement not necessary for keys that are not supposed to be in the
1555 # list of public keys
1556 if not node_info.to_public_keys:
1557 continue
1558 # Check and fix sanity of key file
1559 keys_by_name = ssh.QueryPubKeyFile([node_info.name], key_file=pub_key_file)
1560 keys_by_uuid = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1561
1562 if (not keys_by_name or node_info.name not in keys_by_name) \
1563 and (not keys_by_uuid or node_info.uuid not in keys_by_uuid):
1564 raise errors.SshUpdateError(
1565 "No keys found for the new node '%s' (UUID %s) in the list of public"
1566 " SSH keys, neither for the name or the UUID" %
1567 (node_info.name, node_info.uuid))
1568 else:
1569 if node_info.name in keys_by_name:
1570 # Replace the name by UUID in the file as the name should only be used
1571 # temporarily
1572 ssh.ReplaceNameByUuid(node_info.uuid, node_info.name,
1573 error_fn=errors.SshUpdateError,
1574 key_file=pub_key_file)
1575
1576 # Retrieve updated map of UUIDs to keys
1577 keys_by_uuid = ssh.QueryPubKeyFile(
1578 [node_info.uuid for node_info in node_list], key_file=pub_key_file)
1579
1580 # Update the master node's key files
1581 (auth_key_file, _) = \
1582 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1583 for node_info in node_list:
1584 if node_info.to_authorized_keys:
1585 ssh.AddAuthorizedKeys(auth_key_file, keys_by_uuid[node_info.uuid])
1586
1587 base_data = {}
1588 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1589 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1590
1591 ssh_port_map = ssconf_store.GetSshPortMap()
1592
1593 # Update the target nodes themselves
1594 for node_info in node_list:
1595 logging.debug("Updating SSH key files of target node '%s'.", node_info.name)
1596 if node_info.get_public_keys:
1597 node_data = {}
1598 _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
1599 all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
1600 node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1601 (constants.SSHS_OVERRIDE, all_keys)
1602
1603 try:
1604 backoff = 5 # seconds
1605 utils.RetryByNumberOfTimes(
1606 constants.SSHS_MAX_RETRIES, backoff,
1607 errors.SshUpdateError,
1608 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1609 ssh_port_map.get(node_info.name), node_data,
1610 debug=ssh_update_debug, verbose=ssh_update_verbose,
1611 use_cluster_key=False, ask_key=False, strict_host_check=False)
1612 except errors.SshUpdateError as e:
1613 # Clean up the master's public key file if adding key fails
1614 if node_info.to_public_keys:
1615 ssh.RemovePublicKey(node_info.uuid)
1616 raise e
1617
1618 # Update all nodes except master and the target nodes
1619 keys_by_uuid_auth = ssh.QueryPubKeyFile(
1620 [node_info.uuid for node_info in node_list
1621 if node_info.to_authorized_keys],
1622 key_file=pub_key_file)
1623 if to_authorized_keys:
1624 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1625 (constants.SSHS_ADD, keys_by_uuid_auth)
1626
1627 pot_mc_data = base_data.copy()
1628 keys_by_uuid_pub = ssh.QueryPubKeyFile(
1629 [node_info.uuid for node_info in node_list
1630 if node_info.to_public_keys],
1631 key_file=pub_key_file)
1632 if to_public_keys:
1633 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1634 (constants.SSHS_REPLACE_OR_ADD, keys_by_uuid_pub)
1635
1636 all_nodes = ssconf_store.GetNodeList()
1637 master_node = ssconf_store.GetMasterNode()
1638 online_nodes = ssconf_store.GetOnlineNodeList()
1639
1640 node_errors = []
1641 for node in all_nodes:
1642 if node == master_node:
1643 logging.debug("Skipping master node '%s'.", master_node)
1644 continue
1645 if node not in online_nodes:
1646 logging.debug("Skipping offline node '%s'.", node)
1647 continue
1648 if node in potential_master_candidates:
1649 logging.debug("Updating SSH key files of node '%s'.", node)
1650 try:
1651 backoff = 5 # seconds
1652 utils.RetryByNumberOfTimes(
1653 constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
1654 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1655 ssh_port_map.get(node), pot_mc_data,
1656 debug=ssh_update_debug, verbose=ssh_update_verbose,
1657 use_cluster_key=False, ask_key=False, strict_host_check=False)
1658 except errors.SshUpdateError as last_exception:
1659 error_msg = ("When adding the key of node '%s', updating SSH key"
1660 " files of node '%s' failed after %s retries."
1661 " Not trying again. Last error was: %s." %
1662 (node, node_info.name, constants.SSHS_MAX_RETRIES,
1663 last_exception))
1664 node_errors.append((node, error_msg))
1665 # We only log the error and don't throw an exception, because
1666 # one unreachable node shall not abort the entire procedure.
1667 logging.error(error_msg)
1668
1669 else:
1670 if to_authorized_keys:
1671 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
1672 ssh_port_map.get(node), base_data,
1673 debug=ssh_update_debug, verbose=ssh_update_verbose,
1674 use_cluster_key=False, ask_key=False,
1675 strict_host_check=False)
1676
1677 return node_errors
1678
1679
1680 # TODO: will be fixed with pending patch series.
1681 # pylint: disable=R0913
1682 def RemoveNodeSshKey(node_uuid, node_name,
1683 master_candidate_uuids,
1684 potential_master_candidates,
1685 master_uuid=None,
1686 keys_to_remove=None,
1687 from_authorized_keys=False,
1688 from_public_keys=False,
1689 clear_authorized_keys=False,
1690 clear_public_keys=False,
1691 pub_key_file=pathutils.SSH_PUB_KEYS,
1692 ssconf_store=None,
1693 noded_cert_file=pathutils.NODED_CERT_FILE,
1694 readd=False,
1695 run_cmd_fn=ssh.RunSshCmdWithStdin,
1696 ssh_update_debug=False,
1697 ssh_update_verbose=False):
1698 """Removes the node's SSH keys from the key files and distributes those.
1699
1700 Note that at least one of the flags C{from_authorized_keys},
1701 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1702 has to be set to C{True} for the function to perform any action at all.
1703 Not doing so will trigger an assertion in the function.
1704
1705 @type node_uuid: str
1706 @param node_uuid: UUID of the node whose key is removed
1707 @type node_name: str
1708 @param node_name: name of the node whose key is remove
1709 @type master_candidate_uuids: list of str
1710 @param master_candidate_uuids: list of UUIDs of the current master candidates
1711 @type potential_master_candidates: list of str
1712 @param potential_master_candidates: list of names of potential master
1713 candidates
1714 @type keys_to_remove: dict of str to list of str
1715 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1716 to be removed. This list is supposed to be used only if the keys are not
1717 in the public keys file. This is for example the case when removing a
1718 master node's key.
1719 @type from_authorized_keys: boolean
1720 @param from_authorized_keys: whether or not the key should be removed
1721 from the C{authorized_keys} file
1722 @type from_public_keys: boolean
1723 @param from_public_keys: whether or not the key should be remove from
1724 the C{ganeti_pub_keys} file
1725 @type clear_authorized_keys: boolean
1726 @param clear_authorized_keys: whether or not the C{authorized_keys} file
1727 should be cleared on the node whose keys are removed
1728 @type clear_public_keys: boolean
1729 @param clear_public_keys: whether to clear the node's C{ganeti_pub_key} file
1730 @type readd: boolean
1731 @param readd: whether this is called during a readd operation.
1732 @rtype: list of string
1733 @returns: list of feedback messages
1734
1735 """
1736 node_list = [SshRemoveNodeInfo(uuid=node_uuid,
1737 name=node_name,
1738 from_authorized_keys=from_authorized_keys,
1739 from_public_keys=from_public_keys,
1740 clear_authorized_keys=clear_authorized_keys,
1741 clear_public_keys=clear_public_keys)]
1742 return RemoveNodeSshKeyBulk(node_list,
1743 master_candidate_uuids,
1744 potential_master_candidates,
1745 master_uuid=master_uuid,
1746 keys_to_remove=keys_to_remove,
1747 pub_key_file=pub_key_file,
1748 ssconf_store=ssconf_store,
1749 noded_cert_file=noded_cert_file,
1750 readd=readd,
1751 run_cmd_fn=run_cmd_fn,
1752 ssh_update_debug=ssh_update_debug,
1753 ssh_update_verbose=ssh_update_verbose)
1754
1755
1756 # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
1757 SshRemoveNodeInfo = collections.namedtuple(
1758 "SshRemoveNodeInfo",
1759 ["uuid",
1760 "name",
1761 "from_authorized_keys",
1762 "from_public_keys",
1763 "clear_authorized_keys",
1764 "clear_public_keys"])
1765
1766
1767 def RemoveNodeSshKeyBulk(node_list,
1768 master_candidate_uuids,
1769 potential_master_candidates,
1770 master_uuid=None,
1771 keys_to_remove=None,
1772 pub_key_file=pathutils.SSH_PUB_KEYS,
1773 ssconf_store=None,
1774 noded_cert_file=pathutils.NODED_CERT_FILE,
1775 readd=False,
1776 run_cmd_fn=ssh.RunSshCmdWithStdin,
1777 ssh_update_debug=False,
1778 ssh_update_verbose=False):
1779 """Removes the node's SSH keys from the key files and distributes those.
1780
1781 Note that at least one of the flags C{from_authorized_keys},
1782 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1783 of at least one node has to be set to C{True} for the function to perform any
1784 action at all. Not doing so will trigger an assertion in the function.
1785
1786 @type node_list: list of C{SshRemoveNodeInfo}.
1787 @param node_list: list of information about nodes whose keys are being removed
1788 @type master_candidate_uuids: list of str
1789 @param master_candidate_uuids: list of UUIDs of the current master candidates
1790 @type potential_master_candidates: list of str
1791 @param potential_master_candidates: list of names of potential master
1792 candidates
1793 @type keys_to_remove: dict of str to list of str
1794 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1795 to be removed. This list is supposed to be used only if the keys are not
1796 in the public keys file. This is for example the case when removing a
1797 master node's key.
1798 @type readd: boolean
1799 @param readd: whether this is called during a readd operation.
1800 @rtype: list of string
1801 @returns: list of feedback messages
1802
1803 """
1804 # Non-disruptive error messages, list of (node, msg) pairs
1805 result_msgs = []
1806
1807 # whether there are any keys to be added or retrieved at all
1808 from_authorized_keys = any([node_info.from_authorized_keys for node_info in
1809 node_list])
1810 from_public_keys = any([node_info.from_public_keys for node_info in
1811 node_list])
1812 clear_authorized_keys = any([node_info.clear_authorized_keys for node_info in
1813 node_list])
1814 clear_public_keys = any([node_info.clear_public_keys for node_info in
1815 node_list])
1816
1817 # Make sure at least one of these flags is true.
1818 if not (from_authorized_keys or from_public_keys or clear_authorized_keys
1819 or clear_public_keys):
1820 raise errors.SshUpdateError("No removal from any key file was requested.")
1821
1822 if not ssconf_store:
1823 ssconf_store = ssconf.SimpleStore()
1824
1825 master_node = ssconf_store.GetMasterNode()
1826 ssh_port_map = ssconf_store.GetSshPortMap()
1827
1828 all_keys_to_remove = {}
1829 if from_authorized_keys or from_public_keys:
1830 for node_info in node_list:
1831 # Skip nodes that don't actually need any keys to be removed.
1832 if not (node_info.from_authorized_keys or node_info.from_public_keys):
1833 continue
1834 if node_info.name == master_node and not keys_to_remove:
1835 raise errors.SshUpdateError("Cannot remove the master node's keys.")
1836 if keys_to_remove:
1837 keys = keys_to_remove
1838 else:
1839 keys = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1840 if (not keys or node_info.uuid not in keys) and not readd:
1841 raise errors.SshUpdateError("Node '%s' not found in the list of"
1842 " public SSH keys. It seems someone"
1843 " tries to remove a key from outside"
1844 " the cluster!" % node_info.uuid)
1845 # During an upgrade all nodes have the master key. In this case we
1846 # should not remove it to avoid accidentally shutting down cluster
1847 # SSH communication
1848 master_keys = None
1849 if master_uuid:
1850 master_keys = ssh.QueryPubKeyFile([master_uuid],
1851 key_file=pub_key_file)
1852 for master_key in master_keys:
1853 if master_key in keys[node_info.uuid]:
1854 keys[node_info.uuid].remove(master_key)
1855
1856 all_keys_to_remove.update(keys)
1857
1858 if all_keys_to_remove:
1859 base_data = {}
1860 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1861 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1862
1863 if from_authorized_keys:
1864 # UUIDs of nodes that are supposed to be removed from the
1865 # authorized_keys files.
1866 nodes_remove_from_authorized_keys = [
1867 node_info.uuid for node_info in node_list
1868 if node_info.from_authorized_keys]
1869 keys_to_remove_from_authorized_keys = dict([
1870 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1871 if uuid in nodes_remove_from_authorized_keys])
1872 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1873 (constants.SSHS_REMOVE, keys_to_remove_from_authorized_keys)
1874 (auth_key_file, _) = \
1875 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False,
1876 dircheck=False)
1877
1878 for uuid in nodes_remove_from_authorized_keys:
1879 ssh.RemoveAuthorizedKeys(auth_key_file,
1880 keys_to_remove_from_authorized_keys[uuid])
1881
1882 pot_mc_data = base_data.copy()
1883
1884 if from_public_keys:
1885 nodes_remove_from_public_keys = [
1886 node_info.uuid for node_info in node_list
1887 if node_info.from_public_keys]
1888 keys_to_remove_from_public_keys = dict([
1889 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1890 if uuid in nodes_remove_from_public_keys])
1891 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1892 (constants.SSHS_REMOVE, keys_to_remove_from_public_keys)
1893
1894 all_nodes = ssconf_store.GetNodeList()
1895 online_nodes = ssconf_store.GetOnlineNodeList()
1896 all_nodes_to_remove = [node_info.name for node_info in node_list]
1897 logging.debug("Removing keys of nodes '%s' from all nodes but itself and"
1898 " master.", ", ".join(all_nodes_to_remove))
1899 for node in all_nodes:
1900 if node == master_node:
1901 logging.debug("Skipping master node '%s'.", master_node)
1902 continue
1903 if node not in online_nodes:
1904 logging.debug("Skipping offline node '%s'.", node)
1905 continue
1906 if node in all_nodes_to_remove:
1907 logging.debug("Skipping node whose key is removed itself '%s'.", node)
1908 continue
1909 ssh_port = ssh_port_map.get(node)
1910 if not ssh_port:
1911 raise errors.OpExecError("No SSH port information available for"
1912 " node '%s', map: %s." %
1913 (node, ssh_port_map))
1914 error_msg_final = ("When removing the key of node '%s', updating the"
1915 " SSH key files of node '%s' failed. Last error"
1916 " was: %s.")
1917 if node in potential_master_candidates:
1918 logging.debug("Updating key setup of potential master candidate node"
1919 " %s.", node)
1920 try:
1921 backoff = 5 # seconds
1922 utils.RetryByNumberOfTimes(
1923 constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
1924 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1925 ssh_port, pot_mc_data,
1926 debug=ssh_update_debug, verbose=ssh_update_verbose,
1927 use_cluster_key=False, ask_key=False, strict_host_check=False)
1928 except errors.SshUpdateError as last_exception:
1929 error_msg = error_msg_final % (
1930 node_info.name, node, last_exception)
1931 result_msgs.append((node, error_msg))
1932 logging.error(error_msg)
1933
1934 else:
1935 if from_authorized_keys:
1936 logging.debug("Updating key setup of normal node %s.", node)
1937 try:
1938 backoff = 5 # seconds
1939 utils.RetryByNumberOfTimes(
1940 constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
1941 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1942 ssh_port, base_data,
1943 debug=ssh_update_debug, verbose=ssh_update_verbose,
1944 use_cluster_key=False, ask_key=False, strict_host_check=False)
1945 except errors.SshUpdateError as last_exception:
1946 error_msg = error_msg_final % (
1947 node_info.name, node, last_exception)
1948 result_msgs.append((node, error_msg))
1949 logging.error(error_msg)
1950
1951 for node_info in node_list:
1952 if node_info.clear_authorized_keys or node_info.from_public_keys or \
1953 node_info.clear_public_keys:
1954 data = {}
1955 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1956 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1957 ssh_port = ssh_port_map.get(node_info.name)
1958 if not ssh_port:
1959 raise errors.OpExecError("No SSH port information available for"
1960 " node '%s', which is leaving the cluster.")
1961
1962 if node_info.clear_authorized_keys:
1963 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore,
1964 # we have to specify exactly which keys to clear to leave keys untouched
1965 # that were not added by Ganeti.
1966 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids
1967 if uuid != node_info.uuid]
1968 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids,
1969 key_file=pub_key_file)
1970 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1971 (constants.SSHS_REMOVE, candidate_keys)
1972
1973 if node_info.clear_public_keys:
1974 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1975 (constants.SSHS_CLEAR, {})
1976 elif node_info.from_public_keys:
1977 # Since clearing the public keys subsumes removing just a single key,
1978 # we only do it if clear_public_keys is 'False'.
1979
1980 if all_keys_to_remove:
1981 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1982 (constants.SSHS_REMOVE, all_keys_to_remove)
1983
1984 # If we have no changes to any keyfile, just return
1985 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or
1986 constants.SSHS_SSH_AUTHORIZED_KEYS in data):
1987 return
1988
1989 logging.debug("Updating SSH key setup of target node '%s'.",
1990 node_info.name)
1991 try:
1992 backoff = 5 # seconds
1993 utils.RetryByNumberOfTimes(
1994 constants.SSHS_MAX_RETRIES, backoff,
1995 errors.SshUpdateError,
1996 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1997 ssh_port, data,
1998 debug=ssh_update_debug, verbose=ssh_update_verbose,
1999 use_cluster_key=False, ask_key=False, strict_host_check=False)
2000 except errors.SshUpdateError as last_exception:
2001 result_msgs.append(
2002 (node_info.name,
2003 ("Removing SSH keys from node '%s' failed."
2004 " This can happen when the node is already unreachable."
2005 " Error: %s" % (node_info.name, last_exception))))
2006
2007 if all_keys_to_remove and from_public_keys:
2008 for node_uuid in nodes_remove_from_public_keys:
2009 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
2010
2011 return result_msgs
2012 # pylint: enable=R0913
2013
2014
2015 def RemoveSshKeyFromPublicKeyFile(node_name,
2016 pub_key_file=pathutils.SSH_PUB_KEYS,
2017 ssconf_store=None):
2018 """Removes a SSH key from the master's public key file.
2019
2020 This is an operation that is only used to clean up after failed operations
2021 (for example failed hooks before adding a node). To avoid abuse of this
2022 function (and the matching RPC call), we add a safety check to make sure
2023 that only stray keys can be removed that belong to nodes that are not
2024 in the cluster (anymore).
2025
2026 @type node_name: string
2027 @param node_name: the name of the node whose key is removed
2028
2029 """
2030 if not ssconf_store:
2031 ssconf_store = ssconf.SimpleStore()
2032
2033 node_list = ssconf_store.GetNodeList()
2034
2035 if node_name in node_list:
2036 raise errors.SshUpdateError("Cannot remove key of node '%s',"
2037 " because it still belongs to the cluster."
2038 % node_name)
2039
2040 keys_by_name = ssh.QueryPubKeyFile([node_name], key_file=pub_key_file)
2041 if not keys_by_name or node_name not in keys_by_name:
2042 logging.info("The node '%s' whose key is supposed to be removed does not"
2043 " have an entry in the public key file. Hence, there is"
2044 " nothing left to do.", node_name)
2045
2046 ssh.RemovePublicKey(node_name, key_file=pub_key_file)
2047
2048
2049 def _GenerateNodeSshKey(node_name, ssh_port_map, ssh_key_type, ssh_key_bits,
2050 ssconf_store=None,
2051 noded_cert_file=pathutils.NODED_CERT_FILE,
2052 run_cmd_fn=ssh.RunSshCmdWithStdin,
2053 suffix="",
2054 ssh_update_debug=False,
2055 ssh_update_verbose=False):
2056 """Generates the root SSH key pair on the node.
2057
2058 @type node_name: str
2059 @param node_name: name of the node whose key is remove
2060 @type ssh_port_map: dict of str to int
2061 @param ssh_port_map: mapping of node names to their SSH port
2062 @type ssh_key_type: One of L{constants.SSHK_ALL}
2063 @param ssh_key_type: the type of SSH key to be generated
2064 @type ssh_key_bits: int
2065 @param ssh_key_bits: the length of the key to be generated
2066
2067 """
2068 if not ssconf_store:
2069 ssconf_store = ssconf.SimpleStore()
2070
2071 data = {}
2072 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
2073 cluster_name = data[constants.SSHS_CLUSTER_NAME]
2074 data[constants.SSHS_GENERATE] = (ssh_key_type, ssh_key_bits, suffix)
2075
2076 run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
2077 ssh_port_map.get(node_name), data,
2078 debug=ssh_update_debug, verbose=ssh_update_verbose,
2079 use_cluster_key=False, ask_key=False, strict_host_check=False)
2080
2081
2082 def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
2083 master_node_uuids = [node_uuid for (node_uuid, node_name)
2084 in node_uuid_name_map
2085 if node_name == master_node_name]
2086 if len(master_node_uuids) != 1:
2087 raise errors.SshUpdateError("No (unique) master UUID found. Master node"
2088 " name: '%s', Master UUID: '%s'" %
2089 (master_node_name, master_node_uuids))
2090 return master_node_uuids[0]
2091
2092
2093 def _GetOldMasterKeys(master_node_uuid, pub_key_file):
2094 old_master_keys_by_uuid = ssh.QueryPubKeyFile([master_node_uuid],
2095 key_file=pub_key_file)
2096 if not old_master_keys_by_uuid:
2097 raise errors.SshUpdateError("No public key of the master node (UUID '%s')"
2098 " found, not generating a new key."
2099 % master_node_uuid)
2100 return old_master_keys_by_uuid
2101
2102
2103 def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
2104 potential_master_candidates, old_key_type, new_key_type,
2105 new_key_bits,
2106 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS,
2107 ssconf_store=None,
2108 noded_cert_file=pathutils.NODED_CERT_FILE,
2109 run_cmd_fn=ssh.RunSshCmdWithStdin,
2110 ssh_update_debug=False,
2111 ssh_update_verbose=False):
2112 """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
2113
2114 @type node_uuids: list of str
2115 @param node_uuids: list of node UUIDs whose keys should be renewed
2116 @type node_names: list of str
2117 @param node_names: list of node names whose keys should be removed. This list
2118 should match the C{node_uuids} parameter
2119 @type master_candidate_uuids: list of str
2120 @param master_candidate_uuids: list of UUIDs of master candidates or
2121 master node
2122 @type old_key_type: One of L{constants.SSHK_ALL}
2123 @param old_key_type: the type of SSH key already present on nodes
2124 @type new_key_type: One of L{constants.SSHK_ALL}
2125 @param new_key_type: the type of SSH key to be generated
2126 @type new_key_bits: int
2127 @param new_key_bits: the length of the key to be generated
2128 @type ganeti_pub_keys_file: str
2129 @param ganeti_pub_keys_file: file path of the the public key file
2130 @type noded_cert_file: str
2131 @param noded_cert_file: path of the noded SSL certificate file
2132 @type run_cmd_fn: function
2133 @param run_cmd_fn: function to run commands on remote nodes via SSH
2134 @raises ProgrammerError: if node_uuids and node_names don't match;
2135 SshUpdateError if a node's key is missing from the public key file,
2136 if a node's new SSH key could not be fetched from it, if there is
2137 none or more than one entry in the public key list for the master
2138 node.
2139
2140 """
2141 if not ssconf_store:
2142 ssconf_store = ssconf.SimpleStore()
2143 cluster_name = ssconf_store.GetClusterName()
2144
2145 if not len(node_uuids) == len(node_names):
2146 raise errors.ProgrammerError("List of nodes UUIDs and node names"
2147 " does not match in length.")
2148
2149 old_pub_keyfile = ssh.GetSshPubKeyFilename(old_key_type)
2150 new_pub_keyfile = ssh.GetSshPubKeyFilename(new_key_type)
2151 old_master_key = ssh.ReadLocalSshPubKeys([old_key_type])
2152
2153 node_uuid_name_map = zip(node_uuids, node_names)
2154
2155 master_node_name = ssconf_store.GetMasterNode()
2156 master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name)
2157 ssh_port_map = ssconf_store.GetSshPortMap()
2158 # List of all node errors that happened, but which did not abort the
2159 # procedure as a whole. It is important that this is a list to have a
2160 # somewhat chronological history of events.
2161 all_node_errors = []
2162
2163 # process non-master nodes
2164
2165 # keys to add in bulk at the end
2166 node_keys_to_add = []
2167
2168 # list of all nodes
2169 node_list = []
2170
2171 # list of keys to be removed before generating new keys
2172 node_info_to_remove = []
2173
2174 for node_uuid, node_name in node_uuid_name_map:
2175 if node_name == master_node_name:
2176 continue
2177 master_candidate = node_uuid in master_candidate_uuids
2178 potential_master_candidate = node_name in potential_master_candidates
2179 node_list.append((node_uuid, node_name, master_candidate,
2180 potential_master_candidate))
2181
2182 if master_candidate:
2183 logging.debug("Fetching old SSH key from node '%s'.", node_name)
2184 old_pub_key = ssh.ReadRemoteSshPubKey(old_pub_keyfile,
2185 node_name, cluster_name,
2186 ssh_port_map[node_name],
2187 False, # ask_key
2188 False) # key_check
2189 if old_pub_key != old_master_key:
2190 # If we are already in a multi-key setup (that is past Ganeti 2.12),
2191 # we can safely remove the old key of the node. Otherwise, we cannot
2192 # remove that node's key, because it is also the master node's key
2193 # and that would terminate all communication from the master to the
2194 # node.
2195 node_info_to_remove.append(SshRemoveNodeInfo(
2196 uuid=node_uuid,
2197 name=node_name,
2198 from_authorized_keys=master_candidate,
2199 from_public_keys=False,
2200 clear_authorized_keys=False,
2201 clear_public_keys=False))
2202 else:
2203 logging.debug("Old key of node '%s' is the same as the current master"
2204 " key. Not deleting that key on the node.", node_name)
2205
2206 logging.debug("Removing old SSH keys of all master candidates.")
2207 if node_info_to_remove:
2208 node_errors = RemoveNodeSshKeyBulk(
2209 node_info_to_remove,
2210 master_candidate_uuids,
2211 potential_master_candidates,
2212 master_uuid=master_node_uuid,
2213 pub_key_file=ganeti_pub_keys_file,
2214 ssconf_store=ssconf_store,
2215 noded_cert_file=noded_cert_file,
2216 run_cmd_fn=run_cmd_fn,
2217 ssh_update_debug=ssh_update_debug,
2218 ssh_update_verbose=ssh_update_verbose)
2219 if node_errors:
2220 all_node_errors = all_node_errors + node_errors
2221
2222 for (node_uuid, node_name, master_candidate, potential_master_candidate) \
2223 in node_list:
2224
2225 logging.debug("Generating new SSH key for node '%s'.", node_name)
2226 _GenerateNodeSshKey(node_name, ssh_port_map, new_key_type, new_key_bits,
2227 ssconf_store=ssconf_store,
2228 noded_cert_file=noded_cert_file,
2229 run_cmd_fn=run_cmd_fn,
2230 ssh_update_verbose=ssh_update_verbose,
2231 ssh_update_debug=ssh_update_debug)
2232
2233 try:
2234 logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
2235 pub_key = ssh.ReadRemoteSshPubKey(new_pub_keyfile,
2236 node_name, cluster_name,
2237 ssh_port_map[node_name],
2238 False, # ask_key
2239 False) # key_check
2240 except:
2241 raise errors.SshUpdateError("Could not fetch key of node %s"
2242 " (UUID %s)" % (node_name, node_uuid))
2243
2244 if potential_master_candidate:
2245 ssh.RemovePublicKey(node_uuid, key_file=ganeti_pub_keys_file)
2246 ssh.AddPublicKey(node_uuid, pub_key, key_file=ganeti_pub_keys_file)
2247
2248 node_info = SshAddNodeInfo(name=node_name,
2249 uuid=node_uuid,
2250 to_authorized_keys=master_candidate,
2251 to_public_keys=potential_master_candidate,
2252 get_public_keys=True)
2253 node_keys_to_add.append(node_info)
2254
2255 node_errors = AddNodeSshKeyBulk(
2256 node_keys_to_add, potential_master_candidates,
2257 pub_key_file=ganeti_pub_keys_file, ssconf_store=ssconf_store,
2258 noded_cert_file=noded_cert_file,
2259 run_cmd_fn=run_cmd_fn,
2260 ssh_update_debug=ssh_update_debug,
2261 ssh_update_verbose=ssh_update_verbose)
2262 if node_errors:
2263 all_node_errors = all_node_errors + node_errors
2264
2265 # Renewing the master node's key
2266
2267 # Preserve the old keys for now
2268 old_master_keys_by_uuid = _GetOldMasterKeys(master_node_uuid,
2269 ganeti_pub_keys_file)
2270
2271 # Generate a new master key with a suffix, don't touch the old one for now
2272 logging.debug("Generate new ssh key of master.")
2273 _GenerateNodeSshKey(master_node_name, ssh_port_map,
2274 new_key_type, new_key_bits,
2275 ssconf_store=ssconf_store,
2276 noded_cert_file=noded_cert_file,
2277 run_cmd_fn=run_cmd_fn,
2278 suffix=constants.SSHS_MASTER_SUFFIX,
2279 ssh_update_debug=ssh_update_debug,
2280 ssh_update_verbose=ssh_update_verbose)
2281 # Read newly created master key
2282 new_master_keys = ssh.ReadLocalSshPubKeys(
2283 [new_key_type], suffix=constants.SSHS_MASTER_SUFFIX)
2284
2285 # Replace master key in the master nodes' public key file
2286 ssh.RemovePublicKey(master_node_uuid, key_file=ganeti_pub_keys_file)
2287 for pub_key in new_master_keys:
2288 ssh.AddPublicKey(master_node_uuid, pub_key, key_file=ganeti_pub_keys_file)
2289
2290 # Add new master key to all node's public and authorized keys
2291 logging.debug("Add new master key to all nodes.")
2292 node_errors = AddNodeSshKey(
2293 master_node_uuid, master_node_name, potential_master_candidates,
2294 to_authorized_keys=True, to_public_keys=True,
2295 get_public_keys=False, pub_key_file=ganeti_pub_keys_file,
2296 ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
2297 run_cmd_fn=run_cmd_fn,
2298 ssh_update_debug=ssh_update_debug,
2299 ssh_update_verbose=ssh_update_verbose)
2300 if node_errors:
2301 all_node_errors = all_node_errors + node_errors
2302
2303 # Remove the old key file and rename the new key to the non-temporary filename
2304 ssh.ReplaceSshKeys(new_key_type, new_key_type,
2305 src_key_suffix=constants.SSHS_MASTER_SUFFIX)
2306
2307 # Remove old key from authorized keys
2308 (auth_key_file, _) = \
2309 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2310 ssh.RemoveAuthorizedKeys(auth_key_file,
2311 old_master_keys_by_uuid[master_node_uuid])
2312
2313 # Remove the old key from all node's authorized keys file
2314 logging.debug("Remove the old master key from all nodes.")
2315 node_errors = RemoveNodeSshKey(
2316 master_node_uuid, master_node_name, master_candidate_uuids,
2317 potential_master_candidates,
2318 keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
2319 from_public_keys=False, clear_authorized_keys=False,
2320 clear_public_keys=False,
2321 pub_key_file=ganeti_pub_keys_file,
2322 ssconf_store=ssconf_store,
2323 noded_cert_file=noded_cert_file,
2324 run_cmd_fn=run_cmd_fn,
2325 ssh_update_debug=ssh_update_debug,
2326 ssh_update_verbose=ssh_update_verbose)
2327 if node_errors:
2328 all_node_errors = all_node_errors + node_errors
2329
2330 return all_node_errors
2331
2332
2333 def GetBlockDevSizes(devices):
2334 """Return the size of the given block devices
2335
2336 @type devices: list
2337 @param devices: list of block device nodes to query
2338 @rtype: dict
2339 @return:
2340 dictionary of all block devices under /dev (key). The value is their
2341 size in MiB.
2342
2343 {'/dev/disk/by-uuid/123456-12321231-312312-312': 124}
2344
2345 """
2346 DEV_PREFIX = "/dev/"
2347 blockdevs = {}
2348
2349 for devpath in devices:
2350 if not utils.IsBelowDir(DEV_PREFIX, devpath):
2351 continue
2352
2353 try:
2354 st = os.stat(devpath)
2355 except EnvironmentError, err:
2356 logging.warning("Error stat()'ing device %s: %s", devpath, str(err))
2357 continue
2358
2359 if stat.S_ISBLK(st.st_mode):
2360 result = utils.RunCmd(["blockdev", "--getsize64", devpath])
2361 if result.failed:
2362 # We don't want to fail, just do not list this device as available
2363 logging.warning("Cannot get size for block device %s", devpath)
2364 continue
2365
2366 size = int(result.stdout) / (1024 * 1024)
2367 blockdevs[devpath] = size
2368 return blockdevs
2369
2370
2371 def GetVolumeList(vg_names):
2372 """Compute list of logical volumes and their size.
2373
2374 @type vg_names: list
2375 @param vg_names: the volume groups whose LVs we should list, or
2376 empty for all volume groups
2377 @rtype: dict
2378 @return:
2379 dictionary of all partions (key) with value being a tuple of
2380 their size (in MiB), inactive and online status::
2381
2382 {'xenvg/test1': ('20.06', True, True)}
2383
2384 in case of errors, a string is returned with the error
2385 details.
2386
2387 """
2388 lvs = {}
2389 sep = "|"
2390 if not vg_names:
2391 vg_names = []
2392 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2393 "--separator=%s" % sep,
2394 "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names)
2395 if result.failed:
2396 _Fail("Failed to list logical volumes, lvs output: %s", result.output)
2397
2398 for line in result.stdout.splitlines():
2399 line = line.strip()
2400 match = _LVSLINE_REGEX.match(line)
2401 if not match:
2402 logging.error("Invalid line returned from lvs output: '%s'", line)
2403 continue
2404 vg_name, name, size, attr = match.groups()
2405 inactive = attr[4] == "-"
2406 online = attr[5] == "o"
2407 virtual = attr[0] == "v"
2408 if virtual:
2409 # we don't want to report such volumes as existing, since they
2410 # don't really hold data
2411 continue
2412 lvs[vg_name + "/" + name] = (size, inactive, online)
2413
2414 return lvs
2415
2416
2417 def ListVolumeGroups():
2418 """List the volume groups and their size.
2419
2420 @rtype: dict
2421 @return: dictionary with keys volume name and values the
2422 size of the volume
2423
2424 """
2425 return utils.ListVolumeGroups()
2426
2427
2428 def NodeVolumes():
2429 """List all volumes on this node.
2430
2431 @rtype: list
2432 @return:
2433 A list of dictionaries, each having four keys:
2434 - name: the logical volume name,
2435 - size: the size of the logical volume
2436 - dev: the physical device on which the LV lives
2437 - vg: the volume group to which it belongs
2438
2439 In case of errors, we return an empty list and log the
2440 error.
2441
2442 Note that since a logical volume can live on multiple physical
2443 volumes, the resulting list might include a logical volume
2444 multiple times.
2445
2446 """
2447 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2448 "--separator=|",
2449 "--options=lv_name,lv_size,devices,vg_name"])
2450 if result.failed:
2451 _Fail("Failed to list logical volumes, lvs output: %s",
2452 result.output)
2453
2454 def parse_dev(dev):
2455 return dev.split("(")[0]
2456
2457 def handle_dev(dev):
2458 return [parse_dev(x) for x in dev.split(",")]
2459
2460 def map_line(line):
2461 line = [v.strip() for v in line]
2462 return [{"name": line[0], "size": line[1],
2463 "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])]
2464
2465 all_devs = []
2466 for line in result.stdout.splitlines():
2467 if line.count("|") >= 3:
2468 all_devs.extend(map_line(line.split("|")))
2469 else:
2470 logging.warning("Strange line in the output from lvs: '%s'", line)
2471 return all_devs
2472
2473
2474 def BridgesExist(bridges_list):
2475 """Check if a list of bridges exist on the current node.
2476
2477 @rtype: boolean
2478 @return: C{True} if all of them exist, C{False} otherwise
2479
2480 """
2481 missing = []
2482 for bridge in bridges_list:
2483 if not utils.BridgeExists(bridge):
2484 missing.append(bridge)
2485
2486 if missing:
2487 _Fail("Missing bridges %s", utils.CommaJoin(missing))
2488
2489
2490 def GetInstanceListForHypervisor(hname, hvparams=None,
2491 get_hv_fn=hypervisor.GetHypervisor):
2492 """Provides a list of instances of the given hypervisor.
2493
2494 @type hname: string
2495 @param hname: name of the hypervisor
2496 @type hvparams: dict of strings
2497 @param hvparams: hypervisor parameters for the given hypervisor
2498 @type get_hv_fn: function
2499 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2500 name; optional parameter to increase testability
2501
2502 @rtype: list
2503 @return: a list of all running instances on the current node
2504 - instance1.example.com
2505 - instance2.example.com
2506
2507 """
2508 try:
2509 return get_hv_fn(hname).ListInstances(hvparams=hvparams)
2510 except errors.HypervisorError, err:
2511 _Fail("Error enumerating instances (hypervisor %s): %s",
2512 hname, err, exc=True)
2513
2514
2515 def GetInstanceList(hypervisor_list, all_hvparams=None,
2516 get_hv_fn=hypervisor.GetHypervisor):
2517 """Provides a list of instances.
2518
2519 @type hypervisor_list: list
2520 @param hypervisor_list: the list of hypervisors to query information
2521 @type all_hvparams: dict of dict of strings
2522 @param all_hvparams: a dictionary mapping hypervisor types to respective
2523 cluster-wide hypervisor parameters
2524 @type get_hv_fn: function
2525 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2526 name; optional parameter to increase testability
2527
2528 @rtype: list
2529 @return: a list of all running instances on the current node
2530 - instance1.example.com
2531 - instance2.example.com
2532
2533 """
2534 results = []
2535 for hname in hypervisor_list:
2536 hvparams = all_hvparams[hname]
2537 results.extend(GetInstanceListForHypervisor(hname, hvparams=hvparams,
2538 get_hv_fn=get_hv_fn))
2539 return results
2540
2541
2542 def GetInstanceInfo(instance, hname, hvparams=None):
2543 """Gives back the information about an instance as a dictionary.
2544
2545 @type instance: string
2546 @param instance: the instance name
2547 @type hname: string
2548 @param hname: the hypervisor type of the instance
2549 @type hvparams: dict of strings
2550 @param hvparams: the instance's hvparams
2551
2552 @rtype: dict
2553 @return: dictionary with the following keys:
2554 - memory: memory size of instance (int)
2555 - state: state of instance (HvInstanceState)
2556 - time: cpu time of instance (float)
2557 - vcpus: the number of vcpus (int)
2558
2559 """
2560 output = {}
2561
2562 iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance,
2563 hvparams=hvparams)
2564 if iinfo is not None:
2565 output["memory"] = iinfo[2]
2566 output["vcpus"] = iinfo[3]
2567 output["state"] = iinfo[4]
2568 output["time"] = iinfo[5]
2569
2570 return output
2571
2572
2573 def GetInstanceMigratable(instance):
2574 """Computes whether an instance can be migrated.
2575
2576 @type instance: L{objects.Instance}
2577 @param instance: object representing the instance to be checked.
2578
2579 @rtype: tuple
2580 @return: tuple of (result, description) where:
2581 - result: whether the instance can be migrated or not
2582 - description: a description of the issue, if relevant
2583
2584 """
2585 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2586 iname = instance.name
2587 if iname not in hyper.ListInstances(hvparams=instance.hvparams):
2588 _Fail("Instance %s is not running", iname)
2589
2590 for idx in range(len(instance.disks_info)):
2591 link_name = _GetBlockDevSymlinkPath(iname, idx)
2592 if not os.path.islink(link_name):
2593 logging.warning("Instance %s is missing symlink %s for disk %d",
2594 iname, link_name, idx)
2595
2596
2597 def GetAllInstancesInfo(hypervisor_list, all_hvparams):
2598 """Gather data about all instances.
2599
2600 This is the equivalent of L{GetInstanceInfo}, except that it
2601 computes data for all instances at once, thus being faster if one
2602 needs data about more than one instance.
2603
2604 @type hypervisor_list: list
2605 @param hypervisor_list: list of hypervisors to query for instance data
2606 @type all_hvparams: dict of dict of strings
2607 @param all_hvparams: mapping of hypervisor names to hvparams
2608
2609 @rtype: dict
2610 @return: dictionary of instance: data, with data having the following keys:
2611 - memory: memory size of instance (int)
2612 - state: xen state of instance (string)
2613 - time: cpu time of instance (float)
2614 - vcpus: the number of vcpus
2615
2616 """
2617 output = {}
2618 for hname in hypervisor_list:
2619 hvparams = all_hvparams[hname]
2620 iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo(hvparams)
2621 if iinfo:
2622 for name, _, memory, vcpus, state, times in iinfo:
2623 value = {
2624 "memory": memory,
2625 "vcpus": vcpus,
2626 "state": state,
2627 "time": times,
2628 }
2629 if name in output:
2630 # we only check static parameters, like memory and vcpus,
2631 # and not state and time which can change between the
2632 # invocations of the different hypervisors
2633 for key in "memory", "vcpus":
2634 if value[key] != output[name][key]:
2635 _Fail("Instance %s is running twice"
2636 " with different parameters", name)
2637 output[name] = value
2638
2639 return output
2640
2641
2642 def GetInstanceConsoleInfo(instance_param_dict,
2643 get_hv_fn=hypervisor.GetHypervisor):
2644 """Gather data about the console access of a set of instances of this node.
2645
2646 This function assumes that the caller already knows which instances are on
2647 this node, by calling a function such as L{GetAllInstancesInfo} or
2648 L{GetInstanceList}.
2649
2650 For every instance, a large amount of configuration data needs to be
2651 provided to the hypervisor interface in order to receive the console
2652 information. Whether this could or should be cut down can be discussed.
2653 The information is provided in a dictionary indexed by instance name,
2654 allowing any number of instance queries to be done.
2655
2656 @type instance_param_dict: dict of string to tuple of dictionaries, where the
2657 dictionaries represent: L{objects.Instance}, L{objects.Node},
2658 L{objects.NodeGroup}, HvParams, BeParams
2659 @param instance_param_dict: mapping of instance name to parameters necessary
2660 for console information retrieval
2661
2662 @rtype: dict
2663 @return: dictionary of instance: data, with data having the following keys:
2664 - instance: instance name
2665 - kind: console kind
2666 - message: used with kind == CONS_MESSAGE, indicates console to be
2667 unavailable, supplies error message
2668 - host: host to connect to
2669 - port: port to use
2670 - user: user for login
2671 - command: the command, broken into parts as an array
2672 - display: unknown, potentially unused?
2673
2674 """
2675
2676 output = {}
2677 for inst_name in instance_param_dict:
2678 instance = instance_param_dict[inst_name]["instance"]
2679 pnode = instance_param_dict[inst_name]["node"]
2680 group = instance_param_dict[inst_name]["group"]
2681 hvparams = instance_param_dict[inst_name]["hvParams"]
2682 beparams = instance_param_dict[inst_name]["beParams"]
2683
2684 instance = objects.Instance.FromDict(instance)
2685 pnode = objects.Node.FromDict(pnode)
2686 group = objects.NodeGroup.FromDict(group)
2687
2688 h = get_hv_fn(instance.hypervisor)
2689 output[inst_name] = h.GetInstanceConsole(instance, pnode, group,
2690 hvparams, beparams).ToDict()
2691
2692 return output
2693
2694
2695 def _InstanceLogName(kind, os_name, instance, component):
2696 """Compute the OS log filename for a given instance and operation.
2697
2698 The instance name and os name are passed in as strings since not all
2699 operations have these as part of an instance object.
2700
2701 @type kind: string
2702 @param kind: the operation type (e.g. add, import, etc.)
2703 @type os_name: string
2704 @param os_name: the os name
2705 @type instance: string
2706 @param instance: the name of the instance being imported/added/etc.
2707 @type component: string or None
2708 @param component: the name of the component of the instance being
2709 transferred
2710
2711 """
2712 # TODO: Use tempfile.mkstemp to create unique filename
2713 if component:
2714 assert "/" not in component
2715 c_msg = "-%s" % component
2716 else:
2717 c_msg = ""
2718 base = ("%s-%s-%s%s-%s.log" %
2719 (kind, os_name, instance, c_msg, utils.TimestampForFilename()))
2720 return utils.PathJoin(pathutils.LOG_OS_DIR, base)
2721
2722
2723 def InstanceOsAdd(instance, reinstall, debug):
2724 """Add an OS to an instance.
2725
2726 @type instance: L{objects.Instance}
2727 @param instance: Instance whose OS is to be installed
2728 @type reinstall: boolean
2729 @param reinstall: whether this is an instance reinstall
2730 @type debug: integer
2731 @param debug: debug level, passed to the OS scripts
2732 @rtype: None
2733
2734 """
2735 inst_os = OSFromDisk(instance.os)
2736
2737 create_env = OSEnvironment(instance, inst_os, debug)
2738 if reinstall:
2739 create_env["INSTANCE_REINSTALL"] = "1"
2740
2741 logfile = _InstanceLogName("add", instance.os, instance.name, None)
2742
2743 result = utils.RunCmd([inst_os.create_script], env=create_env,
2744 cwd=inst_os.path, output=logfile, reset_env=True)
2745 if result.failed:
2746 logging.error("os create command '%s' returned error: %s, logfile: %s,"
2747 " output: %s", result.cmd, result.fail_reason, logfile,
2748 result.output)
2749 lines = [utils.SafeEncode(val)
2750 for val in utils.TailFile(logfile, lines=20)]
2751 _Fail("OS create script failed (%s), last lines in the"
2752 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2753
2754
2755 def RunRenameInstance(instance, old_name, debug):
2756 """Run the OS rename script for an instance.
2757
2758 @type instance: L{objects.Instance}
2759 @param instance: Instance whose OS is to be installed
2760 @type old_name: string
2761 @param old_name: previous instance name
2762 @type debug: integer
2763 @param debug: debug level, passed to the OS scripts
2764 @rtype: boolean
2765 @return: the success of the operation
2766
2767 """
2768 inst_os = OSFromDisk(instance.os)
2769
2770 rename_env = OSEnvironment(instance, inst_os, debug)
2771 rename_env["OLD_INSTANCE_NAME"] = old_name
2772
2773 logfile = _InstanceLogName("rename", instance.os,
2774 "%s-%s" % (old_name, instance.name), None)
2775
2776 result = utils.RunCmd([inst_os.rename_script], env=rename_env,
2777 cwd=inst_os.path, output=logfile, reset_env=True)
2778
2779 if result.failed:
2780 logging.error("os create command '%s' returned error: %s output: %s",
2781 result.cmd, result.fail_reason, result.output)
2782 lines = [utils.SafeEncode(val)
2783 for val in utils.TailFile(logfile, lines=20)]
2784 _Fail("OS rename script failed (%s), last lines in the"
2785 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2786
2787
2788 def _GetBlockDevSymlinkPath(instance_name, idx, _dir=None):
2789 """Returns symlink path for block device.
2790
2791 """
2792 if _dir is None:
2793 _dir = pathutils.DISK_LINKS_DIR
2794
2795 return utils.PathJoin(_dir,
2796 ("%s%s%s" %
2797 (instance_name, constants.DISK_SEPARATOR, idx)))
2798
2799
2800 def _SymlinkBlockDev(instance_name, device_path, idx):
2801 """Set up symlinks to a instance's block device.
2802
2803 This is an auxiliary function run when an instance is start (on the primary
2804 node) or when an instance is migrated (on the target node).
2805
2806
2807 @param instance_name: the name of the target instance
2808 @param device_path: path of the physical block device, on the node
2809 @param idx: the disk index
2810 @return: absolute path to the disk's symlink
2811
2812 """
2813 # In case we have only a userspace access URI, device_path is None
2814 if not device_path:
2815 return None
2816
2817 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2818 try:
2819 os.symlink(device_path, link_name)
2820 except OSError, err:
2821 if err.errno == errno.EEXIST:
2822 if (not os.path.islink(link_name) or
2823 os.readlink(link_name) != device_path):
2824 os.remove(link_name)
2825 os.symlink(device_path, link_name)
2826 else:
2827 raise
2828
2829 return link_name
2830
2831
2832 def _RemoveBlockDevLinks(instance_name, disks):
2833 """Remove the block device symlinks belonging to the given instance.
2834
2835 """
2836 for idx, _ in enumerate(disks):
2837 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2838 if os.path.islink(link_name):
2839 try:
2840 os.remove(link_name)
2841 except OSError:
2842 logging.exception("Can't remove symlink '%s'", link_name)
2843
2844
2845 def _CalculateDeviceURI(instance, disk, device):
2846 """Get the URI for the device.
2847
2848 @type instance: L{objects.Instance}
2849 @param instance: the instance which disk belongs to
2850 @type disk: L{objects.Disk}
2851 @param disk: the target disk object
2852 @type device: L{bdev.BlockDev}
2853 @param device: the corresponding BlockDevice
2854 @rtype: string
2855 @return: the device uri if any else None
2856
2857 """
2858 access_mode = disk.params.get(constants.LDP_ACCESS,
2859 constants.DISK_KERNELSPACE)
2860 if access_mode == constants.DISK_USERSPACE:
2861 # This can raise errors.BlockDeviceError
2862 return device.GetUserspaceAccessUri(instance.hypervisor)
2863 else:
2864 return None
2865
2866
2867 def _GatherAndLinkBlockDevs(instance):
2868 """Set up an instance's block device(s).
2869
2870 This is run on the primary node at instance startup. The block
2871 devices must be already assembled.
2872
2873 @type instance: L{objects.Instance}
2874 @param instance: the instance whose disks we should assemble
2875 @rtype: list
2876 @return: list of (disk_object, link_name, drive_uri)
2877
2878 """
2879 block_devices = []
2880 for idx, disk in enumerate(instance.disks_info):
2881 device = _RecursiveFindBD(disk)
2882 if device is None:
2883 raise errors.BlockDeviceError("Block device '%s' is not set up." %
2884 str(disk))
2885 device.Open()
2886 try:
2887 link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
2888 except OSError, e:
2889 raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
2890 e.strerror)
2891 uri = _CalculateDeviceURI(instance, disk, device)
2892
2893 block_devices.append((disk, link_name, uri))
2894
2895 return block_devices
2896
2897
2898 def _IsInstanceUserDown(instance_info):
2899 return instance_info and \
2900 "state" in instance_info and \
2901 hv_base.HvInstanceState.IsShutdown(instance_info["state"])
2902
2903
2904 def _GetInstanceInfo(instance):
2905 """Helper function L{GetInstanceInfo}"""
2906 return GetInstanceInfo(instance.name, instance.hypervisor,
2907 hvparams=instance.hvparams)
2908
2909
2910 def StartInstance(instance, startup_paused, reason, store_reason=True):
2911 """Start an instance.
2912
2913 @type instance: L{objects.Instance}
2914 @param instance: the instance object
2915 @type startup_paused: bool
2916 @param instance: pause instance at startup?
2917 @type reason: list of reasons
2918 @param reason: the reason trail for this startup
2919 @type store_reason: boolean
2920 @param store_reason: whether to store the shutdown reason trail on file
2921 @rtype: None
2922
2923 """
2924 instance_info = _GetInstanceInfo(instance)
2925
2926 if instance_info and not _IsInstanceUserDown(instance_info):
2927 logging.info("Instance '%s' already running, not starting", instance.name)
2928 return
2929
2930 try:
2931 block_devices = _GatherAndLinkBlockDevs(instance)
2932 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2933 hyper.StartInstance(instance, block_devices, startup_paused)
2934 if store_reason:
2935 _StoreInstReasonTrail(instance.name, reason)
2936 except errors.BlockDeviceError, err:
2937 _Fail("Block device error: %s", err, exc=True)
2938 except errors.HypervisorError, err:
2939 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2940 _Fail("Hypervisor error: %s", err, exc=True)
2941
2942
2943 def InstanceShutdown(instance, timeout, reason, store_reason=True):
2944 """Shut an instance down.
2945
2946 @note: this functions uses polling with a hardcoded timeout.
2947
2948 @type instance: L{objects.Instance}
2949 @param instance: the instance object
2950 @type timeout: integer
2951 @param timeout: maximum timeout for soft shutdown
2952 @type reason: list of reasons
2953 @param reason: the reason trail for this shutdown
2954 @type store_reason: boolean
2955 @param store_reason: whether to store the shutdown reason trail on file
2956 @rtype: None
2957
2958 """
2959 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2960
2961 if not _GetInstanceInfo(instance):
2962 logging.info("Instance '%s' not running, doing nothing", instance.name)
2963 return
2964
2965 class _TryShutdown(object):
2966 def __init__(self):
2967 self.tried_once = False
2968
2969 def __call__(self):
2970 try:
2971 hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout)
2972 if store_reason:
2973 _StoreInstReasonTrail(instance.name, reason)
2974 except errors.HypervisorError, err:
2975 # if the instance does no longer exist, consider this success and go to
2976 # cleanup, otherwise fail without retrying
2977 if _GetInstanceInfo(instance):
2978 _Fail("Failed to stop instance '%s': %s", instance.name, err)
2979 return
2980
2981 # TODO: Cleanup hypervisor implementations to prevent them from failing
2982 # silently. We could easily decide if we want to retry or not by using
2983 # HypervisorSoftError()/HypervisorHardError()
2984 self.tried_once = True
2985 if _GetInstanceInfo(instance):
2986 raise utils.RetryAgain()
2987
2988 try:
2989 utils.Retry(_TryShutdown(), 5, timeout)
2990 except utils.RetryTimeout:
2991 # the shutdown did not succeed
2992 logging.error("Shutdown of '%s' unsuccessful, forcing", instance.name)
2993
2994 try:
2995 hyper.StopInstance(instance, force=True)
2996 except errors.HypervisorError, err:
2997 # only raise an error if the instance still exists, otherwise
2998 # the error could simply be "instance ... unknown"!
2999 if _GetInstanceInfo(instance):
3000 _Fail("Failed to force stop instance '%s': %s", instance.name, err)
3001
3002 time.sleep(1)
3003
3004 if _GetInstanceInfo(instance):
3005 _Fail("Could not shutdown instance '%s' even by destroy", instance.name)
3006
3007 try:
3008 hyper.CleanupInstance(instance.name)
3009 except errors.HypervisorError, err:
3010 logging.warning("Failed to execute post-shutdown cleanup step: %s", err)
3011
3012 _RemoveBlockDevLinks(instance.name, instance.disks_info)
3013
3014
3015 def InstanceReboot(instance, reboot_type, shutdown_timeout, reason):
3016 """Reboot an instance.
3017
3018 @type instance: L{objects.Instance}
3019 @param instance: the instance object to reboot
3020 @type reboot_type: str
3021 @param reboot_type: the type of reboot, one the following
3022 constants:
3023 - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
3024 instance OS, do not recreate the VM
3025 - L{constants.INSTANCE_REBOOT_HARD}: tear down and
3026 restart the VM (at the hypervisor level)
3027 - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is
3028 not accepted here, since that mode is handled differently, in
3029 cmdlib, and translates into full stop and start of the
3030 instance (instead of a call_instance_reboot RPC)
3031 @type shutdown_timeout: integer
3032 @param shutdown_timeout: maximum timeout for soft shutdown
3033 @type reason: list of reasons
3034 @param reason: the reason trail for this reboot
3035 @rtype: None
3036
3037 """
3038 # TODO: this is inconsistent with 'StartInstance' and 'InstanceShutdown'
3039 # because those functions simply 'return' on error whereas this one
3040 # raises an exception with '_Fail'
3041 if not _GetInstanceInfo(instance):
3042 _Fail("Cannot reboot instance '%s' that is not running", instance.name)
3043
3044 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3045 if reboot_type == constants.INSTANCE_REBOOT_SOFT:
3046 try:
3047 hyper.RebootInstance(instance)
3048 except errors.HypervisorError, err:
3049 _Fail("Failed to soft reboot instance '%s': %s", instance.name, err)
3050 elif reboot_type == constants.INSTANCE_REBOOT_HARD:
3051 try:
3052 InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False)
3053 result = StartInstance(instance, False, reason, store_reason=False)
3054 _StoreInstReasonTrail(instance.name, reason)
3055 return result
3056 except errors.HypervisorError, err:
3057 _Fail("Failed to hard reboot instance '%s': %s", instance.name, err)
3058 else:
3059 _Fail("Invalid reboot_type received: '%s'", reboot_type)
3060
3061
3062 def InstanceBalloonMemory(instance, memory):
3063 """Resize an instance's memory.
3064
3065 @type instance: L{objects.Instance}
3066 @param instance: the instance object
3067 @type memory: int
3068 @param memory: new memory amount in MB
3069 @rtype: None
3070
3071 """
3072 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3073 running = hyper.ListInstances(hvparams=instance.hvparams)
3074 if instance.name not in running:
3075 logging.info("Instance %s is not running, cannot balloon", instance.name)
3076 return
3077 try:
3078 hyper.BalloonInstanceMemory(instance, memory)
3079 except errors.HypervisorError, err:
3080 _Fail("Failed to balloon instance memory: %s", err, exc=True)
3081
3082
3083 def MigrationInfo(instance):
3084 """Gather information about an instance to be migrated.
3085
3086 @type instance: L{objects.Instance}
3087 @param instance: the instance definition
3088
3089 """
3090 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3091 try:
3092 info = hyper.MigrationInfo(instance)
3093 except errors.HypervisorError, err:
3094 _Fail("Failed to fetch migration information: %s", err, exc=True)
3095 return info
3096
3097
3098 def AcceptInstance(instance, info, target):
3099 """Prepare the node to accept an instance.
3100
3101 @type instance: L{objects.Instance}
3102 @param instance: the instance definition
3103 @type info: string/data (opaque)
3104 @param info: migration information, from the source node
3105 @type target: string
3106 @param target: target host (usually ip), on this node
3107
3108 """
3109 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3110 try:
3111 hyper.AcceptInstance(instance, info, target)
3112 except errors.HypervisorError, err:
3113 _Fail("Failed to accept instance: %s", err, exc=True)
3114
3115
3116 def FinalizeMigrationDst(instance, info, success):
3117 """Finalize any preparation to accept an instance.
3118
3119 @type instance: L{objects.Instance}
3120 @param instance: the instance definition
3121 @type info: string/data (opaque)
3122 @param info: migration information, from the source node
3123 @type success: boolean
3124 @param success: whether the migration was a success or a failure
3125
3126 """
3127 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3128 try:
3129 hyper.FinalizeMigrationDst(instance, info, success)
3130 except errors.HypervisorError, err:
3131 _Fail("Failed to finalize migration on the target node: %s", err, exc=True)
3132
3133
3134 def MigrateInstance(cluster_name, instance, target, live):
3135 """Migrates an instance to another node.
3136
3137 @type cluster_name: string
3138 @param cluster_name: name of the cluster
3139 @type instance: L{objects.Instance}
3140 @param instance: the instance definition
3141 @type target: string
3142 @param target: the target node name
3143 @type live: boolean
3144 @param live: whether the migration should be done live or not (the
3145 interpretation of this parameter is left to the hypervisor)
3146 @raise RPCFail: if migration fails for some reason
3147
3148 """
3149 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3150
3151 try:
3152 hyper.MigrateInstance(cluster_name, instance, target, live)
3153 except errors.HypervisorError, err:
3154 _Fail("Failed to migrate instance: %s", err, exc=True)
3155
3156
3157 def FinalizeMigrationSource(instance, success, live):
3158 """Finalize the instance migration on the source node.
3159
3160 @type instance: L{objects.Instance}
3161 @param instance: the instance definition of the migrated instance
3162 @type success: bool
3163 @param success: whether the migration succeeded or not
3164 @type live: bool
3165 @param live: whether the user requested a live migration or not
3166 @raise RPCFail: If the execution fails for some reason
3167
3168 """
3169 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3170
3171 try:
3172 hyper.FinalizeMigrationSource(instance, success, live)
3173 except Exception, err: # pylint: disable=W0703
3174 _Fail("Failed to finalize the migration on the source node: %s", err,
3175 exc=True)
3176
3177
3178 def GetMigrationStatus(instance):
3179 """Get the migration status
3180
3181 @type instance: L{objects.Instance}
3182 @param instance: the instance that is being migrated
3183 @rtype: L{objects.MigrationStatus}
3184 @return: the status of the current migration (one of
3185 L{constants.HV_MIGRATION_VALID_STATUSES}), plus any additional
3186 progress info that can be retrieved from the hypervisor
3187 @raise RPCFail: If the migration status cannot be retrieved
3188
3189 """
3190 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3191 try:
3192 return hyper.GetMigrationStatus(instance)
3193 except Exception, err: # pylint: disable=W0703
3194 _Fail("Failed to get migration status: %s", err, exc=True)
3195
3196
3197 def HotplugDevice(instance, action, dev_type, device, extra, seq):
3198 """Hotplug a device
3199
3200 Hotplug is currently supported only for KVM Hypervisor.
3201 @type instance: L{objects.Instance}
3202 @param instance: the instance to which we hotplug a device
3203 @type action: string
3204 @param action: the hotplug action to perform
3205 @type dev_type: string
3206 @param dev_type: the device type to hotplug
3207 @type device: either L{objects.NIC} or L{objects.Disk}
3208 @param device: the device object to hotplug
3209 @type extra: tuple
3210 @param extra: extra info used for disk hotplug (disk link, drive uri)
3211 @type seq: int
3212 @param seq: the index of the device from master perspective
3213 @raise RPCFail: in case instance does not have KVM hypervisor
3214
3215 """
3216 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3217 try:
3218 hyper.VerifyHotplugSupport(instance, action, dev_type)
3219 except errors.HotplugError, err:
3220 _Fail("Hotplug is not supported: %s", err)
3221
3222 if action == constants.HOTPLUG_ACTION_ADD:
3223 fn = hyper.HotAddDevice
3224 elif action == constants.HOTPLUG_ACTION_REMOVE:
3225 fn = hyper.HotDelDevice
3226 elif action == constants.HOTPLUG_ACTION_MODIFY:
3227 fn = hyper.HotModDevice
3228 else:
3229 assert action in constants.HOTPLUG_ALL_ACTIONS
3230
3231 return fn(instance, dev_type, device, extra, seq)
3232
3233
3234 def HotplugSupported(instance):
3235 """Checks if hotplug is generally supported.
3236
3237 """
3238 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3239 try:
3240 hyper.HotplugSupported(instance)
3241 except errors.HotplugError, err:
3242 _Fail("Hotplug is not supported: %s", err)
3243
3244
3245 def ModifyInstanceMetadata(metadata):
3246 """Sends instance data to the metadata daemon.
3247
3248 Uses the Luxi transport layer to communicate with the metadata
3249 daemon configuration server. It starts the metadata daemon if it is
3250 not running.
3251 The daemon must be enabled during at configuration time.
3252
3253 @type metadata: dict
3254 @param metadata: instance metadata obtained by calling
3255 L{objects.Instance.ToDict} on an instance object
3256
3257 """
3258 if not constants.ENABLE_METAD:
3259 raise errors.ProgrammerError("The metadata deamon is disabled, yet"
3260 " ModifyInstanceMetadata has been called")
3261
3262 if not utils.IsDaemonAlive(constants.METAD):
3263 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.METAD])
3264 if result.failed:
3265 raise errors.HypervisorError("Failed to start metadata daemon")
3266
3267 with contextlib.closing(metad.Client()) as client:
3268 client.UpdateConfig(metadata)
3269
3270
3271 def BlockdevCreate(disk, size, owner, on_primary, info, excl_stor):
3272 """Creates a block device for an instance.
3273
3274 @type disk: L{objects.Disk}
3275 @param disk: the object describing the disk we should create
3276 @type size: int
3277 @param size: the size of the physical underlying device, in MiB
3278 @type owner: str
3279 @param owner: the name of the instance for which disk is created,
3280 used for device cache data
3281 @type on_primary: boolean
3282 @param on_primary: indicates if it is the primary node or not
3283 @type info: string
3284 @param info: string that will be sent to the physical device
3285 creation, used for example to set (LVM) tags on LVs
3286 @type excl_stor: boolean
3287 @param excl_stor: Whether exclusive_storage is active
3288
3289 @return: the new unique_id of the device (this can sometime be
3290 computed only after creation), or None. On secondary nodes,
3291 it's not required to return anything.
3292
3293 """
3294 # TODO: remove the obsolete "size" argument
3295 # pylint: disable=W0613
3296 clist = []
3297 if disk.children:
3298 for child in disk.children:
3299 try:
3300 crdev = _RecursiveAssembleBD(child, owner, on_primary)
3301 except errors.BlockDeviceError, err:
3302 _Fail("Can't assemble device %s: %s", child, err)
3303 if on_primary or disk.AssembleOnSecondary():
3304 # we need the children open in case the device itself has to
3305 # be assembled
3306 try:
3307 # pylint: disable=E1103
3308 crdev.Open()
3309 except errors.BlockDeviceError, err:
3310 _Fail("Can't make child '%s' read-write: %s", child, err)
3311 clist.append(crdev)
3312
3313 try:
3314 device = bdev.Create(disk, clist, excl_stor)
3315 except errors.BlockDeviceError, err:
3316 _Fail("Can't create block device: %s", err)
3317
3318 if on_primary or disk.AssembleOnSecondary():
3319 try:
3320 device.Assemble()
3321 except errors.BlockDeviceError, err:
3322 _Fail("Can't assemble device after creation, unusual event: %s", err)
3323 if on_primary or disk.OpenOnSecondary():
3324 try:
3325 device.Open(force=True)
3326 except errors.BlockDeviceError, err:
3327 _Fail("Can't make device r/w after creation, unusual event: %s", err)
3328 DevCacheManager.UpdateCache(device.dev_path, owner,
3329 on_primary, disk.iv_name)
3330
3331 device.SetInfo(info)
3332
3333 return device.unique_id
3334
3335
3336 def _DumpDevice(source_path, target_path, offset, size, truncate):
3337 """This function images/wipes the device using a local file.
3338
3339 @type source_path: string
3340 @param source_path: path of the image or data source (e.g., "/dev/zero")
3341
3342 @type target_path: string
3343 @param target_path: path of the device to image/wipe
3344
3345 @type offset: int
3346 @param offset: offset in MiB in the output file
3347
3348 @type size: int
3349 @param size: maximum size in MiB to write (data source might be smaller)
3350
3351 @type truncate: bool
3352 @param truncate: whether the file should be truncated
3353
3354 @return: None
3355 @raise RPCFail: in case of failure
3356
3357 """
3358 # Internal sizes are always in Mebibytes; if the following "dd" command
3359 # should use a different block size the offset and size given to this
3360 # function must be adjusted accordingly before being passed to "dd".
3361 block_size = constants.DD_BLOCK_SIZE