Catch IOError of SSH files when removing node
[ganeti-github.git] / lib / backend.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31 """Functions used by the node daemon
32
33 @var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
34 the L{UploadFile} function
35 @var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted
36 in the L{_CleanDirectory} function
37
38 """
39
40 # pylint: disable=E1103,C0302
41
42 # E1103: %s %r has no %r member (but some types could not be
43 # inferred), because the _TryOSFromDisk returns either (True, os_obj)
44 # or (False, "string") which confuses pylint
45
46 # C0302: This module has become too big and should be split up
47
48
49 import base64
50 import errno
51 import logging
52 import os
53 import os.path
54 import pycurl
55 import random
56 import re
57 import shutil
58 import signal
59 import stat
60 import tempfile
61 import time
62 import zlib
63 import contextlib
64 import collections
65
66 from ganeti import errors
67 from ganeti import http
68 from ganeti import utils
69 from ganeti import ssh
70 from ganeti import hypervisor
71 from ganeti.hypervisor import hv_base
72 from ganeti import constants
73 from ganeti.storage import bdev
74 from ganeti.storage import drbd
75 from ganeti.storage import extstorage
76 from ganeti.storage import filestorage
77 from ganeti import objects
78 from ganeti import ssconf
79 from ganeti import serializer
80 from ganeti import netutils
81 from ganeti import runtime
82 from ganeti import compat
83 from ganeti import pathutils
84 from ganeti import vcluster
85 from ganeti import ht
86 from ganeti.storage.base import BlockDev
87 from ganeti.storage.drbd import DRBD8
88 from ganeti import hooksmaster
89 import ganeti.metad as metad
90
91
92 _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id"
93 _ALLOWED_CLEAN_DIRS = compat.UniqueFrozenset([
94 pathutils.DATA_DIR,
95 pathutils.JOB_QUEUE_ARCHIVE_DIR,
96 pathutils.QUEUE_DIR,
97 pathutils.CRYPTO_KEYS_DIR,
98 ])
99 _MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60
100 _X509_KEY_FILE = "key"
101 _X509_CERT_FILE = "cert"
102 _IES_STATUS_FILE = "status"
103 _IES_PID_FILE = "pid"
104 _IES_CA_FILE = "ca"
105
106 #: Valid LVS output line regex
107 _LVSLINE_REGEX = re.compile(r"^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$")
108
109 # Actions for the master setup script
110 _MASTER_START = "start"
111 _MASTER_STOP = "stop"
112
113 #: Maximum file permissions for restricted command directory and executables
114 _RCMD_MAX_MODE = (stat.S_IRWXU |
115 stat.S_IRGRP | stat.S_IXGRP |
116 stat.S_IROTH | stat.S_IXOTH)
117
118 #: Delay before returning an error for restricted commands
119 _RCMD_INVALID_DELAY = 10
120
121 #: How long to wait to acquire lock for restricted commands (shorter than
122 #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many
123 #: command requests arrive
124 _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8
125
126
127 class RPCFail(Exception):
128 """Class denoting RPC failure.
129
130 Its argument is the error message.
131
132 """
133
134
135 def _GetInstReasonFilename(instance_name):
136 """Path of the file containing the reason of the instance status change.
137
138 @type instance_name: string
139 @param instance_name: The name of the instance
140 @rtype: string
141 @return: The path of the file
142
143 """
144 return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name)
145
146
147 def _StoreInstReasonTrail(instance_name, trail):
148 """Serialize a reason trail related to an instance change of state to file.
149
150 The exact location of the file depends on the name of the instance and on
151 the configuration of the Ganeti cluster defined at deploy time.
152
153 @type instance_name: string
154 @param instance_name: The name of the instance
155
156 @type trail: list of reasons
157 @param trail: reason trail
158
159 @rtype: None
160
161 """
162 json = serializer.DumpJson(trail)
163 filename = _GetInstReasonFilename(instance_name)
164 utils.WriteFile(filename, data=json)
165
166
167 def _Fail(msg, *args, **kwargs):
168 """Log an error and the raise an RPCFail exception.
169
170 This exception is then handled specially in the ganeti daemon and
171 turned into a 'failed' return type. As such, this function is a
172 useful shortcut for logging the error and returning it to the master
173 daemon.
174
175 @type msg: string
176 @param msg: the text of the exception
177 @raise RPCFail
178
179 """
180 if args:
181 msg = msg % args
182 if "log" not in kwargs or kwargs["log"]: # if we should log this error
183 if "exc" in kwargs and kwargs["exc"]:
184 logging.exception(msg)
185 else:
186 logging.error(msg)
187 raise RPCFail(msg)
188
189
190 def _GetConfig():
191 """Simple wrapper to return a SimpleStore.
192
193 @rtype: L{ssconf.SimpleStore}
194 @return: a SimpleStore instance
195
196 """
197 return ssconf.SimpleStore()
198
199
200 def _GetSshRunner(cluster_name):
201 """Simple wrapper to return an SshRunner.
202
203 @type cluster_name: str
204 @param cluster_name: the cluster name, which is needed
205 by the SshRunner constructor
206 @rtype: L{ssh.SshRunner}
207 @return: an SshRunner instance
208
209 """
210 return ssh.SshRunner(cluster_name)
211
212
213 def _Decompress(data):
214 """Unpacks data compressed by the RPC client.
215
216 @type data: list or tuple
217 @param data: Data sent by RPC client
218 @rtype: str
219 @return: Decompressed data
220
221 """
222 assert isinstance(data, (list, tuple))
223 assert len(data) == 2
224 (encoding, content) = data
225 if encoding == constants.RPC_ENCODING_NONE:
226 return content
227 elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
228 return zlib.decompress(base64.b64decode(content))
229 else:
230 raise AssertionError("Unknown data encoding")
231
232
233 def _CleanDirectory(path, exclude=None):
234 """Removes all regular files in a directory.
235
236 @type path: str
237 @param path: the directory to clean
238 @type exclude: list
239 @param exclude: list of files to be excluded, defaults
240 to the empty list
241
242 """
243 if path not in _ALLOWED_CLEAN_DIRS:
244 _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'",
245 path)
246
247 if not os.path.isdir(path):
248 return
249 if exclude is None:
250 exclude = []
251 else:
252 # Normalize excluded paths
253 exclude = [os.path.normpath(i) for i in exclude]
254
255 for rel_name in utils.ListVisibleFiles(path):
256 full_name = utils.PathJoin(path, rel_name)
257 if full_name in exclude:
258 continue
259 if os.path.isfile(full_name) and not os.path.islink(full_name):
260 utils.RemoveFile(full_name)
261
262
263 def _BuildUploadFileList():
264 """Build the list of allowed upload files.
265
266 This is abstracted so that it's built only once at module import time.
267
268 """
269 allowed_files = set([
270 pathutils.CLUSTER_CONF_FILE,
271 pathutils.ETC_HOSTS,
272 pathutils.SSH_KNOWN_HOSTS_FILE,
273 pathutils.VNC_PASSWORD_FILE,
274 pathutils.RAPI_CERT_FILE,
275 pathutils.SPICE_CERT_FILE,
276 pathutils.SPICE_CACERT_FILE,
277 pathutils.RAPI_USERS_FILE,
278 pathutils.CONFD_HMAC_KEY,
279 pathutils.CLUSTER_DOMAIN_SECRET_FILE,
280 ])
281
282 for hv_name in constants.HYPER_TYPES:
283 hv_class = hypervisor.GetHypervisorClass(hv_name)
284 allowed_files.update(hv_class.GetAncillaryFiles()[0])
285
286 assert pathutils.FILE_STORAGE_PATHS_FILE not in allowed_files, \
287 "Allowed file storage paths should never be uploaded via RPC"
288
289 return frozenset(allowed_files)
290
291
292 _ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
293
294
295 def JobQueuePurge():
296 """Removes job queue files and archived jobs.
297
298 @rtype: tuple
299 @return: True, None
300
301 """
302 _CleanDirectory(pathutils.QUEUE_DIR, exclude=[pathutils.JOB_QUEUE_LOCK_FILE])
303 _CleanDirectory(pathutils.JOB_QUEUE_ARCHIVE_DIR)
304
305
306 def GetMasterNodeName():
307 """Returns the master node name.
308
309 @rtype: string
310 @return: name of the master node
311 @raise RPCFail: in case of errors
312
313 """
314 try:
315 return _GetConfig().GetMasterNode()
316 except errors.ConfigurationError, err:
317 _Fail("Cluster configuration incomplete: %s", err, exc=True)
318
319
320 def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
321 """Decorator that runs hooks before and after the decorated function.
322
323 @type hook_opcode: string
324 @param hook_opcode: opcode of the hook
325 @type hooks_path: string
326 @param hooks_path: path of the hooks
327 @type env_builder_fn: function
328 @param env_builder_fn: function that returns a dictionary containing the
329 environment variables for the hooks. Will get all the parameters of the
330 decorated function.
331 @raise RPCFail: in case of pre-hook failure
332
333 """
334 def decorator(fn):
335 def wrapper(*args, **kwargs):
336 _, myself = ssconf.GetMasterAndMyself()
337 nodes = ([myself], [myself]) # these hooks run locally
338
339 env_fn = compat.partial(env_builder_fn, *args, **kwargs)
340
341 cfg = _GetConfig()
342 hr = HooksRunner()
343 hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes,
344 hr.RunLocalHooks, None, env_fn, None,
345 logging.warning, cfg.GetClusterName(),
346 cfg.GetMasterNode())
347 hm.RunPhase(constants.HOOKS_PHASE_PRE)
348 result = fn(*args, **kwargs)
349 hm.RunPhase(constants.HOOKS_PHASE_POST)
350
351 return result
352 return wrapper
353 return decorator
354
355
356 def _BuildMasterIpEnv(master_params, use_external_mip_script=None):
357 """Builds environment variables for master IP hooks.
358
359 @type master_params: L{objects.MasterNetworkParameters}
360 @param master_params: network parameters of the master
361 @type use_external_mip_script: boolean
362 @param use_external_mip_script: whether to use an external master IP
363 address setup script (unused, but necessary per the implementation of the
364 _RunLocalHooks decorator)
365
366 """
367 # pylint: disable=W0613
368 ver = netutils.IPAddress.GetVersionFromAddressFamily(master_params.ip_family)
369 env = {
370 "MASTER_NETDEV": master_params.netdev,
371 "MASTER_IP": master_params.ip,
372 "MASTER_NETMASK": str(master_params.netmask),
373 "CLUSTER_IP_VERSION": str(ver),
374 }
375
376 return env
377
378
379 def _RunMasterSetupScript(master_params, action, use_external_mip_script):
380 """Execute the master IP address setup script.
381
382 @type master_params: L{objects.MasterNetworkParameters}
383 @param master_params: network parameters of the master
384 @type action: string
385 @param action: action to pass to the script. Must be one of
386 L{backend._MASTER_START} or L{backend._MASTER_STOP}
387 @type use_external_mip_script: boolean
388 @param use_external_mip_script: whether to use an external master IP
389 address setup script
390 @raise backend.RPCFail: if there are errors during the execution of the
391 script
392
393 """
394 env = _BuildMasterIpEnv(master_params)
395
396 if use_external_mip_script:
397 setup_script = pathutils.EXTERNAL_MASTER_SETUP_SCRIPT
398 else:
399 setup_script = pathutils.DEFAULT_MASTER_SETUP_SCRIPT
400
401 result = utils.RunCmd([setup_script, action], env=env, reset_env=True)
402
403 if result.failed:
404 _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" %
405 (action, result.exit_code, result.output), log=True)
406
407
408 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup",
409 _BuildMasterIpEnv)
410 def ActivateMasterIp(master_params, use_external_mip_script):
411 """Activate the IP address of the master daemon.
412
413 @type master_params: L{objects.MasterNetworkParameters}
414 @param master_params: network parameters of the master
415 @type use_external_mip_script: boolean
416 @param use_external_mip_script: whether to use an external master IP
417 address setup script
418 @raise RPCFail: in case of errors during the IP startup
419
420 """
421 _RunMasterSetupScript(master_params, _MASTER_START,
422 use_external_mip_script)
423
424
425 def StartMasterDaemons(no_voting):
426 """Activate local node as master node.
427
428 The function will start the master daemons (ganeti-masterd and ganeti-rapi).
429
430 @type no_voting: boolean
431 @param no_voting: whether to start ganeti-masterd without a node vote
432 but still non-interactively
433 @rtype: None
434
435 """
436
437 if no_voting:
438 masterd_args = "--no-voting --yes-do-it"
439 else:
440 masterd_args = ""
441
442 env = {
443 "EXTRA_MASTERD_ARGS": masterd_args,
444 }
445
446 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-master"], env=env)
447 if result.failed:
448 msg = "Can't start Ganeti master: %s" % result.output
449 logging.error(msg)
450 _Fail(msg)
451
452
453 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown",
454 _BuildMasterIpEnv)
455 def DeactivateMasterIp(master_params, use_external_mip_script):
456 """Deactivate the master IP on this node.
457
458 @type master_params: L{objects.MasterNetworkParameters}
459 @param master_params: network parameters of the master
460 @type use_external_mip_script: boolean
461 @param use_external_mip_script: whether to use an external master IP
462 address setup script
463 @raise RPCFail: in case of errors during the IP turndown
464
465 """
466 _RunMasterSetupScript(master_params, _MASTER_STOP,
467 use_external_mip_script)
468
469
470 def StopMasterDaemons():
471 """Stop the master daemons on this node.
472
473 Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node.
474
475 @rtype: None
476
477 """
478 # TODO: log and report back to the caller the error failures; we
479 # need to decide in which case we fail the RPC for this
480
481 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-master"])
482 if result.failed:
483 logging.error("Could not stop Ganeti master, command %s had exitcode %s"
484 " and error %s",
485 result.cmd, result.exit_code, result.output)
486
487
488 def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
489 """Change the netmask of the master IP.
490
491 @param old_netmask: the old value of the netmask
492 @param netmask: the new value of the netmask
493 @param master_ip: the master IP
494 @param master_netdev: the master network device
495
496 """
497 if old_netmask == netmask:
498 return
499
500 if not netutils.IPAddress.Own(master_ip):
501 _Fail("The master IP address is not up, not attempting to change its"
502 " netmask")
503
504 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
505 "%s/%s" % (master_ip, netmask),
506 "dev", master_netdev, "label",
507 "%s:0" % master_netdev])
508 if result.failed:
509 _Fail("Could not set the new netmask on the master IP address")
510
511 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
512 "%s/%s" % (master_ip, old_netmask),
513 "dev", master_netdev, "label",
514 "%s:0" % master_netdev])
515 if result.failed:
516 _Fail("Could not bring down the master IP address with the old netmask")
517
518
519 def EtcHostsModify(mode, host, ip):
520 """Modify a host entry in /etc/hosts.
521
522 @param mode: The mode to operate. Either add or remove entry
523 @param host: The host to operate on
524 @param ip: The ip associated with the entry
525
526 """
527 if mode == constants.ETC_HOSTS_ADD:
528 if not ip:
529 RPCFail("Mode 'add' needs 'ip' parameter, but parameter not"
530 " present")
531 utils.AddHostToEtcHosts(host, ip)
532 elif mode == constants.ETC_HOSTS_REMOVE:
533 if ip:
534 RPCFail("Mode 'remove' does not allow 'ip' parameter, but"
535 " parameter is present")
536 utils.RemoveHostFromEtcHosts(host)
537 else:
538 RPCFail("Mode not supported")
539
540
541 def LeaveCluster(modify_ssh_setup):
542 """Cleans up and remove the current node.
543
544 This function cleans up and prepares the current node to be removed
545 from the cluster.
546
547 If processing is successful, then it raises an
548 L{errors.QuitGanetiException} which is used as a special case to
549 shutdown the node daemon.
550
551 @param modify_ssh_setup: boolean
552
553 """
554 _CleanDirectory(pathutils.DATA_DIR)
555 _CleanDirectory(pathutils.CRYPTO_KEYS_DIR)
556 JobQueuePurge()
557
558 if modify_ssh_setup:
559 try:
560 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
561
562 ssh.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
563
564 utils.RemoveFile(priv_key)
565 utils.RemoveFile(pub_key)
566 except errors.OpExecError:
567 logging.exception("Error while processing ssh files")
568 except IOError:
569 logging.exception("At least one SSH file was not accessible.")
570
571 try:
572 utils.RemoveFile(pathutils.CONFD_HMAC_KEY)
573 utils.RemoveFile(pathutils.RAPI_CERT_FILE)
574 utils.RemoveFile(pathutils.SPICE_CERT_FILE)
575 utils.RemoveFile(pathutils.SPICE_CACERT_FILE)
576 utils.RemoveFile(pathutils.NODED_CERT_FILE)
577 except: # pylint: disable=W0702
578 logging.exception("Error while removing cluster secrets")
579
580 utils.StopDaemon(constants.CONFD)
581 utils.StopDaemon(constants.MOND)
582 utils.StopDaemon(constants.KVMD)
583
584 # Raise a custom exception (handled in ganeti-noded)
585 raise errors.QuitGanetiException(True, "Shutdown scheduled")
586
587
588 def _CheckStorageParams(params, num_params):
589 """Performs sanity checks for storage parameters.
590
591 @type params: list
592 @param params: list of storage parameters
593 @type num_params: int
594 @param num_params: expected number of parameters
595
596 """
597 if params is None:
598 raise errors.ProgrammerError("No storage parameters for storage"
599 " reporting is provided.")
600 if not isinstance(params, list):
601 raise errors.ProgrammerError("The storage parameters are not of type"
602 " list: '%s'" % params)
603 if not len(params) == num_params:
604 raise errors.ProgrammerError("Did not receive the expected number of"
605 "storage parameters: expected %s,"
606 " received '%s'" % (num_params, len(params)))
607
608
609 def _CheckLvmStorageParams(params):
610 """Performs sanity check for the 'exclusive storage' flag.
611
612 @see: C{_CheckStorageParams}
613
614 """
615 _CheckStorageParams(params, 1)
616 excl_stor = params[0]
617 if not isinstance(params[0], bool):
618 raise errors.ProgrammerError("Exclusive storage parameter is not"
619 " boolean: '%s'." % excl_stor)
620 return excl_stor
621
622
623 def _GetLvmVgSpaceInfo(name, params):
624 """Wrapper around C{_GetVgInfo} which checks the storage parameters.
625
626 @type name: string
627 @param name: name of the volume group
628 @type params: list
629 @param params: list of storage parameters, which in this case should be
630 containing only one for exclusive storage
631
632 """
633 excl_stor = _CheckLvmStorageParams(params)
634 return _GetVgInfo(name, excl_stor)
635
636
637 def _GetVgInfo(
638 name, excl_stor, info_fn=bdev.LogicalVolume.GetVGInfo):
639 """Retrieves information about a LVM volume group.
640
641 """
642 # TODO: GetVGInfo supports returning information for multiple VGs at once
643 vginfo = info_fn([name], excl_stor)
644 if vginfo:
645 vg_free = int(round(vginfo[0][0], 0))
646 vg_size = int(round(vginfo[0][1], 0))
647 else:
648 vg_free = None
649 vg_size = None
650
651 return {
652 "type": constants.ST_LVM_VG,
653 "name": name,
654 "storage_free": vg_free,
655 "storage_size": vg_size,
656 }
657
658
659 def _GetLvmPvSpaceInfo(name, params):
660 """Wrapper around C{_GetVgSpindlesInfo} with sanity checks.
661
662 @see: C{_GetLvmVgSpaceInfo}
663
664 """
665 excl_stor = _CheckLvmStorageParams(params)
666 return _GetVgSpindlesInfo(name, excl_stor)
667
668
669 def _GetVgSpindlesInfo(
670 name, excl_stor, info_fn=bdev.LogicalVolume.GetVgSpindlesInfo):
671 """Retrieves information about spindles in an LVM volume group.
672
673 @type name: string
674 @param name: VG name
675 @type excl_stor: bool
676 @param excl_stor: exclusive storage
677 @rtype: dict
678 @return: dictionary whose keys are "name", "vg_free", "vg_size" for VG name,
679 free spindles, total spindles respectively
680
681 """
682 if excl_stor:
683 (vg_free, vg_size) = info_fn(name)
684 else:
685 vg_free = 0
686 vg_size = 0
687 return {
688 "type": constants.ST_LVM_PV,
689 "name": name,
690 "storage_free": vg_free,
691 "storage_size": vg_size,
692 }
693
694
695 def _GetHvInfo(name, hvparams, get_hv_fn=hypervisor.GetHypervisor):
696 """Retrieves node information from a hypervisor.
697
698 The information returned depends on the hypervisor. Common items:
699
700 - vg_size is the size of the configured volume group in MiB
701 - vg_free is the free size of the volume group in MiB
702 - memory_dom0 is the memory allocated for domain0 in MiB
703 - memory_free is the currently available (free) ram in MiB
704 - memory_total is the total number of ram in MiB
705 - hv_version: the hypervisor version, if available
706
707 @type hvparams: dict of string
708 @param hvparams: the hypervisor's hvparams
709
710 """
711 return get_hv_fn(name).GetNodeInfo(hvparams=hvparams)
712
713
714 def _GetHvInfoAll(hv_specs, get_hv_fn=hypervisor.GetHypervisor):
715 """Retrieves node information for all hypervisors.
716
717 See C{_GetHvInfo} for information on the output.
718
719 @type hv_specs: list of pairs (string, dict of strings)
720 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
721
722 """
723 if hv_specs is None:
724 return None
725
726 result = []
727 for hvname, hvparams in hv_specs:
728 result.append(_GetHvInfo(hvname, hvparams, get_hv_fn))
729 return result
730
731
732 def _GetNamedNodeInfo(names, fn):
733 """Calls C{fn} for all names in C{names} and returns a dictionary.
734
735 @rtype: None or dict
736
737 """
738 if names is None:
739 return None
740 else:
741 return map(fn, names)
742
743
744 def GetNodeInfo(storage_units, hv_specs):
745 """Gives back a hash with different information about the node.
746
747 @type storage_units: list of tuples (string, string, list)
748 @param storage_units: List of tuples (storage unit, identifier, parameters) to
749 ask for disk space information. In case of lvm-vg, the identifier is
750 the VG name. The parameters can contain additional, storage-type-specific
751 parameters, for example exclusive storage for lvm storage.
752 @type hv_specs: list of pairs (string, dict of strings)
753 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
754 @rtype: tuple; (string, None/dict, None/dict)
755 @return: Tuple containing boot ID, volume group information and hypervisor
756 information
757
758 """
759 bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n")
760 storage_info = _GetNamedNodeInfo(
761 storage_units,
762 (lambda (storage_type, storage_key, storage_params):
763 _ApplyStorageInfoFunction(storage_type, storage_key, storage_params)))
764 hv_info = _GetHvInfoAll(hv_specs)
765 return (bootid, storage_info, hv_info)
766
767
768 def _GetFileStorageSpaceInfo(path, params):
769 """Wrapper around filestorage.GetSpaceInfo.
770
771 The purpose of this wrapper is to call filestorage.GetFileStorageSpaceInfo
772 and ignore the *args parameter to not leak it into the filestorage
773 module's code.
774
775 @see: C{filestorage.GetFileStorageSpaceInfo} for description of the
776 parameters.
777
778 """
779 _CheckStorageParams(params, 0)
780 return filestorage.GetFileStorageSpaceInfo(path)
781
782
783 # FIXME: implement storage reporting for all missing storage types.
784 _STORAGE_TYPE_INFO_FN = {
785 constants.ST_BLOCK: None,
786 constants.ST_DISKLESS: None,
787 constants.ST_EXT: None,
788 constants.ST_FILE: _GetFileStorageSpaceInfo,
789 constants.ST_LVM_PV: _GetLvmPvSpaceInfo,
790 constants.ST_LVM_VG: _GetLvmVgSpaceInfo,
791 constants.ST_SHARED_FILE: None,
792 constants.ST_GLUSTER: None,
793 constants.ST_RADOS: None,
794 }
795
796
797 def _ApplyStorageInfoFunction(storage_type, storage_key, *args):
798 """Looks up and applies the correct function to calculate free and total
799 storage for the given storage type.
800
801 @type storage_type: string
802 @param storage_type: the storage type for which the storage shall be reported.
803 @type storage_key: string
804 @param storage_key: identifier of a storage unit, e.g. the volume group name
805 of an LVM storage unit
806 @type args: any
807 @param args: various parameters that can be used for storage reporting. These
808 parameters and their semantics vary from storage type to storage type and
809 are just propagated in this function.
810 @return: the results of the application of the storage space function (see
811 _STORAGE_TYPE_INFO_FN) if storage space reporting is implemented for that
812 storage type
813 @raises NotImplementedError: for storage types who don't support space
814 reporting yet
815 """
816 fn = _STORAGE_TYPE_INFO_FN[storage_type]
817 if fn is not None:
818 return fn(storage_key, *args)
819 else:
820 raise NotImplementedError
821
822
823 def _CheckExclusivePvs(pvi_list):
824 """Check that PVs are not shared among LVs
825
826 @type pvi_list: list of L{objects.LvmPvInfo} objects
827 @param pvi_list: information about the PVs
828
829 @rtype: list of tuples (string, list of strings)
830 @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...])
831
832 """
833 res = []
834 for pvi in pvi_list:
835 if len(pvi.lv_list) > 1:
836 res.append((pvi.name, pvi.lv_list))
837 return res
838
839
840 def _VerifyHypervisors(what, vm_capable, result, all_hvparams,
841 get_hv_fn=hypervisor.GetHypervisor):
842 """Verifies the hypervisor. Appends the results to the 'results' list.
843
844 @type what: C{dict}
845 @param what: a dictionary of things to check
846 @type vm_capable: boolean
847 @param vm_capable: whether or not this node is vm capable
848 @type result: dict
849 @param result: dictionary of verification results; results of the
850 verifications in this function will be added here
851 @type all_hvparams: dict of dict of string
852 @param all_hvparams: dictionary mapping hypervisor names to hvparams
853 @type get_hv_fn: function
854 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
855
856 """
857 if not vm_capable:
858 return
859
860 if constants.NV_HYPERVISOR in what:
861 result[constants.NV_HYPERVISOR] = {}
862 for hv_name in what[constants.NV_HYPERVISOR]:
863 hvparams = all_hvparams[hv_name]
864 try:
865 val = get_hv_fn(hv_name).Verify(hvparams=hvparams)
866 except errors.HypervisorError, err:
867 val = "Error while checking hypervisor: %s" % str(err)
868 result[constants.NV_HYPERVISOR][hv_name] = val
869
870
871 def _VerifyHvparams(what, vm_capable, result,
872 get_hv_fn=hypervisor.GetHypervisor):
873 """Verifies the hvparams. Appends the results to the 'results' list.
874
875 @type what: C{dict}
876 @param what: a dictionary of things to check
877 @type vm_capable: boolean
878 @param vm_capable: whether or not this node is vm capable
879 @type result: dict
880 @param result: dictionary of verification results; results of the
881 verifications in this function will be added here
882 @type get_hv_fn: function
883 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
884
885 """
886 if not vm_capable:
887 return
888
889 if constants.NV_HVPARAMS in what:
890 result[constants.NV_HVPARAMS] = []
891 for source, hv_name, hvparms in what[constants.NV_HVPARAMS]:
892 try:
893 logging.info("Validating hv %s, %s", hv_name, hvparms)
894 get_hv_fn(hv_name).ValidateParameters(hvparms)
895 except errors.HypervisorError, err:
896 result[constants.NV_HVPARAMS].append((source, hv_name, str(err)))
897
898
899 def _VerifyInstanceList(what, vm_capable, result, all_hvparams):
900 """Verifies the instance list.
901
902 @type what: C{dict}
903 @param what: a dictionary of things to check
904 @type vm_capable: boolean
905 @param vm_capable: whether or not this node is vm capable
906 @type result: dict
907 @param result: dictionary of verification results; results of the
908 verifications in this function will be added here
909 @type all_hvparams: dict of dict of string
910 @param all_hvparams: dictionary mapping hypervisor names to hvparams
911
912 """
913 if constants.NV_INSTANCELIST in what and vm_capable:
914 # GetInstanceList can fail
915 try:
916 val = GetInstanceList(what[constants.NV_INSTANCELIST],
917 all_hvparams=all_hvparams)
918 except RPCFail, err:
919 val = str(err)
920 result[constants.NV_INSTANCELIST] = val
921
922
923 def _VerifyNodeInfo(what, vm_capable, result, all_hvparams):
924 """Verifies the node info.
925
926 @type what: C{dict}
927 @param what: a dictionary of things to check
928 @type vm_capable: boolean
929 @param vm_capable: whether or not this node is vm capable
930 @type result: dict
931 @param result: dictionary of verification results; results of the
932 verifications in this function will be added here
933 @type all_hvparams: dict of dict of string
934 @param all_hvparams: dictionary mapping hypervisor names to hvparams
935
936 """
937 if constants.NV_HVINFO in what and vm_capable:
938 hvname = what[constants.NV_HVINFO]
939 hyper = hypervisor.GetHypervisor(hvname)
940 hvparams = all_hvparams[hvname]
941 result[constants.NV_HVINFO] = hyper.GetNodeInfo(hvparams=hvparams)
942
943
944 def _VerifyClientCertificate(cert_file=pathutils.NODED_CLIENT_CERT_FILE):
945 """Verify the existance and validity of the client SSL certificate.
946
947 Also, verify that the client certificate is not self-signed. Self-
948 signed client certificates stem from Ganeti versions 2.12.0 - 2.12.4
949 and should be replaced by client certificates signed by the server
950 certificate. Hence we output a warning when we encounter a self-signed
951 one.
952
953 """
954 create_cert_cmd = "gnt-cluster renew-crypto --new-node-certificates"
955 if not os.path.exists(cert_file):
956 return (constants.CV_ERROR,
957 "The client certificate does not exist. Run '%s' to create"
958 " client certificates for all nodes." % create_cert_cmd)
959
960 (errcode, msg) = utils.VerifyCertificate(cert_file)
961 if errcode is not None:
962 return (errcode, msg)
963
964 (errcode, msg) = utils.IsCertificateSelfSigned(cert_file)
965 if errcode is not None:
966 return (errcode, msg)
967
968 # if everything is fine, we return the digest to be compared to the config
969 return (None, utils.GetCertificateDigest(cert_filename=cert_file))
970
971
972 def _VerifySshSetup(node_status_list, my_name, ssh_key_type,
973 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS):
974 """Verifies the state of the SSH key files.
975
976 @type node_status_list: list of tuples
977 @param node_status_list: list of nodes of the cluster associated with a
978 couple of flags: (uuid, name, is_master_candidate,
979 is_potential_master_candidate, online)
980 @type my_name: str
981 @param my_name: name of this node
982 @type ssh_key_type: one of L{constants.SSHK_ALL}
983 @param ssh_key_type: type of key used on nodes
984 @type ganeti_pub_keys_file: str
985 @param ganeti_pub_keys_file: filename of the public keys file
986
987 """
988 if node_status_list is None:
989 return ["No node list to check against the pub_key_file received."]
990
991 my_status_list = [(my_uuid, name, mc, pot_mc, online) for
992 (my_uuid, name, mc, pot_mc, online)
993 in node_status_list if name == my_name]
994 if len(my_status_list) == 0:
995 return ["Cannot find node information for node '%s'." % my_name]
996 (my_uuid, _, _, potential_master_candidate, online) = \
997 my_status_list[0]
998
999 result = []
1000
1001 if not os.path.exists(ganeti_pub_keys_file):
1002 result.append("The public key file '%s' does not exist. Consider running"
1003 " 'gnt-cluster renew-crypto --new-ssh-keys"
1004 " [--no-ssh-key-check]' to fix this." % ganeti_pub_keys_file)
1005 return result
1006
1007 pot_mc_uuids = [uuid for (uuid, _, _, _, _) in node_status_list]
1008 offline_nodes = [uuid for (uuid, _, _, _, online) in node_status_list
1009 if not online]
1010 pub_keys = ssh.QueryPubKeyFile(None, key_file=ganeti_pub_keys_file)
1011
1012 if potential_master_candidate:
1013 # Check that the set of potential master candidates matches the
1014 # public key file
1015 pub_uuids_set = set(pub_keys.keys()) - set(offline_nodes)
1016 pot_mc_uuids_set = set(pot_mc_uuids) - set(offline_nodes)
1017 missing_uuids = set([])
1018 if pub_uuids_set != pot_mc_uuids_set:
1019 unknown_uuids = pub_uuids_set - pot_mc_uuids_set
1020 if unknown_uuids:
1021 result.append("The following node UUIDs are listed in the public key"
1022 " file on node '%s', but are not potential master"
1023 " candidates: %s."
1024 % (my_name, ", ".join(list(unknown_uuids))))
1025 missing_uuids = pot_mc_uuids_set - pub_uuids_set
1026 if missing_uuids:
1027 result.append("The following node UUIDs of potential master candidates"
1028 " are missing in the public key file on node %s: %s."
1029 % (my_name, ", ".join(list(missing_uuids))))
1030
1031 (_, key_files) = \
1032 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1033 (_, node_pub_key_file) = key_files[ssh_key_type]
1034
1035 my_keys = pub_keys[my_uuid]
1036
1037 node_pub_key = utils.ReadFile(node_pub_key_file)
1038 if node_pub_key.strip() not in my_keys:
1039 result.append("The dsa key of node %s does not match this node's key"
1040 " in the pub key file." % my_name)
1041 if len(my_keys) != 1:
1042 result.append("There is more than one key for node %s in the public key"
1043 " file." % my_name)
1044 else:
1045 if len(pub_keys.keys()) > 0:
1046 result.append("The public key file of node '%s' is not empty, although"
1047 " the node is not a potential master candidate."
1048 % my_name)
1049
1050 # Check that all master candidate keys are in the authorized_keys file
1051 (auth_key_file, _) = \
1052 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1053 for (uuid, name, mc, _, online) in node_status_list:
1054 if not online:
1055 continue
1056 if uuid in missing_uuids:
1057 continue
1058 if mc:
1059 for key in pub_keys[uuid]:
1060 if not ssh.HasAuthorizedKey(auth_key_file, key):
1061 result.append("A SSH key of master candidate '%s' (UUID: '%s') is"
1062 " not in the 'authorized_keys' file of node '%s'."
1063 % (name, uuid, my_name))
1064 else:
1065 for key in pub_keys[uuid]:
1066 if name != my_name and ssh.HasAuthorizedKey(auth_key_file, key):
1067 result.append("A SSH key of normal node '%s' (UUID: '%s') is in the"
1068 " 'authorized_keys' file of node '%s'."
1069 % (name, uuid, my_name))
1070 if name == my_name and not ssh.HasAuthorizedKey(auth_key_file, key):
1071 result.append("A SSH key of normal node '%s' (UUID: '%s') is not"
1072 " in the 'authorized_keys' file of itself."
1073 % (my_name, uuid))
1074
1075 return result
1076
1077
1078 def _VerifySshClutter(node_status_list, my_name):
1079 """Verifies that the 'authorized_keys' files are not cluttered up.
1080
1081 @type node_status_list: list of tuples
1082 @param node_status_list: list of nodes of the cluster associated with a
1083 couple of flags: (uuid, name, is_master_candidate,
1084 is_potential_master_candidate, online)
1085 @type my_name: str
1086 @param my_name: name of this node
1087
1088 """
1089 result = []
1090 (auth_key_file, _) = \
1091 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1092 node_names = [name for (_, name, _, _) in node_status_list]
1093 multiple_occurrences = ssh.CheckForMultipleKeys(auth_key_file, node_names)
1094 if multiple_occurrences:
1095 msg = "There are hosts which have more than one SSH key stored for the" \
1096 " same user in the 'authorized_keys' file of node %s. This can be" \
1097 " due to an unsuccessful operation which cluttered up the" \
1098 " 'authorized_keys' file. We recommend to clean this up manually. " \
1099 % my_name
1100 for host, occ in multiple_occurrences.items():
1101 msg += "Entry for '%s' in lines %s. " % (host, utils.CommaJoin(occ))
1102 result.append(msg)
1103
1104 return result
1105
1106
1107 def VerifyNode(what, cluster_name, all_hvparams):
1108 """Verify the status of the local node.
1109
1110 Based on the input L{what} parameter, various checks are done on the
1111 local node.
1112
1113 If the I{filelist} key is present, this list of
1114 files is checksummed and the file/checksum pairs are returned.
1115
1116 If the I{nodelist} key is present, we check that we have
1117 connectivity via ssh with the target nodes (and check the hostname
1118 report).
1119
1120 If the I{node-net-test} key is present, we check that we have
1121 connectivity to the given nodes via both primary IP and, if
1122 applicable, secondary IPs.
1123
1124 @type what: C{dict}
1125 @param what: a dictionary of things to check:
1126 - filelist: list of files for which to compute checksums
1127 - nodelist: list of nodes we should check ssh communication with
1128 - node-net-test: list of nodes we should check node daemon port
1129 connectivity with
1130 - hypervisor: list with hypervisors to run the verify for
1131 @type cluster_name: string
1132 @param cluster_name: the cluster's name
1133 @type all_hvparams: dict of dict of strings
1134 @param all_hvparams: a dictionary mapping hypervisor names to hvparams
1135 @rtype: dict
1136 @return: a dictionary with the same keys as the input dict, and
1137 values representing the result of the checks
1138
1139 """
1140 result = {}
1141 my_name = netutils.Hostname.GetSysName()
1142 port = netutils.GetDaemonPort(constants.NODED)
1143 vm_capable = my_name not in what.get(constants.NV_NONVMNODES, [])
1144
1145 _VerifyHypervisors(what, vm_capable, result, all_hvparams)
1146 _VerifyHvparams(what, vm_capable, result)
1147
1148 if constants.NV_FILELIST in what:
1149 fingerprints = utils.FingerprintFiles(map(vcluster.LocalizeVirtualPath,
1150 what[constants.NV_FILELIST]))
1151 result[constants.NV_FILELIST] = \
1152 dict((vcluster.MakeVirtualPath(key), value)
1153 for (key, value) in fingerprints.items())
1154
1155 if constants.NV_CLIENT_CERT in what:
1156 result[constants.NV_CLIENT_CERT] = _VerifyClientCertificate()
1157
1158 if constants.NV_SSH_SETUP in what:
1159 node_status_list, key_type = what[constants.NV_SSH_SETUP]
1160 result[constants.NV_SSH_SETUP] = \
1161 _VerifySshSetup(node_status_list, my_name, key_type)
1162 if constants.NV_SSH_CLUTTER in what:
1163 result[constants.NV_SSH_CLUTTER] = \
1164 _VerifySshClutter(what[constants.NV_SSH_SETUP], my_name)
1165
1166 if constants.NV_NODELIST in what:
1167 (nodes, bynode, mcs) = what[constants.NV_NODELIST]
1168
1169 # Add nodes from other groups (different for each node)
1170 try:
1171 nodes.extend(bynode[my_name])
1172 except KeyError:
1173 pass
1174
1175 # Use a random order
1176 random.shuffle(nodes)
1177
1178 # Try to contact all nodes
1179 val = {}
1180 ssh_port_map = ssconf.SimpleStore().GetSshPortMap()
1181 for node in nodes:
1182 # We only test if master candidates can communicate to other nodes.
1183 # We cannot test if normal nodes cannot communicate with other nodes,
1184 # because the administrator might have installed additional SSH keys,
1185 # over which Ganeti has no power.
1186 if my_name in mcs:
1187 success, message = _GetSshRunner(cluster_name). \
1188 VerifyNodeHostname(node, ssh_port_map[node])
1189 if not success:
1190 val[node] = message
1191
1192 result[constants.NV_NODELIST] = val
1193
1194 if constants.NV_NODENETTEST in what:
1195 result[constants.NV_NODENETTEST] = tmp = {}
1196 my_pip = my_sip = None
1197 for name, pip, sip in what[constants.NV_NODENETTEST]:
1198 if name == my_name:
1199 my_pip = pip
1200 my_sip = sip
1201 break
1202 if not my_pip:
1203 tmp[my_name] = ("Can't find my own primary/secondary IP"
1204 " in the node list")
1205 else:
1206 for name, pip, sip in what[constants.NV_NODENETTEST]:
1207 fail = []
1208 if not netutils.TcpPing(pip, port, source=my_pip):
1209 fail.append("primary")
1210 if sip != pip:
1211 if not netutils.TcpPing(sip, port, source=my_sip):
1212 fail.append("secondary")
1213 if fail:
1214 tmp[name] = ("failure using the %s interface(s)" %
1215 " and ".join(fail))
1216
1217 if constants.NV_MASTERIP in what:
1218 # FIXME: add checks on incoming data structures (here and in the
1219 # rest of the function)
1220 master_name, master_ip = what[constants.NV_MASTERIP]
1221 if master_name == my_name:
1222 source = constants.IP4_ADDRESS_LOCALHOST
1223 else:
1224 source = None
1225 result[constants.NV_MASTERIP] = netutils.TcpPing(master_ip, port,
1226 source=source)
1227
1228 if constants.NV_USERSCRIPTS in what:
1229 result[constants.NV_USERSCRIPTS] = \
1230 [script for script in what[constants.NV_USERSCRIPTS]
1231 if not utils.IsExecutable(script)]
1232
1233 if constants.NV_OOB_PATHS in what:
1234 result[constants.NV_OOB_PATHS] = tmp = []
1235 for path in what[constants.NV_OOB_PATHS]:
1236 try:
1237 st = os.stat(path)
1238 except OSError, err:
1239 tmp.append("error stating out of band helper: %s" % err)
1240 else:
1241 if stat.S_ISREG(st.st_mode):
1242 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR:
1243 tmp.append(None)
1244 else:
1245 tmp.append("out of band helper %s is not executable" % path)
1246 else:
1247 tmp.append("out of band helper %s is not a file" % path)
1248
1249 if constants.NV_LVLIST in what and vm_capable:
1250 try:
1251 val = GetVolumeList(utils.ListVolumeGroups().keys())
1252 except RPCFail, err:
1253 val = str(err)
1254 result[constants.NV_LVLIST] = val
1255
1256 _VerifyInstanceList(what, vm_capable, result, all_hvparams)
1257
1258 if constants.NV_VGLIST in what and vm_capable:
1259 result[constants.NV_VGLIST] = utils.ListVolumeGroups()
1260
1261 if constants.NV_PVLIST in what and vm_capable:
1262 check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what
1263 val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST],
1264 filter_allocatable=False,
1265 include_lvs=check_exclusive_pvs)
1266 if check_exclusive_pvs:
1267 result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val)
1268 for pvi in val:
1269 # Avoid sending useless data on the wire
1270 pvi.lv_list = []
1271 result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val)
1272
1273 if constants.NV_VERSION in what:
1274 result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
1275 constants.RELEASE_VERSION)
1276
1277 _VerifyNodeInfo(what, vm_capable, result, all_hvparams)
1278
1279 if constants.NV_DRBDVERSION in what and vm_capable:
1280 try:
1281 drbd_version = DRBD8.GetProcInfo().GetVersionString()
1282 except errors.BlockDeviceError, err:
1283 logging.warning("Can't get DRBD version", exc_info=True)
1284 drbd_version = str(err)
1285 result[constants.NV_DRBDVERSION] = drbd_version
1286
1287 if constants.NV_DRBDLIST in what and vm_capable:
1288 try:
1289 used_minors = drbd.DRBD8.GetUsedDevs()
1290 except errors.BlockDeviceError, err:
1291 logging.warning("Can't get used minors list", exc_info=True)
1292 used_minors = str(err)
1293 result[constants.NV_DRBDLIST] = used_minors
1294
1295 if constants.NV_DRBDHELPER in what and vm_capable:
1296 status = True
1297 try:
1298 payload = drbd.DRBD8.GetUsermodeHelper()
1299 except errors.BlockDeviceError, err:
1300 logging.error("Can't get DRBD usermode helper: %s", str(err))
1301 status = False
1302 payload = str(err)
1303 result[constants.NV_DRBDHELPER] = (status, payload)
1304
1305 if constants.NV_NODESETUP in what:
1306 result[constants.NV_NODESETUP] = tmpr = []
1307 if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"):
1308 tmpr.append("The sysfs filesytem doesn't seem to be mounted"
1309 " under /sys, missing required directories /sys/block"
1310 " and /sys/class/net")
1311 if (not os.path.isdir("/proc/sys") or
1312 not os.path.isfile("/proc/sysrq-trigger")):
1313 tmpr.append("The procfs filesystem doesn't seem to be mounted"
1314 " under /proc, missing required directory /proc/sys and"
1315 " the file /proc/sysrq-trigger")
1316
1317 if constants.NV_TIME in what:
1318 result[constants.NV_TIME] = utils.SplitTime(time.time())
1319
1320 if constants.NV_OSLIST in what and vm_capable:
1321 result[constants.NV_OSLIST] = DiagnoseOS()
1322
1323 if constants.NV_BRIDGES in what and vm_capable:
1324 result[constants.NV_BRIDGES] = [bridge
1325 for bridge in what[constants.NV_BRIDGES]
1326 if not utils.BridgeExists(bridge)]
1327
1328 if what.get(constants.NV_ACCEPTED_STORAGE_PATHS) == my_name:
1329 result[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1330 filestorage.ComputeWrongFileStoragePaths()
1331
1332 if what.get(constants.NV_FILE_STORAGE_PATH):
1333 pathresult = filestorage.CheckFileStoragePath(
1334 what[constants.NV_FILE_STORAGE_PATH])
1335 if pathresult:
1336 result[constants.NV_FILE_STORAGE_PATH] = pathresult
1337
1338 if what.get(constants.NV_SHARED_FILE_STORAGE_PATH):
1339 pathresult = filestorage.CheckFileStoragePath(
1340 what[constants.NV_SHARED_FILE_STORAGE_PATH])
1341 if pathresult:
1342 result[constants.NV_SHARED_FILE_STORAGE_PATH] = pathresult
1343
1344 return result
1345
1346
1347 def GetCryptoTokens(token_requests):
1348 """Perform actions on the node's cryptographic tokens.
1349
1350 Token types can be 'ssl' or 'ssh'. So far only some actions are implemented
1351 for 'ssl'. Action 'get' returns the digest of the public client ssl
1352 certificate. Action 'create' creates a new client certificate and private key
1353 and also returns the digest of the certificate. The third parameter of a
1354 token request are optional parameters for the actions, so far only the
1355 filename is supported.
1356
1357 @type token_requests: list of tuples of (string, string, dict), where the
1358 first string is in constants.CRYPTO_TYPES, the second in
1359 constants.CRYPTO_ACTIONS. The third parameter is a dictionary of string
1360 to string.
1361 @param token_requests: list of requests of cryptographic tokens and actions
1362 to perform on them. The actions come with a dictionary of options.
1363 @rtype: list of tuples (string, string)
1364 @return: list of tuples of the token type and the public crypto token
1365
1366 """
1367 tokens = []
1368 for (token_type, action, _) in token_requests:
1369 if token_type not in constants.CRYPTO_TYPES:
1370 raise errors.ProgrammerError("Token type '%s' not supported." %
1371 token_type)
1372 if action not in constants.CRYPTO_ACTIONS:
1373 raise errors.ProgrammerError("Action '%s' is not supported." %
1374 action)
1375 if token_type == constants.CRYPTO_TYPE_SSL_DIGEST:
1376 tokens.append((token_type,
1377 utils.GetCertificateDigest()))
1378 return tokens
1379
1380
1381 def EnsureDaemon(daemon_name, run):
1382 """Ensures the given daemon is running or stopped.
1383
1384 @type daemon_name: string
1385 @param daemon_name: name of the daemon (e.g., constants.KVMD)
1386
1387 @type run: bool
1388 @param run: whether to start or stop the daemon
1389
1390 @rtype: bool
1391 @return: 'True' if daemon successfully started/stopped,
1392 'False' otherwise
1393
1394 """
1395 allowed_daemons = [constants.KVMD]
1396
1397 if daemon_name not in allowed_daemons:
1398 fn = lambda _: False
1399 elif run:
1400 fn = utils.EnsureDaemon
1401 else:
1402 fn = utils.StopDaemon
1403
1404 return fn(daemon_name)
1405
1406
1407 def _InitSshUpdateData(data, noded_cert_file, ssconf_store):
1408 (_, noded_cert) = \
1409 utils.ExtractX509Certificate(utils.ReadFile(noded_cert_file))
1410 data[constants.SSHS_NODE_DAEMON_CERTIFICATE] = noded_cert
1411
1412 cluster_name = ssconf_store.GetClusterName()
1413 data[constants.SSHS_CLUSTER_NAME] = cluster_name
1414
1415
1416 def AddNodeSshKey(node_uuid, node_name,
1417 potential_master_candidates,
1418 to_authorized_keys=False,
1419 to_public_keys=False,
1420 get_public_keys=False,
1421 pub_key_file=pathutils.SSH_PUB_KEYS,
1422 ssconf_store=None,
1423 noded_cert_file=pathutils.NODED_CERT_FILE,
1424 run_cmd_fn=ssh.RunSshCmdWithStdin,
1425 ssh_update_debug=False,
1426 ssh_update_verbose=False):
1427 """Distributes a node's public SSH key across the cluster.
1428
1429 Note that this function should only be executed on the master node, which
1430 then will copy the new node's key to all nodes in the cluster via SSH.
1431
1432 Also note: at least one of the flags C{to_authorized_keys},
1433 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1434 the function to actually perform any actions.
1435
1436 @type node_uuid: str
1437 @param node_uuid: the UUID of the node whose key is added
1438 @type node_name: str
1439 @param node_name: the name of the node whose key is added
1440 @type potential_master_candidates: list of str
1441 @param potential_master_candidates: list of node names of potential master
1442 candidates; this should match the list of uuids in the public key file
1443 @type to_authorized_keys: boolean
1444 @param to_authorized_keys: whether the key should be added to the
1445 C{authorized_keys} file of all nodes
1446 @type to_public_keys: boolean
1447 @param to_public_keys: whether the keys should be added to the public key file
1448 @type get_public_keys: boolean
1449 @param get_public_keys: whether the node should add the clusters' public keys
1450 to its {ganeti_pub_keys} file
1451
1452 """
1453 node_list = [SshAddNodeInfo(name=node_name, uuid=node_uuid,
1454 to_authorized_keys=to_authorized_keys,
1455 to_public_keys=to_public_keys,
1456 get_public_keys=get_public_keys)]
1457 return AddNodeSshKeyBulk(node_list,
1458 potential_master_candidates,
1459 pub_key_file=pub_key_file,
1460 ssconf_store=ssconf_store,
1461 noded_cert_file=noded_cert_file,
1462 run_cmd_fn=run_cmd_fn,
1463 ssh_update_debug=ssh_update_debug,
1464 ssh_update_verbose=ssh_update_verbose)
1465
1466
1467 # Node info named tuple specifically for the use with AddNodeSshKeyBulk
1468 SshAddNodeInfo = collections.namedtuple(
1469 "SshAddNodeInfo",
1470 ["uuid",
1471 "name",
1472 "to_authorized_keys",
1473 "to_public_keys",
1474 "get_public_keys"])
1475
1476
1477 def AddNodeSshKeyBulk(node_list,
1478 potential_master_candidates,
1479 pub_key_file=pathutils.SSH_PUB_KEYS,
1480 ssconf_store=None,
1481 noded_cert_file=pathutils.NODED_CERT_FILE,
1482 run_cmd_fn=ssh.RunSshCmdWithStdin,
1483 ssh_update_debug=False,
1484 ssh_update_verbose=False):
1485 """Distributes a node's public SSH key across the cluster.
1486
1487 Note that this function should only be executed on the master node, which
1488 then will copy the new node's key to all nodes in the cluster via SSH.
1489
1490 Also note: at least one of the flags C{to_authorized_keys},
1491 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1492 the function to actually perform any actions.
1493
1494 @type node_list: list of SshAddNodeInfo tuples
1495 @param node_list: list of tuples containing the necessary node information for
1496 adding their keys
1497 @type potential_master_candidates: list of str
1498 @param potential_master_candidates: list of node names of potential master
1499 candidates; this should match the list of uuids in the public key file
1500
1501 """
1502 # whether there are any keys to be added or retrieved at all
1503 to_authorized_keys = any([node_info.to_authorized_keys for node_info in
1504 node_list])
1505 to_public_keys = any([node_info.to_public_keys for node_info in
1506 node_list])
1507
1508 if not ssconf_store:
1509 ssconf_store = ssconf.SimpleStore()
1510
1511 for node_info in node_list:
1512 # replacement not necessary for keys that are not supposed to be in the
1513 # list of public keys
1514 if not node_info.to_public_keys:
1515 continue
1516 # Check and fix sanity of key file
1517 keys_by_name = ssh.QueryPubKeyFile([node_info.name], key_file=pub_key_file)
1518 keys_by_uuid = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1519
1520 if (not keys_by_name or node_info.name not in keys_by_name) \
1521 and (not keys_by_uuid or node_info.uuid not in keys_by_uuid):
1522 raise errors.SshUpdateError(
1523 "No keys found for the new node '%s' (UUID %s) in the list of public"
1524 " SSH keys, neither for the name or the UUID" %
1525 (node_info.name, node_info.uuid))
1526 else:
1527 if node_info.name in keys_by_name:
1528 # Replace the name by UUID in the file as the name should only be used
1529 # temporarily
1530 ssh.ReplaceNameByUuid(node_info.uuid, node_info.name,
1531 error_fn=errors.SshUpdateError,
1532 key_file=pub_key_file)
1533
1534 # Retrieve updated map of UUIDs to keys
1535 keys_by_uuid = ssh.QueryPubKeyFile(
1536 [node_info.uuid for node_info in node_list], key_file=pub_key_file)
1537
1538 # Update the master node's key files
1539 (auth_key_file, _) = \
1540 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1541 for node_info in node_list:
1542 if node_info.to_authorized_keys:
1543 ssh.AddAuthorizedKeys(auth_key_file, keys_by_uuid[node_info.uuid])
1544
1545 base_data = {}
1546 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1547 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1548
1549 ssh_port_map = ssconf_store.GetSshPortMap()
1550
1551 # Update the target nodes themselves
1552 for node_info in node_list:
1553 logging.debug("Updating SSH key files of target node '%s'.", node_info.name)
1554 if node_info.get_public_keys:
1555 node_data = {}
1556 _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
1557 all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
1558 node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1559 (constants.SSHS_OVERRIDE, all_keys)
1560
1561 try:
1562 utils.RetryByNumberOfTimes(
1563 constants.SSHS_MAX_RETRIES,
1564 errors.SshUpdateError,
1565 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1566 ssh_port_map.get(node_info.name), node_data,
1567 debug=ssh_update_debug, verbose=ssh_update_verbose,
1568 use_cluster_key=False, ask_key=False, strict_host_check=False)
1569 except errors.SshUpdateError as e:
1570 # Clean up the master's public key file if adding key fails
1571 if node_info.to_public_keys:
1572 ssh.RemovePublicKey(node_info.uuid)
1573 raise e
1574
1575 # Update all nodes except master and the target nodes
1576 keys_by_uuid_auth = ssh.QueryPubKeyFile(
1577 [node_info.uuid for node_info in node_list
1578 if node_info.to_authorized_keys],
1579 key_file=pub_key_file)
1580 if to_authorized_keys:
1581 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1582 (constants.SSHS_ADD, keys_by_uuid_auth)
1583
1584 pot_mc_data = base_data.copy()
1585 keys_by_uuid_pub = ssh.QueryPubKeyFile(
1586 [node_info.uuid for node_info in node_list
1587 if node_info.to_public_keys],
1588 key_file=pub_key_file)
1589 if to_public_keys:
1590 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1591 (constants.SSHS_REPLACE_OR_ADD, keys_by_uuid_pub)
1592
1593 all_nodes = ssconf_store.GetNodeList()
1594 master_node = ssconf_store.GetMasterNode()
1595 online_nodes = ssconf_store.GetOnlineNodeList()
1596
1597 node_errors = []
1598 for node in all_nodes:
1599 if node == master_node:
1600 logging.debug("Skipping master node '%s'.", master_node)
1601 continue
1602 if node not in online_nodes:
1603 logging.debug("Skipping offline node '%s'.", node)
1604 continue
1605 if node in potential_master_candidates:
1606 logging.debug("Updating SSH key files of node '%s'.", node)
1607 try:
1608 utils.RetryByNumberOfTimes(
1609 constants.SSHS_MAX_RETRIES,
1610 errors.SshUpdateError,
1611 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1612 ssh_port_map.get(node), pot_mc_data,
1613 debug=ssh_update_debug, verbose=ssh_update_verbose,
1614 use_cluster_key=False, ask_key=False, strict_host_check=False)
1615 except errors.SshUpdateError as last_exception:
1616 error_msg = ("When adding the key of node '%s', updating SSH key"
1617 " files of node '%s' failed after %s retries."
1618 " Not trying again. Last error was: %s." %
1619 (node, node_info.name, constants.SSHS_MAX_RETRIES,
1620 last_exception))
1621 node_errors.append((node, error_msg))
1622 # We only log the error and don't throw an exception, because
1623 # one unreachable node shall not abort the entire procedure.
1624 logging.error(error_msg)
1625
1626 else:
1627 if to_authorized_keys:
1628 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
1629 ssh_port_map.get(node), base_data,
1630 debug=ssh_update_debug, verbose=ssh_update_verbose,
1631 use_cluster_key=False, ask_key=False,
1632 strict_host_check=False)
1633
1634 return node_errors
1635
1636
1637 # TODO: will be fixed with pending patch series.
1638 # pylint: disable=R0913
1639 def RemoveNodeSshKey(node_uuid, node_name,
1640 master_candidate_uuids,
1641 potential_master_candidates,
1642 master_uuid=None,
1643 keys_to_remove=None,
1644 from_authorized_keys=False,
1645 from_public_keys=False,
1646 clear_authorized_keys=False,
1647 clear_public_keys=False,
1648 pub_key_file=pathutils.SSH_PUB_KEYS,
1649 ssconf_store=None,
1650 noded_cert_file=pathutils.NODED_CERT_FILE,
1651 readd=False,
1652 run_cmd_fn=ssh.RunSshCmdWithStdin,
1653 ssh_update_debug=False,
1654 ssh_update_verbose=False):
1655 """Removes the node's SSH keys from the key files and distributes those.
1656
1657 Note that at least one of the flags C{from_authorized_keys},
1658 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1659 has to be set to C{True} for the function to perform any action at all.
1660 Not doing so will trigger an assertion in the function.
1661
1662 @type node_uuid: str
1663 @param node_uuid: UUID of the node whose key is removed
1664 @type node_name: str
1665 @param node_name: name of the node whose key is remove
1666 @type master_candidate_uuids: list of str
1667 @param master_candidate_uuids: list of UUIDs of the current master candidates
1668 @type potential_master_candidates: list of str
1669 @param potential_master_candidates: list of names of potential master
1670 candidates
1671 @type keys_to_remove: dict of str to list of str
1672 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1673 to be removed. This list is supposed to be used only if the keys are not
1674 in the public keys file. This is for example the case when removing a
1675 master node's key.
1676 @type from_authorized_keys: boolean
1677 @param from_authorized_keys: whether or not the key should be removed
1678 from the C{authorized_keys} file
1679 @type from_public_keys: boolean
1680 @param from_public_keys: whether or not the key should be remove from
1681 the C{ganeti_pub_keys} file
1682 @type clear_authorized_keys: boolean
1683 @param clear_authorized_keys: whether or not the C{authorized_keys} file
1684 should be cleared on the node whose keys are removed
1685 @type clear_public_keys: boolean
1686 @param clear_public_keys: whether to clear the node's C{ganeti_pub_key} file
1687 @type readd: boolean
1688 @param readd: whether this is called during a readd operation.
1689 @rtype: list of string
1690 @returns: list of feedback messages
1691
1692 """
1693 node_list = [SshRemoveNodeInfo(uuid=node_uuid,
1694 name=node_name,
1695 from_authorized_keys=from_authorized_keys,
1696 from_public_keys=from_public_keys,
1697 clear_authorized_keys=clear_authorized_keys,
1698 clear_public_keys=clear_public_keys)]
1699 return RemoveNodeSshKeyBulk(node_list,
1700 master_candidate_uuids,
1701 potential_master_candidates,
1702 master_uuid=master_uuid,
1703 keys_to_remove=keys_to_remove,
1704 pub_key_file=pub_key_file,
1705 ssconf_store=ssconf_store,
1706 noded_cert_file=noded_cert_file,
1707 readd=readd,
1708 run_cmd_fn=run_cmd_fn,
1709 ssh_update_debug=ssh_update_debug,
1710 ssh_update_verbose=ssh_update_verbose)
1711
1712
1713 # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
1714 SshRemoveNodeInfo = collections.namedtuple(
1715 "SshRemoveNodeInfo",
1716 ["uuid",
1717 "name",
1718 "from_authorized_keys",
1719 "from_public_keys",
1720 "clear_authorized_keys",
1721 "clear_public_keys"])
1722
1723
1724 def RemoveNodeSshKeyBulk(node_list,
1725 master_candidate_uuids,
1726 potential_master_candidates,
1727 master_uuid=None,
1728 keys_to_remove=None,
1729 pub_key_file=pathutils.SSH_PUB_KEYS,
1730 ssconf_store=None,
1731 noded_cert_file=pathutils.NODED_CERT_FILE,
1732 readd=False,
1733 run_cmd_fn=ssh.RunSshCmdWithStdin,
1734 ssh_update_debug=False,
1735 ssh_update_verbose=False):
1736 """Removes the node's SSH keys from the key files and distributes those.
1737
1738 Note that at least one of the flags C{from_authorized_keys},
1739 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1740 of at least one node has to be set to C{True} for the function to perform any
1741 action at all. Not doing so will trigger an assertion in the function.
1742
1743 @type node_list: list of C{SshRemoveNodeInfo}.
1744 @param node_list: list of information about nodes whose keys are being removed
1745 @type master_candidate_uuids: list of str
1746 @param master_candidate_uuids: list of UUIDs of the current master candidates
1747 @type potential_master_candidates: list of str
1748 @param potential_master_candidates: list of names of potential master
1749 candidates
1750 @type keys_to_remove: dict of str to list of str
1751 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1752 to be removed. This list is supposed to be used only if the keys are not
1753 in the public keys file. This is for example the case when removing a
1754 master node's key.
1755 @type readd: boolean
1756 @param readd: whether this is called during a readd operation.
1757 @rtype: list of string
1758 @returns: list of feedback messages
1759
1760 """
1761 # Non-disruptive error messages, list of (node, msg) pairs
1762 result_msgs = []
1763
1764 # whether there are any keys to be added or retrieved at all
1765 from_authorized_keys = any([node_info.from_authorized_keys for node_info in
1766 node_list])
1767 from_public_keys = any([node_info.from_public_keys for node_info in
1768 node_list])
1769 clear_authorized_keys = any([node_info.clear_authorized_keys for node_info in
1770 node_list])
1771 clear_public_keys = any([node_info.clear_public_keys for node_info in
1772 node_list])
1773
1774 # Make sure at least one of these flags is true.
1775 if not (from_authorized_keys or from_public_keys or clear_authorized_keys
1776 or clear_public_keys):
1777 raise errors.SshUpdateError("No removal from any key file was requested.")
1778
1779 if not ssconf_store:
1780 ssconf_store = ssconf.SimpleStore()
1781
1782 master_node = ssconf_store.GetMasterNode()
1783 ssh_port_map = ssconf_store.GetSshPortMap()
1784
1785 all_keys_to_remove = {}
1786 if from_authorized_keys or from_public_keys:
1787 for node_info in node_list:
1788 # Skip nodes that don't actually need any keys to be removed.
1789 if not (node_info.from_authorized_keys or node_info.from_public_keys):
1790 continue
1791 if node_info.name == master_node and not keys_to_remove:
1792 raise errors.SshUpdateError("Cannot remove the master node's keys.")
1793 if keys_to_remove:
1794 keys = keys_to_remove
1795 else:
1796 keys = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1797 if (not keys or node_info.uuid not in keys) and not readd:
1798 raise errors.SshUpdateError("Node '%s' not found in the list of"
1799 " public SSH keys. It seems someone"
1800 " tries to remove a key from outside"
1801 " the cluster!" % node_info.uuid)
1802 # During an upgrade all nodes have the master key. In this case we
1803 # should not remove it to avoid accidentally shutting down cluster
1804 # SSH communication
1805 master_keys = None
1806 if master_uuid:
1807 master_keys = ssh.QueryPubKeyFile([master_uuid],
1808 key_file=pub_key_file)
1809 for master_key in master_keys:
1810 if master_key in keys[node_info.uuid]:
1811 keys[node_info.uuid].remove(master_key)
1812
1813 all_keys_to_remove.update(keys)
1814
1815 if all_keys_to_remove:
1816 base_data = {}
1817 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1818 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1819
1820 if from_authorized_keys:
1821 # UUIDs of nodes that are supposed to be removed from the
1822 # authorized_keys files.
1823 nodes_remove_from_authorized_keys = [
1824 node_info.uuid for node_info in node_list
1825 if node_info.from_authorized_keys]
1826 keys_to_remove_from_authorized_keys = dict([
1827 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1828 if uuid in nodes_remove_from_authorized_keys])
1829 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1830 (constants.SSHS_REMOVE, keys_to_remove_from_authorized_keys)
1831 (auth_key_file, _) = \
1832 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False,
1833 dircheck=False)
1834
1835 for uuid in nodes_remove_from_authorized_keys:
1836 ssh.RemoveAuthorizedKeys(auth_key_file,
1837 keys_to_remove_from_authorized_keys[uuid])
1838
1839 pot_mc_data = base_data.copy()
1840
1841 if from_public_keys:
1842 nodes_remove_from_public_keys = [
1843 node_info.uuid for node_info in node_list
1844 if node_info.from_public_keys]
1845 keys_to_remove_from_public_keys = dict([
1846 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1847 if uuid in nodes_remove_from_public_keys])
1848 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1849 (constants.SSHS_REMOVE, keys_to_remove_from_public_keys)
1850
1851 all_nodes = ssconf_store.GetNodeList()
1852 online_nodes = ssconf_store.GetOnlineNodeList()
1853 all_nodes_to_remove = [node_info.name for node_info in node_list]
1854 logging.debug("Removing keys of nodes '%s' from all nodes but itself and"
1855 " master.", ", ".join(all_nodes_to_remove))
1856 for node in all_nodes:
1857 if node == master_node:
1858 logging.debug("Skipping master node '%s'.", master_node)
1859 continue
1860 if node not in online_nodes:
1861 logging.debug("Skipping offline node '%s'.", node)
1862 continue
1863 if node in all_nodes_to_remove:
1864 logging.debug("Skipping node whose key is removed itself '%s'.", node)
1865 continue
1866 ssh_port = ssh_port_map.get(node)
1867 if not ssh_port:
1868 raise errors.OpExecError("No SSH port information available for"
1869 " node '%s', map: %s." %
1870 (node, ssh_port_map))
1871 error_msg_final = ("When removing the key of node '%s', updating the"
1872 " SSH key files of node '%s' failed. Last error"
1873 " was: %s.")
1874 if node in potential_master_candidates:
1875 logging.debug("Updating key setup of potential master candidate node"
1876 " %s.", node)
1877 try:
1878 utils.RetryByNumberOfTimes(
1879 constants.SSHS_MAX_RETRIES,
1880 errors.SshUpdateError,
1881 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1882 ssh_port, pot_mc_data,
1883 debug=ssh_update_debug, verbose=ssh_update_verbose,
1884 use_cluster_key=False, ask_key=False, strict_host_check=False)
1885 except errors.SshUpdateError as last_exception:
1886 error_msg = error_msg_final % (
1887 node_info.name, node, last_exception)
1888 result_msgs.append((node, error_msg))
1889 logging.error(error_msg)
1890
1891 else:
1892 if from_authorized_keys:
1893 logging.debug("Updating key setup of normal node %s.", node)
1894 try:
1895 utils.RetryByNumberOfTimes(
1896 constants.SSHS_MAX_RETRIES,
1897 errors.SshUpdateError,
1898 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1899 ssh_port, base_data,
1900 debug=ssh_update_debug, verbose=ssh_update_verbose,
1901 use_cluster_key=False, ask_key=False, strict_host_check=False)
1902 except errors.SshUpdateError as last_exception:
1903 error_msg = error_msg_final % (
1904 node_info.name, node, last_exception)
1905 result_msgs.append((node, error_msg))
1906 logging.error(error_msg)
1907
1908 for node_info in node_list:
1909 if node_info.clear_authorized_keys or node_info.from_public_keys or \
1910 node_info.clear_public_keys:
1911 data = {}
1912 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1913 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1914 ssh_port = ssh_port_map.get(node_info.name)
1915 if not ssh_port:
1916 raise errors.OpExecError("No SSH port information available for"
1917 " node '%s', which is leaving the cluster.")
1918
1919 if node_info.clear_authorized_keys:
1920 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore,
1921 # we have to specify exactly which keys to clear to leave keys untouched
1922 # that were not added by Ganeti.
1923 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids
1924 if uuid != node_info.uuid]
1925 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids,
1926 key_file=pub_key_file)
1927 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1928 (constants.SSHS_REMOVE, candidate_keys)
1929
1930 if node_info.clear_public_keys:
1931 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1932 (constants.SSHS_CLEAR, {})
1933 elif node_info.from_public_keys:
1934 # Since clearing the public keys subsumes removing just a single key,
1935 # we only do it if clear_public_keys is 'False'.
1936
1937 if all_keys_to_remove:
1938 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1939 (constants.SSHS_REMOVE, all_keys_to_remove)
1940
1941 # If we have no changes to any keyfile, just return
1942 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or
1943 constants.SSHS_SSH_AUTHORIZED_KEYS in data):
1944 return
1945
1946 logging.debug("Updating SSH key setup of target node '%s'.",
1947 node_info.name)
1948 try:
1949 utils.RetryByNumberOfTimes(
1950 constants.SSHS_MAX_RETRIES,
1951 errors.SshUpdateError,
1952 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1953 ssh_port, data,
1954 debug=ssh_update_debug, verbose=ssh_update_verbose,
1955 use_cluster_key=False, ask_key=False, strict_host_check=False)
1956 except errors.SshUpdateError as last_exception:
1957 result_msgs.append(
1958 (node_info.name,
1959 ("Removing SSH keys from node '%s' failed."
1960 " This can happen when the node is already unreachable."
1961 " Error: %s" % (node_info.name, last_exception))))
1962
1963 if all_keys_to_remove and from_public_keys:
1964 for node_uuid in nodes_remove_from_public_keys:
1965 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
1966
1967 return result_msgs
1968 # pylint: enable=R0913
1969
1970
1971 def _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, ssh_key_type,
1972 ssh_key_bits, pub_key_file=pathutils.SSH_PUB_KEYS,
1973 ssconf_store=None,
1974 noded_cert_file=pathutils.NODED_CERT_FILE,
1975 run_cmd_fn=ssh.RunSshCmdWithStdin,
1976 suffix="",
1977 ssh_update_debug=False,
1978 ssh_update_verbose=False):
1979 """Generates the root SSH key pair on the node.
1980
1981 @type node_uuid: str
1982 @param node_uuid: UUID of the node whose key is removed
1983 @type node_name: str
1984 @param node_name: name of the node whose key is remove
1985 @type ssh_port_map: dict of str to int
1986 @param ssh_port_map: mapping of node names to their SSH port
1987 @type ssh_key_type: One of L{constants.SSHK_ALL}
1988 @param ssh_key_type: the type of SSH key to be generated
1989 @type ssh_key_bits: int
1990 @param ssh_key_bits: the length of the key to be generated
1991
1992 """
1993 if not ssconf_store:
1994 ssconf_store = ssconf.SimpleStore()
1995
1996 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
1997 if not keys_by_uuid or node_uuid not in keys_by_uuid:
1998 raise errors.SshUpdateError("Node %s (UUID: %s) whose key is requested to"
1999 " be regenerated is not registered in the"
2000 " public keys file." % (node_name, node_uuid))
2001
2002 data = {}
2003 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
2004 cluster_name = data[constants.SSHS_CLUSTER_NAME]
2005 data[constants.SSHS_GENERATE] = (ssh_key_type, ssh_key_bits, suffix)
2006
2007 run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
2008 ssh_port_map.get(node_name), data,
2009 debug=ssh_update_debug, verbose=ssh_update_verbose,
2010 use_cluster_key=False, ask_key=False, strict_host_check=False)
2011
2012
2013 def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
2014 master_node_uuids = [node_uuid for (node_uuid, node_name)
2015 in node_uuid_name_map
2016 if node_name == master_node_name]
2017 if len(master_node_uuids) != 1:
2018 raise errors.SshUpdateError("No (unique) master UUID found. Master node"
2019 " name: '%s', Master UUID: '%s'" %
2020 (master_node_name, master_node_uuids))
2021 return master_node_uuids[0]
2022
2023
2024 def _GetOldMasterKeys(master_node_uuid, pub_key_file):
2025 old_master_keys_by_uuid = ssh.QueryPubKeyFile([master_node_uuid],
2026 key_file=pub_key_file)
2027 if not old_master_keys_by_uuid:
2028 raise errors.SshUpdateError("No public key of the master node (UUID '%s')"
2029 " found, not generating a new key."
2030 % master_node_uuid)
2031 return old_master_keys_by_uuid
2032
2033
2034 def _GetNewMasterKey(root_keyfiles, master_node_uuid):
2035 new_master_keys = []
2036 for (_, (_, public_key_file)) in root_keyfiles.items():
2037 public_key_dir = os.path.dirname(public_key_file)
2038 public_key_file_tmp_filename = \
2039 os.path.splitext(os.path.basename(public_key_file))[0] \
2040 + constants.SSHS_MASTER_SUFFIX + ".pub"
2041 public_key_path_tmp = os.path.join(public_key_dir,
2042 public_key_file_tmp_filename)
2043 if os.path.exists(public_key_path_tmp):
2044 # for some key types, there might not be any keys
2045 key = utils.ReadFile(public_key_path_tmp)
2046 new_master_keys.append(key)
2047 if not new_master_keys:
2048 raise errors.SshUpdateError("Cannot find any type of temporary SSH key.")
2049 return {master_node_uuid: new_master_keys}
2050
2051
2052 def _ReplaceMasterKeyOnMaster(root_keyfiles):
2053 number_of_moves = 0
2054 for (_, (private_key_file, public_key_file)) in root_keyfiles.items():
2055 key_dir = os.path.dirname(public_key_file)
2056 private_key_file_tmp = \
2057 os.path.basename(private_key_file) + constants.SSHS_MASTER_SUFFIX
2058 public_key_file_tmp = private_key_file_tmp + ".pub"
2059 private_key_path_tmp = os.path.join(key_dir,
2060 private_key_file_tmp)
2061 public_key_path_tmp = os.path.join(key_dir,
2062 public_key_file_tmp)
2063 if os.path.exists(public_key_file):
2064 utils.CreateBackup(public_key_file)
2065 utils.RemoveFile(public_key_file)
2066 if os.path.exists(private_key_file):
2067 utils.CreateBackup(private_key_file)
2068 utils.RemoveFile(private_key_file)
2069 if os.path.exists(public_key_path_tmp) and \
2070 os.path.exists(private_key_path_tmp):
2071 # for some key types, there might not be any keys
2072 shutil.move(public_key_path_tmp, public_key_file)
2073 shutil.move(private_key_path_tmp, private_key_file)
2074 number_of_moves += 1
2075 if not number_of_moves:
2076 raise errors.SshUpdateError("Could not move at least one master SSH key.")
2077
2078
2079 def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
2080 potential_master_candidates, old_key_type, new_key_type,
2081 new_key_bits,
2082 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS,
2083 ssconf_store=None,
2084 noded_cert_file=pathutils.NODED_CERT_FILE,
2085 run_cmd_fn=ssh.RunSshCmdWithStdin,
2086 ssh_update_debug=False,
2087 ssh_update_verbose=False):
2088 """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
2089
2090 @type node_uuids: list of str
2091 @param node_uuids: list of node UUIDs whose keys should be renewed
2092 @type node_names: list of str
2093 @param node_names: list of node names whose keys should be removed. This list
2094 should match the C{node_uuids} parameter
2095 @type master_candidate_uuids: list of str
2096 @param master_candidate_uuids: list of UUIDs of master candidates or
2097 master node
2098 @type old_key_type: One of L{constants.SSHK_ALL}
2099 @param old_key_type: the type of SSH key already present on nodes
2100 @type new_key_type: One of L{constants.SSHK_ALL}
2101 @param new_key_type: the type of SSH key to be generated
2102 @type new_key_bits: int
2103 @param new_key_bits: the length of the key to be generated
2104 @type ganeti_pub_keys_file: str
2105 @param ganeti_pub_keys_file: file path of the the public key file
2106 @type noded_cert_file: str
2107 @param noded_cert_file: path of the noded SSL certificate file
2108 @type run_cmd_fn: function
2109 @param run_cmd_fn: function to run commands on remote nodes via SSH
2110 @raises ProgrammerError: if node_uuids and node_names don't match;
2111 SshUpdateError if a node's key is missing from the public key file,
2112 if a node's new SSH key could not be fetched from it, if there is
2113 none or more than one entry in the public key list for the master
2114 node.
2115
2116 """
2117 if not ssconf_store:
2118 ssconf_store = ssconf.SimpleStore()
2119 cluster_name = ssconf_store.GetClusterName()
2120
2121 if not len(node_uuids) == len(node_names):
2122 raise errors.ProgrammerError("List of nodes UUIDs and node names"
2123 " does not match in length.")
2124
2125 (_, root_keyfiles) = \
2126 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2127 (_, old_pub_keyfile) = root_keyfiles[old_key_type]
2128 (_, new_pub_keyfile) = root_keyfiles[new_key_type]
2129 old_master_key = utils.ReadFile(old_pub_keyfile)
2130
2131 node_uuid_name_map = zip(node_uuids, node_names)
2132
2133 master_node_name = ssconf_store.GetMasterNode()
2134 master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name)
2135 ssh_port_map = ssconf_store.GetSshPortMap()
2136 # List of all node errors that happened, but which did not abort the
2137 # procedure as a whole. It is important that this is a list to have a
2138 # somewhat chronological history of events.
2139 all_node_errors = []
2140
2141 # process non-master nodes
2142
2143 # keys to add in bulk at the end
2144 node_keys_to_add = []
2145
2146 # list of all nodes
2147 node_list = []
2148
2149 # list of keys to be removed before generating new keys
2150 node_info_to_remove = []
2151
2152 for node_uuid, node_name in node_uuid_name_map:
2153 if node_name == master_node_name:
2154 continue
2155 master_candidate = node_uuid in master_candidate_uuids
2156 potential_master_candidate = node_name in potential_master_candidates
2157 node_list.append((node_uuid, node_name, master_candidate,
2158 potential_master_candidate))
2159
2160 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid],
2161 key_file=ganeti_pub_keys_file)
2162 if not keys_by_uuid:
2163 raise errors.SshUpdateError("No public key of node %s (UUID %s) found,"
2164 " not generating a new key."
2165 % (node_name, node_uuid))
2166
2167 if master_candidate:
2168 logging.debug("Fetching old SSH key from node '%s'.", node_name)
2169 old_pub_key = ssh.ReadRemoteSshPubKeys(old_pub_keyfile,
2170 node_name, cluster_name,
2171 ssh_port_map[node_name],
2172 False, # ask_key
2173 False) # key_check
2174 if old_pub_key != old_master_key:
2175 # If we are already in a multi-key setup (that is past Ganeti 2.12),
2176 # we can safely remove the old key of the node. Otherwise, we cannot
2177 # remove that node's key, because it is also the master node's key
2178 # and that would terminate all communication from the master to the
2179 # node.
2180 node_info_to_remove.append(SshRemoveNodeInfo(
2181 uuid=node_uuid,
2182 name=node_name,
2183 from_authorized_keys=master_candidate,
2184 from_public_keys=False,
2185 clear_authorized_keys=False,
2186 clear_public_keys=False))
2187 else:
2188 logging.debug("Old key of node '%s' is the same as the current master"
2189 " key. Not deleting that key on the node.", node_name)
2190
2191 logging.debug("Removing old SSH keys of all master candidates.")
2192 if node_info_to_remove:
2193 node_errors = RemoveNodeSshKeyBulk(
2194 node_info_to_remove,
2195 master_candidate_uuids,
2196 potential_master_candidates,
2197 master_uuid=master_node_uuid,
2198 ssh_update_debug=ssh_update_debug,
2199 ssh_update_verbose=ssh_update_verbose)
2200 if node_errors:
2201 all_node_errors = all_node_errors + node_errors
2202
2203 for (node_uuid, node_name, master_candidate, potential_master_candidate) \
2204 in node_list:
2205
2206 logging.debug("Generating new SSH key for node '%s'.", node_name)
2207 _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, new_key_type,
2208 new_key_bits, pub_key_file=ganeti_pub_keys_file,
2209 ssconf_store=ssconf_store,
2210 noded_cert_file=noded_cert_file,
2211 run_cmd_fn=run_cmd_fn,
2212 ssh_update_verbose=ssh_update_verbose,
2213 ssh_update_debug=ssh_update_debug)
2214
2215 try:
2216 logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
2217 pub_key = ssh.ReadRemoteSshPubKeys(new_pub_keyfile,
2218 node_name, cluster_name,
2219 ssh_port_map[node_name],
2220 False, # ask_key
2221 False) # key_check
2222 except:
2223 raise errors.SshUpdateError("Could not fetch key of node %s"
2224 " (UUID %s)" % (node_name, node_uuid))
2225
2226 if potential_master_candidate:
2227 ssh.RemovePublicKey(node_uuid, key_file=ganeti_pub_keys_file)
2228 ssh.AddPublicKey(node_uuid, pub_key, key_file=ganeti_pub_keys_file)
2229
2230 node_info = SshAddNodeInfo(name=node_name,
2231 uuid=node_uuid,
2232 to_authorized_keys=master_candidate,
2233 to_public_keys=potential_master_candidate,
2234 get_public_keys=True)
2235 node_keys_to_add.append(node_info)
2236
2237 node_errors = AddNodeSshKeyBulk(
2238 node_keys_to_add, potential_master_candidates,
2239 pub_key_file=ganeti_pub_keys_file, ssconf_store=ssconf_store,
2240 noded_cert_file=noded_cert_file,
2241 run_cmd_fn=run_cmd_fn,
2242 ssh_update_debug=ssh_update_debug,
2243 ssh_update_verbose=ssh_update_verbose)
2244 if node_errors:
2245 all_node_errors = all_node_errors + node_errors
2246
2247 # Renewing the master node's key
2248
2249 # Preserve the old keys for now
2250 old_master_keys_by_uuid = _GetOldMasterKeys(master_node_uuid,
2251 ganeti_pub_keys_file)
2252
2253 # Generate a new master key with a suffix, don't touch the old one for now
2254 logging.debug("Generate new ssh key of master.")
2255 _GenerateNodeSshKey(master_node_uuid, master_node_name, ssh_port_map,
2256 new_key_type, new_key_bits,
2257 pub_key_file=ganeti_pub_keys_file,
2258 ssconf_store=ssconf_store,
2259 noded_cert_file=noded_cert_file,
2260 run_cmd_fn=run_cmd_fn,
2261 suffix=constants.SSHS_MASTER_SUFFIX,
2262 ssh_update_debug=ssh_update_debug,
2263 ssh_update_verbose=ssh_update_verbose)
2264 # Read newly created master key
2265 new_master_key_dict = _GetNewMasterKey(root_keyfiles, master_node_uuid)
2266
2267 # Replace master key in the master nodes' public key file
2268 ssh.RemovePublicKey(master_node_uuid, key_file=ganeti_pub_keys_file)
2269 for pub_key in new_master_key_dict[master_node_uuid]:
2270 ssh.AddPublicKey(master_node_uuid, pub_key, key_file=ganeti_pub_keys_file)
2271
2272 # Add new master key to all node's public and authorized keys
2273 logging.debug("Add new master key to all nodes.")
2274 node_errors = AddNodeSshKey(
2275 master_node_uuid, master_node_name, potential_master_candidates,
2276 to_authorized_keys=True, to_public_keys=True,
2277 get_public_keys=False, pub_key_file=ganeti_pub_keys_file,
2278 ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
2279 run_cmd_fn=run_cmd_fn,
2280 ssh_update_debug=ssh_update_debug,
2281 ssh_update_verbose=ssh_update_verbose)
2282 if node_errors:
2283 all_node_errors = all_node_errors + node_errors
2284
2285 # Remove the old key file and rename the new key to the non-temporary filename
2286 _ReplaceMasterKeyOnMaster(root_keyfiles)
2287
2288 # Remove old key from authorized keys
2289 (auth_key_file, _) = \
2290 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2291 ssh.RemoveAuthorizedKeys(auth_key_file,
2292 old_master_keys_by_uuid[master_node_uuid])
2293
2294 # Remove the old key from all node's authorized keys file
2295 logging.debug("Remove the old master key from all nodes.")
2296 node_errors = RemoveNodeSshKey(
2297 master_node_uuid, master_node_name, master_candidate_uuids,
2298 potential_master_candidates,
2299 keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
2300 from_public_keys=False, clear_authorized_keys=False,
2301 clear_public_keys=False,
2302 ssh_update_debug=ssh_update_debug,
2303 ssh_update_verbose=ssh_update_verbose)
2304 if node_errors:
2305 all_node_errors = all_node_errors + node_errors
2306
2307 return all_node_errors
2308
2309
2310 def GetBlockDevSizes(devices):
2311 """Return the size of the given block devices
2312
2313 @type devices: list
2314 @param devices: list of block device nodes to query
2315 @rtype: dict
2316 @return:
2317 dictionary of all block devices under /dev (key). The value is their
2318 size in MiB.
2319
2320 {'/dev/disk/by-uuid/123456-12321231-312312-312': 124}
2321
2322 """
2323 DEV_PREFIX = "/dev/"
2324 blockdevs = {}
2325
2326 for devpath in devices:
2327 if not utils.IsBelowDir(DEV_PREFIX, devpath):
2328 continue
2329
2330 try:
2331 st = os.stat(devpath)
2332 except EnvironmentError, err:
2333 logging.warning("Error stat()'ing device %s: %s", devpath, str(err))
2334 continue
2335
2336 if stat.S_ISBLK(st.st_mode):
2337 result = utils.RunCmd(["blockdev", "--getsize64", devpath])
2338 if result.failed:
2339 # We don't want to fail, just do not list this device as available
2340 logging.warning("Cannot get size for block device %s", devpath)
2341 continue
2342
2343 size = int(result.stdout) / (1024 * 1024)
2344 blockdevs[devpath] = size
2345 return blockdevs
2346
2347
2348 def GetVolumeList(vg_names):
2349 """Compute list of logical volumes and their size.
2350
2351 @type vg_names: list
2352 @param vg_names: the volume groups whose LVs we should list, or
2353 empty for all volume groups
2354 @rtype: dict
2355 @return:
2356 dictionary of all partions (key) with value being a tuple of
2357 their size (in MiB), inactive and online status::
2358
2359 {'xenvg/test1': ('20.06', True, True)}
2360
2361 in case of errors, a string is returned with the error
2362 details.
2363
2364 """
2365 lvs = {}
2366 sep = "|"
2367 if not vg_names:
2368 vg_names = []
2369 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2370 "--separator=%s" % sep,
2371 "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names)
2372 if result.failed:
2373 _Fail("Failed to list logical volumes, lvs output: %s", result.output)
2374
2375 for line in result.stdout.splitlines():
2376 line = line.strip()
2377 match = _LVSLINE_REGEX.match(line)
2378 if not match:
2379 logging.error("Invalid line returned from lvs output: '%s'", line)
2380 continue
2381 vg_name, name, size, attr = match.groups()
2382 inactive = attr[4] == "-"
2383 online = attr[5] == "o"
2384 virtual = attr[0] == "v"
2385 if virtual:
2386 # we don't want to report such volumes as existing, since they
2387 # don't really hold data
2388 continue
2389 lvs[vg_name + "/" + name] = (size, inactive, online)
2390
2391 return lvs
2392
2393
2394 def ListVolumeGroups():
2395 """List the volume groups and their size.
2396
2397 @rtype: dict
2398 @return: dictionary with keys volume name and values the
2399 size of the volume
2400
2401 """
2402 return utils.ListVolumeGroups()
2403
2404
2405 def NodeVolumes():
2406 """List all volumes on this node.
2407
2408 @rtype: list
2409 @return:
2410 A list of dictionaries, each having four keys:
2411 - name: the logical volume name,
2412 - size: the size of the logical volume
2413 - dev: the physical device on which the LV lives
2414 - vg: the volume group to which it belongs
2415
2416 In case of errors, we return an empty list and log the
2417 error.
2418
2419 Note that since a logical volume can live on multiple physical
2420 volumes, the resulting list might include a logical volume
2421 multiple times.
2422
2423 """
2424 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2425 "--separator=|",
2426 "--options=lv_name,lv_size,devices,vg_name"])
2427 if result.failed:
2428 _Fail("Failed to list logical volumes, lvs output: %s",
2429 result.output)
2430
2431 def parse_dev(dev):
2432 return dev.split("(")[0]
2433
2434 def handle_dev(dev):
2435 return [parse_dev(x) for x in dev.split(",")]
2436
2437 def map_line(line):
2438 line = [v.strip() for v in line]
2439 return [{"name": line[0], "size": line[1],
2440 "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])]
2441
2442 all_devs = []
2443 for line in result.stdout.splitlines():
2444 if line.count("|") >= 3:
2445 all_devs.extend(map_line(line.split("|")))
2446 else:
2447 logging.warning("Strange line in the output from lvs: '%s'", line)
2448 return all_devs
2449
2450
2451 def BridgesExist(bridges_list):
2452 """Check if a list of bridges exist on the current node.
2453
2454 @rtype: boolean
2455 @return: C{True} if all of them exist, C{False} otherwise
2456
2457 """
2458 missing = []
2459 for bridge in bridges_list:
2460 if not utils.BridgeExists(bridge):
2461 missing.append(bridge)
2462
2463 if missing:
2464 _Fail("Missing bridges %s", utils.CommaJoin(missing))
2465
2466
2467 def GetInstanceListForHypervisor(hname, hvparams=None,
2468 get_hv_fn=hypervisor.GetHypervisor):
2469 """Provides a list of instances of the given hypervisor.
2470
2471 @type hname: string
2472 @param hname: name of the hypervisor
2473 @type hvparams: dict of strings
2474 @param hvparams: hypervisor parameters for the given hypervisor
2475 @type get_hv_fn: function
2476 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2477 name; optional parameter to increase testability
2478
2479 @rtype: list
2480 @return: a list of all running instances on the current node
2481 - instance1.example.com
2482 - instance2.example.com
2483
2484 """
2485 try:
2486 return get_hv_fn(hname).ListInstances(hvparams=hvparams)
2487 except errors.HypervisorError, err:
2488 _Fail("Error enumerating instances (hypervisor %s): %s",
2489 hname, err, exc=True)
2490
2491
2492 def GetInstanceList(hypervisor_list, all_hvparams=None,
2493 get_hv_fn=hypervisor.GetHypervisor):
2494 """Provides a list of instances.
2495
2496 @type hypervisor_list: list
2497 @param hypervisor_list: the list of hypervisors to query information
2498 @type all_hvparams: dict of dict of strings
2499 @param all_hvparams: a dictionary mapping hypervisor types to respective
2500 cluster-wide hypervisor parameters
2501 @type get_hv_fn: function
2502 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2503 name; optional parameter to increase testability
2504
2505 @rtype: list
2506 @return: a list of all running instances on the current node
2507 - instance1.example.com
2508 - instance2.example.com
2509
2510 """
2511 results = []
2512 for hname in hypervisor_list:
2513 hvparams = all_hvparams[hname]
2514 results.extend(GetInstanceListForHypervisor(hname, hvparams=hvparams,
2515 get_hv_fn=get_hv_fn))
2516 return results
2517
2518
2519 def GetInstanceInfo(instance, hname, hvparams=None):
2520 """Gives back the information about an instance as a dictionary.
2521
2522 @type instance: string
2523 @param instance: the instance name
2524 @type hname: string
2525 @param hname: the hypervisor type of the instance
2526 @type hvparams: dict of strings
2527 @param hvparams: the instance's hvparams
2528
2529 @rtype: dict
2530 @return: dictionary with the following keys:
2531 - memory: memory size of instance (int)
2532 - state: state of instance (HvInstanceState)
2533 - time: cpu time of instance (float)
2534 - vcpus: the number of vcpus (int)
2535
2536 """
2537 output = {}
2538
2539 iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance,
2540 hvparams=hvparams)
2541 if iinfo is not None:
2542 output["memory"] = iinfo[2]
2543 output["vcpus"] = iinfo[3]
2544 output["state"] = iinfo[4]
2545 output["time"] = iinfo[5]
2546
2547 return output
2548
2549
2550 def GetInstanceMigratable(instance):
2551 """Computes whether an instance can be migrated.
2552
2553 @type instance: L{objects.Instance}
2554 @param instance: object representing the instance to be checked.
2555
2556 @rtype: tuple
2557 @return: tuple of (result, description) where:
2558 - result: whether the instance can be migrated or not
2559 - description: a description of the issue, if relevant
2560
2561 """
2562 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2563 iname = instance.name
2564 if iname not in hyper.ListInstances(hvparams=instance.hvparams):
2565 _Fail("Instance %s is not running", iname)
2566
2567 for idx in range(len(instance.disks_info)):
2568 link_name = _GetBlockDevSymlinkPath(iname, idx)
2569 if not os.path.islink(link_name):
2570 logging.warning("Instance %s is missing symlink %s for disk %d",
2571 iname, link_name, idx)
2572
2573
2574 def GetAllInstancesInfo(hypervisor_list, all_hvparams):
2575 """Gather data about all instances.
2576
2577 This is the equivalent of L{GetInstanceInfo}, except that it
2578 computes data for all instances at once, thus being faster if one
2579 needs data about more than one instance.
2580
2581 @type hypervisor_list: list
2582 @param hypervisor_list: list of hypervisors to query for instance data
2583 @type all_hvparams: dict of dict of strings
2584 @param all_hvparams: mapping of hypervisor names to hvparams
2585
2586 @rtype: dict
2587 @return: dictionary of instance: data, with data having the following keys:
2588 - memory: memory size of instance (int)
2589 - state: xen state of instance (string)
2590 - time: cpu time of instance (float)
2591 - vcpus: the number of vcpus
2592
2593 """
2594 output = {}
2595 for hname in hypervisor_list:
2596 hvparams = all_hvparams[hname]
2597 iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo(hvparams)
2598 if iinfo:
2599 for name, _, memory, vcpus, state, times in iinfo:
2600 value = {
2601 "memory": memory,
2602 "vcpus": vcpus,
2603 "state": state,
2604 "time": times,
2605 }
2606 if name in output:
2607 # we only check static parameters, like memory and vcpus,
2608 # and not state and time which can change between the
2609 # invocations of the different hypervisors
2610 for key in "memory", "vcpus":
2611 if value[key] != output[name][key]:
2612 _Fail("Instance %s is running twice"
2613 " with different parameters", name)
2614 output[name] = value
2615
2616 return output
2617
2618
2619 def GetInstanceConsoleInfo(instance_param_dict,
2620 get_hv_fn=hypervisor.GetHypervisor):
2621 """Gather data about the console access of a set of instances of this node.
2622
2623 This function assumes that the caller already knows which instances are on
2624 this node, by calling a function such as L{GetAllInstancesInfo} or
2625 L{GetInstanceList}.
2626
2627 For every instance, a large amount of configuration data needs to be
2628 provided to the hypervisor interface in order to receive the console
2629 information. Whether this could or should be cut down can be discussed.
2630 The information is provided in a dictionary indexed by instance name,
2631 allowing any number of instance queries to be done.
2632
2633 @type instance_param_dict: dict of string to tuple of dictionaries, where the
2634 dictionaries represent: L{objects.Instance}, L{objects.Node},
2635 L{objects.NodeGroup}, HvParams, BeParams
2636 @param instance_param_dict: mapping of instance name to parameters necessary
2637 for console information retrieval
2638
2639 @rtype: dict
2640 @return: dictionary of instance: data, with data having the following keys:
2641 - instance: instance name
2642 - kind: console kind
2643 - message: used with kind == CONS_MESSAGE, indicates console to be
2644 unavailable, supplies error message
2645 - host: host to connect to
2646 - port: port to use
2647 - user: user for login
2648 - command: the command, broken into parts as an array
2649 - display: unknown, potentially unused?
2650
2651 """
2652
2653 output = {}
2654 for inst_name in instance_param_dict:
2655 instance = instance_param_dict[inst_name]["instance"]
2656 pnode = instance_param_dict[inst_name]["node"]
2657 group = instance_param_dict[inst_name]["group"]
2658 hvparams = instance_param_dict[inst_name]["hvParams"]
2659 beparams = instance_param_dict[inst_name]["beParams"]
2660
2661 instance = objects.Instance.FromDict(instance)
2662 pnode = objects.Node.FromDict(pnode)
2663 group = objects.NodeGroup.FromDict(group)
2664
2665 h = get_hv_fn(instance.hypervisor)
2666 output[inst_name] = h.GetInstanceConsole(instance, pnode, group,
2667 hvparams, beparams).ToDict()
2668
2669 return output
2670
2671
2672 def _InstanceLogName(kind, os_name, instance, component):
2673 """Compute the OS log filename for a given instance and operation.
2674
2675 The instance name and os name are passed in as strings since not all
2676 operations have these as part of an instance object.
2677
2678 @type kind: string
2679 @param kind: the operation type (e.g. add, import, etc.)
2680 @type os_name: string
2681 @param os_name: the os name
2682 @type instance: string
2683 @param instance: the name of the instance being imported/added/etc.
2684 @type component: string or None
2685 @param component: the name of the component of the instance being
2686 transferred
2687
2688 """
2689 # TODO: Use tempfile.mkstemp to create unique filename
2690 if component:
2691 assert "/" not in component
2692 c_msg = "-%s" % component
2693 else:
2694 c_msg = ""
2695 base = ("%s-%s-%s%s-%s.log" %
2696 (kind, os_name, instance, c_msg, utils.TimestampForFilename()))
2697 return utils.PathJoin(pathutils.LOG_OS_DIR, base)
2698
2699
2700 def InstanceOsAdd(instance, reinstall, debug):
2701 """Add an OS to an instance.
2702
2703 @type instance: L{objects.Instance}
2704 @param instance: Instance whose OS is to be installed
2705 @type reinstall: boolean
2706 @param reinstall: whether this is an instance reinstall
2707 @type debug: integer
2708 @param debug: debug level, passed to the OS scripts
2709 @rtype: None
2710
2711 """
2712 inst_os = OSFromDisk(instance.os)
2713
2714 create_env = OSEnvironment(instance, inst_os, debug)
2715 if reinstall:
2716 create_env["INSTANCE_REINSTALL"] = "1"
2717
2718 logfile = _InstanceLogName("add", instance.os, instance.name, None)
2719
2720 result = utils.RunCmd([inst_os.create_script], env=create_env,
2721 cwd=inst_os.path, output=logfile, reset_env=True)
2722 if result.failed:
2723 logging.error("os create command '%s' returned error: %s, logfile: %s,"
2724 " output: %s", result.cmd, result.fail_reason, logfile,
2725 result.output)
2726 lines = [utils.SafeEncode(val)
2727 for val in utils.TailFile(logfile, lines=20)]
2728 _Fail("OS create script failed (%s), last lines in the"
2729 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2730
2731
2732 def RunRenameInstance(instance, old_name, debug):
2733 """Run the OS rename script for an instance.
2734
2735 @type instance: L{objects.Instance}
2736 @param instance: Instance whose OS is to be installed
2737 @type old_name: string
2738 @param old_name: previous instance name
2739 @type debug: integer
2740 @param debug: debug level, passed to the OS scripts
2741 @rtype: boolean
2742 @return: the success of the operation
2743
2744 """
2745 inst_os = OSFromDisk(instance.os)
2746
2747 rename_env = OSEnvironment(instance, inst_os, debug)
2748 rename_env["OLD_INSTANCE_NAME"] = old_name
2749
2750 logfile = _InstanceLogName("rename", instance.os,
2751 "%s-%s" % (old_name, instance.name), None)
2752
2753 result = utils.RunCmd([inst_os.rename_script], env=rename_env,
2754 cwd=inst_os.path, output=logfile, reset_env=True)
2755
2756 if result.failed:
2757 logging.error("os create command '%s' returned error: %s output: %s",
2758 result.cmd, result.fail_reason, result.output)
2759 lines = [utils.SafeEncode(val)
2760 for val in utils.TailFile(logfile, lines=20)]
2761 _Fail("OS rename script failed (%s), last lines in the"
2762 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2763
2764
2765 def _GetBlockDevSymlinkPath(instance_name, idx, _dir=None):
2766 """Returns symlink path for block device.
2767
2768 """
2769 if _dir is None:
2770 _dir = pathutils.DISK_LINKS_DIR
2771
2772 return utils.PathJoin(_dir,
2773 ("%s%s%s" %
2774 (instance_name, constants.DISK_SEPARATOR, idx)))
2775
2776
2777 def _SymlinkBlockDev(instance_name, device_path, idx):
2778 """Set up symlinks to a instance's block device.
2779
2780 This is an auxiliary function run when an instance is start (on the primary
2781 node) or when an instance is migrated (on the target node).
2782
2783
2784 @param instance_name: the name of the target instance
2785 @param device_path: path of the physical block device, on the node
2786 @param idx: the disk index
2787 @return: absolute path to the disk's symlink
2788
2789 """
2790 # In case we have only a userspace access URI, device_path is None
2791 if not device_path:
2792 return None
2793
2794 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2795 try:
2796 os.symlink(device_path, link_name)
2797 except OSError, err:
2798 if err.errno == errno.EEXIST:
2799 if (not os.path.islink(link_name) or
2800 os.readlink(link_name) != device_path):
2801 os.remove(link_name)
2802 os.symlink(device_path, link_name)
2803 else:
2804 raise
2805
2806 return link_name
2807
2808
2809 def _RemoveBlockDevLinks(instance_name, disks):
2810 """Remove the block device symlinks belonging to the given instance.
2811
2812 """
2813 for idx, _ in enumerate(disks):
2814 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2815 if os.path.islink(link_name):
2816 try:
2817 os.remove(link_name)
2818 except OSError:
2819 logging.exception("Can't remove symlink '%s'", link_name)
2820
2821
2822 def _CalculateDeviceURI(instance, disk, device):
2823 """Get the URI for the device.
2824
2825 @type instance: L{objects.Instance}
2826 @param instance: the instance which disk belongs to
2827 @type disk: L{objects.Disk}
2828 @param disk: the target disk object
2829 @type device: L{bdev.BlockDev}
2830 @param device: the corresponding BlockDevice
2831 @rtype: string
2832 @return: the device uri if any else None
2833
2834 """
2835 access_mode = disk.params.get(constants.LDP_ACCESS,
2836 constants.DISK_KERNELSPACE)
2837 if access_mode == constants.DISK_USERSPACE:
2838 # This can raise errors.BlockDeviceError
2839 return device.GetUserspaceAccessUri(instance.hypervisor)
2840 else:
2841 return None
2842
2843
2844 def _GatherAndLinkBlockDevs(instance):
2845 """Set up an instance's block device(s).
2846
2847 This is run on the primary node at instance startup. The block
2848 devices must be already assembled.
2849
2850 @type instance: L{objects.Instance}
2851 @param instance: the instance whose disks we should assemble
2852 @rtype: list
2853 @return: list of (disk_object, link_name, drive_uri)
2854
2855 """
2856 block_devices = []
2857 for idx, disk in enumerate(instance.disks_info):
2858 device = _RecursiveFindBD(disk)
2859 if device is None:
2860 raise errors.BlockDeviceError("Block device '%s' is not set up." %
2861 str(disk))
2862 device.Open()
2863 try:
2864 link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
2865 except OSError, e:
2866 raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
2867 e.strerror)
2868 uri = _CalculateDeviceURI(instance, disk, device)
2869
2870 block_devices.append((disk, link_name, uri))
2871
2872 return block_devices
2873
2874
2875 def _IsInstanceUserDown(instance_info):
2876 return instance_info and \
2877 "state" in instance_info and \
2878 hv_base.HvInstanceState.IsShutdown(instance_info["state"])
2879
2880
2881 def _GetInstanceInfo(instance):
2882 """Helper function L{GetInstanceInfo}"""
2883 return GetInstanceInfo(instance.name, instance.hypervisor,
2884 hvparams=instance.hvparams)
2885
2886
2887 def StartInstance(instance, startup_paused, reason, store_reason=True):
2888 """Start an instance.
2889
2890 @type instance: L{objects.Instance}
2891 @param instance: the instance object
2892 @type startup_paused: bool
2893 @param instance: pause instance at startup?
2894 @type reason: list of reasons
2895 @param reason: the reason trail for this startup
2896 @type store_reason: boolean
2897 @param store_reason: whether to store the shutdown reason trail on file
2898 @rtype: None
2899
2900 """
2901 instance_info = _GetInstanceInfo(instance)
2902
2903 if instance_info and not _IsInstanceUserDown(instance_info):
2904 logging.info("Instance '%s' already running, not starting", instance.name)
2905 return
2906
2907 try:
2908 block_devices = _GatherAndLinkBlockDevs(instance)
2909 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2910 hyper.StartInstance(instance, block_devices, startup_paused)
2911 if store_reason:
2912 _StoreInstReasonTrail(instance.name, reason)
2913 except errors.BlockDeviceError, err:
2914 _Fail("Block device error: %s", err, exc=True)
2915 except errors.HypervisorError, err:
2916 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2917 _Fail("Hypervisor error: %s", err, exc=True)
2918
2919
2920 def InstanceShutdown(instance, timeout, reason, store_reason=True):
2921 """Shut an instance down.
2922
2923 @note: this functions uses polling with a hardcoded timeout.
2924
2925 @type instance: L{objects.Instance}
2926 @param instance: the instance object
2927 @type timeout: integer
2928 @param timeout: maximum timeout for soft shutdown
2929 @type reason: list of reasons
2930 @param reason: the reason trail for this shutdown
2931 @type store_reason: boolean
2932 @param store_reason: whether to store the shutdown reason trail on file
2933 @rtype: None
2934
2935 """
2936 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2937
2938 if not _GetInstanceInfo(instance):
2939 logging.info("Instance '%s' not running, doing nothing", instance.name)
2940 return
2941
2942 class _TryShutdown(object):
2943 def __init__(self):
2944 self.tried_once = False
2945
2946 def __call__(self):
2947 if not _GetInstanceInfo(instance):
2948 return
2949
2950 try:
2951 hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout)
2952 if store_reason:
2953 _StoreInstReasonTrail(instance.name, reason)
2954 except errors.HypervisorError, err:
2955 # if the instance is no longer existing, consider this a
2956 # success and go to cleanup
2957 if not _GetInstanceInfo(instance):
2958 return
2959
2960 _Fail("Failed to stop instance '%s': %s", instance.name, err)
2961
2962 self.tried_once = True
2963
2964 raise utils.RetryAgain()
2965
2966 try:
2967 utils.Retry(_TryShutdown(), 5, timeout)
2968 except utils.RetryTimeout:
2969 # the shutdown did not succeed
2970 logging.error("Shutdown of '%s' unsuccessful, forcing", instance.name)
2971
2972 try:
2973 hyper.StopInstance(instance, force=True)
2974 except errors.HypervisorError, err:
2975 # only raise an error if the instance still exists, otherwise
2976 # the error could simply be "instance ... unknown"!
2977 if _GetInstanceInfo(instance):
2978 _Fail("Failed to force stop instance '%s': %s", instance.name, err)
2979
2980 time.sleep(1)
2981
2982 if _GetInstanceInfo(instance):
2983 _Fail("Could not shutdown instance '%s' even by destroy", instance.name)
2984
2985 try:
2986 hyper.CleanupInstance(instance.name)
2987 except errors.HypervisorError, err:
2988 logging.warning("Failed to execute post-shutdown cleanup step: %s", err)
2989
2990 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2991
2992
2993 def InstanceReboot(instance, reboot_type, shutdown_timeout, reason):
2994 """Reboot an instance.
2995
2996 @type instance: L{objects.Instance}
2997 @param instance: the instance object to reboot
2998 @type reboot_type: str
2999 @param reboot_type: the type of reboot, one the following
3000 constants:
3001 - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
3002 instance OS, do not recreate the VM
3003 - L{constants.INSTANCE_REBOOT_HARD}: tear down and
3004 restart the VM (at the hypervisor level)
3005 - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is
3006 not accepted here, since that mode is handled differently, in
3007 cmdlib, and translates into full stop and start of the
3008 instance (instead of a call_instance_reboot RPC)
3009 @type shutdown_timeout: integer
3010 @param shutdown_timeout: maximum timeout for soft shutdown
3011 @type reason: list of reasons
3012 @param reason: the reason trail for this reboot
3013 @rtype: None
3014
3015 """
3016 # TODO: this is inconsistent with 'StartInstance' and 'InstanceShutdown'
3017 # because those functions simply 'return' on error whereas this one
3018 # raises an exception with '_Fail'
3019 if not _GetInstanceInfo(instance):
3020 _Fail("Cannot reboot instance '%s' that is not running", instance.name)
3021
3022 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3023 if reboot_type == constants.INSTANCE_REBOOT_SOFT:
3024 try:
3025 hyper.RebootInstance(instance)
3026 except errors.HypervisorError, err:
3027 _Fail("Failed to soft reboot instance '%s': %s", instance.name, err)
3028 elif reboot_type == constants.INSTANCE_REBOOT_HARD:
3029 try:
3030 InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False)
3031 result = StartInstance(instance, False, reason, store_reason=False)
3032 _StoreInstReasonTrail(instance.name, reason)
3033 return result
3034 except errors.HypervisorError, err:
3035 _Fail("Failed to hard reboot instance '%s': %s", instance.name, err)
3036 else:
3037 _Fail("Invalid reboot_type received: '%s'", reboot_type)
3038
3039
3040 def InstanceBalloonMemory(instance, memory):
3041 """Resize an instance's memory.
3042
3043 @type instance: L{objects.Instance}
3044 @param instance: the instance object
3045 @type memory: int
3046 @param memory: new memory amount in MB
3047 @rtype: None
3048
3049 """
3050 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3051 running = hyper.ListInstances(hvparams=instance.hvparams)
3052 if instance.name not in running:
3053 logging.info("Instance %s is not running, cannot balloon", instance.name)
3054 return
3055 try:
3056 hyper.BalloonInstanceMemory(instance, memory)
3057 except errors.HypervisorError, err:
3058 _Fail("Failed to balloon instance memory: %s", err, exc=True)
3059
3060
3061 def MigrationInfo(instance):
3062 """Gather information about an instance to be migrated.
3063
3064 @type instance: L{objects.Instance}
3065 @param instance: the instance definition
3066
3067 """
3068 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3069 try:
3070 info = hyper.MigrationInfo(instance)
3071 except errors.HypervisorError, err:
3072 _Fail("Failed to fetch migration information: %s", err, exc=True)
3073 return info
3074
3075
3076 def AcceptInstance(instance, info, target):
3077 """Prepare the node to accept an instance.
3078
3079 @type instance: L{objects.Instance}
3080 @param instance: the instance definition
3081 @type info: string/data (opaque)
3082 @param info: migration information, from the source node
3083 @type target: string
3084 @param target: target host (usually ip), on this node
3085
3086 """
3087 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3088 try:
3089 hyper.AcceptInstance(instance, info, target)
3090 except errors.HypervisorError, err:
3091 _Fail("Failed to accept instance: %s", err, exc=True)
3092
3093
3094 def FinalizeMigrationDst(instance, info, success):
3095 """Finalize any preparation to accept an instance.
3096
3097 @type instance: L{objects.Instance}
3098 @param instance: the instance definition
3099 @type info: string/data (opaque)
3100 @param info: migration information, from the source node
3101 @type success: boolean
3102 @param success: whether the migration was a success or a failure
3103
3104 """
3105 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3106 try:
3107 hyper.FinalizeMigrationDst(instance, info, success)
3108 except errors.HypervisorError, err:
3109 _Fail("Failed to finalize migration on the target node: %s", err, exc=True)
3110
3111
3112 def MigrateInstance(cluster_name, instance, target, live):
3113 """Migrates an instance to another node.
3114
3115 @type cluster_name: string
3116 @param cluster_name: name of the cluster
3117 @type instance: L{objects.Instance}
3118 @param instance: the instance definition
3119 @type target: string
3120 @param target: the target node name
3121 @type live: boolean
3122 @param live: whether the migration should be done live or not (the
3123 interpretation of this parameter is left to the hypervisor)
3124 @raise RPCFail: if migration fails for some reason
3125
3126 """
3127 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3128
3129 try:
3130 hyper.MigrateInstance(cluster_name, instance, target, live)
3131 except errors.HypervisorError, err:
3132 _Fail("Failed to migrate instance: %s", err, exc=True)
3133
3134
3135 def FinalizeMigrationSource(instance, success, live):
3136 """Finalize the instance migration on the source node.
3137
3138 @type instance: L{objects.Instance}
3139 @param instance: the instance definition of the migrated instance
3140 @type success: bool
3141 @param success: whether the migration succeeded or not
3142 @type live: bool
3143 @param live: whether the user requested a live migration or not
3144 @raise RPCFail: If the execution fails for some reason
3145
3146 """
3147 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3148
3149 try:
3150 hyper.FinalizeMigrationSource(instance, success, live)
3151 except Exception, err: # pylint: disable=W0703
3152 _Fail("Failed to finalize the migration on the source node: %s", err,
3153 exc=True)
3154
3155
3156 def GetMigrationStatus(instance):
3157 """Get the migration status
3158
3159 @type instance: L{objects.Instance}
3160 @param instance: the instance that is being migrated
3161 @rtype: L{objects.MigrationStatus}
3162 @return: the status of the current migration (one of
3163 L{constants.HV_MIGRATION_VALID_STATUSES}), plus any additional
3164 progress info that can be retrieved from the hypervisor
3165 @raise RPCFail: If the migration status cannot be retrieved
3166
3167 """
3168 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3169 try:
3170 return hyper.GetMigrationStatus(instance)
3171 except Exception, err: # pylint: disable=W0703
3172 _Fail("Failed to get migration status: %s", err, exc=True)
3173
3174
3175 def HotplugDevice(instance, action, dev_type, device, extra, seq):
3176 """Hotplug a device
3177
3178 Hotplug is currently supported only for KVM Hypervisor.
3179 @type instance: L{objects.Instance}
3180 @param instance: the instance to which we hotplug a device
3181 @type action: string
3182 @param action: the hotplug action to perform
3183 @type dev_type: string
3184 @param dev_type: the device type to hotplug
3185 @type device: either L{objects.NIC} or L{objects.Disk}
3186 @param device: the device object to hotplug
3187 @type extra: tuple
3188 @param extra: extra info used for disk hotplug (disk link, drive uri)
3189 @type seq: int
3190 @param seq: the index of the device from master perspective
3191 @raise RPCFail: in case instance does not have KVM hypervisor
3192
3193 """
3194 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3195 try:
3196 hyper.VerifyHotplugSupport(instance, action, dev_type)
3197 except errors.HotplugError, err:
3198 _Fail("Hotplug is not supported: %s", err)
3199
3200 if action == constants.HOTPLUG_ACTION_ADD:
3201 fn = hyper.HotAddDevice
3202 elif action == constants.HOTPLUG_ACTION_REMOVE:
3203 fn = hyper.HotDelDevice
3204 elif action == constants.HOTPLUG_ACTION_MODIFY:
3205 fn = hyper.HotModDevice
3206 else:
3207 assert action in constants.HOTPLUG_ALL_ACTIONS
3208
3209 return fn(instance, dev_type, device, extra, seq)
3210
3211
3212 def HotplugSupported(instance):
3213 """Checks if hotplug is generally supported.
3214
3215 """
3216 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3217 try:
3218 hyper.HotplugSupported(instance)
3219 except errors.HotplugError, err:
3220 _Fail("Hotplug is not supported: %s", err)
3221
3222
3223 def ModifyInstanceMetadata(metadata):
3224 """Sends instance data to the metadata daemon.
3225
3226 Uses the Luxi transport layer to communicate with the metadata
3227 daemon configuration server. It starts the metadata daemon if it is
3228 not running.
3229 The daemon must be enabled during at configuration time.
3230
3231 @type metadata: dict
3232 @param metadata: instance metadata obtained by calling
3233 L{objects.Instance.ToDict} on an instance object
3234
3235 """
3236 if not constants.ENABLE_METAD:
3237 raise errors.ProgrammerError("The metadata deamon is disabled, yet"
3238 " ModifyInstanceMetadata has been called")
3239
3240 if not utils.IsDaemonAlive(constants.METAD):
3241 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.METAD])
3242 if result.failed:
3243 raise errors.HypervisorError("Failed to start metadata daemon")
3244
3245 with contextlib.closing(metad.Client()) as client:
3246 client.UpdateConfig(metadata)
3247
3248
3249 def BlockdevCreate(disk, size, owner, on_primary, info, excl_stor):
3250 """Creates a block device for an instance.
3251
3252 @type disk: L{objects.Disk}
3253 @param disk: the object describing the disk we should create
3254 @type size: int
3255 @param size: the size of the physical underlying device, in MiB
3256 @type owner: str
3257 @param owner: the name of the instance for which disk is created,
3258 used for device cache data
3259 @type on_primary: boolean
3260 @param on_primary: indicates if it is the primary node or not
3261 @type info: string
3262 @param info: string that will be sent to the physical device
3263 creation, used for example to set (LVM) tags on LVs
3264 @type excl_stor: boolean
3265 @param excl_stor: Whether exclusive_storage is active
3266
3267 @return: the new unique_id of the device (this can sometime be
3268 computed only after creation), or None. On secondary nodes,
3269 it's not required to return anything.
3270
3271 """
3272 # TODO: remove the obsolete "size" argument
3273 # pylint: disable=W0613
3274 clist = []
3275 if disk.children:
3276 for child in disk.children:
3277 try:
3278 crdev = _RecursiveAssembleBD(child, owner, on_primary)
3279 except errors.BlockDeviceError, err:
3280 _Fail("Can't assemble device %s: %s", child, err)
3281 if on_primary or disk.AssembleOnSecondary():
3282 # we need the children open in case the device itself has to
3283 # be assembled
3284 try:
3285 # pylint: disable=E1103
3286 crdev.Open()
3287 except errors.BlockDeviceError, err:
3288 _Fail("Can't make child '%s' read-write: %s", child, err)
3289 clist.append(crdev)
3290
3291 try:
3292 device = bdev.Create(disk, clist, excl_stor)
3293 except errors.BlockDeviceError, err:
3294 _Fail("Can't create block device: %s", err)
3295
3296 if on_primary or disk.AssembleOnSecondary():
3297 try:
3298 device.Assemble()
3299 except errors.BlockDeviceError, err:
3300 _Fail("Can't assemble device after creation, unusual event: %s", err)
3301 if on_primary or disk.OpenOnSecondary():
3302 try:
3303 device.Open(force=True)
3304 except errors.BlockDeviceError, err:
3305 _Fail("Can't make device r/w after creation, unusual event: %s", err)
3306 DevCacheManager.UpdateCache(device.dev_path, owner,
3307 on_primary, disk.iv_name)
3308
3309 device.SetInfo(info)
3310
3311 return device.unique_id
3312
3313
3314 def _DumpDevice(source_path, target_path, offset, size, truncate):
3315 """This function images/wipes the device using a local file.
3316
3317 @type source_path: string
3318 @param source_path: path of the image or data source (e.g., "/dev/zero")
3319
3320 @type target_path: string
3321 @param target_path: path of the device to image/wipe
3322
3323 @type offset: int
3324 @param offset: offset in MiB in the output file
3325
3326 @type size: int
3327 @param size: maximum size in MiB to write (data source might be smaller)
3328
3329 @type truncate: bool
3330 @param truncate: whether the file should be truncated
3331
3332 @return: None
3333 @raise RPCFail: in case of failure
3334
3335 """
3336 # Internal sizes are always in Mebibytes; if the following "dd" command
3337 # should use a different block size the offset and size given to this
3338 # function must be adjusted accordingly before being passed to "dd".
3339 block_size = constants.DD_BLOCK_SIZE
3340
3341 cmd = [constants.DD_CMD, "if=%s" % source_path, "seek=%d" % offset,
3342 "bs=%s" % block_size, "oflag=direct", "of=%s" % target_path,
3343 "count=%d" % size]
3344
3345 if not truncate:
3346 cmd.append("conv=notrunc")
3347
3348 result = utils.RunCmd(cmd