Fix renew-crypto on one-node-cluster
[ganeti-github.git] / lib / backend.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31 """Functions used by the node daemon
32
33 @var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
34 the L{UploadFile} function
35 @var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted
36 in the L{_CleanDirectory} function
37
38 """
39
40 # pylint: disable=E1103,C0302
41
42 # E1103: %s %r has no %r member (but some types could not be
43 # inferred), because the _TryOSFromDisk returns either (True, os_obj)
44 # or (False, "string") which confuses pylint
45
46 # C0302: This module has become too big and should be split up
47
48
49 import base64
50 import errno
51 import logging
52 import os
53 import os.path
54 import pycurl
55 import random
56 import re
57 import shutil
58 import signal
59 import stat
60 import tempfile
61 import time
62 import zlib
63 import contextlib
64 import collections
65
66 from ganeti import errors
67 from ganeti import http
68 from ganeti import utils
69 from ganeti import ssh
70 from ganeti import hypervisor
71 from ganeti.hypervisor import hv_base
72 from ganeti import constants
73 from ganeti.storage import bdev
74 from ganeti.storage import drbd
75 from ganeti.storage import extstorage
76 from ganeti.storage import filestorage
77 from ganeti import objects
78 from ganeti import ssconf
79 from ganeti import serializer
80 from ganeti import netutils
81 from ganeti import runtime
82 from ganeti import compat
83 from ganeti import pathutils
84 from ganeti import vcluster
85 from ganeti import ht
86 from ganeti.storage.base import BlockDev
87 from ganeti.storage.drbd import DRBD8
88 from ganeti import hooksmaster
89 import ganeti.metad as metad
90
91
92 _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id"
93 _ALLOWED_CLEAN_DIRS = compat.UniqueFrozenset([
94 pathutils.DATA_DIR,
95 pathutils.JOB_QUEUE_ARCHIVE_DIR,
96 pathutils.QUEUE_DIR,
97 pathutils.CRYPTO_KEYS_DIR,
98 ])
99 _MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60
100 _X509_KEY_FILE = "key"
101 _X509_CERT_FILE = "cert"
102 _IES_STATUS_FILE = "status"
103 _IES_PID_FILE = "pid"
104 _IES_CA_FILE = "ca"
105
106 #: Valid LVS output line regex
107 _LVSLINE_REGEX = re.compile(r"^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$")
108
109 # Actions for the master setup script
110 _MASTER_START = "start"
111 _MASTER_STOP = "stop"
112
113 #: Maximum file permissions for restricted command directory and executables
114 _RCMD_MAX_MODE = (stat.S_IRWXU |
115 stat.S_IRGRP | stat.S_IXGRP |
116 stat.S_IROTH | stat.S_IXOTH)
117
118 #: Delay before returning an error for restricted commands
119 _RCMD_INVALID_DELAY = 10
120
121 #: How long to wait to acquire lock for restricted commands (shorter than
122 #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many
123 #: command requests arrive
124 _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8
125
126
127 class RPCFail(Exception):
128 """Class denoting RPC failure.
129
130 Its argument is the error message.
131
132 """
133
134
135 def _GetInstReasonFilename(instance_name):
136 """Path of the file containing the reason of the instance status change.
137
138 @type instance_name: string
139 @param instance_name: The name of the instance
140 @rtype: string
141 @return: The path of the file
142
143 """
144 return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name)
145
146
147 def _StoreInstReasonTrail(instance_name, trail):
148 """Serialize a reason trail related to an instance change of state to file.
149
150 The exact location of the file depends on the name of the instance and on
151 the configuration of the Ganeti cluster defined at deploy time.
152
153 @type instance_name: string
154 @param instance_name: The name of the instance
155
156 @type trail: list of reasons
157 @param trail: reason trail
158
159 @rtype: None
160
161 """
162 json = serializer.DumpJson(trail)
163 filename = _GetInstReasonFilename(instance_name)
164 utils.WriteFile(filename, data=json)
165
166
167 def _Fail(msg, *args, **kwargs):
168 """Log an error and the raise an RPCFail exception.
169
170 This exception is then handled specially in the ganeti daemon and
171 turned into a 'failed' return type. As such, this function is a
172 useful shortcut for logging the error and returning it to the master
173 daemon.
174
175 @type msg: string
176 @param msg: the text of the exception
177 @raise RPCFail
178
179 """
180 if args:
181 msg = msg % args
182 if "log" not in kwargs or kwargs["log"]: # if we should log this error
183 if "exc" in kwargs and kwargs["exc"]:
184 logging.exception(msg)
185 else:
186 logging.error(msg)
187 raise RPCFail(msg)
188
189
190 def _GetConfig():
191 """Simple wrapper to return a SimpleStore.
192
193 @rtype: L{ssconf.SimpleStore}
194 @return: a SimpleStore instance
195
196 """
197 return ssconf.SimpleStore()
198
199
200 def _GetSshRunner(cluster_name):
201 """Simple wrapper to return an SshRunner.
202
203 @type cluster_name: str
204 @param cluster_name: the cluster name, which is needed
205 by the SshRunner constructor
206 @rtype: L{ssh.SshRunner}
207 @return: an SshRunner instance
208
209 """
210 return ssh.SshRunner(cluster_name)
211
212
213 def _Decompress(data):
214 """Unpacks data compressed by the RPC client.
215
216 @type data: list or tuple
217 @param data: Data sent by RPC client
218 @rtype: str
219 @return: Decompressed data
220
221 """
222 assert isinstance(data, (list, tuple))
223 assert len(data) == 2
224 (encoding, content) = data
225 if encoding == constants.RPC_ENCODING_NONE:
226 return content
227 elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
228 return zlib.decompress(base64.b64decode(content))
229 else:
230 raise AssertionError("Unknown data encoding")
231
232
233 def _CleanDirectory(path, exclude=None):
234 """Removes all regular files in a directory.
235
236 @type path: str
237 @param path: the directory to clean
238 @type exclude: list
239 @param exclude: list of files to be excluded, defaults
240 to the empty list
241
242 """
243 if path not in _ALLOWED_CLEAN_DIRS:
244 _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'",
245 path)
246
247 if not os.path.isdir(path):
248 return
249 if exclude is None:
250 exclude = []
251 else:
252 # Normalize excluded paths
253 exclude = [os.path.normpath(i) for i in exclude]
254
255 for rel_name in utils.ListVisibleFiles(path):
256 full_name = utils.PathJoin(path, rel_name)
257 if full_name in exclude:
258 continue
259 if os.path.isfile(full_name) and not os.path.islink(full_name):
260 utils.RemoveFile(full_name)
261
262
263 def _BuildUploadFileList():
264 """Build the list of allowed upload files.
265
266 This is abstracted so that it's built only once at module import time.
267
268 """
269 allowed_files = set([
270 pathutils.CLUSTER_CONF_FILE,
271 pathutils.ETC_HOSTS,
272 pathutils.SSH_KNOWN_HOSTS_FILE,
273 pathutils.VNC_PASSWORD_FILE,
274 pathutils.RAPI_CERT_FILE,
275 pathutils.SPICE_CERT_FILE,
276 pathutils.SPICE_CACERT_FILE,
277 pathutils.RAPI_USERS_FILE,
278 pathutils.CONFD_HMAC_KEY,
279 pathutils.CLUSTER_DOMAIN_SECRET_FILE,
280 ])
281
282 for hv_name in constants.HYPER_TYPES:
283 hv_class = hypervisor.GetHypervisorClass(hv_name)
284 allowed_files.update(hv_class.GetAncillaryFiles()[0])
285
286 assert pathutils.FILE_STORAGE_PATHS_FILE not in allowed_files, \
287 "Allowed file storage paths should never be uploaded via RPC"
288
289 return frozenset(allowed_files)
290
291
292 _ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
293
294
295 def JobQueuePurge():
296 """Removes job queue files and archived jobs.
297
298 @rtype: tuple
299 @return: True, None
300
301 """
302 _CleanDirectory(pathutils.QUEUE_DIR, exclude=[pathutils.JOB_QUEUE_LOCK_FILE])
303 _CleanDirectory(pathutils.JOB_QUEUE_ARCHIVE_DIR)
304
305
306 def GetMasterNodeName():
307 """Returns the master node name.
308
309 @rtype: string
310 @return: name of the master node
311 @raise RPCFail: in case of errors
312
313 """
314 try:
315 return _GetConfig().GetMasterNode()
316 except errors.ConfigurationError, err:
317 _Fail("Cluster configuration incomplete: %s", err, exc=True)
318
319
320 def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
321 """Decorator that runs hooks before and after the decorated function.
322
323 @type hook_opcode: string
324 @param hook_opcode: opcode of the hook
325 @type hooks_path: string
326 @param hooks_path: path of the hooks
327 @type env_builder_fn: function
328 @param env_builder_fn: function that returns a dictionary containing the
329 environment variables for the hooks. Will get all the parameters of the
330 decorated function.
331 @raise RPCFail: in case of pre-hook failure
332
333 """
334 def decorator(fn):
335 def wrapper(*args, **kwargs):
336 _, myself = ssconf.GetMasterAndMyself()
337 nodes = ([myself], [myself]) # these hooks run locally
338
339 env_fn = compat.partial(env_builder_fn, *args, **kwargs)
340
341 cfg = _GetConfig()
342 hr = HooksRunner()
343 hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes,
344 hr.RunLocalHooks, None, env_fn, None,
345 logging.warning, cfg.GetClusterName(),
346 cfg.GetMasterNode())
347 hm.RunPhase(constants.HOOKS_PHASE_PRE)
348 result = fn(*args, **kwargs)
349 hm.RunPhase(constants.HOOKS_PHASE_POST)
350
351 return result
352 return wrapper
353 return decorator
354
355
356 def _BuildMasterIpEnv(master_params, use_external_mip_script=None):
357 """Builds environment variables for master IP hooks.
358
359 @type master_params: L{objects.MasterNetworkParameters}
360 @param master_params: network parameters of the master
361 @type use_external_mip_script: boolean
362 @param use_external_mip_script: whether to use an external master IP
363 address setup script (unused, but necessary per the implementation of the
364 _RunLocalHooks decorator)
365
366 """
367 # pylint: disable=W0613
368 ver = netutils.IPAddress.GetVersionFromAddressFamily(master_params.ip_family)
369 env = {
370 "MASTER_NETDEV": master_params.netdev,
371 "MASTER_IP": master_params.ip,
372 "MASTER_NETMASK": str(master_params.netmask),
373 "CLUSTER_IP_VERSION": str(ver),
374 }
375
376 return env
377
378
379 def _RunMasterSetupScript(master_params, action, use_external_mip_script):
380 """Execute the master IP address setup script.
381
382 @type master_params: L{objects.MasterNetworkParameters}
383 @param master_params: network parameters of the master
384 @type action: string
385 @param action: action to pass to the script. Must be one of
386 L{backend._MASTER_START} or L{backend._MASTER_STOP}
387 @type use_external_mip_script: boolean
388 @param use_external_mip_script: whether to use an external master IP
389 address setup script
390 @raise backend.RPCFail: if there are errors during the execution of the
391 script
392
393 """
394 env = _BuildMasterIpEnv(master_params)
395
396 if use_external_mip_script:
397 setup_script = pathutils.EXTERNAL_MASTER_SETUP_SCRIPT
398 else:
399 setup_script = pathutils.DEFAULT_MASTER_SETUP_SCRIPT
400
401 result = utils.RunCmd([setup_script, action], env=env, reset_env=True)
402
403 if result.failed:
404 _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" %
405 (action, result.exit_code, result.output), log=True)
406
407
408 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup",
409 _BuildMasterIpEnv)
410 def ActivateMasterIp(master_params, use_external_mip_script):
411 """Activate the IP address of the master daemon.
412
413 @type master_params: L{objects.MasterNetworkParameters}
414 @param master_params: network parameters of the master
415 @type use_external_mip_script: boolean
416 @param use_external_mip_script: whether to use an external master IP
417 address setup script
418 @raise RPCFail: in case of errors during the IP startup
419
420 """
421 _RunMasterSetupScript(master_params, _MASTER_START,
422 use_external_mip_script)
423
424
425 def StartMasterDaemons(no_voting):
426 """Activate local node as master node.
427
428 The function will start the master daemons (ganeti-masterd and ganeti-rapi).
429
430 @type no_voting: boolean
431 @param no_voting: whether to start ganeti-masterd without a node vote
432 but still non-interactively
433 @rtype: None
434
435 """
436
437 if no_voting:
438 masterd_args = "--no-voting --yes-do-it"
439 else:
440 masterd_args = ""
441
442 env = {
443 "EXTRA_MASTERD_ARGS": masterd_args,
444 }
445
446 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-master"], env=env)
447 if result.failed:
448 msg = "Can't start Ganeti master: %s" % result.output
449 logging.error(msg)
450 _Fail(msg)
451
452
453 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown",
454 _BuildMasterIpEnv)
455 def DeactivateMasterIp(master_params, use_external_mip_script):
456 """Deactivate the master IP on this node.
457
458 @type master_params: L{objects.MasterNetworkParameters}
459 @param master_params: network parameters of the master
460 @type use_external_mip_script: boolean
461 @param use_external_mip_script: whether to use an external master IP
462 address setup script
463 @raise RPCFail: in case of errors during the IP turndown
464
465 """
466 _RunMasterSetupScript(master_params, _MASTER_STOP,
467 use_external_mip_script)
468
469
470 def StopMasterDaemons():
471 """Stop the master daemons on this node.
472
473 Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node.
474
475 @rtype: None
476
477 """
478 # TODO: log and report back to the caller the error failures; we
479 # need to decide in which case we fail the RPC for this
480
481 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-master"])
482 if result.failed:
483 logging.error("Could not stop Ganeti master, command %s had exitcode %s"
484 " and error %s",
485 result.cmd, result.exit_code, result.output)
486
487
488 def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
489 """Change the netmask of the master IP.
490
491 @param old_netmask: the old value of the netmask
492 @param netmask: the new value of the netmask
493 @param master_ip: the master IP
494 @param master_netdev: the master network device
495
496 """
497 if old_netmask == netmask:
498 return
499
500 if not netutils.IPAddress.Own(master_ip):
501 _Fail("The master IP address is not up, not attempting to change its"
502 " netmask")
503
504 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
505 "%s/%s" % (master_ip, netmask),
506 "dev", master_netdev, "label",
507 "%s:0" % master_netdev])
508 if result.failed:
509 _Fail("Could not set the new netmask on the master IP address")
510
511 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
512 "%s/%s" % (master_ip, old_netmask),
513 "dev", master_netdev, "label",
514 "%s:0" % master_netdev])
515 if result.failed:
516 _Fail("Could not bring down the master IP address with the old netmask")
517
518
519 def EtcHostsModify(mode, host, ip):
520 """Modify a host entry in /etc/hosts.
521
522 @param mode: The mode to operate. Either add or remove entry
523 @param host: The host to operate on
524 @param ip: The ip associated with the entry
525
526 """
527 if mode == constants.ETC_HOSTS_ADD:
528 if not ip:
529 RPCFail("Mode 'add' needs 'ip' parameter, but parameter not"
530 " present")
531 utils.AddHostToEtcHosts(host, ip)
532 elif mode == constants.ETC_HOSTS_REMOVE:
533 if ip:
534 RPCFail("Mode 'remove' does not allow 'ip' parameter, but"
535 " parameter is present")
536 utils.RemoveHostFromEtcHosts(host)
537 else:
538 RPCFail("Mode not supported")
539
540
541 def LeaveCluster(modify_ssh_setup):
542 """Cleans up and remove the current node.
543
544 This function cleans up and prepares the current node to be removed
545 from the cluster.
546
547 If processing is successful, then it raises an
548 L{errors.QuitGanetiException} which is used as a special case to
549 shutdown the node daemon.
550
551 @param modify_ssh_setup: boolean
552
553 """
554 _CleanDirectory(pathutils.DATA_DIR)
555 _CleanDirectory(pathutils.CRYPTO_KEYS_DIR)
556 JobQueuePurge()
557
558 if modify_ssh_setup:
559 try:
560 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
561
562 ssh.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
563
564 utils.RemoveFile(priv_key)
565 utils.RemoveFile(pub_key)
566 except errors.OpExecError:
567 logging.exception("Error while processing ssh files")
568
569 try:
570 utils.RemoveFile(pathutils.CONFD_HMAC_KEY)
571 utils.RemoveFile(pathutils.RAPI_CERT_FILE)
572 utils.RemoveFile(pathutils.SPICE_CERT_FILE)
573 utils.RemoveFile(pathutils.SPICE_CACERT_FILE)
574 utils.RemoveFile(pathutils.NODED_CERT_FILE)
575 except: # pylint: disable=W0702
576 logging.exception("Error while removing cluster secrets")
577
578 utils.StopDaemon(constants.CONFD)
579 utils.StopDaemon(constants.MOND)
580 utils.StopDaemon(constants.KVMD)
581
582 # Raise a custom exception (handled in ganeti-noded)
583 raise errors.QuitGanetiException(True, "Shutdown scheduled")
584
585
586 def _CheckStorageParams(params, num_params):
587 """Performs sanity checks for storage parameters.
588
589 @type params: list
590 @param params: list of storage parameters
591 @type num_params: int
592 @param num_params: expected number of parameters
593
594 """
595 if params is None:
596 raise errors.ProgrammerError("No storage parameters for storage"
597 " reporting is provided.")
598 if not isinstance(params, list):
599 raise errors.ProgrammerError("The storage parameters are not of type"
600 " list: '%s'" % params)
601 if not len(params) == num_params:
602 raise errors.ProgrammerError("Did not receive the expected number of"
603 "storage parameters: expected %s,"
604 " received '%s'" % (num_params, len(params)))
605
606
607 def _CheckLvmStorageParams(params):
608 """Performs sanity check for the 'exclusive storage' flag.
609
610 @see: C{_CheckStorageParams}
611
612 """
613 _CheckStorageParams(params, 1)
614 excl_stor = params[0]
615 if not isinstance(params[0], bool):
616 raise errors.ProgrammerError("Exclusive storage parameter is not"
617 " boolean: '%s'." % excl_stor)
618 return excl_stor
619
620
621 def _GetLvmVgSpaceInfo(name, params):
622 """Wrapper around C{_GetVgInfo} which checks the storage parameters.
623
624 @type name: string
625 @param name: name of the volume group
626 @type params: list
627 @param params: list of storage parameters, which in this case should be
628 containing only one for exclusive storage
629
630 """
631 excl_stor = _CheckLvmStorageParams(params)
632 return _GetVgInfo(name, excl_stor)
633
634
635 def _GetVgInfo(
636 name, excl_stor, info_fn=bdev.LogicalVolume.GetVGInfo):
637 """Retrieves information about a LVM volume group.
638
639 """
640 # TODO: GetVGInfo supports returning information for multiple VGs at once
641 vginfo = info_fn([name], excl_stor)
642 if vginfo:
643 vg_free = int(round(vginfo[0][0], 0))
644 vg_size = int(round(vginfo[0][1], 0))
645 else:
646 vg_free = None
647 vg_size = None
648
649 return {
650 "type": constants.ST_LVM_VG,
651 "name": name,
652 "storage_free": vg_free,
653 "storage_size": vg_size,
654 }
655
656
657 def _GetLvmPvSpaceInfo(name, params):
658 """Wrapper around C{_GetVgSpindlesInfo} with sanity checks.
659
660 @see: C{_GetLvmVgSpaceInfo}
661
662 """
663 excl_stor = _CheckLvmStorageParams(params)
664 return _GetVgSpindlesInfo(name, excl_stor)
665
666
667 def _GetVgSpindlesInfo(
668 name, excl_stor, info_fn=bdev.LogicalVolume.GetVgSpindlesInfo):
669 """Retrieves information about spindles in an LVM volume group.
670
671 @type name: string
672 @param name: VG name
673 @type excl_stor: bool
674 @param excl_stor: exclusive storage
675 @rtype: dict
676 @return: dictionary whose keys are "name", "vg_free", "vg_size" for VG name,
677 free spindles, total spindles respectively
678
679 """
680 if excl_stor:
681 (vg_free, vg_size) = info_fn(name)
682 else:
683 vg_free = 0
684 vg_size = 0
685 return {
686 "type": constants.ST_LVM_PV,
687 "name": name,
688 "storage_free": vg_free,
689 "storage_size": vg_size,
690 }
691
692
693 def _GetHvInfo(name, hvparams, get_hv_fn=hypervisor.GetHypervisor):
694 """Retrieves node information from a hypervisor.
695
696 The information returned depends on the hypervisor. Common items:
697
698 - vg_size is the size of the configured volume group in MiB
699 - vg_free is the free size of the volume group in MiB
700 - memory_dom0 is the memory allocated for domain0 in MiB
701 - memory_free is the currently available (free) ram in MiB
702 - memory_total is the total number of ram in MiB
703 - hv_version: the hypervisor version, if available
704
705 @type hvparams: dict of string
706 @param hvparams: the hypervisor's hvparams
707
708 """
709 return get_hv_fn(name).GetNodeInfo(hvparams=hvparams)
710
711
712 def _GetHvInfoAll(hv_specs, get_hv_fn=hypervisor.GetHypervisor):
713 """Retrieves node information for all hypervisors.
714
715 See C{_GetHvInfo} for information on the output.
716
717 @type hv_specs: list of pairs (string, dict of strings)
718 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
719
720 """
721 if hv_specs is None:
722 return None
723
724 result = []
725 for hvname, hvparams in hv_specs:
726 result.append(_GetHvInfo(hvname, hvparams, get_hv_fn))
727 return result
728
729
730 def _GetNamedNodeInfo(names, fn):
731 """Calls C{fn} for all names in C{names} and returns a dictionary.
732
733 @rtype: None or dict
734
735 """
736 if names is None:
737 return None
738 else:
739 return map(fn, names)
740
741
742 def GetNodeInfo(storage_units, hv_specs):
743 """Gives back a hash with different information about the node.
744
745 @type storage_units: list of tuples (string, string, list)
746 @param storage_units: List of tuples (storage unit, identifier, parameters) to
747 ask for disk space information. In case of lvm-vg, the identifier is
748 the VG name. The parameters can contain additional, storage-type-specific
749 parameters, for example exclusive storage for lvm storage.
750 @type hv_specs: list of pairs (string, dict of strings)
751 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
752 @rtype: tuple; (string, None/dict, None/dict)
753 @return: Tuple containing boot ID, volume group information and hypervisor
754 information
755
756 """
757 bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n")
758 storage_info = _GetNamedNodeInfo(
759 storage_units,
760 (lambda (storage_type, storage_key, storage_params):
761 _ApplyStorageInfoFunction(storage_type, storage_key, storage_params)))
762 hv_info = _GetHvInfoAll(hv_specs)
763 return (bootid, storage_info, hv_info)
764
765
766 def _GetFileStorageSpaceInfo(path, params):
767 """Wrapper around filestorage.GetSpaceInfo.
768
769 The purpose of this wrapper is to call filestorage.GetFileStorageSpaceInfo
770 and ignore the *args parameter to not leak it into the filestorage
771 module's code.
772
773 @see: C{filestorage.GetFileStorageSpaceInfo} for description of the
774 parameters.
775
776 """
777 _CheckStorageParams(params, 0)
778 return filestorage.GetFileStorageSpaceInfo(path)
779
780
781 # FIXME: implement storage reporting for all missing storage types.
782 _STORAGE_TYPE_INFO_FN = {
783 constants.ST_BLOCK: None,
784 constants.ST_DISKLESS: None,
785 constants.ST_EXT: None,
786 constants.ST_FILE: _GetFileStorageSpaceInfo,
787 constants.ST_LVM_PV: _GetLvmPvSpaceInfo,
788 constants.ST_LVM_VG: _GetLvmVgSpaceInfo,
789 constants.ST_SHARED_FILE: None,
790 constants.ST_GLUSTER: None,
791 constants.ST_RADOS: None,
792 }
793
794
795 def _ApplyStorageInfoFunction(storage_type, storage_key, *args):
796 """Looks up and applies the correct function to calculate free and total
797 storage for the given storage type.
798
799 @type storage_type: string
800 @param storage_type: the storage type for which the storage shall be reported.
801 @type storage_key: string
802 @param storage_key: identifier of a storage unit, e.g. the volume group name
803 of an LVM storage unit
804 @type args: any
805 @param args: various parameters that can be used for storage reporting. These
806 parameters and their semantics vary from storage type to storage type and
807 are just propagated in this function.
808 @return: the results of the application of the storage space function (see
809 _STORAGE_TYPE_INFO_FN) if storage space reporting is implemented for that
810 storage type
811 @raises NotImplementedError: for storage types who don't support space
812 reporting yet
813 """
814 fn = _STORAGE_TYPE_INFO_FN[storage_type]
815 if fn is not None:
816 return fn(storage_key, *args)
817 else:
818 raise NotImplementedError
819
820
821 def _CheckExclusivePvs(pvi_list):
822 """Check that PVs are not shared among LVs
823
824 @type pvi_list: list of L{objects.LvmPvInfo} objects
825 @param pvi_list: information about the PVs
826
827 @rtype: list of tuples (string, list of strings)
828 @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...])
829
830 """
831 res = []
832 for pvi in pvi_list:
833 if len(pvi.lv_list) > 1:
834 res.append((pvi.name, pvi.lv_list))
835 return res
836
837
838 def _VerifyHypervisors(what, vm_capable, result, all_hvparams,
839 get_hv_fn=hypervisor.GetHypervisor):
840 """Verifies the hypervisor. Appends the results to the 'results' list.
841
842 @type what: C{dict}
843 @param what: a dictionary of things to check
844 @type vm_capable: boolean
845 @param vm_capable: whether or not this node is vm capable
846 @type result: dict
847 @param result: dictionary of verification results; results of the
848 verifications in this function will be added here
849 @type all_hvparams: dict of dict of string
850 @param all_hvparams: dictionary mapping hypervisor names to hvparams
851 @type get_hv_fn: function
852 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
853
854 """
855 if not vm_capable:
856 return
857
858 if constants.NV_HYPERVISOR in what:
859 result[constants.NV_HYPERVISOR] = {}
860 for hv_name in what[constants.NV_HYPERVISOR]:
861 hvparams = all_hvparams[hv_name]
862 try:
863 val = get_hv_fn(hv_name).Verify(hvparams=hvparams)
864 except errors.HypervisorError, err:
865 val = "Error while checking hypervisor: %s" % str(err)
866 result[constants.NV_HYPERVISOR][hv_name] = val
867
868
869 def _VerifyHvparams(what, vm_capable, result,
870 get_hv_fn=hypervisor.GetHypervisor):
871 """Verifies the hvparams. Appends the results to the 'results' list.
872
873 @type what: C{dict}
874 @param what: a dictionary of things to check
875 @type vm_capable: boolean
876 @param vm_capable: whether or not this node is vm capable
877 @type result: dict
878 @param result: dictionary of verification results; results of the
879 verifications in this function will be added here
880 @type get_hv_fn: function
881 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
882
883 """
884 if not vm_capable:
885 return
886
887 if constants.NV_HVPARAMS in what:
888 result[constants.NV_HVPARAMS] = []
889 for source, hv_name, hvparms in what[constants.NV_HVPARAMS]:
890 try:
891 logging.info("Validating hv %s, %s", hv_name, hvparms)
892 get_hv_fn(hv_name).ValidateParameters(hvparms)
893 except errors.HypervisorError, err:
894 result[constants.NV_HVPARAMS].append((source, hv_name, str(err)))
895
896
897 def _VerifyInstanceList(what, vm_capable, result, all_hvparams):
898 """Verifies the instance list.
899
900 @type what: C{dict}
901 @param what: a dictionary of things to check
902 @type vm_capable: boolean
903 @param vm_capable: whether or not this node is vm capable
904 @type result: dict
905 @param result: dictionary of verification results; results of the
906 verifications in this function will be added here
907 @type all_hvparams: dict of dict of string
908 @param all_hvparams: dictionary mapping hypervisor names to hvparams
909
910 """
911 if constants.NV_INSTANCELIST in what and vm_capable:
912 # GetInstanceList can fail
913 try:
914 val = GetInstanceList(what[constants.NV_INSTANCELIST],
915 all_hvparams=all_hvparams)
916 except RPCFail, err:
917 val = str(err)
918 result[constants.NV_INSTANCELIST] = val
919
920
921 def _VerifyNodeInfo(what, vm_capable, result, all_hvparams):
922 """Verifies the node info.
923
924 @type what: C{dict}
925 @param what: a dictionary of things to check
926 @type vm_capable: boolean
927 @param vm_capable: whether or not this node is vm capable
928 @type result: dict
929 @param result: dictionary of verification results; results of the
930 verifications in this function will be added here
931 @type all_hvparams: dict of dict of string
932 @param all_hvparams: dictionary mapping hypervisor names to hvparams
933
934 """
935 if constants.NV_HVINFO in what and vm_capable:
936 hvname = what[constants.NV_HVINFO]
937 hyper = hypervisor.GetHypervisor(hvname)
938 hvparams = all_hvparams[hvname]
939 result[constants.NV_HVINFO] = hyper.GetNodeInfo(hvparams=hvparams)
940
941
942 def _VerifyClientCertificate(cert_file=pathutils.NODED_CLIENT_CERT_FILE):
943 """Verify the existance and validity of the client SSL certificate.
944
945 Also, verify that the client certificate is not self-signed. Self-
946 signed client certificates stem from Ganeti versions 2.12.0 - 2.12.4
947 and should be replaced by client certificates signed by the server
948 certificate. Hence we output a warning when we encounter a self-signed
949 one.
950
951 """
952 create_cert_cmd = "gnt-cluster renew-crypto --new-node-certificates"
953 if not os.path.exists(cert_file):
954 return (constants.CV_ERROR,
955 "The client certificate does not exist. Run '%s' to create"
956 " client certificates for all nodes." % create_cert_cmd)
957
958 (errcode, msg) = utils.VerifyCertificate(cert_file)
959 if errcode is not None:
960 return (errcode, msg)
961
962 (errcode, msg) = utils.IsCertificateSelfSigned(cert_file)
963 if errcode is not None:
964 return (errcode, msg)
965
966 # if everything is fine, we return the digest to be compared to the config
967 return (None, utils.GetCertificateDigest(cert_filename=cert_file))
968
969
970 def _VerifySshSetup(node_status_list, my_name,
971 pub_key_file=pathutils.SSH_PUB_KEYS):
972 """Verifies the state of the SSH key files.
973
974 @type node_status_list: list of tuples
975 @param node_status_list: list of nodes of the cluster associated with a
976 couple of flags: (uuid, name, is_master_candidate,
977 is_potential_master_candidate, online)
978 @type my_name: str
979 @param my_name: name of this node
980 @type pub_key_file: str
981 @param pub_key_file: filename of the public key file
982
983 """
984 if node_status_list is None:
985 return ["No node list to check against the pub_key_file received."]
986
987 my_status_list = [(my_uuid, name, mc, pot_mc, online) for
988 (my_uuid, name, mc, pot_mc, online)
989 in node_status_list if name == my_name]
990 if len(my_status_list) == 0:
991 return ["Cannot find node information for node '%s'." % my_name]
992 (my_uuid, _, _, potential_master_candidate, online) = \
993 my_status_list[0]
994
995 result = []
996
997 if not os.path.exists(pub_key_file):
998 result.append("The public key file '%s' does not exist. Consider running"
999 " 'gnt-cluster renew-crypto --new-ssh-keys"
1000 " [--no-ssh-key-check]' to fix this." % pub_key_file)
1001 return result
1002
1003 pot_mc_uuids = [uuid for (uuid, _, _, _, _) in node_status_list]
1004 offline_nodes = [uuid for (uuid, _, _, _, online) in node_status_list
1005 if not online]
1006 pub_keys = ssh.QueryPubKeyFile(None)
1007
1008 if potential_master_candidate:
1009 # Check that the set of potential master candidates matches the
1010 # public key file
1011 pub_uuids_set = set(pub_keys.keys()) - set(offline_nodes)
1012 pot_mc_uuids_set = set(pot_mc_uuids) - set(offline_nodes)
1013 missing_uuids = set([])
1014 if pub_uuids_set != pot_mc_uuids_set:
1015 unknown_uuids = pub_uuids_set - pot_mc_uuids_set
1016 if unknown_uuids:
1017 result.append("The following node UUIDs are listed in the public key"
1018 " file on node '%s', but are not potential master"
1019 " candidates: %s."
1020 % (my_name, ", ".join(list(unknown_uuids))))
1021 missing_uuids = pot_mc_uuids_set - pub_uuids_set
1022 if missing_uuids:
1023 result.append("The following node UUIDs of potential master candidates"
1024 " are missing in the public key file on node %s: %s."
1025 % (my_name, ", ".join(list(missing_uuids))))
1026
1027 (_, key_files) = \
1028 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1029 (_, dsa_pub_key_filename) = key_files[constants.SSHK_DSA]
1030
1031 my_keys = pub_keys[my_uuid]
1032
1033 dsa_pub_key = utils.ReadFile(dsa_pub_key_filename)
1034 if dsa_pub_key.strip() not in my_keys:
1035 result.append("The dsa key of node %s does not match this node's key"
1036 " in the pub key file." % (my_name))
1037 if len(my_keys) != 1:
1038 result.append("There is more than one key for node %s in the public key"
1039 " file." % my_name)
1040 else:
1041 if len(pub_keys.keys()) > 0:
1042 result.append("The public key file of node '%s' is not empty, although"
1043 " the node is not a potential master candidate."
1044 % my_name)
1045
1046 # Check that all master candidate keys are in the authorized_keys file
1047 (auth_key_file, _) = \
1048 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1049 for (uuid, name, mc, _, online) in node_status_list:
1050 if not online:
1051 continue
1052 if uuid in missing_uuids:
1053 continue
1054 if mc:
1055 for key in pub_keys[uuid]:
1056 if not ssh.HasAuthorizedKey(auth_key_file, key):
1057 result.append("A SSH key of master candidate '%s' (UUID: '%s') is"
1058 " not in the 'authorized_keys' file of node '%s'."
1059 % (name, uuid, my_name))
1060 else:
1061 for key in pub_keys[uuid]:
1062 if name != my_name and ssh.HasAuthorizedKey(auth_key_file, key):
1063 result.append("A SSH key of normal node '%s' (UUID: '%s') is in the"
1064 " 'authorized_keys' file of node '%s'."
1065 % (name, uuid, my_name))
1066 if name == my_name and not ssh.HasAuthorizedKey(auth_key_file, key):
1067 result.append("A SSH key of normal node '%s' (UUID: '%s') is not"
1068 " in the 'authorized_keys' file of itself."
1069 % (my_name, uuid))
1070
1071 return result
1072
1073
1074 def _VerifySshClutter(node_status_list, my_name):
1075 """Verifies that the 'authorized_keys' files are not cluttered up.
1076
1077 @type node_status_list: list of tuples
1078 @param node_status_list: list of nodes of the cluster associated with a
1079 couple of flags: (uuid, name, is_master_candidate,
1080 is_potential_master_candidate, online)
1081 @type my_name: str
1082 @param my_name: name of this node
1083
1084 """
1085 result = []
1086 (auth_key_file, _) = \
1087 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1088 node_names = [name for (_, name, _, _) in node_status_list]
1089 multiple_occurrences = ssh.CheckForMultipleKeys(auth_key_file, node_names)
1090 if multiple_occurrences:
1091 msg = "There are hosts which have more than one SSH key stored for the" \
1092 " same user in the 'authorized_keys' file of node %s. This can be" \
1093 " due to an unsuccessful operation which cluttered up the" \
1094 " 'authorized_keys' file. We recommend to clean this up manually. " \
1095 % my_name
1096 for host, occ in multiple_occurrences.items():
1097 msg += "Entry for '%s' in lines %s. " % (host, utils.CommaJoin(occ))
1098 result.append(msg)
1099
1100 return result
1101
1102
1103 def VerifyNode(what, cluster_name, all_hvparams, node_groups, groups_cfg):
1104 """Verify the status of the local node.
1105
1106 Based on the input L{what} parameter, various checks are done on the
1107 local node.
1108
1109 If the I{filelist} key is present, this list of
1110 files is checksummed and the file/checksum pairs are returned.
1111
1112 If the I{nodelist} key is present, we check that we have
1113 connectivity via ssh with the target nodes (and check the hostname
1114 report).
1115
1116 If the I{node-net-test} key is present, we check that we have
1117 connectivity to the given nodes via both primary IP and, if
1118 applicable, secondary IPs.
1119
1120 @type what: C{dict}
1121 @param what: a dictionary of things to check:
1122 - filelist: list of files for which to compute checksums
1123 - nodelist: list of nodes we should check ssh communication with
1124 - node-net-test: list of nodes we should check node daemon port
1125 connectivity with
1126 - hypervisor: list with hypervisors to run the verify for
1127 @type cluster_name: string
1128 @param cluster_name: the cluster's name
1129 @type all_hvparams: dict of dict of strings
1130 @param all_hvparams: a dictionary mapping hypervisor names to hvparams
1131 @type node_groups: a dict of strings
1132 @param node_groups: node _names_ mapped to their group uuids (it's enough to
1133 have only those nodes that are in `what["nodelist"]`)
1134 @type groups_cfg: a dict of dict of strings
1135 @param groups_cfg: a dictionary mapping group uuids to their configuration
1136 @rtype: dict
1137 @return: a dictionary with the same keys as the input dict, and
1138 values representing the result of the checks
1139
1140 """
1141 result = {}
1142 my_name = netutils.Hostname.GetSysName()
1143 port = netutils.GetDaemonPort(constants.NODED)
1144 vm_capable = my_name not in what.get(constants.NV_NONVMNODES, [])
1145
1146 _VerifyHypervisors(what, vm_capable, result, all_hvparams)
1147 _VerifyHvparams(what, vm_capable, result)
1148
1149 if constants.NV_FILELIST in what:
1150 fingerprints = utils.FingerprintFiles(map(vcluster.LocalizeVirtualPath,
1151 what[constants.NV_FILELIST]))
1152 result[constants.NV_FILELIST] = \
1153 dict((vcluster.MakeVirtualPath(key), value)
1154 for (key, value) in fingerprints.items())
1155
1156 if constants.NV_CLIENT_CERT in what:
1157 result[constants.NV_CLIENT_CERT] = _VerifyClientCertificate()
1158
1159 if constants.NV_SSH_SETUP in what:
1160 result[constants.NV_SSH_SETUP] = \
1161 _VerifySshSetup(what[constants.NV_SSH_SETUP], my_name)
1162 if constants.NV_SSH_CLUTTER in what:
1163 result[constants.NV_SSH_CLUTTER] = \
1164 _VerifySshClutter(what[constants.NV_SSH_SETUP], my_name)
1165
1166 if constants.NV_NODELIST in what:
1167 (nodes, bynode, mcs) = what[constants.NV_NODELIST]
1168
1169 # Add nodes from other groups (different for each node)
1170 try:
1171 nodes.extend(bynode[my_name])
1172 except KeyError:
1173 pass
1174
1175 # Use a random order
1176 random.shuffle(nodes)
1177
1178 # Try to contact all nodes
1179 val = {}
1180 for node in nodes:
1181 params = groups_cfg.get(node_groups.get(node))
1182 ssh_port = params["ndparams"].get(constants.ND_SSH_PORT)
1183 logging.debug("Ssh port %s (None = default) for node %s",
1184 str(ssh_port), node)
1185
1186 # We only test if master candidates can communicate to other nodes.
1187 # We cannot test if normal nodes cannot communicate with other nodes,
1188 # because the administrator might have installed additional SSH keys,
1189 # over which Ganeti has no power.
1190 if my_name in mcs:
1191 success, message = _GetSshRunner(cluster_name). \
1192 VerifyNodeHostname(node, ssh_port)
1193 if not success:
1194 val[node] = message
1195
1196 result[constants.NV_NODELIST] = val
1197
1198 if constants.NV_NODENETTEST in what:
1199 result[constants.NV_NODENETTEST] = tmp = {}
1200 my_pip = my_sip = None
1201 for name, pip, sip in what[constants.NV_NODENETTEST]:
1202 if name == my_name:
1203 my_pip = pip
1204 my_sip = sip
1205 break
1206 if not my_pip:
1207 tmp[my_name] = ("Can't find my own primary/secondary IP"
1208 " in the node list")
1209 else:
1210 for name, pip, sip in what[constants.NV_NODENETTEST]:
1211 fail = []
1212 if not netutils.TcpPing(pip, port, source=my_pip):
1213 fail.append("primary")
1214 if sip != pip:
1215 if not netutils.TcpPing(sip, port, source=my_sip):
1216 fail.append("secondary")
1217 if fail:
1218 tmp[name] = ("failure using the %s interface(s)" %
1219 " and ".join(fail))
1220
1221 if constants.NV_MASTERIP in what:
1222 # FIXME: add checks on incoming data structures (here and in the
1223 # rest of the function)
1224 master_name, master_ip = what[constants.NV_MASTERIP]
1225 if master_name == my_name:
1226 source = constants.IP4_ADDRESS_LOCALHOST
1227 else:
1228 source = None
1229 result[constants.NV_MASTERIP] = netutils.TcpPing(master_ip, port,
1230 source=source)
1231
1232 if constants.NV_USERSCRIPTS in what:
1233 result[constants.NV_USERSCRIPTS] = \
1234 [script for script in what[constants.NV_USERSCRIPTS]
1235 if not utils.IsExecutable(script)]
1236
1237 if constants.NV_OOB_PATHS in what:
1238 result[constants.NV_OOB_PATHS] = tmp = []
1239 for path in what[constants.NV_OOB_PATHS]:
1240 try:
1241 st = os.stat(path)
1242 except OSError, err:
1243 tmp.append("error stating out of band helper: %s" % err)
1244 else:
1245 if stat.S_ISREG(st.st_mode):
1246 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR:
1247 tmp.append(None)
1248 else:
1249 tmp.append("out of band helper %s is not executable" % path)
1250 else:
1251 tmp.append("out of band helper %s is not a file" % path)
1252
1253 if constants.NV_LVLIST in what and vm_capable:
1254 try:
1255 val = GetVolumeList(utils.ListVolumeGroups().keys())
1256 except RPCFail, err:
1257 val = str(err)
1258 result[constants.NV_LVLIST] = val
1259
1260 _VerifyInstanceList(what, vm_capable, result, all_hvparams)
1261
1262 if constants.NV_VGLIST in what and vm_capable:
1263 result[constants.NV_VGLIST] = utils.ListVolumeGroups()
1264
1265 if constants.NV_PVLIST in what and vm_capable:
1266 check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what
1267 val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST],
1268 filter_allocatable=False,
1269 include_lvs=check_exclusive_pvs)
1270 if check_exclusive_pvs:
1271 result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val)
1272 for pvi in val:
1273 # Avoid sending useless data on the wire
1274 pvi.lv_list = []
1275 result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val)
1276
1277 if constants.NV_VERSION in what:
1278 result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
1279 constants.RELEASE_VERSION)
1280
1281 _VerifyNodeInfo(what, vm_capable, result, all_hvparams)
1282
1283 if constants.NV_DRBDVERSION in what and vm_capable:
1284 try:
1285 drbd_version = DRBD8.GetProcInfo().GetVersionString()
1286 except errors.BlockDeviceError, err:
1287 logging.warning("Can't get DRBD version", exc_info=True)
1288 drbd_version = str(err)
1289 result[constants.NV_DRBDVERSION] = drbd_version
1290
1291 if constants.NV_DRBDLIST in what and vm_capable:
1292 try:
1293 used_minors = drbd.DRBD8.GetUsedDevs()
1294 except errors.BlockDeviceError, err:
1295 logging.warning("Can't get used minors list", exc_info=True)
1296 used_minors = str(err)
1297 result[constants.NV_DRBDLIST] = used_minors
1298
1299 if constants.NV_DRBDHELPER in what and vm_capable:
1300 status = True
1301 try:
1302 payload = drbd.DRBD8.GetUsermodeHelper()
1303 except errors.BlockDeviceError, err:
1304 logging.error("Can't get DRBD usermode helper: %s", str(err))
1305 status = False
1306 payload = str(err)
1307 result[constants.NV_DRBDHELPER] = (status, payload)
1308
1309 if constants.NV_NODESETUP in what:
1310 result[constants.NV_NODESETUP] = tmpr = []
1311 if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"):
1312 tmpr.append("The sysfs filesytem doesn't seem to be mounted"
1313 " under /sys, missing required directories /sys/block"
1314 " and /sys/class/net")
1315 if (not os.path.isdir("/proc/sys") or
1316 not os.path.isfile("/proc/sysrq-trigger")):
1317 tmpr.append("The procfs filesystem doesn't seem to be mounted"
1318 " under /proc, missing required directory /proc/sys and"
1319 " the file /proc/sysrq-trigger")
1320
1321 if constants.NV_TIME in what:
1322 result[constants.NV_TIME] = utils.SplitTime(time.time())
1323
1324 if constants.NV_OSLIST in what and vm_capable:
1325 result[constants.NV_OSLIST] = DiagnoseOS()
1326
1327 if constants.NV_BRIDGES in what and vm_capable:
1328 result[constants.NV_BRIDGES] = [bridge
1329 for bridge in what[constants.NV_BRIDGES]
1330 if not utils.BridgeExists(bridge)]
1331
1332 if what.get(constants.NV_ACCEPTED_STORAGE_PATHS) == my_name:
1333 result[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1334 filestorage.ComputeWrongFileStoragePaths()
1335
1336 if what.get(constants.NV_FILE_STORAGE_PATH):
1337 pathresult = filestorage.CheckFileStoragePath(
1338 what[constants.NV_FILE_STORAGE_PATH])
1339 if pathresult:
1340 result[constants.NV_FILE_STORAGE_PATH] = pathresult
1341
1342 if what.get(constants.NV_SHARED_FILE_STORAGE_PATH):
1343 pathresult = filestorage.CheckFileStoragePath(
1344 what[constants.NV_SHARED_FILE_STORAGE_PATH])
1345 if pathresult:
1346 result[constants.NV_SHARED_FILE_STORAGE_PATH] = pathresult
1347
1348 return result
1349
1350
1351 def GetCryptoTokens(token_requests):
1352 """Perform actions on the node's cryptographic tokens.
1353
1354 Token types can be 'ssl' or 'ssh'. So far only some actions are implemented
1355 for 'ssl'. Action 'get' returns the digest of the public client ssl
1356 certificate. Action 'create' creates a new client certificate and private key
1357 and also returns the digest of the certificate. The third parameter of a
1358 token request are optional parameters for the actions, so far only the
1359 filename is supported.
1360
1361 @type token_requests: list of tuples of (string, string, dict), where the
1362 first string is in constants.CRYPTO_TYPES, the second in
1363 constants.CRYPTO_ACTIONS. The third parameter is a dictionary of string
1364 to string.
1365 @param token_requests: list of requests of cryptographic tokens and actions
1366 to perform on them. The actions come with a dictionary of options.
1367 @rtype: list of tuples (string, string)
1368 @return: list of tuples of the token type and the public crypto token
1369
1370 """
1371 tokens = []
1372 for (token_type, action, _) in token_requests:
1373 if token_type not in constants.CRYPTO_TYPES:
1374 raise errors.ProgrammerError("Token type '%s' not supported." %
1375 token_type)
1376 if action not in constants.CRYPTO_ACTIONS:
1377 raise errors.ProgrammerError("Action '%s' is not supported." %
1378 action)
1379 if token_type == constants.CRYPTO_TYPE_SSL_DIGEST:
1380 tokens.append((token_type,
1381 utils.GetCertificateDigest()))
1382 return tokens
1383
1384
1385 def EnsureDaemon(daemon_name, run):
1386 """Ensures the given daemon is running or stopped.
1387
1388 @type daemon_name: string
1389 @param daemon_name: name of the daemon (e.g., constants.KVMD)
1390
1391 @type run: bool
1392 @param run: whether to start or stop the daemon
1393
1394 @rtype: bool
1395 @return: 'True' if daemon successfully started/stopped,
1396 'False' otherwise
1397
1398 """
1399 allowed_daemons = [constants.KVMD]
1400
1401 if daemon_name not in allowed_daemons:
1402 fn = lambda _: False
1403 elif run:
1404 fn = utils.EnsureDaemon
1405 else:
1406 fn = utils.StopDaemon
1407
1408 return fn(daemon_name)
1409
1410
1411 def _InitSshUpdateData(data, noded_cert_file, ssconf_store):
1412 (_, noded_cert) = \
1413 utils.ExtractX509Certificate(utils.ReadFile(noded_cert_file))
1414 data[constants.SSHS_NODE_DAEMON_CERTIFICATE] = noded_cert
1415
1416 cluster_name = ssconf_store.GetClusterName()
1417 data[constants.SSHS_CLUSTER_NAME] = cluster_name
1418
1419
1420 def AddNodeSshKey(node_uuid, node_name,
1421 potential_master_candidates,
1422 to_authorized_keys=False,
1423 to_public_keys=False,
1424 get_public_keys=False,
1425 pub_key_file=pathutils.SSH_PUB_KEYS,
1426 ssconf_store=None,
1427 noded_cert_file=pathutils.NODED_CERT_FILE,
1428 run_cmd_fn=ssh.RunSshCmdWithStdin):
1429 """Distributes a node's public SSH key across the cluster.
1430
1431 Note that this function should only be executed on the master node, which
1432 then will copy the new node's key to all nodes in the cluster via SSH.
1433
1434 Also note: at least one of the flags C{to_authorized_keys},
1435 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1436 the function to actually perform any actions.
1437
1438 @type node_uuid: str
1439 @param node_uuid: the UUID of the node whose key is added
1440 @type node_name: str
1441 @param node_name: the name of the node whose key is added
1442 @type potential_master_candidates: list of str
1443 @param potential_master_candidates: list of node names of potential master
1444 candidates; this should match the list of uuids in the public key file
1445 @type to_authorized_keys: boolean
1446 @param to_authorized_keys: whether the key should be added to the
1447 C{authorized_keys} file of all nodes
1448 @type to_public_keys: boolean
1449 @param to_public_keys: whether the keys should be added to the public key file
1450 @type get_public_keys: boolean
1451 @param get_public_keys: whether the node should add the clusters' public keys
1452 to its {ganeti_pub_keys} file
1453
1454 """
1455 node_list = [SshAddNodeInfo(name=node_name, uuid=node_uuid,
1456 to_authorized_keys=to_authorized_keys,
1457 to_public_keys=to_public_keys,
1458 get_public_keys=get_public_keys)]
1459 return AddNodeSshKeyBulk(node_list,
1460 potential_master_candidates,
1461 pub_key_file=pub_key_file,
1462 ssconf_store=ssconf_store,
1463 noded_cert_file=noded_cert_file,
1464 run_cmd_fn=run_cmd_fn)
1465
1466
1467 # Node info named tuple specifically for the use with AddNodeSshKeyBulk
1468 SshAddNodeInfo = collections.namedtuple(
1469 "SshAddNodeInfo",
1470 ["uuid",
1471 "name",
1472 "to_authorized_keys",
1473 "to_public_keys",
1474 "get_public_keys"])
1475
1476
1477 def AddNodeSshKeyBulk(node_list,
1478 potential_master_candidates,
1479 pub_key_file=pathutils.SSH_PUB_KEYS,
1480 ssconf_store=None,
1481 noded_cert_file=pathutils.NODED_CERT_FILE,
1482 run_cmd_fn=ssh.RunSshCmdWithStdin):
1483 """Distributes a node's public SSH key across the cluster.
1484
1485 Note that this function should only be executed on the master node, which
1486 then will copy the new node's key to all nodes in the cluster via SSH.
1487
1488 Also note: at least one of the flags C{to_authorized_keys},
1489 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1490 the function to actually perform any actions.
1491
1492 @type node_list: list of SshAddNodeInfo tuples
1493 @param node_list: list of tuples containing the necessary node information for
1494 adding their keys
1495 @type potential_master_candidates: list of str
1496 @param potential_master_candidates: list of node names of potential master
1497 candidates; this should match the list of uuids in the public key file
1498
1499 """
1500 # whether there are any keys to be added or retrieved at all
1501 to_authorized_keys = any([node_info.to_authorized_keys for node_info in
1502 node_list])
1503 to_public_keys = any([node_info.to_public_keys for node_info in
1504 node_list])
1505
1506 if not ssconf_store:
1507 ssconf_store = ssconf.SimpleStore()
1508
1509 for node_info in node_list:
1510 # replacement not necessary for keys that are not supposed to be in the
1511 # list of public keys
1512 if not node_info.to_public_keys:
1513 continue
1514 # Check and fix sanity of key file
1515 keys_by_name = ssh.QueryPubKeyFile([node_info.name], key_file=pub_key_file)
1516 keys_by_uuid = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1517
1518 if (not keys_by_name or node_info.name not in keys_by_name) \
1519 and (not keys_by_uuid or node_info.uuid not in keys_by_uuid):
1520 raise errors.SshUpdateError(
1521 "No keys found for the new node '%s' (UUID %s) in the list of public"
1522 " SSH keys, neither for the name or the UUID" %
1523 (node_info.name, node_info.uuid))
1524 else:
1525 if node_info.name in keys_by_name:
1526 # Replace the name by UUID in the file as the name should only be used
1527 # temporarily
1528 ssh.ReplaceNameByUuid(node_info.uuid, node_info.name,
1529 error_fn=errors.SshUpdateError,
1530 key_file=pub_key_file)
1531
1532 # Retrieve updated map of UUIDs to keys
1533 keys_by_uuid = ssh.QueryPubKeyFile(
1534 [node_info.uuid for node_info in node_list], key_file=pub_key_file)
1535
1536 # Update the master node's key files
1537 (auth_key_file, _) = \
1538 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1539 for node_info in node_list:
1540 if node_info.to_authorized_keys:
1541 ssh.AddAuthorizedKeys(auth_key_file, keys_by_uuid[node_info.uuid])
1542
1543 base_data = {}
1544 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1545 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1546
1547 ssh_port_map = ssconf_store.GetSshPortMap()
1548
1549 # Update the target nodes themselves
1550 for node_info in node_list:
1551 logging.debug("Updating SSH key files of target node '%s'.", node_info.name)
1552 if node_info.get_public_keys:
1553 node_data = {}
1554 _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
1555 all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
1556 node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1557 (constants.SSHS_OVERRIDE, all_keys)
1558
1559 try:
1560 utils.RetryByNumberOfTimes(
1561 constants.SSHS_MAX_RETRIES,
1562 errors.SshUpdateError,
1563 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1564 ssh_port_map.get(node_info.name), node_data,
1565 debug=False, verbose=False, use_cluster_key=False,
1566 ask_key=False, strict_host_check=False)
1567 except errors.SshUpdateError as e:
1568 # Clean up the master's public key file if adding key fails
1569 if node_info.to_public_keys:
1570 ssh.RemovePublicKey(node_info.uuid)
1571 raise e
1572
1573 # Update all nodes except master and the target nodes
1574 keys_by_uuid_auth = ssh.QueryPubKeyFile(
1575 [node_info.uuid for node_info in node_list
1576 if node_info.to_authorized_keys],
1577 key_file=pub_key_file)
1578 if to_authorized_keys:
1579 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1580 (constants.SSHS_ADD, keys_by_uuid_auth)
1581
1582 pot_mc_data = base_data.copy()
1583 keys_by_uuid_pub = ssh.QueryPubKeyFile(
1584 [node_info.uuid for node_info in node_list
1585 if node_info.to_public_keys],
1586 key_file=pub_key_file)
1587 if to_public_keys:
1588 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1589 (constants.SSHS_REPLACE_OR_ADD, keys_by_uuid_pub)
1590
1591 all_nodes = ssconf_store.GetNodeList()
1592 master_node = ssconf_store.GetMasterNode()
1593 online_nodes = ssconf_store.GetOnlineNodeList()
1594
1595 node_errors = []
1596 for node in all_nodes:
1597 if node == master_node:
1598 logging.debug("Skipping master node '%s'.", master_node)
1599 continue
1600 if node not in online_nodes:
1601 logging.debug("Skipping offline node '%s'.", node)
1602 continue
1603 if node in potential_master_candidates:
1604 logging.debug("Updating SSH key files of node '%s'.", node)
1605 try:
1606 utils.RetryByNumberOfTimes(
1607 constants.SSHS_MAX_RETRIES,
1608 errors.SshUpdateError,
1609 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1610 ssh_port_map.get(node), pot_mc_data,
1611 debug=False, verbose=False, use_cluster_key=False,
1612 ask_key=False, strict_host_check=False)
1613 except errors.SshUpdateError as last_exception:
1614 error_msg = ("When adding the key of node '%s', updating SSH key"
1615 " files of node '%s' failed after %s retries."
1616 " Not trying again. Last error was: %s." %
1617 (node, node_info.name, constants.SSHS_MAX_RETRIES,
1618 last_exception))
1619 node_errors.append((node, error_msg))
1620 # We only log the error and don't throw an exception, because
1621 # one unreachable node shall not abort the entire procedure.
1622 logging.error(error_msg)
1623
1624 else:
1625 if to_authorized_keys:
1626 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
1627 ssh_port_map.get(node), base_data,
1628 debug=False, verbose=False, use_cluster_key=False,
1629 ask_key=False, strict_host_check=False)
1630
1631 return node_errors
1632
1633
1634 def RemoveNodeSshKey(node_uuid, node_name,
1635 master_candidate_uuids,
1636 potential_master_candidates,
1637 master_uuid=None,
1638 keys_to_remove=None,
1639 from_authorized_keys=False,
1640 from_public_keys=False,
1641 clear_authorized_keys=False,
1642 clear_public_keys=False,
1643 pub_key_file=pathutils.SSH_PUB_KEYS,
1644 ssconf_store=None,
1645 noded_cert_file=pathutils.NODED_CERT_FILE,
1646 readd=False,
1647 run_cmd_fn=ssh.RunSshCmdWithStdin):
1648 """Removes the node's SSH keys from the key files and distributes those.
1649
1650 Note that at least one of the flags C{from_authorized_keys},
1651 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1652 has to be set to C{True} for the function to perform any action at all.
1653 Not doing so will trigger an assertion in the function.
1654
1655 @type node_uuid: str
1656 @param node_uuid: UUID of the node whose key is removed
1657 @type node_name: str
1658 @param node_name: name of the node whose key is remove
1659 @type master_candidate_uuids: list of str
1660 @param master_candidate_uuids: list of UUIDs of the current master candidates
1661 @type potential_master_candidates: list of str
1662 @param potential_master_candidates: list of names of potential master
1663 candidates
1664 @type keys_to_remove: dict of str to list of str
1665 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1666 to be removed. This list is supposed to be used only if the keys are not
1667 in the public keys file. This is for example the case when removing a
1668 master node's key.
1669 @type from_authorized_keys: boolean
1670 @param from_authorized_keys: whether or not the key should be removed
1671 from the C{authorized_keys} file
1672 @type from_public_keys: boolean
1673 @param from_public_keys: whether or not the key should be remove from
1674 the C{ganeti_pub_keys} file
1675 @type clear_authorized_keys: boolean
1676 @param clear_authorized_keys: whether or not the C{authorized_keys} file
1677 should be cleared on the node whose keys are removed
1678 @type clear_public_keys: boolean
1679 @param clear_public_keys: whether to clear the node's C{ganeti_pub_key} file
1680 @type readd: boolean
1681 @param readd: whether this is called during a readd operation.
1682 @rtype: list of string
1683 @returns: list of feedback messages
1684
1685 """
1686 node_list = [SshRemoveNodeInfo(uuid=node_uuid,
1687 name=node_name,
1688 from_authorized_keys=from_authorized_keys,
1689 from_public_keys=from_public_keys,
1690 clear_authorized_keys=clear_authorized_keys,
1691 clear_public_keys=clear_public_keys)]
1692 return RemoveNodeSshKeyBulk(node_list,
1693 master_candidate_uuids,
1694 potential_master_candidates,
1695 master_uuid=master_uuid,
1696 keys_to_remove=keys_to_remove,
1697 pub_key_file=pub_key_file,
1698 ssconf_store=ssconf_store,
1699 noded_cert_file=noded_cert_file,
1700 readd=readd,
1701 run_cmd_fn=run_cmd_fn)
1702
1703
1704 # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
1705 SshRemoveNodeInfo = collections.namedtuple(
1706 "SshRemoveNodeInfo",
1707 ["uuid",
1708 "name",
1709 "from_authorized_keys",
1710 "from_public_keys",
1711 "clear_authorized_keys",
1712 "clear_public_keys"])
1713
1714
1715 def RemoveNodeSshKeyBulk(node_list,
1716 master_candidate_uuids,
1717 potential_master_candidates,
1718 master_uuid=None,
1719 keys_to_remove=None,
1720 pub_key_file=pathutils.SSH_PUB_KEYS,
1721 ssconf_store=None,
1722 noded_cert_file=pathutils.NODED_CERT_FILE,
1723 readd=False,
1724 run_cmd_fn=ssh.RunSshCmdWithStdin):
1725 """Removes the node's SSH keys from the key files and distributes those.
1726
1727 Note that at least one of the flags C{from_authorized_keys},
1728 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1729 of at least one node has to be set to C{True} for the function to perform any
1730 action at all. Not doing so will trigger an assertion in the function.
1731
1732 @type node_list: list of C{SshRemoveNodeInfo}.
1733 @param node_list: list of information about nodes whose keys are being removed
1734 @type master_candidate_uuids: list of str
1735 @param master_candidate_uuids: list of UUIDs of the current master candidates
1736 @type potential_master_candidates: list of str
1737 @param potential_master_candidates: list of names of potential master
1738 candidates
1739 @type keys_to_remove: dict of str to list of str
1740 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1741 to be removed. This list is supposed to be used only if the keys are not
1742 in the public keys file. This is for example the case when removing a
1743 master node's key.
1744 @type readd: boolean
1745 @param readd: whether this is called during a readd operation.
1746 @rtype: list of string
1747 @returns: list of feedback messages
1748
1749 """
1750 # Non-disruptive error messages, list of (node, msg) pairs
1751 result_msgs = []
1752
1753 # whether there are any keys to be added or retrieved at all
1754 from_authorized_keys = any([node_info.from_authorized_keys for node_info in
1755 node_list])
1756 from_public_keys = any([node_info.from_public_keys for node_info in
1757 node_list])
1758 clear_authorized_keys = any([node_info.clear_authorized_keys for node_info in
1759 node_list])
1760 clear_public_keys = any([node_info.clear_public_keys for node_info in
1761 node_list])
1762
1763 # Make sure at least one of these flags is true.
1764 if not (from_authorized_keys or from_public_keys or clear_authorized_keys
1765 or clear_public_keys):
1766 raise errors.SshUpdateError("No removal from any key file was requested.")
1767
1768 if not ssconf_store:
1769 ssconf_store = ssconf.SimpleStore()
1770
1771 master_node = ssconf_store.GetMasterNode()
1772 ssh_port_map = ssconf_store.GetSshPortMap()
1773
1774 all_keys_to_remove = {}
1775 if from_authorized_keys or from_public_keys:
1776 for node_info in node_list:
1777 # Skip nodes that don't actually need any keys to be removed.
1778 if not (node_info.from_authorized_keys or node_info.from_public_keys):
1779 continue
1780 if node_info.name == master_node and not keys_to_remove:
1781 raise errors.SshUpdateError("Cannot remove the master node's keys.")
1782 if keys_to_remove:
1783 keys = keys_to_remove
1784 else:
1785 keys = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1786 if (not keys or node_info.uuid not in keys) and not readd:
1787 raise errors.SshUpdateError("Node '%s' not found in the list of"
1788 " public SSH keys. It seems someone"
1789 " tries to remove a key from outside"
1790 " the cluster!" % node_info.uuid)
1791 # During an upgrade all nodes have the master key. In this case we
1792 # should not remove it to avoid accidentally shutting down cluster
1793 # SSH communication
1794 master_keys = None
1795 if master_uuid:
1796 master_keys = ssh.QueryPubKeyFile([master_uuid],
1797 key_file=pub_key_file)
1798 for master_key in master_keys:
1799 if master_key in keys[node_info.uuid]:
1800 keys[node_info.uuid].remove(master_key)
1801
1802 all_keys_to_remove.update(keys)
1803
1804 if all_keys_to_remove:
1805 base_data = {}
1806 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1807 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1808
1809 if from_authorized_keys:
1810 # UUIDs of nodes that are supposed to be removed from the
1811 # authorized_keys files.
1812 nodes_remove_from_authorized_keys = [
1813 node_info.uuid for node_info in node_list
1814 if node_info.from_authorized_keys]
1815 keys_to_remove_from_authorized_keys = dict([
1816 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1817 if uuid in nodes_remove_from_authorized_keys])
1818 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1819 (constants.SSHS_REMOVE, keys_to_remove_from_authorized_keys)
1820 (auth_key_file, _) = \
1821 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False,
1822 dircheck=False)
1823
1824 for uuid in nodes_remove_from_authorized_keys:
1825 ssh.RemoveAuthorizedKeys(auth_key_file,
1826 keys_to_remove_from_authorized_keys[uuid])
1827
1828 pot_mc_data = base_data.copy()
1829
1830 if from_public_keys:
1831 nodes_remove_from_public_keys = [
1832 node_info.uuid for node_info in node_list
1833 if node_info.from_public_keys]
1834 keys_to_remove_from_public_keys = dict([
1835 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1836 if uuid in nodes_remove_from_public_keys])
1837 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1838 (constants.SSHS_REMOVE, keys_to_remove_from_public_keys)
1839
1840 all_nodes = ssconf_store.GetNodeList()
1841 online_nodes = ssconf_store.GetOnlineNodeList()
1842 all_nodes_to_remove = [node_info.name for node_info in node_list]
1843 logging.debug("Removing keys of nodes '%s' from all nodes but itself and"
1844 " master.", ", ".join(all_nodes_to_remove))
1845 for node in all_nodes:
1846 if node == master_node:
1847 logging.debug("Skipping master node '%s'.", master_node)
1848 continue
1849 if node not in online_nodes:
1850 logging.debug("Skipping offline node '%s'.", node)
1851 continue
1852 if node in all_nodes_to_remove:
1853 logging.debug("Skipping node whose key is removed itself '%s'.", node)
1854 continue
1855 ssh_port = ssh_port_map.get(node)
1856 if not ssh_port:
1857 raise errors.OpExecError("No SSH port information available for"
1858 " node '%s', map: %s." %
1859 (node, ssh_port_map))
1860 error_msg_final = ("When removing the key of node '%s', updating the"
1861 " SSH key files of node '%s' failed. Last error"
1862 " was: %s.")
1863 if node in potential_master_candidates:
1864 logging.debug("Updating key setup of potential master candidate node"
1865 " %s.", node)
1866 try:
1867 utils.RetryByNumberOfTimes(
1868 constants.SSHS_MAX_RETRIES,
1869 errors.SshUpdateError,
1870 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1871 ssh_port, pot_mc_data,
1872 debug=False, verbose=False, use_cluster_key=False,
1873 ask_key=False, strict_host_check=False)
1874 except errors.SshUpdateError as last_exception:
1875 error_msg = error_msg_final % (
1876 node_info.name, node, last_exception)
1877 result_msgs.append((node, error_msg))
1878 logging.error(error_msg)
1879
1880 else:
1881 if from_authorized_keys:
1882 logging.debug("Updating key setup of normal node %s.", node)
1883 try:
1884 utils.RetryByNumberOfTimes(
1885 constants.SSHS_MAX_RETRIES,
1886 errors.SshUpdateError,
1887 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1888 ssh_port, base_data,
1889 debug=False, verbose=False, use_cluster_key=False,
1890 ask_key=False, strict_host_check=False)
1891 except errors.SshUpdateError as last_exception:
1892 error_msg = error_msg_final % (
1893 node_info.name, node, last_exception)
1894 result_msgs.append((node, error_msg))
1895 logging.error(error_msg)
1896
1897 for node_info in node_list:
1898 if node_info.clear_authorized_keys or node_info.from_public_keys or \
1899 node_info.clear_public_keys:
1900 data = {}
1901 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1902 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1903 ssh_port = ssh_port_map.get(node_info.name)
1904 if not ssh_port:
1905 raise errors.OpExecError("No SSH port information available for"
1906 " node '%s', which is leaving the cluster.")
1907
1908 if node_info.clear_authorized_keys:
1909 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore,
1910 # we have to specify exactly which keys to clear to leave keys untouched
1911 # that were not added by Ganeti.
1912 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids
1913 if uuid != node_info.uuid]
1914 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids,
1915 key_file=pub_key_file)
1916 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1917 (constants.SSHS_REMOVE, candidate_keys)
1918
1919 if node_info.clear_public_keys:
1920 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1921 (constants.SSHS_CLEAR, {})
1922 elif node_info.from_public_keys:
1923 # Since clearing the public keys subsumes removing just a single key,
1924 # we only do it if clear_public_keys is 'False'.
1925
1926 if all_keys_to_remove:
1927 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1928 (constants.SSHS_REMOVE, all_keys_to_remove)
1929
1930 # If we have no changes to any keyfile, just return
1931 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or
1932 constants.SSHS_SSH_AUTHORIZED_KEYS in data):
1933 return
1934
1935 logging.debug("Updating SSH key setup of target node '%s'.",
1936 node_info.name)
1937 try:
1938 utils.RetryByNumberOfTimes(
1939 constants.SSHS_MAX_RETRIES,
1940 errors.SshUpdateError,
1941 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1942 ssh_port, data,
1943 debug=False, verbose=False, use_cluster_key=False,
1944 ask_key=False, strict_host_check=False)
1945 except errors.SshUpdateError as last_exception:
1946 result_msgs.append(
1947 (node_info.name,
1948 ("Removing SSH keys from node '%s' failed."
1949 " This can happen when the node is already unreachable."
1950 " Error: %s" % (node_info.name, last_exception))))
1951
1952 if all_keys_to_remove and from_public_keys:
1953 for node_uuid in nodes_remove_from_public_keys:
1954 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
1955
1956 return result_msgs
1957
1958
1959 def _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map,
1960 pub_key_file=pathutils.SSH_PUB_KEYS,
1961 ssconf_store=None,
1962 noded_cert_file=pathutils.NODED_CERT_FILE,
1963 run_cmd_fn=ssh.RunSshCmdWithStdin,
1964 suffix=""):
1965 """Generates the root SSH key pair on the node.
1966
1967 @type node_uuid: str
1968 @param node_uuid: UUID of the node whose key is removed
1969 @type node_name: str
1970 @param node_name: name of the node whose key is remove
1971 @type ssh_port_map: dict of str to int
1972 @param ssh_port_map: mapping of node names to their SSH port
1973
1974 """
1975 if not ssconf_store:
1976 ssconf_store = ssconf.SimpleStore()
1977
1978 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
1979 if not keys_by_uuid or node_uuid not in keys_by_uuid:
1980 raise errors.SshUpdateError("Node %s (UUID: %s) whose key is requested to"
1981 " be regenerated is not registered in the"
1982 " public keys file." % (node_name, node_uuid))
1983
1984 data = {}
1985 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1986 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1987 data[constants.SSHS_GENERATE] = {constants.SSHS_SUFFIX: suffix}
1988
1989 run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
1990 ssh_port_map.get(node_name), data,
1991 debug=False, verbose=False, use_cluster_key=False,
1992 ask_key=False, strict_host_check=False)
1993
1994
1995 def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
1996 master_node_uuids = [node_uuid for (node_uuid, node_name)
1997 in node_uuid_name_map
1998 if node_name == master_node_name]
1999 if len(master_node_uuids) != 1:
2000 raise errors.SshUpdateError("No (unique) master UUID found. Master node"
2001 " name: '%s', Master UUID: '%s'" %
2002 (master_node_name, master_node_uuids))
2003 return master_node_uuids[0]
2004
2005
2006 def _GetOldMasterKeys(master_node_uuid, pub_key_file):
2007 old_master_keys_by_uuid = ssh.QueryPubKeyFile([master_node_uuid],
2008 key_file=pub_key_file)
2009 if not old_master_keys_by_uuid:
2010 raise errors.SshUpdateError("No public key of the master node (UUID '%s')"
2011 " found, not generating a new key."
2012 % master_node_uuid)
2013 return old_master_keys_by_uuid
2014
2015
2016 def _GetNewMasterKey(root_keyfiles, master_node_uuid):
2017 new_master_keys = []
2018 for (_, (_, public_key_file)) in root_keyfiles.items():
2019 public_key_dir = os.path.dirname(public_key_file)
2020 public_key_file_tmp_filename = \
2021 os.path.splitext(os.path.basename(public_key_file))[0] \
2022 + constants.SSHS_MASTER_SUFFIX + ".pub"
2023 public_key_path_tmp = os.path.join(public_key_dir,
2024 public_key_file_tmp_filename)
2025 if os.path.exists(public_key_path_tmp):
2026 # for some key types, there might not be any keys
2027 key = utils.ReadFile(public_key_path_tmp)
2028 new_master_keys.append(key)
2029 if not new_master_keys:
2030 raise errors.SshUpdateError("Cannot find any type of temporary SSH key.")
2031 return {master_node_uuid: new_master_keys}
2032
2033
2034 def _ReplaceMasterKeyOnMaster(root_keyfiles):
2035 number_of_moves = 0
2036 for (_, (private_key_file, public_key_file)) in root_keyfiles.items():
2037 key_dir = os.path.dirname(public_key_file)
2038 private_key_file_tmp = \
2039 os.path.basename(private_key_file) + constants.SSHS_MASTER_SUFFIX
2040 public_key_file_tmp = private_key_file_tmp + ".pub"
2041 private_key_path_tmp = os.path.join(key_dir,
2042 private_key_file_tmp)
2043 public_key_path_tmp = os.path.join(key_dir,
2044 public_key_file_tmp)
2045 if os.path.exists(public_key_file):
2046 utils.CreateBackup(public_key_file)
2047 utils.RemoveFile(public_key_file)
2048 if os.path.exists(private_key_file):
2049 utils.CreateBackup(private_key_file)
2050 utils.RemoveFile(private_key_file)
2051 if os.path.exists(public_key_path_tmp) and \
2052 os.path.exists(private_key_path_tmp):
2053 # for some key types, there might not be any keys
2054 shutil.move(public_key_path_tmp, public_key_file)
2055 shutil.move(private_key_path_tmp, private_key_file)
2056 number_of_moves += 1
2057 if not number_of_moves:
2058 raise errors.SshUpdateError("Could not move at least one master SSH key.")
2059
2060
2061 def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
2062 potential_master_candidates,
2063 pub_key_file=pathutils.SSH_PUB_KEYS,
2064 ssconf_store=None,
2065 noded_cert_file=pathutils.NODED_CERT_FILE,
2066 run_cmd_fn=ssh.RunSshCmdWithStdin):
2067 """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
2068
2069 @type node_uuids: list of str
2070 @param node_uuids: list of node UUIDs whose keys should be renewed
2071 @type node_names: list of str
2072 @param node_names: list of node names whose keys should be removed. This list
2073 should match the C{node_uuids} parameter
2074 @type master_candidate_uuids: list of str
2075 @param master_candidate_uuids: list of UUIDs of master candidates or
2076 master node
2077 @type pub_key_file: str
2078 @param pub_key_file: file path of the the public key file
2079 @type noded_cert_file: str
2080 @param noded_cert_file: path of the noded SSL certificate file
2081 @type run_cmd_fn: function
2082 @param run_cmd_fn: function to run commands on remote nodes via SSH
2083 @raises ProgrammerError: if node_uuids and node_names don't match;
2084 SshUpdateError if a node's key is missing from the public key file,
2085 if a node's new SSH key could not be fetched from it, if there is
2086 none or more than one entry in the public key list for the master
2087 node.
2088
2089 """
2090 if not ssconf_store:
2091 ssconf_store = ssconf.SimpleStore()
2092 cluster_name = ssconf_store.GetClusterName()
2093
2094 if not len(node_uuids) == len(node_names):
2095 raise errors.ProgrammerError("List of nodes UUIDs and node names"
2096 " does not match in length.")
2097
2098 (_, root_keyfiles) = \
2099 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2100 (_, dsa_pub_keyfile) = root_keyfiles[constants.SSHK_DSA]
2101 old_master_key = utils.ReadFile(dsa_pub_keyfile)
2102
2103 node_uuid_name_map = zip(node_uuids, node_names)
2104
2105 master_node_name = ssconf_store.GetMasterNode()
2106 master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name)
2107 ssh_port_map = ssconf_store.GetSshPortMap()
2108 # List of all node errors that happened, but which did not abort the
2109 # procedure as a whole. It is important that this is a list to have a
2110 # somewhat chronological history of events.
2111 all_node_errors = []
2112
2113 # process non-master nodes
2114
2115 # keys to add in bulk at the end
2116 node_keys_to_add = []
2117
2118 # list of all nodes
2119 node_list = []
2120
2121 # list of keys to be removed before generating new keys
2122 node_info_to_remove = []
2123
2124 for node_uuid, node_name in node_uuid_name_map:
2125 if node_name == master_node_name:
2126 continue
2127 master_candidate = node_uuid in master_candidate_uuids
2128 potential_master_candidate = node_name in potential_master_candidates
2129 node_list.append((node_uuid, node_name, master_candidate,
2130 potential_master_candidate))
2131
2132 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
2133 if not keys_by_uuid:
2134 raise errors.SshUpdateError("No public key of node %s (UUID %s) found,"
2135 " not generating a new key."
2136 % (node_name, node_uuid))
2137
2138 if master_candidate:
2139 logging.debug("Fetching old SSH key from node '%s'.", node_name)
2140 old_pub_key = ssh.ReadRemoteSshPubKeys(dsa_pub_keyfile,
2141 node_name, cluster_name,
2142 ssh_port_map[node_name],
2143 False, # ask_key
2144 False) # key_check
2145 if old_pub_key != old_master_key:
2146 # If we are already in a multi-key setup (that is past Ganeti 2.12),
2147 # we can safely remove the old key of the node. Otherwise, we cannot
2148 # remove that node's key, because it is also the master node's key
2149 # and that would terminate all communication from the master to the
2150 # node.
2151 node_info_to_remove.append(SshRemoveNodeInfo(
2152 uuid=node_uuid,
2153 name=node_name,
2154 from_authorized_keys=master_candidate,
2155 from_public_keys=False,
2156 clear_authorized_keys=False,
2157 clear_public_keys=False))
2158 else:
2159 logging.debug("Old key of node '%s' is the same as the current master"
2160 " key. Not deleting that key on the node.", node_name)
2161
2162 logging.debug("Removing old SSH keys of all master candidates.")
2163 if node_info_to_remove:
2164 node_errors = RemoveNodeSshKeyBulk(
2165 node_info_to_remove,
2166 master_candidate_uuids,
2167 potential_master_candidates,
2168 master_uuid=master_node_uuid)
2169 if node_errors:
2170 all_node_errors = all_node_errors + node_errors
2171
2172 for (node_uuid, node_name, master_candidate, potential_master_candidate) \
2173 in node_list:
2174
2175 logging.debug("Generating new SSH key for node '%s'.", node_name)
2176 _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map,
2177 pub_key_file=pub_key_file,
2178 ssconf_store=ssconf_store,
2179 noded_cert_file=noded_cert_file,
2180 run_cmd_fn=run_cmd_fn)
2181
2182 try:
2183 logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
2184 pub_key = ssh.ReadRemoteSshPubKeys(dsa_pub_keyfile,
2185 node_name, cluster_name,
2186 ssh_port_map[node_name],
2187 False, # ask_key
2188 False) # key_check
2189 except:
2190 raise errors.SshUpdateError("Could not fetch key of node %s"
2191 " (UUID %s)" % (node_name, node_uuid))
2192
2193 if potential_master_candidate:
2194 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
2195 ssh.AddPublicKey(node_uuid, pub_key, key_file=pub_key_file)
2196
2197 logging.debug("Add ssh key of node '%s'.", node_name)
2198 node_info = SshAddNodeInfo(name=node_name,
2199 uuid=node_uuid,
2200 to_authorized_keys=master_candidate,
2201 to_public_keys=potential_master_candidate,
2202 get_public_keys=True)
2203 node_keys_to_add.append(node_info)
2204
2205 node_errors = AddNodeSshKeyBulk(
2206 node_keys_to_add, potential_master_candidates,
2207 pub_key_file=pub_key_file, ssconf_store=ssconf_store,
2208 noded_cert_file=noded_cert_file,
2209 run_cmd_fn=run_cmd_fn)
2210 if node_errors:
2211 all_node_errors = all_node_errors + node_errors
2212
2213 # Renewing the master node's key
2214
2215 # Preserve the old keys for now
2216 old_master_keys_by_uuid = _GetOldMasterKeys(master_node_uuid, pub_key_file)
2217
2218 # Generate a new master key with a suffix, don't touch the old one for now
2219 logging.debug("Generate new ssh key of master.")
2220 _GenerateNodeSshKey(master_node_uuid, master_node_name, ssh_port_map,
2221 pub_key_file=pub_key_file,
2222 ssconf_store=ssconf_store,
2223 noded_cert_file=noded_cert_file,
2224 run_cmd_fn=run_cmd_fn,
2225 suffix=constants.SSHS_MASTER_SUFFIX)
2226 # Read newly created master key
2227 new_master_key_dict = _GetNewMasterKey(root_keyfiles, master_node_uuid)
2228
2229 # Replace master key in the master nodes' public key file
2230 ssh.RemovePublicKey(master_node_uuid, key_file=pub_key_file)
2231 for pub_key in new_master_key_dict[master_node_uuid]:
2232 ssh.AddPublicKey(master_node_uuid, pub_key, key_file=pub_key_file)
2233
2234 # Add new master key to all node's public and authorized keys
2235 logging.debug("Add new master key to all nodes.")
2236 node_errors = AddNodeSshKey(
2237 master_node_uuid, master_node_name, potential_master_candidates,
2238 to_authorized_keys=True, to_public_keys=True,
2239 get_public_keys=False, pub_key_file=pub_key_file,
2240 ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
2241 run_cmd_fn=run_cmd_fn)
2242 if node_errors:
2243 all_node_errors = all_node_errors + node_errors
2244
2245 # Remove the old key file and rename the new key to the non-temporary filename
2246 _ReplaceMasterKeyOnMaster(root_keyfiles)
2247
2248 # Remove old key from authorized keys
2249 (auth_key_file, _) = \
2250 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2251 ssh.RemoveAuthorizedKeys(auth_key_file,
2252 old_master_keys_by_uuid[master_node_uuid])
2253
2254 # Remove the old key from all node's authorized keys file
2255 logging.debug("Remove the old master key from all nodes.")
2256 node_errors = RemoveNodeSshKey(
2257 master_node_uuid, master_node_name, master_candidate_uuids,
2258 potential_master_candidates,
2259 keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
2260 from_public_keys=False, clear_authorized_keys=False,
2261 clear_public_keys=False)
2262 if node_errors:
2263 all_node_errors = all_node_errors + node_errors
2264
2265 return all_node_errors
2266
2267
2268 def GetBlockDevSizes(devices):
2269 """Return the size of the given block devices
2270
2271 @type devices: list
2272 @param devices: list of block device nodes to query
2273 @rtype: dict
2274 @return:
2275 dictionary of all block devices under /dev (key). The value is their
2276 size in MiB.
2277
2278 {'/dev/disk/by-uuid/123456-12321231-312312-312': 124}
2279
2280 """
2281 DEV_PREFIX = "/dev/"
2282 blockdevs = {}
2283
2284 for devpath in devices:
2285 if not utils.IsBelowDir(DEV_PREFIX, devpath):
2286 continue
2287
2288 try:
2289 st = os.stat(devpath)
2290 except EnvironmentError, err:
2291 logging.warning("Error stat()'ing device %s: %s", devpath, str(err))
2292 continue
2293
2294 if stat.S_ISBLK(st.st_mode):
2295 result = utils.RunCmd(["blockdev", "--getsize64", devpath])
2296 if result.failed:
2297 # We don't want to fail, just do not list this device as available
2298 logging.warning("Cannot get size for block device %s", devpath)
2299 continue
2300
2301 size = int(result.stdout) / (1024 * 1024)
2302 blockdevs[devpath] = size
2303 return blockdevs
2304
2305
2306 def GetVolumeList(vg_names):
2307 """Compute list of logical volumes and their size.
2308
2309 @type vg_names: list
2310 @param vg_names: the volume groups whose LVs we should list, or
2311 empty for all volume groups
2312 @rtype: dict
2313 @return:
2314 dictionary of all partions (key) with value being a tuple of
2315 their size (in MiB), inactive and online status::
2316
2317 {'xenvg/test1': ('20.06', True, True)}
2318
2319 in case of errors, a string is returned with the error
2320 details.
2321
2322 """
2323 lvs = {}
2324 sep = "|"
2325 if not vg_names:
2326 vg_names = []
2327 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2328 "--separator=%s" % sep,
2329 "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names)
2330 if result.failed:
2331 _Fail("Failed to list logical volumes, lvs output: %s", result.output)
2332
2333 for line in result.stdout.splitlines():
2334 line = line.strip()
2335 match = _LVSLINE_REGEX.match(line)
2336 if not match:
2337 logging.error("Invalid line returned from lvs output: '%s'", line)
2338 continue
2339 vg_name, name, size, attr = match.groups()
2340 inactive = attr[4] == "-"
2341 online = attr[5] == "o"
2342 virtual = attr[0] == "v"
2343 if virtual:
2344 # we don't want to report such volumes as existing, since they
2345 # don't really hold data
2346 continue
2347 lvs[vg_name + "/" + name] = (size, inactive, online)
2348
2349 return lvs
2350
2351
2352 def ListVolumeGroups():
2353 """List the volume groups and their size.
2354
2355 @rtype: dict
2356 @return: dictionary with keys volume name and values the
2357 size of the volume
2358
2359 """
2360 return utils.ListVolumeGroups()
2361
2362
2363 def NodeVolumes():
2364 """List all volumes on this node.
2365
2366 @rtype: list
2367 @return:
2368 A list of dictionaries, each having four keys:
2369 - name: the logical volume name,
2370 - size: the size of the logical volume
2371 - dev: the physical device on which the LV lives
2372 - vg: the volume group to which it belongs
2373
2374 In case of errors, we return an empty list and log the
2375 error.
2376
2377 Note that since a logical volume can live on multiple physical
2378 volumes, the resulting list might include a logical volume
2379 multiple times.
2380
2381 """
2382 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2383 "--separator=|",
2384 "--options=lv_name,lv_size,devices,vg_name"])
2385 if result.failed:
2386 _Fail("Failed to list logical volumes, lvs output: %s",
2387 result.output)
2388
2389 def parse_dev(dev):
2390 return dev.split("(")[0]
2391
2392 def handle_dev(dev):
2393 return [parse_dev(x) for x in dev.split(",")]
2394
2395 def map_line(line):
2396 line = [v.strip() for v in line]
2397 return [{"name": line[0], "size": line[1],
2398 "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])]
2399
2400 all_devs = []
2401 for line in result.stdout.splitlines():
2402 if line.count("|") >= 3:
2403 all_devs.extend(map_line(line.split("|")))
2404 else:
2405 logging.warning("Strange line in the output from lvs: '%s'", line)
2406 return all_devs
2407
2408
2409 def BridgesExist(bridges_list):
2410 """Check if a list of bridges exist on the current node.
2411
2412 @rtype: boolean
2413 @return: C{True} if all of them exist, C{False} otherwise
2414
2415 """
2416 missing = []
2417 for bridge in bridges_list:
2418 if not utils.BridgeExists(bridge):
2419 missing.append(bridge)
2420
2421 if missing:
2422 _Fail("Missing bridges %s", utils.CommaJoin(missing))
2423
2424
2425 def GetInstanceListForHypervisor(hname, hvparams=None,
2426 get_hv_fn=hypervisor.GetHypervisor):
2427 """Provides a list of instances of the given hypervisor.
2428
2429 @type hname: string
2430 @param hname: name of the hypervisor
2431 @type hvparams: dict of strings
2432 @param hvparams: hypervisor parameters for the given hypervisor
2433 @type get_hv_fn: function
2434 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2435 name; optional parameter to increase testability
2436
2437 @rtype: list
2438 @return: a list of all running instances on the current node
2439 - instance1.example.com
2440 - instance2.example.com
2441
2442 """
2443 try:
2444 return get_hv_fn(hname).ListInstances(hvparams=hvparams)
2445 except errors.HypervisorError, err:
2446 _Fail("Error enumerating instances (hypervisor %s): %s",
2447 hname, err, exc=True)
2448
2449
2450 def GetInstanceList(hypervisor_list, all_hvparams=None,
2451 get_hv_fn=hypervisor.GetHypervisor):
2452 """Provides a list of instances.
2453
2454 @type hypervisor_list: list
2455 @param hypervisor_list: the list of hypervisors to query information
2456 @type all_hvparams: dict of dict of strings
2457 @param all_hvparams: a dictionary mapping hypervisor types to respective
2458 cluster-wide hypervisor parameters
2459 @type get_hv_fn: function
2460 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2461 name; optional parameter to increase testability
2462
2463 @rtype: list
2464 @return: a list of all running instances on the current node
2465 - instance1.example.com
2466 - instance2.example.com
2467
2468 """
2469 results = []
2470 for hname in hypervisor_list:
2471 hvparams = all_hvparams[hname]
2472 results.extend(GetInstanceListForHypervisor(hname, hvparams=hvparams,
2473 get_hv_fn=get_hv_fn))
2474 return results
2475
2476
2477 def GetInstanceInfo(instance, hname, hvparams=None):
2478 """Gives back the information about an instance as a dictionary.
2479
2480 @type instance: string
2481 @param instance: the instance name
2482 @type hname: string
2483 @param hname: the hypervisor type of the instance
2484 @type hvparams: dict of strings
2485 @param hvparams: the instance's hvparams
2486
2487 @rtype: dict
2488 @return: dictionary with the following keys:
2489 - memory: memory size of instance (int)
2490 - state: state of instance (HvInstanceState)
2491 - time: cpu time of instance (float)
2492 - vcpus: the number of vcpus (int)
2493
2494 """
2495 output = {}
2496
2497 iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance,
2498 hvparams=hvparams)
2499 if iinfo is not None:
2500 output["memory"] = iinfo[2]
2501 output["vcpus"] = iinfo[3]
2502 output["state"] = iinfo[4]
2503 output["time"] = iinfo[5]
2504
2505 return output
2506
2507
2508 def GetInstanceMigratable(instance):
2509 """Computes whether an instance can be migrated.
2510
2511 @type instance: L{objects.Instance}
2512 @param instance: object representing the instance to be checked.
2513
2514 @rtype: tuple
2515 @return: tuple of (result, description) where:
2516 - result: whether the instance can be migrated or not
2517 - description: a description of the issue, if relevant
2518
2519 """
2520 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2521 iname = instance.name
2522 if iname not in hyper.ListInstances(hvparams=instance.hvparams):
2523 _Fail("Instance %s is not running", iname)
2524
2525 for idx in range(len(instance.disks_info)):
2526 link_name = _GetBlockDevSymlinkPath(iname, idx)
2527 if not os.path.islink(link_name):
2528 logging.warning("Instance %s is missing symlink %s for disk %d",
2529 iname, link_name, idx)
2530
2531
2532 def GetAllInstancesInfo(hypervisor_list, all_hvparams):
2533 """Gather data about all instances.
2534
2535 This is the equivalent of L{GetInstanceInfo}, except that it
2536 computes data for all instances at once, thus being faster if one
2537 needs data about more than one instance.
2538
2539 @type hypervisor_list: list
2540 @param hypervisor_list: list of hypervisors to query for instance data
2541 @type all_hvparams: dict of dict of strings
2542 @param all_hvparams: mapping of hypervisor names to hvparams
2543
2544 @rtype: dict
2545 @return: dictionary of instance: data, with data having the following keys:
2546 - memory: memory size of instance (int)
2547 - state: xen state of instance (string)
2548 - time: cpu time of instance (float)
2549 - vcpus: the number of vcpus
2550
2551 """
2552 output = {}
2553 for hname in hypervisor_list:
2554 hvparams = all_hvparams[hname]
2555 iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo(hvparams)
2556 if iinfo:
2557 for name, _, memory, vcpus, state, times in iinfo:
2558 value = {
2559 "memory": memory,
2560 "vcpus": vcpus,
2561 "state": state,
2562 "time": times,
2563 }
2564 if name in output:
2565 # we only check static parameters, like memory and vcpus,
2566 # and not state and time which can change between the
2567 # invocations of the different hypervisors
2568 for key in "memory", "vcpus":
2569 if value[key] != output[name][key]:
2570 _Fail("Instance %s is running twice"
2571 " with different parameters", name)
2572 output[name] = value
2573
2574 return output
2575
2576
2577 def GetInstanceConsoleInfo(instance_param_dict,
2578 get_hv_fn=hypervisor.GetHypervisor):
2579 """Gather data about the console access of a set of instances of this node.
2580
2581 This function assumes that the caller already knows which instances are on
2582 this node, by calling a function such as L{GetAllInstancesInfo} or
2583 L{GetInstanceList}.
2584
2585 For every instance, a large amount of configuration data needs to be
2586 provided to the hypervisor interface in order to receive the console
2587 information. Whether this could or should be cut down can be discussed.
2588 The information is provided in a dictionary indexed by instance name,
2589 allowing any number of instance queries to be done.
2590
2591 @type instance_param_dict: dict of string to tuple of dictionaries, where the
2592 dictionaries represent: L{objects.Instance}, L{objects.Node},
2593 L{objects.NodeGroup}, HvParams, BeParams
2594 @param instance_param_dict: mapping of instance name to parameters necessary
2595 for console information retrieval
2596
2597 @rtype: dict
2598 @return: dictionary of instance: data, with data having the following keys:
2599 - instance: instance name
2600 - kind: console kind
2601 - message: used with kind == CONS_MESSAGE, indicates console to be
2602 unavailable, supplies error message
2603 - host: host to connect to
2604 - port: port to use
2605 - user: user for login
2606 - command: the command, broken into parts as an array
2607 - display: unknown, potentially unused?
2608
2609 """
2610
2611 output = {}
2612 for inst_name in instance_param_dict:
2613 instance = instance_param_dict[inst_name]["instance"]
2614 pnode = instance_param_dict[inst_name]["node"]
2615 group = instance_param_dict[inst_name]["group"]
2616 hvparams = instance_param_dict[inst_name]["hvParams"]
2617 beparams = instance_param_dict[inst_name]["beParams"]
2618
2619 instance = objects.Instance.FromDict(instance)
2620 pnode = objects.Node.FromDict(pnode)
2621 group = objects.NodeGroup.FromDict(group)
2622
2623 h = get_hv_fn(instance.hypervisor)
2624 output[inst_name] = h.GetInstanceConsole(instance, pnode, group,
2625 hvparams, beparams).ToDict()
2626
2627 return output
2628
2629
2630 def _InstanceLogName(kind, os_name, instance, component):
2631 """Compute the OS log filename for a given instance and operation.
2632
2633 The instance name and os name are passed in as strings since not all
2634 operations have these as part of an instance object.
2635
2636 @type kind: string
2637 @param kind: the operation type (e.g. add, import, etc.)
2638 @type os_name: string
2639 @param os_name: the os name
2640 @type instance: string
2641 @param instance: the name of the instance being imported/added/etc.
2642 @type component: string or None
2643 @param component: the name of the component of the instance being
2644 transferred
2645
2646 """
2647 # TODO: Use tempfile.mkstemp to create unique filename
2648 if component:
2649 assert "/" not in component
2650 c_msg = "-%s" % component
2651 else:
2652 c_msg = ""
2653 base = ("%s-%s-%s%s-%s.log" %
2654 (kind, os_name, instance, c_msg, utils.TimestampForFilename()))
2655 return utils.PathJoin(pathutils.LOG_OS_DIR, base)
2656
2657
2658 def InstanceOsAdd(instance, reinstall, debug):
2659 """Add an OS to an instance.
2660
2661 @type instance: L{objects.Instance}
2662 @param instance: Instance whose OS is to be installed
2663 @type reinstall: boolean
2664 @param reinstall: whether this is an instance reinstall
2665 @type debug: integer
2666 @param debug: debug level, passed to the OS scripts
2667 @rtype: None
2668
2669 """
2670 inst_os = OSFromDisk(instance.os)
2671
2672 create_env = OSEnvironment(instance, inst_os, debug)
2673 if reinstall:
2674 create_env["INSTANCE_REINSTALL"] = "1"
2675
2676 logfile = _InstanceLogName("add", instance.os, instance.name, None)
2677
2678 result = utils.RunCmd([inst_os.create_script], env=create_env,
2679 cwd=inst_os.path, output=logfile, reset_env=True)
2680 if result.failed:
2681 logging.error("os create command '%s' returned error: %s, logfile: %s,"
2682 " output: %s", result.cmd, result.fail_reason, logfile,
2683 result.output)
2684 lines = [utils.SafeEncode(val)
2685 for val in utils.TailFile(logfile, lines=20)]
2686 _Fail("OS create script failed (%s), last lines in the"
2687 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2688
2689
2690 def RunRenameInstance(instance, old_name, debug):
2691 """Run the OS rename script for an instance.
2692
2693 @type instance: L{objects.Instance}
2694 @param instance: Instance whose OS is to be installed
2695 @type old_name: string
2696 @param old_name: previous instance name
2697 @type debug: integer
2698 @param debug: debug level, passed to the OS scripts
2699 @rtype: boolean
2700 @return: the success of the operation
2701
2702 """
2703 inst_os = OSFromDisk(instance.os)
2704
2705 rename_env = OSEnvironment(instance, inst_os, debug)
2706 rename_env["OLD_INSTANCE_NAME"] = old_name
2707
2708 logfile = _InstanceLogName("rename", instance.os,
2709 "%s-%s" % (old_name, instance.name), None)
2710
2711 result = utils.RunCmd([inst_os.rename_script], env=rename_env,
2712 cwd=inst_os.path, output=logfile, reset_env=True)
2713
2714 if result.failed:
2715 logging.error("os create command '%s' returned error: %s output: %s",
2716 result.cmd, result.fail_reason, result.output)
2717 lines = [utils.SafeEncode(val)
2718 for val in utils.TailFile(logfile, lines=20)]
2719 _Fail("OS rename script failed (%s), last lines in the"
2720 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2721
2722
2723 def _GetBlockDevSymlinkPath(instance_name, idx, _dir=None):
2724 """Returns symlink path for block device.
2725
2726 """
2727 if _dir is None:
2728 _dir = pathutils.DISK_LINKS_DIR
2729
2730 return utils.PathJoin(_dir,
2731 ("%s%s%s" %
2732 (instance_name, constants.DISK_SEPARATOR, idx)))
2733
2734
2735 def _SymlinkBlockDev(instance_name, device_path, idx):
2736 """Set up symlinks to a instance's block device.
2737
2738 This is an auxiliary function run when an instance is start (on the primary
2739 node) or when an instance is migrated (on the target node).
2740
2741
2742 @param instance_name: the name of the target instance
2743 @param device_path: path of the physical block device, on the node
2744 @param idx: the disk index
2745 @return: absolute path to the disk's symlink
2746
2747 """
2748 # In case we have only a userspace access URI, device_path is None
2749 if not device_path:
2750 return None
2751
2752 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2753 try:
2754 os.symlink(device_path, link_name)
2755 except OSError, err:
2756 if err.errno == errno.EEXIST:
2757 if (not os.path.islink(link_name) or
2758 os.readlink(link_name) != device_path):
2759 os.remove(link_name)
2760 os.symlink(device_path, link_name)
2761 else:
2762 raise
2763
2764 return link_name
2765
2766
2767 def _RemoveBlockDevLinks(instance_name, disks):
2768 """Remove the block device symlinks belonging to the given instance.
2769
2770 """
2771 for idx, _ in enumerate(disks):
2772 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2773 if os.path.islink(link_name):
2774 try:
2775 os.remove(link_name)
2776 except OSError:
2777 logging.exception("Can't remove symlink '%s'", link_name)
2778
2779
2780 def _CalculateDeviceURI(instance, disk, device):
2781 """Get the URI for the device.
2782
2783 @type instance: L{objects.Instance}
2784 @param instance: the instance which disk belongs to
2785 @type disk: L{objects.Disk}
2786 @param disk: the target disk object
2787 @type device: L{bdev.BlockDev}
2788 @param device: the corresponding BlockDevice
2789 @rtype: string
2790 @return: the device uri if any else None
2791
2792 """
2793 access_mode = disk.params.get(constants.LDP_ACCESS,
2794 constants.DISK_KERNELSPACE)
2795 if access_mode == constants.DISK_USERSPACE:
2796 # This can raise errors.BlockDeviceError
2797 return device.GetUserspaceAccessUri(instance.hypervisor)
2798 else:
2799 return None
2800
2801
2802 def _GatherAndLinkBlockDevs(instance):
2803 """Set up an instance's block device(s).
2804
2805 This is run on the primary node at instance startup. The block
2806 devices must be already assembled.
2807
2808 @type instance: L{objects.Instance}
2809 @param instance: the instance whose disks we should assemble
2810 @rtype: list
2811 @return: list of (disk_object, link_name, drive_uri)
2812
2813 """
2814 block_devices = []
2815 for idx, disk in enumerate(instance.disks_info):
2816 device = _RecursiveFindBD(disk)
2817 if device is None:
2818 raise errors.BlockDeviceError("Block device '%s' is not set up." %
2819 str(disk))
2820 device.Open()
2821 try:
2822 link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
2823 except OSError, e:
2824 raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
2825 e.strerror)
2826 uri = _CalculateDeviceURI(instance, disk, device)
2827
2828 block_devices.append((disk, link_name, uri))
2829
2830 return block_devices
2831
2832
2833 def _IsInstanceUserDown(instance_info):
2834 return instance_info and \
2835 "state" in instance_info and \
2836 hv_base.HvInstanceState.IsShutdown(instance_info["state"])
2837
2838
2839 def _GetInstanceInfo(instance):
2840 """Helper function L{GetInstanceInfo}"""
2841 return GetInstanceInfo(instance.name, instance.hypervisor,
2842 hvparams=instance.hvparams)
2843
2844
2845 def StartInstance(instance, startup_paused, reason, store_reason=True):
2846 """Start an instance.
2847
2848 @type instance: L{objects.Instance}
2849 @param instance: the instance object
2850 @type startup_paused: bool
2851 @param instance: pause instance at startup?
2852 @type reason: list of reasons
2853 @param reason: the reason trail for this startup
2854 @type store_reason: boolean
2855 @param store_reason: whether to store the shutdown reason trail on file
2856 @rtype: None
2857
2858 """
2859 instance_info = _GetInstanceInfo(instance)
2860
2861 if instance_info and not _IsInstanceUserDown(instance_info):
2862 logging.info("Instance '%s' already running, not starting", instance.name)
2863 return
2864
2865 try:
2866 block_devices = _GatherAndLinkBlockDevs(instance)
2867 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2868 hyper.StartInstance(instance, block_devices, startup_paused)
2869 if store_reason:
2870 _StoreInstReasonTrail(instance.name, reason)
2871 except errors.BlockDeviceError, err:
2872 _Fail("Block device error: %s", err, exc=True)
2873 except errors.HypervisorError, err:
2874 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2875 _Fail("Hypervisor error: %s", err, exc=True)
2876
2877
2878 def InstanceShutdown(instance, timeout, reason, store_reason=True):
2879 """Shut an instance down.
2880
2881 @note: this functions uses polling with a hardcoded timeout.
2882
2883 @type instance: L{objects.Instance}
2884 @param instance: the instance object
2885 @type timeout: integer
2886 @param timeout: maximum timeout for soft shutdown
2887 @type reason: list of reasons
2888 @param reason: the reason trail for this shutdown
2889 @type store_reason: boolean
2890 @param store_reason: whether to store the shutdown reason trail on file
2891 @rtype: None
2892
2893 """
2894 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2895
2896 if not _GetInstanceInfo(instance):
2897 logging.info("Instance '%s' not running, doing nothing", instance.name)
2898 return
2899
2900 class _TryShutdown(object):
2901 def __init__(self):
2902 self.tried_once = False
2903
2904 def __call__(self):
2905 if not _GetInstanceInfo(instance):
2906 return
2907
2908 try:
2909 hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout)
2910 if store_reason:
2911 _StoreInstReasonTrail(instance.name, reason)
2912 except errors.HypervisorError, err:
2913 # if the instance is no longer existing, consider this a
2914 # success and go to cleanup
2915 if not _GetInstanceInfo(instance):
2916 return
2917
2918 _Fail("Failed to stop instance '%s': %s", instance.name, err)
2919
2920 self.tried_once = True
2921
2922 raise utils.RetryAgain()
2923
2924 try:
2925 utils.Retry(_TryShutdown(), 5, timeout)
2926 except utils.RetryTimeout:
2927 # the shutdown did not succeed
2928 logging.error("Shutdown of '%s' unsuccessful, forcing", instance.name)
2929
2930 try:
2931 hyper.StopInstance(instance, force=True)
2932 except errors.HypervisorError, err:
2933 # only raise an error if the instance still exists, otherwise
2934 # the error could simply be "instance ... unknown"!
2935 if _GetInstanceInfo(instance):
2936 _Fail("Failed to force stop instance '%s': %s", instance.name, err)
2937
2938 time.sleep(1)
2939
2940 if _GetInstanceInfo(instance):
2941 _Fail("Could not shutdown instance '%s' even by destroy", instance.name)
2942
2943 try:
2944 hyper.CleanupInstance(instance.name)
2945 except errors.HypervisorError, err:
2946 logging.warning("Failed to execute post-shutdown cleanup step: %s", err)
2947
2948 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2949
2950
2951 def InstanceReboot(instance, reboot_type, shutdown_timeout, reason):
2952 """Reboot an instance.
2953
2954 @type instance: L{objects.Instance}
2955 @param instance: the instance object to reboot
2956 @type reboot_type: str
2957 @param reboot_type: the type of reboot, one the following
2958 constants:
2959 - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
2960 instance OS, do not recreate the VM
2961 - L{constants.INSTANCE_REBOOT_HARD}: tear down and
2962 restart the VM (at the hypervisor level)
2963 - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is
2964 not accepted here, since that mode is handled differently, in
2965 cmdlib, and translates into full stop and start of the
2966 instance (instead of a call_instance_reboot RPC)
2967 @type shutdown_timeout: integer
2968 @param shutdown_timeout: maximum timeout for soft shutdown
2969 @type reason: list of reasons
2970 @param reason: the reason trail for this reboot
2971 @rtype: None
2972
2973 """
2974 # TODO: this is inconsistent with 'StartInstance' and 'InstanceShutdown'
2975 # because those functions simply 'return' on error whereas this one
2976 # raises an exception with '_Fail'
2977 if not _GetInstanceInfo(instance):
2978 _Fail("Cannot reboot instance '%s' that is not running", instance.name)
2979
2980 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2981 if reboot_type == constants.INSTANCE_REBOOT_SOFT:
2982 try:
2983 hyper.RebootInstance(instance)
2984 except errors.HypervisorError, err:
2985 _Fail("Failed to soft reboot instance '%s': %s", instance.name, err)
2986 elif reboot_type == constants.INSTANCE_REBOOT_HARD:
2987 try:
2988 InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False)
2989 result = StartInstance(instance, False, reason, store_reason=False)
2990 _StoreInstReasonTrail(instance.name, reason)
2991 return result
2992 except errors.HypervisorError, err:
2993 _Fail("Failed to hard reboot instance '%s': %s", instance.name, err)
2994 else:
2995 _Fail("Invalid reboot_type received: '%s'", reboot_type)
2996
2997
2998 def InstanceBalloonMemory(instance, memory):
2999 """Resize an instance's memory.
3000
3001 @type instance: L{objects.Instance}
3002 @param instance: the instance object
3003 @type memory: int
3004 @param memory: new memory amount in MB
3005 @rtype: None
3006
3007 """
3008 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3009 running = hyper.ListInstances(hvparams=instance.hvparams)
3010 if instance.name not in running:
3011 logging.info("Instance %s is not running, cannot balloon", instance.name)
3012 return
3013 try:
3014 hyper.BalloonInstanceMemory(instance, memory)
3015 except errors.HypervisorError, err:
3016 _Fail("Failed to balloon instance memory: %s", err, exc=True)
3017
3018
3019 def MigrationInfo(instance):
3020 """Gather information about an instance to be migrated.
3021
3022 @type instance: L{objects.Instance}
3023 @param instance: the instance definition
3024
3025 """
3026 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3027 try:
3028 info = hyper.MigrationInfo(instance)
3029 except errors.HypervisorError, err:
3030 _Fail("Failed to fetch migration information: %s", err, exc=True)
3031 return info
3032
3033
3034 def AcceptInstance(instance, info, target):
3035 """Prepare the node to accept an instance.
3036
3037 @type instance: L{objects.Instance}
3038 @param instance: the instance definition
3039 @type info: string/data (opaque)
3040 @param info: migration information, from the source node
3041 @type target: string
3042 @param target: target host (usually ip), on this node
3043
3044 """
3045 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3046 try:
3047 hyper.AcceptInstance(instance, info, target)
3048 except errors.HypervisorError, err:
3049 _Fail("Failed to accept instance: %s", err, exc=True)
3050
3051
3052 def FinalizeMigrationDst(instance, info, success):
3053 """Finalize any preparation to accept an instance.
3054
3055 @type instance: L{objects.Instance}
3056 @param instance: the instance definition
3057 @type info: string/data (opaque)
3058 @param info: migration information, from the source node
3059 @type success: boolean
3060 @param success: whether the migration was a success or a failure
3061
3062 """
3063 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3064 try:
3065 hyper.FinalizeMigrationDst(instance, info, success)
3066 except errors.HypervisorError, err:
3067 _Fail("Failed to finalize migration on the target node: %s", err, exc=True)
3068
3069
3070 def MigrateInstance(cluster_name, instance, target, live):
3071 """Migrates an instance to another node.
3072
3073 @type cluster_name: string
3074 @param cluster_name: name of the cluster
3075 @type instance: L{objects.Instance}
3076 @param instance: the instance definition
3077 @type target: string
3078 @param target: the target node name
3079 @type live: boolean
3080 @param live: whether the migration should be done live or not (the
3081 interpretation of this parameter is left to the hypervisor)
3082 @raise RPCFail: if migration fails for some reason
3083
3084 """
3085 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3086
3087 try:
3088 hyper.MigrateInstance(cluster_name, instance, target, live)
3089 except errors.HypervisorError, err:
3090 _Fail("Failed to migrate instance: %s", err, exc=True)
3091
3092
3093 def FinalizeMigrationSource(instance, success, live):
3094 """Finalize the instance migration on the source node.
3095
3096 @type instance: L{objects.Instance}
3097 @param instance: the instance definition of the migrated instance
3098 @type success: bool
3099 @param success: whether the migration succeeded or not
3100 @type live: bool
3101 @param live: whether the user requested a live migration or not
3102 @raise RPCFail: If the execution fails for some reason
3103
3104 """
3105 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3106
3107 try:
3108 hyper.FinalizeMigrationSource(instance, success, live)
3109 except Exception, err: # pylint: disable=W0703
3110 _Fail("Failed to finalize the migration on the source node: %s", err,
3111 exc=True)
3112
3113
3114 def GetMigrationStatus(instance):
3115 """Get the migration status
3116
3117 @type instance: L{objects.Instance}
3118 @param instance: the instance that is being migrated
3119 @rtype: L{objects.MigrationStatus}
3120 @return: the status of the current migration (one of
3121 L{constants.HV_MIGRATION_VALID_STATUSES}), plus any additional
3122 progress info that can be retrieved from the hypervisor
3123 @raise RPCFail: If the migration status cannot be retrieved
3124
3125 """
3126 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3127 try:
3128 return hyper.GetMigrationStatus(instance)
3129 except Exception, err: # pylint: disable=W0703
3130 _Fail("Failed to get migration status: %s", err, exc=True)
3131
3132
3133 def HotplugDevice(instance, action, dev_type, device, extra, seq):
3134 """Hotplug a device
3135
3136 Hotplug is currently supported only for KVM Hypervisor.
3137 @type instance: L{objects.Instance}
3138 @param instance: the instance to which we hotplug a device
3139 @type action: string
3140 @param action: the hotplug action to perform
3141 @type dev_type: string
3142 @param dev_type: the device type to hotplug
3143 @type device: either L{objects.NIC} or L{objects.Disk}
3144 @param device: the device object to hotplug
3145 @type extra: tuple
3146 @param extra: extra info used for disk hotplug (disk link, drive uri)
3147 @type seq: int
3148 @param seq: the index of the device from master perspective
3149 @raise RPCFail: in case instance does not have KVM hypervisor
3150
3151 """
3152 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3153 try:
3154 hyper.VerifyHotplugSupport(instance, action, dev_type)
3155 except errors.HotplugError, err:
3156 _Fail("Hotplug is not supported: %s", err)
3157
3158 if action == constants.HOTPLUG_ACTION_ADD:
3159 fn = hyper.HotAddDevice
3160 elif action == constants.HOTPLUG_ACTION_REMOVE:
3161 fn = hyper.HotDelDevice
3162 elif action == constants.HOTPLUG_ACTION_MODIFY:
3163 fn = hyper.HotModDevice
3164 else:
3165 assert action in constants.HOTPLUG_ALL_ACTIONS
3166
3167 return fn(instance, dev_type, device, extra, seq)
3168
3169
3170 def HotplugSupported(instance):
3171 """Checks if hotplug is generally supported.
3172
3173 """
3174 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3175 try:
3176 hyper.HotplugSupported(instance)
3177 except errors.HotplugError, err:
3178 _Fail("Hotplug is not supported: %s", err)
3179
3180
3181 def ModifyInstanceMetadata(metadata):
3182 """Sends instance data to the metadata daemon.
3183
3184 Uses the Luxi transport layer to communicate with the metadata
3185 daemon configuration server. It starts the metadata daemon if it is
3186 not running.
3187 The daemon must be enabled during at configuration time.
3188
3189 @type metadata: dict
3190 @param metadata: instance metadata obtained by calling
3191 L{objects.Instance.ToDict} on an instance object
3192
3193 """
3194 if not constants.ENABLE_METAD:
3195 raise errors.ProgrammerError("The metadata deamon is disabled, yet"
3196 " ModifyInstanceMetadata has been called")
3197
3198 if not utils.IsDaemonAlive(constants.METAD):
3199 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.METAD])
3200 if result.failed:
3201 raise errors.HypervisorError("Failed to start metadata daemon")
3202
3203 with contextlib.closing(metad.Client()) as client:
3204 client.UpdateConfig(metadata)
3205
3206
3207 def BlockdevCreate(disk, size, owner, on_primary, info, excl_stor):
3208 """Creates a block device for an instance.
3209
3210 @type disk: L{objects.Disk}
3211 @param disk: the object describing the disk we should create
3212 @type size: int
3213 @param size: the size of the physical underlying device, in MiB
3214 @type owner: str
3215 @param owner: the name of the instance for which disk is created,
3216 used for device cache data
3217 @type on_primary: boolean
3218 @param on_primary: indicates if it is the primary node or not
3219 @type info: string
3220 @param info: string that will be sent to the physical device
3221 creation, used for example to set (LVM) tags on LVs
3222 @type excl_stor: boolean
3223 @param excl_stor: Whether exclusive_storage is active
3224
3225 @return: the new unique_id of the device (this can sometime be
3226 computed only after creation), or None. On secondary nodes,
3227 it's not required to return anything.
3228
3229 """
3230 # TODO: remove the obsolete "size" argument
3231 # pylint: disable=W0613
3232 clist = []
3233 if disk.children:
3234 for child in disk.children:
3235 try:
3236 crdev = _RecursiveAssembleBD(child, owner, on_primary)
3237 except errors.BlockDeviceError, err:
3238 _Fail("Can't assemble device %s: %s", child, err)
3239 if on_primary or disk.AssembleOnSecondary():
3240 # we need the children open in case the device itself has to
3241 # be assembled
3242 try:
3243 # pylint: disable=E1103
3244 crdev.Open()
3245 except errors.BlockDeviceError, err:
3246 _Fail("Can't make child '%s' read-write: %s", child, err)
3247 clist.append(crdev)
3248
3249 try:
3250 device = bdev.Create(disk, clist, excl_stor)
3251 except errors.BlockDeviceError, err:
3252 _Fail("Can't create block device: %s", err)
3253
3254 if on_primary or disk.AssembleOnSecondary():
3255 try:
3256 device.Assemble()
3257 except errors.BlockDeviceError, err:
3258 _Fail("Can't assemble device after creation, unusual event: %s", err)
3259 if on_primary or disk.OpenOnSecondary():
3260 try:
3261 device.Open(force=True)
3262 except errors.BlockDeviceError, err:
3263 _Fail("Can't make device r/w after creation, unusual event: %s", err)
3264 DevCacheManager.UpdateCache(device.dev_path, owner,
3265 on_primary, disk.iv_name)
3266
3267 device.SetInfo(info)
3268
3269 return device.unique_id
3270
3271
3272 def _DumpDevice(source_path, target_path, offset, size, truncate):
3273 """This function images/wipes the device using a local file.
3274
3275 @type source_path: string
3276 @param source_path: path of the image or data source (e.g., "/dev/zero")
3277
3278 @type target_path: string
3279 @param target_path: path of the device to image/wipe
3280
3281 @type offset: int
3282 @param offset: offset in MiB in the output file
3283
3284 @type size: int
3285 @param size: maximum size in MiB to write (data source might be smaller)
3286
3287 @type truncate: bool
3288 @param truncate: whether the file should be truncated
3289
3290 @return: None
3291 @raise RPCFail: in case of failure
3292
3293 """
3294 # Internal sizes are always in Mebibytes; if the following "dd" command
3295 # should use a different block size the offset and size given to this
3296 # function must be adjusted accordingly before being passed to "dd".
3297 block_size = constants.DD_BLOCK_SIZE
3298
3299 cmd = [constants.DD_CMD, "if=%s" % source_path, "seek=%d" % offset,
3300 "bs=%s" % block_size, "oflag=direct", "of=%s" % target_path,
3301 "count=%d" % size]
3302
3303 if not truncate:
3304 cmd.append("conv=notrunc")
3305
3306 result = utils.RunCmd(cmd)
3307
3308 if result.failed:
3309 _Fail("Dump command '%s' exited with error: %s; output: %s", result.cmd,
3310 result.fail_reason, result.output)
3311
3312
3313 def _DownloadAndDumpDevice(source_url, target_path, size):
3314 """This function images a device using a downloaded image file.
3315
3316 @type source_url: string
3317 @param source_url: URL of image to dump to disk
3318
3319 @type target_path: string
3320 @param target_path: path of the device to image
3321
3322 @type size: int
3323 @param size: maximum size in MiB to write (data source might be smaller)
3324
3325 @rtype: NoneType
3326 @return: None
3327 @raise RPCFail: in case of download or write failures
3328
3329 """
3330 class DDParams(object):
3331 def __init__(self, current_size, total_size):
3332 self.current_size = current_size
3333 self.total_size = total_size
3334 self.image_size_error = False
3335
3336 def dd_write(ddparams, out):
3337 if ddparams.current_size < ddparams.total_size:
3338 ddparams.current_size += len(out)
3339 target_file.write(out)
3340 else:
3341 ddparams.image_size_error = True
3342 return -1
3343
3344 target_file = open(target_path, "r+")
3345 ddparams = DDParams(0, 1024 * 1024 * size)
3346
3347 curl = pycurl.Curl()
3348 curl.setopt(pycurl.VERBOSE,