Catch IOError of SSH files when removing node
[ganeti-github.git] / lib / backend.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31 """Functions used by the node daemon
32
33 @var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
34 the L{UploadFile} function
35 @var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted
36 in the L{_CleanDirectory} function
37
38 """
39
40 # pylint: disable=E1103,C0302
41
42 # E1103: %s %r has no %r member (but some types could not be
43 # inferred), because the _TryOSFromDisk returns either (True, os_obj)
44 # or (False, "string") which confuses pylint
45
46 # C0302: This module has become too big and should be split up
47
48
49 import base64
50 import errno
51 import logging
52 import os
53 import os.path
54 import pycurl
55 import random
56 import re
57 import shutil
58 import signal
59 import stat
60 import tempfile
61 import time
62 import zlib
63 import contextlib
64 import collections
65
66 from ganeti import errors
67 from ganeti import http
68 from ganeti import utils
69 from ganeti import ssh
70 from ganeti import hypervisor
71 from ganeti.hypervisor import hv_base
72 from ganeti import constants
73 from ganeti.storage import bdev
74 from ganeti.storage import drbd
75 from ganeti.storage import extstorage
76 from ganeti.storage import filestorage
77 from ganeti import objects
78 from ganeti import ssconf
79 from ganeti import serializer
80 from ganeti import netutils
81 from ganeti import runtime
82 from ganeti import compat
83 from ganeti import pathutils
84 from ganeti import vcluster
85 from ganeti import ht
86 from ganeti.storage.base import BlockDev
87 from ganeti.storage.drbd import DRBD8
88 from ganeti import hooksmaster
89 import ganeti.metad as metad
90
91
92 _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id"
93 _ALLOWED_CLEAN_DIRS = compat.UniqueFrozenset([
94 pathutils.DATA_DIR,
95 pathutils.JOB_QUEUE_ARCHIVE_DIR,
96 pathutils.QUEUE_DIR,
97 pathutils.CRYPTO_KEYS_DIR,
98 ])
99 _MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60
100 _X509_KEY_FILE = "key"
101 _X509_CERT_FILE = "cert"
102 _IES_STATUS_FILE = "status"
103 _IES_PID_FILE = "pid"
104 _IES_CA_FILE = "ca"
105
106 #: Valid LVS output line regex
107 _LVSLINE_REGEX = re.compile(r"^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$")
108
109 # Actions for the master setup script
110 _MASTER_START = "start"
111 _MASTER_STOP = "stop"
112
113 #: Maximum file permissions for restricted command directory and executables
114 _RCMD_MAX_MODE = (stat.S_IRWXU |
115 stat.S_IRGRP | stat.S_IXGRP |
116 stat.S_IROTH | stat.S_IXOTH)
117
118 #: Delay before returning an error for restricted commands
119 _RCMD_INVALID_DELAY = 10
120
121 #: How long to wait to acquire lock for restricted commands (shorter than
122 #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many
123 #: command requests arrive
124 _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8
125
126
127 class RPCFail(Exception):
128 """Class denoting RPC failure.
129
130 Its argument is the error message.
131
132 """
133
134
135 def _GetInstReasonFilename(instance_name):
136 """Path of the file containing the reason of the instance status change.
137
138 @type instance_name: string
139 @param instance_name: The name of the instance
140 @rtype: string
141 @return: The path of the file
142
143 """
144 return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name)
145
146
147 def _StoreInstReasonTrail(instance_name, trail):
148 """Serialize a reason trail related to an instance change of state to file.
149
150 The exact location of the file depends on the name of the instance and on
151 the configuration of the Ganeti cluster defined at deploy time.
152
153 @type instance_name: string
154 @param instance_name: The name of the instance
155
156 @type trail: list of reasons
157 @param trail: reason trail
158
159 @rtype: None
160
161 """
162 json = serializer.DumpJson(trail)
163 filename = _GetInstReasonFilename(instance_name)
164 utils.WriteFile(filename, data=json)
165
166
167 def _Fail(msg, *args, **kwargs):
168 """Log an error and the raise an RPCFail exception.
169
170 This exception is then handled specially in the ganeti daemon and
171 turned into a 'failed' return type. As such, this function is a
172 useful shortcut for logging the error and returning it to the master
173 daemon.
174
175 @type msg: string
176 @param msg: the text of the exception
177 @raise RPCFail
178
179 """
180 if args:
181 msg = msg % args
182 if "log" not in kwargs or kwargs["log"]: # if we should log this error
183 if "exc" in kwargs and kwargs["exc"]:
184 logging.exception(msg)
185 else:
186 logging.error(msg)
187 raise RPCFail(msg)
188
189
190 def _GetConfig():
191 """Simple wrapper to return a SimpleStore.
192
193 @rtype: L{ssconf.SimpleStore}
194 @return: a SimpleStore instance
195
196 """
197 return ssconf.SimpleStore()
198
199
200 def _GetSshRunner(cluster_name):
201 """Simple wrapper to return an SshRunner.
202
203 @type cluster_name: str
204 @param cluster_name: the cluster name, which is needed
205 by the SshRunner constructor
206 @rtype: L{ssh.SshRunner}
207 @return: an SshRunner instance
208
209 """
210 return ssh.SshRunner(cluster_name)
211
212
213 def _Decompress(data):
214 """Unpacks data compressed by the RPC client.
215
216 @type data: list or tuple
217 @param data: Data sent by RPC client
218 @rtype: str
219 @return: Decompressed data
220
221 """
222 assert isinstance(data, (list, tuple))
223 assert len(data) == 2
224 (encoding, content) = data
225 if encoding == constants.RPC_ENCODING_NONE:
226 return content
227 elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
228 return zlib.decompress(base64.b64decode(content))
229 else:
230 raise AssertionError("Unknown data encoding")
231
232
233 def _CleanDirectory(path, exclude=None):
234 """Removes all regular files in a directory.
235
236 @type path: str
237 @param path: the directory to clean
238 @type exclude: list
239 @param exclude: list of files to be excluded, defaults
240 to the empty list
241
242 """
243 if path not in _ALLOWED_CLEAN_DIRS:
244 _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'",
245 path)
246
247 if not os.path.isdir(path):
248 return
249 if exclude is None:
250 exclude = []
251 else:
252 # Normalize excluded paths
253 exclude = [os.path.normpath(i) for i in exclude]
254
255 for rel_name in utils.ListVisibleFiles(path):
256 full_name = utils.PathJoin(path, rel_name)
257 if full_name in exclude:
258 continue
259 if os.path.isfile(full_name) and not os.path.islink(full_name):
260 utils.RemoveFile(full_name)
261
262
263 def _BuildUploadFileList():
264 """Build the list of allowed upload files.
265
266 This is abstracted so that it's built only once at module import time.
267
268 """
269 allowed_files = set([
270 pathutils.CLUSTER_CONF_FILE,
271 pathutils.ETC_HOSTS,
272 pathutils.SSH_KNOWN_HOSTS_FILE,
273 pathutils.VNC_PASSWORD_FILE,
274 pathutils.RAPI_CERT_FILE,
275 pathutils.SPICE_CERT_FILE,
276 pathutils.SPICE_CACERT_FILE,
277 pathutils.RAPI_USERS_FILE,
278 pathutils.CONFD_HMAC_KEY,
279 pathutils.CLUSTER_DOMAIN_SECRET_FILE,
280 ])
281
282 for hv_name in constants.HYPER_TYPES:
283 hv_class = hypervisor.GetHypervisorClass(hv_name)
284 allowed_files.update(hv_class.GetAncillaryFiles()[0])
285
286 assert pathutils.FILE_STORAGE_PATHS_FILE not in allowed_files, \
287 "Allowed file storage paths should never be uploaded via RPC"
288
289 return frozenset(allowed_files)
290
291
292 _ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
293
294
295 def JobQueuePurge():
296 """Removes job queue files and archived jobs.
297
298 @rtype: tuple
299 @return: True, None
300
301 """
302 _CleanDirectory(pathutils.QUEUE_DIR, exclude=[pathutils.JOB_QUEUE_LOCK_FILE])
303 _CleanDirectory(pathutils.JOB_QUEUE_ARCHIVE_DIR)
304
305
306 def GetMasterNodeName():
307 """Returns the master node name.
308
309 @rtype: string
310 @return: name of the master node
311 @raise RPCFail: in case of errors
312
313 """
314 try:
315 return _GetConfig().GetMasterNode()
316 except errors.ConfigurationError, err:
317 _Fail("Cluster configuration incomplete: %s", err, exc=True)
318
319
320 def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
321 """Decorator that runs hooks before and after the decorated function.
322
323 @type hook_opcode: string
324 @param hook_opcode: opcode of the hook
325 @type hooks_path: string
326 @param hooks_path: path of the hooks
327 @type env_builder_fn: function
328 @param env_builder_fn: function that returns a dictionary containing the
329 environment variables for the hooks. Will get all the parameters of the
330 decorated function.
331 @raise RPCFail: in case of pre-hook failure
332
333 """
334 def decorator(fn):
335 def wrapper(*args, **kwargs):
336 _, myself = ssconf.GetMasterAndMyself()
337 nodes = ([myself], [myself]) # these hooks run locally
338
339 env_fn = compat.partial(env_builder_fn, *args, **kwargs)
340
341 cfg = _GetConfig()
342 hr = HooksRunner()
343 hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes,
344 hr.RunLocalHooks, None, env_fn, None,
345 logging.warning, cfg.GetClusterName(),
346 cfg.GetMasterNode())
347 hm.RunPhase(constants.HOOKS_PHASE_PRE)
348 result = fn(*args, **kwargs)
349 hm.RunPhase(constants.HOOKS_PHASE_POST)
350
351 return result
352 return wrapper
353 return decorator
354
355
356 def _BuildMasterIpEnv(master_params, use_external_mip_script=None):
357 """Builds environment variables for master IP hooks.
358
359 @type master_params: L{objects.MasterNetworkParameters}
360 @param master_params: network parameters of the master
361 @type use_external_mip_script: boolean
362 @param use_external_mip_script: whether to use an external master IP
363 address setup script (unused, but necessary per the implementation of the
364 _RunLocalHooks decorator)
365
366 """
367 # pylint: disable=W0613
368 ver = netutils.IPAddress.GetVersionFromAddressFamily(master_params.ip_family)
369 env = {
370 "MASTER_NETDEV": master_params.netdev,
371 "MASTER_IP": master_params.ip,
372 "MASTER_NETMASK": str(master_params.netmask),
373 "CLUSTER_IP_VERSION": str(ver),
374 }
375
376 return env
377
378
379 def _RunMasterSetupScript(master_params, action, use_external_mip_script):
380 """Execute the master IP address setup script.
381
382 @type master_params: L{objects.MasterNetworkParameters}
383 @param master_params: network parameters of the master
384 @type action: string
385 @param action: action to pass to the script. Must be one of
386 L{backend._MASTER_START} or L{backend._MASTER_STOP}
387 @type use_external_mip_script: boolean
388 @param use_external_mip_script: whether to use an external master IP
389 address setup script
390 @raise backend.RPCFail: if there are errors during the execution of the
391 script
392
393 """
394 env = _BuildMasterIpEnv(master_params)
395
396 if use_external_mip_script:
397 setup_script = pathutils.EXTERNAL_MASTER_SETUP_SCRIPT
398 else:
399 setup_script = pathutils.DEFAULT_MASTER_SETUP_SCRIPT
400
401 result = utils.RunCmd([setup_script, action], env=env, reset_env=True)
402
403 if result.failed:
404 _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" %
405 (action, result.exit_code, result.output), log=True)
406
407
408 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup",
409 _BuildMasterIpEnv)
410 def ActivateMasterIp(master_params, use_external_mip_script):
411 """Activate the IP address of the master daemon.
412
413 @type master_params: L{objects.MasterNetworkParameters}
414 @param master_params: network parameters of the master
415 @type use_external_mip_script: boolean
416 @param use_external_mip_script: whether to use an external master IP
417 address setup script
418 @raise RPCFail: in case of errors during the IP startup
419
420 """
421 _RunMasterSetupScript(master_params, _MASTER_START,
422 use_external_mip_script)
423
424
425 def StartMasterDaemons(no_voting):
426 """Activate local node as master node.
427
428 The function will start the master daemons (ganeti-masterd and ganeti-rapi).
429
430 @type no_voting: boolean
431 @param no_voting: whether to start ganeti-masterd without a node vote
432 but still non-interactively
433 @rtype: None
434
435 """
436
437 if no_voting:
438 masterd_args = "--no-voting --yes-do-it"
439 else:
440 masterd_args = ""
441
442 env = {
443 "EXTRA_MASTERD_ARGS": masterd_args,
444 }
445
446 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-master"], env=env)
447 if result.failed:
448 msg = "Can't start Ganeti master: %s" % result.output
449 logging.error(msg)
450 _Fail(msg)
451
452
453 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown",
454 _BuildMasterIpEnv)
455 def DeactivateMasterIp(master_params, use_external_mip_script):
456 """Deactivate the master IP on this node.
457
458 @type master_params: L{objects.MasterNetworkParameters}
459 @param master_params: network parameters of the master
460 @type use_external_mip_script: boolean
461 @param use_external_mip_script: whether to use an external master IP
462 address setup script
463 @raise RPCFail: in case of errors during the IP turndown
464
465 """
466 _RunMasterSetupScript(master_params, _MASTER_STOP,
467 use_external_mip_script)
468
469
470 def StopMasterDaemons():
471 """Stop the master daemons on this node.
472
473 Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node.
474
475 @rtype: None
476
477 """
478 # TODO: log and report back to the caller the error failures; we
479 # need to decide in which case we fail the RPC for this
480
481 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-master"])
482 if result.failed:
483 logging.error("Could not stop Ganeti master, command %s had exitcode %s"
484 " and error %s",
485 result.cmd, result.exit_code, result.output)
486
487
488 def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
489 """Change the netmask of the master IP.
490
491 @param old_netmask: the old value of the netmask
492 @param netmask: the new value of the netmask
493 @param master_ip: the master IP
494 @param master_netdev: the master network device
495
496 """
497 if old_netmask == netmask:
498 return
499
500 if not netutils.IPAddress.Own(master_ip):
501 _Fail("The master IP address is not up, not attempting to change its"
502 " netmask")
503
504 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
505 "%s/%s" % (master_ip, netmask),
506 "dev", master_netdev, "label",
507 "%s:0" % master_netdev])
508 if result.failed:
509 _Fail("Could not set the new netmask on the master IP address")
510
511 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
512 "%s/%s" % (master_ip, old_netmask),
513 "dev", master_netdev, "label",
514 "%s:0" % master_netdev])
515 if result.failed:
516 _Fail("Could not bring down the master IP address with the old netmask")
517
518
519 def EtcHostsModify(mode, host, ip):
520 """Modify a host entry in /etc/hosts.
521
522 @param mode: The mode to operate. Either add or remove entry
523 @param host: The host to operate on
524 @param ip: The ip associated with the entry
525
526 """
527 if mode == constants.ETC_HOSTS_ADD:
528 if not ip:
529 RPCFail("Mode 'add' needs 'ip' parameter, but parameter not"
530 " present")
531 utils.AddHostToEtcHosts(host, ip)
532 elif mode == constants.ETC_HOSTS_REMOVE:
533 if ip:
534 RPCFail("Mode 'remove' does not allow 'ip' parameter, but"
535 " parameter is present")
536 utils.RemoveHostFromEtcHosts(host)
537 else:
538 RPCFail("Mode not supported")
539
540
541 def LeaveCluster(modify_ssh_setup):
542 """Cleans up and remove the current node.
543
544 This function cleans up and prepares the current node to be removed
545 from the cluster.
546
547 If processing is successful, then it raises an
548 L{errors.QuitGanetiException} which is used as a special case to
549 shutdown the node daemon.
550
551 @param modify_ssh_setup: boolean
552
553 """
554 _CleanDirectory(pathutils.DATA_DIR)
555 _CleanDirectory(pathutils.CRYPTO_KEYS_DIR)
556 JobQueuePurge()
557
558 if modify_ssh_setup:
559 try:
560 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
561
562 ssh.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
563
564 utils.RemoveFile(priv_key)
565 utils.RemoveFile(pub_key)
566 except errors.OpExecError:
567 logging.exception("Error while processing ssh files")
568 except IOError:
569 logging.exception("At least one SSH file was not accessible.")
570
571 try:
572 utils.RemoveFile(pathutils.CONFD_HMAC_KEY)
573 utils.RemoveFile(pathutils.RAPI_CERT_FILE)
574 utils.RemoveFile(pathutils.SPICE_CERT_FILE)
575 utils.RemoveFile(pathutils.SPICE_CACERT_FILE)
576 utils.RemoveFile(pathutils.NODED_CERT_FILE)
577 except: # pylint: disable=W0702
578 logging.exception("Error while removing cluster secrets")
579
580 utils.StopDaemon(constants.CONFD)
581 utils.StopDaemon(constants.MOND)
582 utils.StopDaemon(constants.KVMD)
583
584 # Raise a custom exception (handled in ganeti-noded)
585 raise errors.QuitGanetiException(True, "Shutdown scheduled")
586
587
588 def _CheckStorageParams(params, num_params):
589 """Performs sanity checks for storage parameters.
590
591 @type params: list
592 @param params: list of storage parameters
593 @type num_params: int
594 @param num_params: expected number of parameters
595
596 """
597 if params is None:
598 raise errors.ProgrammerError("No storage parameters for storage"
599 " reporting is provided.")
600 if not isinstance(params, list):
601 raise errors.ProgrammerError("The storage parameters are not of type"
602 " list: '%s'" % params)
603 if not len(params) == num_params:
604 raise errors.ProgrammerError("Did not receive the expected number of"
605 "storage parameters: expected %s,"
606 " received '%s'" % (num_params, len(params)))
607
608
609 def _CheckLvmStorageParams(params):
610 """Performs sanity check for the 'exclusive storage' flag.
611
612 @see: C{_CheckStorageParams}
613
614 """
615 _CheckStorageParams(params, 1)
616 excl_stor = params[0]
617 if not isinstance(params[0], bool):
618 raise errors.ProgrammerError("Exclusive storage parameter is not"
619 " boolean: '%s'." % excl_stor)
620 return excl_stor
621
622
623 def _GetLvmVgSpaceInfo(name, params):
624 """Wrapper around C{_GetVgInfo} which checks the storage parameters.
625
626 @type name: string
627 @param name: name of the volume group
628 @type params: list
629 @param params: list of storage parameters, which in this case should be
630 containing only one for exclusive storage
631
632 """
633 excl_stor = _CheckLvmStorageParams(params)
634 return _GetVgInfo(name, excl_stor)
635
636
637 def _GetVgInfo(
638 name, excl_stor, info_fn=bdev.LogicalVolume.GetVGInfo):
639 """Retrieves information about a LVM volume group.
640
641 """
642 # TODO: GetVGInfo supports returning information for multiple VGs at once
643 vginfo = info_fn([name], excl_stor)
644 if vginfo:
645 vg_free = int(round(vginfo[0][0], 0))
646 vg_size = int(round(vginfo[0][1], 0))
647 else:
648 vg_free = None
649 vg_size = None
650
651 return {
652 "type": constants.ST_LVM_VG,
653 "name": name,
654 "storage_free": vg_free,
655 "storage_size": vg_size,
656 }
657
658
659 def _GetLvmPvSpaceInfo(name, params):
660 """Wrapper around C{_GetVgSpindlesInfo} with sanity checks.
661
662 @see: C{_GetLvmVgSpaceInfo}
663
664 """
665 excl_stor = _CheckLvmStorageParams(params)
666 return _GetVgSpindlesInfo(name, excl_stor)
667
668
669 def _GetVgSpindlesInfo(
670 name, excl_stor, info_fn=bdev.LogicalVolume.GetVgSpindlesInfo):
671 """Retrieves information about spindles in an LVM volume group.
672
673 @type name: string
674 @param name: VG name
675 @type excl_stor: bool
676 @param excl_stor: exclusive storage
677 @rtype: dict
678 @return: dictionary whose keys are "name", "vg_free", "vg_size" for VG name,
679 free spindles, total spindles respectively
680
681 """
682 if excl_stor:
683 (vg_free, vg_size) = info_fn(name)
684 else:
685 vg_free = 0
686 vg_size = 0
687 return {
688 "type": constants.ST_LVM_PV,
689 "name": name,
690 "storage_free": vg_free,
691 "storage_size": vg_size,
692 }
693
694
695 def _GetHvInfo(name, hvparams, get_hv_fn=hypervisor.GetHypervisor):
696 """Retrieves node information from a hypervisor.
697
698 The information returned depends on the hypervisor. Common items:
699
700 - vg_size is the size of the configured volume group in MiB
701 - vg_free is the free size of the volume group in MiB
702 - memory_dom0 is the memory allocated for domain0 in MiB
703 - memory_free is the currently available (free) ram in MiB
704 - memory_total is the total number of ram in MiB
705 - hv_version: the hypervisor version, if available
706
707 @type hvparams: dict of string
708 @param hvparams: the hypervisor's hvparams
709
710 """
711 return get_hv_fn(name).GetNodeInfo(hvparams=hvparams)
712
713
714 def _GetHvInfoAll(hv_specs, get_hv_fn=hypervisor.GetHypervisor):
715 """Retrieves node information for all hypervisors.
716
717 See C{_GetHvInfo} for information on the output.
718
719 @type hv_specs: list of pairs (string, dict of strings)
720 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
721
722 """
723 if hv_specs is None:
724 return None
725
726 result = []
727 for hvname, hvparams in hv_specs:
728 result.append(_GetHvInfo(hvname, hvparams, get_hv_fn))
729 return result
730
731
732 def _GetNamedNodeInfo(names, fn):
733 """Calls C{fn} for all names in C{names} and returns a dictionary.
734
735 @rtype: None or dict
736
737 """
738 if names is None:
739 return None
740 else:
741 return map(fn, names)
742
743
744 def GetNodeInfo(storage_units, hv_specs):
745 """Gives back a hash with different information about the node.
746
747 @type storage_units: list of tuples (string, string, list)
748 @param storage_units: List of tuples (storage unit, identifier, parameters) to
749 ask for disk space information. In case of lvm-vg, the identifier is
750 the VG name. The parameters can contain additional, storage-type-specific
751 parameters, for example exclusive storage for lvm storage.
752 @type hv_specs: list of pairs (string, dict of strings)
753 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
754 @rtype: tuple; (string, None/dict, None/dict)
755 @return: Tuple containing boot ID, volume group information and hypervisor
756 information
757
758 """
759 bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n")
760 storage_info = _GetNamedNodeInfo(
761 storage_units,
762 (lambda (storage_type, storage_key, storage_params):
763 _ApplyStorageInfoFunction(storage_type, storage_key, storage_params)))
764 hv_info = _GetHvInfoAll(hv_specs)
765 return (bootid, storage_info, hv_info)
766
767
768 def _GetFileStorageSpaceInfo(path, params):
769 """Wrapper around filestorage.GetSpaceInfo.
770
771 The purpose of this wrapper is to call filestorage.GetFileStorageSpaceInfo
772 and ignore the *args parameter to not leak it into the filestorage
773 module's code.
774
775 @see: C{filestorage.GetFileStorageSpaceInfo} for description of the
776 parameters.
777
778 """
779 _CheckStorageParams(params, 0)
780 return filestorage.GetFileStorageSpaceInfo(path)
781
782
783 # FIXME: implement storage reporting for all missing storage types.
784 _STORAGE_TYPE_INFO_FN = {
785 constants.ST_BLOCK: None,
786 constants.ST_DISKLESS: None,
787 constants.ST_EXT: None,
788 constants.ST_FILE: _GetFileStorageSpaceInfo,
789 constants.ST_LVM_PV: _GetLvmPvSpaceInfo,
790 constants.ST_LVM_VG: _GetLvmVgSpaceInfo,
791 constants.ST_SHARED_FILE: None,
792 constants.ST_GLUSTER: None,
793 constants.ST_RADOS: None,
794 }
795
796
797 def _ApplyStorageInfoFunction(storage_type, storage_key, *args):
798 """Looks up and applies the correct function to calculate free and total
799 storage for the given storage type.
800
801 @type storage_type: string
802 @param storage_type: the storage type for which the storage shall be reported.
803 @type storage_key: string
804 @param storage_key: identifier of a storage unit, e.g. the volume group name
805 of an LVM storage unit
806 @type args: any
807 @param args: various parameters that can be used for storage reporting. These
808 parameters and their semantics vary from storage type to storage type and
809 are just propagated in this function.
810 @return: the results of the application of the storage space function (see
811 _STORAGE_TYPE_INFO_FN) if storage space reporting is implemented for that
812 storage type
813 @raises NotImplementedError: for storage types who don't support space
814 reporting yet
815 """
816 fn = _STORAGE_TYPE_INFO_FN[storage_type]
817 if fn is not None:
818 return fn(storage_key, *args)
819 else:
820 raise NotImplementedError
821
822
823 def _CheckExclusivePvs(pvi_list):
824 """Check that PVs are not shared among LVs
825
826 @type pvi_list: list of L{objects.LvmPvInfo} objects
827 @param pvi_list: information about the PVs
828
829 @rtype: list of tuples (string, list of strings)
830 @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...])
831
832 """
833 res = []
834 for pvi in pvi_list:
835 if len(pvi.lv_list) > 1:
836 res.append((pvi.name, pvi.lv_list))
837 return res
838
839
840 def _VerifyHypervisors(what, vm_capable, result, all_hvparams,
841 get_hv_fn=hypervisor.GetHypervisor):
842 """Verifies the hypervisor. Appends the results to the 'results' list.
843
844 @type what: C{dict}
845 @param what: a dictionary of things to check
846 @type vm_capable: boolean
847 @param vm_capable: whether or not this node is vm capable
848 @type result: dict
849 @param result: dictionary of verification results; results of the
850 verifications in this function will be added here
851 @type all_hvparams: dict of dict of string
852 @param all_hvparams: dictionary mapping hypervisor names to hvparams
853 @type get_hv_fn: function
854 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
855
856 """
857 if not vm_capable:
858 return
859
860 if constants.NV_HYPERVISOR in what:
861 result[constants.NV_HYPERVISOR] = {}
862 for hv_name in what[constants.NV_HYPERVISOR]:
863 hvparams = all_hvparams[hv_name]
864 try:
865 val = get_hv_fn(hv_name).Verify(hvparams=hvparams)
866 except errors.HypervisorError, err:
867 val = "Error while checking hypervisor: %s" % str(err)
868 result[constants.NV_HYPERVISOR][hv_name] = val
869
870
871 def _VerifyHvparams(what, vm_capable, result,
872 get_hv_fn=hypervisor.GetHypervisor):
873 """Verifies the hvparams. Appends the results to the 'results' list.
874
875 @type what: C{dict}
876 @param what: a dictionary of things to check
877 @type vm_capable: boolean
878 @param vm_capable: whether or not this node is vm capable
879 @type result: dict
880 @param result: dictionary of verification results; results of the
881 verifications in this function will be added here
882 @type get_hv_fn: function
883 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
884
885 """
886 if not vm_capable:
887 return
888
889 if constants.NV_HVPARAMS in what:
890 result[constants.NV_HVPARAMS] = []
891 for source, hv_name, hvparms in what[constants.NV_HVPARAMS]:
892 try:
893 logging.info("Validating hv %s, %s", hv_name, hvparms)
894 get_hv_fn(hv_name).ValidateParameters(hvparms)
895 except errors.HypervisorError, err:
896 result[constants.NV_HVPARAMS].append((source, hv_name, str(err)))
897
898
899 def _VerifyInstanceList(what, vm_capable, result, all_hvparams):
900 """Verifies the instance list.
901
902 @type what: C{dict}
903 @param what: a dictionary of things to check
904 @type vm_capable: boolean
905 @param vm_capable: whether or not this node is vm capable
906 @type result: dict
907 @param result: dictionary of verification results; results of the
908 verifications in this function will be added here
909 @type all_hvparams: dict of dict of string
910 @param all_hvparams: dictionary mapping hypervisor names to hvparams
911
912 """
913 if constants.NV_INSTANCELIST in what and vm_capable:
914 # GetInstanceList can fail
915 try:
916 val = GetInstanceList(what[constants.NV_INSTANCELIST],
917 all_hvparams=all_hvparams)
918 except RPCFail, err:
919 val = str(err)
920 result[constants.NV_INSTANCELIST] = val
921
922
923 def _VerifyNodeInfo(what, vm_capable, result, all_hvparams):
924 """Verifies the node info.
925
926 @type what: C{dict}
927 @param what: a dictionary of things to check
928 @type vm_capable: boolean
929 @param vm_capable: whether or not this node is vm capable
930 @type result: dict
931 @param result: dictionary of verification results; results of the
932 verifications in this function will be added here
933 @type all_hvparams: dict of dict of string
934 @param all_hvparams: dictionary mapping hypervisor names to hvparams
935
936 """
937 if constants.NV_HVINFO in what and vm_capable:
938 hvname = what[constants.NV_HVINFO]
939 hyper = hypervisor.GetHypervisor(hvname)
940 hvparams = all_hvparams[hvname]
941 result[constants.NV_HVINFO] = hyper.GetNodeInfo(hvparams=hvparams)
942
943
944 def _VerifyClientCertificate(cert_file=pathutils.NODED_CLIENT_CERT_FILE):
945 """Verify the existance and validity of the client SSL certificate.
946
947 Also, verify that the client certificate is not self-signed. Self-
948 signed client certificates stem from Ganeti versions 2.12.0 - 2.12.4
949 and should be replaced by client certificates signed by the server
950 certificate. Hence we output a warning when we encounter a self-signed
951 one.
952
953 """
954 create_cert_cmd = "gnt-cluster renew-crypto --new-node-certificates"
955 if not os.path.exists(cert_file):
956 return (constants.CV_ERROR,
957 "The client certificate does not exist. Run '%s' to create"
958 " client certificates for all nodes." % create_cert_cmd)
959
960 (errcode, msg) = utils.VerifyCertificate(cert_file)
961 if errcode is not None:
962 return (errcode, msg)
963
964 (errcode, msg) = utils.IsCertificateSelfSigned(cert_file)
965 if errcode is not None:
966 return (errcode, msg)
967
968 # if everything is fine, we return the digest to be compared to the config
969 return (None, utils.GetCertificateDigest(cert_filename=cert_file))
970
971
972 def _VerifySshSetup(node_status_list, my_name,
973 pub_key_file=pathutils.SSH_PUB_KEYS):
974 """Verifies the state of the SSH key files.
975
976 @type node_status_list: list of tuples
977 @param node_status_list: list of nodes of the cluster associated with a
978 couple of flags: (uuid, name, is_master_candidate,
979 is_potential_master_candidate, online)
980 @type my_name: str
981 @param my_name: name of this node
982 @type pub_key_file: str
983 @param pub_key_file: filename of the public key file
984
985 """
986 if node_status_list is None:
987 return ["No node list to check against the pub_key_file received."]
988
989 my_status_list = [(my_uuid, name, mc, pot_mc, online) for
990 (my_uuid, name, mc, pot_mc, online)
991 in node_status_list if name == my_name]
992 if len(my_status_list) == 0:
993 return ["Cannot find node information for node '%s'." % my_name]
994 (my_uuid, _, _, potential_master_candidate, online) = \
995 my_status_list[0]
996
997 result = []
998
999 if not os.path.exists(pub_key_file):
1000 result.append("The public key file '%s' does not exist. Consider running"
1001 " 'gnt-cluster renew-crypto --new-ssh-keys"
1002 " [--no-ssh-key-check]' to fix this." % pub_key_file)
1003 return result
1004
1005 pot_mc_uuids = [uuid for (uuid, _, _, _, _) in node_status_list]
1006 offline_nodes = [uuid for (uuid, _, _, _, online) in node_status_list
1007 if not online]
1008 pub_keys = ssh.QueryPubKeyFile(None)
1009
1010 if potential_master_candidate:
1011 # Check that the set of potential master candidates matches the
1012 # public key file
1013 pub_uuids_set = set(pub_keys.keys()) - set(offline_nodes)
1014 pot_mc_uuids_set = set(pot_mc_uuids) - set(offline_nodes)
1015 missing_uuids = set([])
1016 if pub_uuids_set != pot_mc_uuids_set:
1017 unknown_uuids = pub_uuids_set - pot_mc_uuids_set
1018 if unknown_uuids:
1019 result.append("The following node UUIDs are listed in the public key"
1020 " file on node '%s', but are not potential master"
1021 " candidates: %s."
1022 % (my_name, ", ".join(list(unknown_uuids))))
1023 missing_uuids = pot_mc_uuids_set - pub_uuids_set
1024 if missing_uuids:
1025 result.append("The following node UUIDs of potential master candidates"
1026 " are missing in the public key file on node %s: %s."
1027 % (my_name, ", ".join(list(missing_uuids))))
1028
1029 (_, key_files) = \
1030 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1031 (_, dsa_pub_key_filename) = key_files[constants.SSHK_DSA]
1032
1033 my_keys = pub_keys[my_uuid]
1034
1035 dsa_pub_key = utils.ReadFile(dsa_pub_key_filename)
1036 if dsa_pub_key.strip() not in my_keys:
1037 result.append("The dsa key of node %s does not match this node's key"
1038 " in the pub key file." % (my_name))
1039 if len(my_keys) != 1:
1040 result.append("There is more than one key for node %s in the public key"
1041 " file." % my_name)
1042 else:
1043 if len(pub_keys.keys()) > 0:
1044 result.append("The public key file of node '%s' is not empty, although"
1045 " the node is not a potential master candidate."
1046 % my_name)
1047
1048 # Check that all master candidate keys are in the authorized_keys file
1049 (auth_key_file, _) = \
1050 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1051 for (uuid, name, mc, _, online) in node_status_list:
1052 if not online:
1053 continue
1054 if uuid in missing_uuids:
1055 continue
1056 if mc:
1057 for key in pub_keys[uuid]:
1058 if not ssh.HasAuthorizedKey(auth_key_file, key):
1059 result.append("A SSH key of master candidate '%s' (UUID: '%s') is"
1060 " not in the 'authorized_keys' file of node '%s'."
1061 % (name, uuid, my_name))
1062 else:
1063 for key in pub_keys[uuid]:
1064 if name != my_name and ssh.HasAuthorizedKey(auth_key_file, key):
1065 result.append("A SSH key of normal node '%s' (UUID: '%s') is in the"
1066 " 'authorized_keys' file of node '%s'."
1067 % (name, uuid, my_name))
1068 if name == my_name and not ssh.HasAuthorizedKey(auth_key_file, key):
1069 result.append("A SSH key of normal node '%s' (UUID: '%s') is not"
1070 " in the 'authorized_keys' file of itself."
1071 % (my_name, uuid))
1072
1073 return result
1074
1075
1076 def _VerifySshClutter(node_status_list, my_name):
1077 """Verifies that the 'authorized_keys' files are not cluttered up.
1078
1079 @type node_status_list: list of tuples
1080 @param node_status_list: list of nodes of the cluster associated with a
1081 couple of flags: (uuid, name, is_master_candidate,
1082 is_potential_master_candidate, online)
1083 @type my_name: str
1084 @param my_name: name of this node
1085
1086 """
1087 result = []
1088 (auth_key_file, _) = \
1089 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1090 node_names = [name for (_, name, _, _) in node_status_list]
1091 multiple_occurrences = ssh.CheckForMultipleKeys(auth_key_file, node_names)
1092 if multiple_occurrences:
1093 msg = "There are hosts which have more than one SSH key stored for the" \
1094 " same user in the 'authorized_keys' file of node %s. This can be" \
1095 " due to an unsuccessful operation which cluttered up the" \
1096 " 'authorized_keys' file. We recommend to clean this up manually. " \
1097 % my_name
1098 for host, occ in multiple_occurrences.items():
1099 msg += "Entry for '%s' in lines %s. " % (host, utils.CommaJoin(occ))
1100 result.append(msg)
1101
1102 return result
1103
1104
1105 def VerifyNode(what, cluster_name, all_hvparams, node_groups, groups_cfg):
1106 """Verify the status of the local node.
1107
1108 Based on the input L{what} parameter, various checks are done on the
1109 local node.
1110
1111 If the I{filelist} key is present, this list of
1112 files is checksummed and the file/checksum pairs are returned.
1113
1114 If the I{nodelist} key is present, we check that we have
1115 connectivity via ssh with the target nodes (and check the hostname
1116 report).
1117
1118 If the I{node-net-test} key is present, we check that we have
1119 connectivity to the given nodes via both primary IP and, if
1120 applicable, secondary IPs.
1121
1122 @type what: C{dict}
1123 @param what: a dictionary of things to check:
1124 - filelist: list of files for which to compute checksums
1125 - nodelist: list of nodes we should check ssh communication with
1126 - node-net-test: list of nodes we should check node daemon port
1127 connectivity with
1128 - hypervisor: list with hypervisors to run the verify for
1129 @type cluster_name: string
1130 @param cluster_name: the cluster's name
1131 @type all_hvparams: dict of dict of strings
1132 @param all_hvparams: a dictionary mapping hypervisor names to hvparams
1133 @type node_groups: a dict of strings
1134 @param node_groups: node _names_ mapped to their group uuids (it's enough to
1135 have only those nodes that are in `what["nodelist"]`)
1136 @type groups_cfg: a dict of dict of strings
1137 @param groups_cfg: a dictionary mapping group uuids to their configuration
1138 @rtype: dict
1139 @return: a dictionary with the same keys as the input dict, and
1140 values representing the result of the checks
1141
1142 """
1143 result = {}
1144 my_name = netutils.Hostname.GetSysName()
1145 port = netutils.GetDaemonPort(constants.NODED)
1146 vm_capable = my_name not in what.get(constants.NV_NONVMNODES, [])
1147
1148 _VerifyHypervisors(what, vm_capable, result, all_hvparams)
1149 _VerifyHvparams(what, vm_capable, result)
1150
1151 if constants.NV_FILELIST in what:
1152 fingerprints = utils.FingerprintFiles(map(vcluster.LocalizeVirtualPath,
1153 what[constants.NV_FILELIST]))
1154 result[constants.NV_FILELIST] = \
1155 dict((vcluster.MakeVirtualPath(key), value)
1156 for (key, value) in fingerprints.items())
1157
1158 if constants.NV_CLIENT_CERT in what:
1159 result[constants.NV_CLIENT_CERT] = _VerifyClientCertificate()
1160
1161 if constants.NV_SSH_SETUP in what:
1162 result[constants.NV_SSH_SETUP] = \
1163 _VerifySshSetup(what[constants.NV_SSH_SETUP], my_name)
1164 if constants.NV_SSH_CLUTTER in what:
1165 result[constants.NV_SSH_CLUTTER] = \
1166 _VerifySshClutter(what[constants.NV_SSH_SETUP], my_name)
1167
1168 if constants.NV_NODELIST in what:
1169 (nodes, bynode, mcs) = what[constants.NV_NODELIST]
1170
1171 # Add nodes from other groups (different for each node)
1172 try:
1173 nodes.extend(bynode[my_name])
1174 except KeyError:
1175 pass
1176
1177 # Use a random order
1178 random.shuffle(nodes)
1179
1180 # Try to contact all nodes
1181 val = {}
1182 for node in nodes:
1183 params = groups_cfg.get(node_groups.get(node))
1184 ssh_port = params["ndparams"].get(constants.ND_SSH_PORT)
1185 logging.debug("Ssh port %s (None = default) for node %s",
1186 str(ssh_port), node)
1187
1188 # We only test if master candidates can communicate to other nodes.
1189 # We cannot test if normal nodes cannot communicate with other nodes,
1190 # because the administrator might have installed additional SSH keys,
1191 # over which Ganeti has no power.
1192 if my_name in mcs:
1193 success, message = _GetSshRunner(cluster_name). \
1194 VerifyNodeHostname(node, ssh_port)
1195 if not success:
1196 val[node] = message
1197
1198 result[constants.NV_NODELIST] = val
1199
1200 if constants.NV_NODENETTEST in what:
1201 result[constants.NV_NODENETTEST] = tmp = {}
1202 my_pip = my_sip = None
1203 for name, pip, sip in what[constants.NV_NODENETTEST]:
1204 if name == my_name:
1205 my_pip = pip
1206 my_sip = sip
1207 break
1208 if not my_pip:
1209 tmp[my_name] = ("Can't find my own primary/secondary IP"
1210 " in the node list")
1211 else:
1212 for name, pip, sip in what[constants.NV_NODENETTEST]:
1213 fail = []
1214 if not netutils.TcpPing(pip, port, source=my_pip):
1215 fail.append("primary")
1216 if sip != pip:
1217 if not netutils.TcpPing(sip, port, source=my_sip):
1218 fail.append("secondary")
1219 if fail:
1220 tmp[name] = ("failure using the %s interface(s)" %
1221 " and ".join(fail))
1222
1223 if constants.NV_MASTERIP in what:
1224 # FIXME: add checks on incoming data structures (here and in the
1225 # rest of the function)
1226 master_name, master_ip = what[constants.NV_MASTERIP]
1227 if master_name == my_name:
1228 source = constants.IP4_ADDRESS_LOCALHOST
1229 else:
1230 source = None
1231 result[constants.NV_MASTERIP] = netutils.TcpPing(master_ip, port,
1232 source=source)
1233
1234 if constants.NV_USERSCRIPTS in what:
1235 result[constants.NV_USERSCRIPTS] = \
1236 [script for script in what[constants.NV_USERSCRIPTS]
1237 if not utils.IsExecutable(script)]
1238
1239 if constants.NV_OOB_PATHS in what:
1240 result[constants.NV_OOB_PATHS] = tmp = []
1241 for path in what[constants.NV_OOB_PATHS]:
1242 try:
1243 st = os.stat(path)
1244 except OSError, err:
1245 tmp.append("error stating out of band helper: %s" % err)
1246 else:
1247 if stat.S_ISREG(st.st_mode):
1248 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR:
1249 tmp.append(None)
1250 else:
1251 tmp.append("out of band helper %s is not executable" % path)
1252 else:
1253 tmp.append("out of band helper %s is not a file" % path)
1254
1255 if constants.NV_LVLIST in what and vm_capable:
1256 try:
1257 val = GetVolumeList(utils.ListVolumeGroups().keys())
1258 except RPCFail, err:
1259 val = str(err)
1260 result[constants.NV_LVLIST] = val
1261
1262 _VerifyInstanceList(what, vm_capable, result, all_hvparams)
1263
1264 if constants.NV_VGLIST in what and vm_capable:
1265 result[constants.NV_VGLIST] = utils.ListVolumeGroups()
1266
1267 if constants.NV_PVLIST in what and vm_capable:
1268 check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what
1269 val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST],
1270 filter_allocatable=False,
1271 include_lvs=check_exclusive_pvs)
1272 if check_exclusive_pvs:
1273 result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val)
1274 for pvi in val:
1275 # Avoid sending useless data on the wire
1276 pvi.lv_list = []
1277 result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val)
1278
1279 if constants.NV_VERSION in what:
1280 result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
1281 constants.RELEASE_VERSION)
1282
1283 _VerifyNodeInfo(what, vm_capable, result, all_hvparams)
1284
1285 if constants.NV_DRBDVERSION in what and vm_capable:
1286 try:
1287 drbd_version = DRBD8.GetProcInfo().GetVersionString()
1288 except errors.BlockDeviceError, err:
1289 logging.warning("Can't get DRBD version", exc_info=True)
1290 drbd_version = str(err)
1291 result[constants.NV_DRBDVERSION] = drbd_version
1292
1293 if constants.NV_DRBDLIST in what and vm_capable:
1294 try:
1295 used_minors = drbd.DRBD8.GetUsedDevs()
1296 except errors.BlockDeviceError, err:
1297 logging.warning("Can't get used minors list", exc_info=True)
1298 used_minors = str(err)
1299 result[constants.NV_DRBDLIST] = used_minors
1300
1301 if constants.NV_DRBDHELPER in what and vm_capable:
1302 status = True
1303 try:
1304 payload = drbd.DRBD8.GetUsermodeHelper()
1305 except errors.BlockDeviceError, err:
1306 logging.error("Can't get DRBD usermode helper: %s", str(err))
1307 status = False
1308 payload = str(err)
1309 result[constants.NV_DRBDHELPER] = (status, payload)
1310
1311 if constants.NV_NODESETUP in what:
1312 result[constants.NV_NODESETUP] = tmpr = []
1313 if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"):
1314 tmpr.append("The sysfs filesytem doesn't seem to be mounted"
1315 " under /sys, missing required directories /sys/block"
1316 " and /sys/class/net")
1317 if (not os.path.isdir("/proc/sys") or
1318 not os.path.isfile("/proc/sysrq-trigger")):
1319 tmpr.append("The procfs filesystem doesn't seem to be mounted"
1320 " under /proc, missing required directory /proc/sys and"
1321 " the file /proc/sysrq-trigger")
1322
1323 if constants.NV_TIME in what:
1324 result[constants.NV_TIME] = utils.SplitTime(time.time())
1325
1326 if constants.NV_OSLIST in what and vm_capable:
1327 result[constants.NV_OSLIST] = DiagnoseOS()
1328
1329 if constants.NV_BRIDGES in what and vm_capable:
1330 result[constants.NV_BRIDGES] = [bridge
1331 for bridge in what[constants.NV_BRIDGES]
1332 if not utils.BridgeExists(bridge)]
1333
1334 if what.get(constants.NV_ACCEPTED_STORAGE_PATHS) == my_name:
1335 result[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1336 filestorage.ComputeWrongFileStoragePaths()
1337
1338 if what.get(constants.NV_FILE_STORAGE_PATH):
1339 pathresult = filestorage.CheckFileStoragePath(
1340 what[constants.NV_FILE_STORAGE_PATH])
1341 if pathresult:
1342 result[constants.NV_FILE_STORAGE_PATH] = pathresult
1343
1344 if what.get(constants.NV_SHARED_FILE_STORAGE_PATH):
1345 pathresult = filestorage.CheckFileStoragePath(
1346 what[constants.NV_SHARED_FILE_STORAGE_PATH])
1347 if pathresult:
1348 result[constants.NV_SHARED_FILE_STORAGE_PATH] = pathresult
1349
1350 return result
1351
1352
1353 def GetCryptoTokens(token_requests):
1354 """Perform actions on the node's cryptographic tokens.
1355
1356 Token types can be 'ssl' or 'ssh'. So far only some actions are implemented
1357 for 'ssl'. Action 'get' returns the digest of the public client ssl
1358 certificate. Action 'create' creates a new client certificate and private key
1359 and also returns the digest of the certificate. The third parameter of a
1360 token request are optional parameters for the actions, so far only the
1361 filename is supported.
1362
1363 @type token_requests: list of tuples of (string, string, dict), where the
1364 first string is in constants.CRYPTO_TYPES, the second in
1365 constants.CRYPTO_ACTIONS. The third parameter is a dictionary of string
1366 to string.
1367 @param token_requests: list of requests of cryptographic tokens and actions
1368 to perform on them. The actions come with a dictionary of options.
1369 @rtype: list of tuples (string, string)
1370 @return: list of tuples of the token type and the public crypto token
1371
1372 """
1373 tokens = []
1374 for (token_type, action, _) in token_requests:
1375 if token_type not in constants.CRYPTO_TYPES:
1376 raise errors.ProgrammerError("Token type '%s' not supported." %
1377 token_type)
1378 if action not in constants.CRYPTO_ACTIONS:
1379 raise errors.ProgrammerError("Action '%s' is not supported." %
1380 action)
1381 if token_type == constants.CRYPTO_TYPE_SSL_DIGEST:
1382 tokens.append((token_type,
1383 utils.GetCertificateDigest()))
1384 return tokens
1385
1386
1387 def EnsureDaemon(daemon_name, run):
1388 """Ensures the given daemon is running or stopped.
1389
1390 @type daemon_name: string
1391 @param daemon_name: name of the daemon (e.g., constants.KVMD)
1392
1393 @type run: bool
1394 @param run: whether to start or stop the daemon
1395
1396 @rtype: bool
1397 @return: 'True' if daemon successfully started/stopped,
1398 'False' otherwise
1399
1400 """
1401 allowed_daemons = [constants.KVMD]
1402
1403 if daemon_name not in allowed_daemons:
1404 fn = lambda _: False
1405 elif run:
1406 fn = utils.EnsureDaemon
1407 else:
1408 fn = utils.StopDaemon
1409
1410 return fn(daemon_name)
1411
1412
1413 def _InitSshUpdateData(data, noded_cert_file, ssconf_store):
1414 (_, noded_cert) = \
1415 utils.ExtractX509Certificate(utils.ReadFile(noded_cert_file))
1416 data[constants.SSHS_NODE_DAEMON_CERTIFICATE] = noded_cert
1417
1418 cluster_name = ssconf_store.GetClusterName()
1419 data[constants.SSHS_CLUSTER_NAME] = cluster_name
1420
1421
1422 def AddNodeSshKey(node_uuid, node_name,
1423 potential_master_candidates,
1424 to_authorized_keys=False,
1425 to_public_keys=False,
1426 get_public_keys=False,
1427 pub_key_file=pathutils.SSH_PUB_KEYS,
1428 ssconf_store=None,
1429 noded_cert_file=pathutils.NODED_CERT_FILE,
1430 run_cmd_fn=ssh.RunSshCmdWithStdin):
1431 """Distributes a node's public SSH key across the cluster.
1432
1433 Note that this function should only be executed on the master node, which
1434 then will copy the new node's key to all nodes in the cluster via SSH.
1435
1436 Also note: at least one of the flags C{to_authorized_keys},
1437 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1438 the function to actually perform any actions.
1439
1440 @type node_uuid: str
1441 @param node_uuid: the UUID of the node whose key is added
1442 @type node_name: str
1443 @param node_name: the name of the node whose key is added
1444 @type potential_master_candidates: list of str
1445 @param potential_master_candidates: list of node names of potential master
1446 candidates; this should match the list of uuids in the public key file
1447 @type to_authorized_keys: boolean
1448 @param to_authorized_keys: whether the key should be added to the
1449 C{authorized_keys} file of all nodes
1450 @type to_public_keys: boolean
1451 @param to_public_keys: whether the keys should be added to the public key file
1452 @type get_public_keys: boolean
1453 @param get_public_keys: whether the node should add the clusters' public keys
1454 to its {ganeti_pub_keys} file
1455
1456 """
1457 node_list = [SshAddNodeInfo(name=node_name, uuid=node_uuid,
1458 to_authorized_keys=to_authorized_keys,
1459 to_public_keys=to_public_keys,
1460 get_public_keys=get_public_keys)]
1461 return AddNodeSshKeyBulk(node_list,
1462 potential_master_candidates,
1463 pub_key_file=pub_key_file,
1464 ssconf_store=ssconf_store,
1465 noded_cert_file=noded_cert_file,
1466 run_cmd_fn=run_cmd_fn)
1467
1468
1469 # Node info named tuple specifically for the use with AddNodeSshKeyBulk
1470 SshAddNodeInfo = collections.namedtuple(
1471 "SshAddNodeInfo",
1472 ["uuid",
1473 "name",
1474 "to_authorized_keys",
1475 "to_public_keys",
1476 "get_public_keys"])
1477
1478
1479 def AddNodeSshKeyBulk(node_list,
1480 potential_master_candidates,
1481 pub_key_file=pathutils.SSH_PUB_KEYS,
1482 ssconf_store=None,
1483 noded_cert_file=pathutils.NODED_CERT_FILE,
1484 run_cmd_fn=ssh.RunSshCmdWithStdin):
1485 """Distributes a node's public SSH key across the cluster.
1486
1487 Note that this function should only be executed on the master node, which
1488 then will copy the new node's key to all nodes in the cluster via SSH.
1489
1490 Also note: at least one of the flags C{to_authorized_keys},
1491 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1492 the function to actually perform any actions.
1493
1494 @type node_list: list of SshAddNodeInfo tuples
1495 @param node_list: list of tuples containing the necessary node information for
1496 adding their keys
1497 @type potential_master_candidates: list of str
1498 @param potential_master_candidates: list of node names of potential master
1499 candidates; this should match the list of uuids in the public key file
1500
1501 """
1502 # whether there are any keys to be added or retrieved at all
1503 to_authorized_keys = any([node_info.to_authorized_keys for node_info in
1504 node_list])
1505 to_public_keys = any([node_info.to_public_keys for node_info in
1506 node_list])
1507
1508 if not ssconf_store:
1509 ssconf_store = ssconf.SimpleStore()
1510
1511 for node_info in node_list:
1512 # replacement not necessary for keys that are not supposed to be in the
1513 # list of public keys
1514 if not node_info.to_public_keys:
1515 continue
1516 # Check and fix sanity of key file
1517 keys_by_name = ssh.QueryPubKeyFile([node_info.name], key_file=pub_key_file)
1518 keys_by_uuid = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1519
1520 if (not keys_by_name or node_info.name not in keys_by_name) \
1521 and (not keys_by_uuid or node_info.uuid not in keys_by_uuid):
1522 raise errors.SshUpdateError(
1523 "No keys found for the new node '%s' (UUID %s) in the list of public"
1524 " SSH keys, neither for the name or the UUID" %
1525 (node_info.name, node_info.uuid))
1526 else:
1527 if node_info.name in keys_by_name:
1528 # Replace the name by UUID in the file as the name should only be used
1529 # temporarily
1530 ssh.ReplaceNameByUuid(node_info.uuid, node_info.name,
1531 error_fn=errors.SshUpdateError,
1532 key_file=pub_key_file)
1533
1534 # Retrieve updated map of UUIDs to keys
1535 keys_by_uuid = ssh.QueryPubKeyFile(
1536 [node_info.uuid for node_info in node_list], key_file=pub_key_file)
1537
1538 # Update the master node's key files
1539 (auth_key_file, _) = \
1540 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1541 for node_info in node_list:
1542 if node_info.to_authorized_keys:
1543 ssh.AddAuthorizedKeys(auth_key_file, keys_by_uuid[node_info.uuid])
1544
1545 base_data = {}
1546 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1547 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1548
1549 ssh_port_map = ssconf_store.GetSshPortMap()
1550
1551 # Update the target nodes themselves
1552 for node_info in node_list:
1553 logging.debug("Updating SSH key files of target node '%s'.", node_info.name)
1554 if node_info.get_public_keys:
1555 node_data = {}
1556 _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
1557 all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
1558 node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1559 (constants.SSHS_OVERRIDE, all_keys)
1560
1561 try:
1562 utils.RetryByNumberOfTimes(
1563 constants.SSHS_MAX_RETRIES,
1564 errors.SshUpdateError,
1565 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1566 ssh_port_map.get(node_info.name), node_data,
1567 debug=False, verbose=False, use_cluster_key=False,
1568 ask_key=False, strict_host_check=False)
1569 except errors.SshUpdateError as e:
1570 # Clean up the master's public key file if adding key fails
1571 if node_info.to_public_keys:
1572 ssh.RemovePublicKey(node_info.uuid)
1573 raise e
1574
1575 # Update all nodes except master and the target nodes
1576 keys_by_uuid_auth = ssh.QueryPubKeyFile(
1577 [node_info.uuid for node_info in node_list
1578 if node_info.to_authorized_keys],
1579 key_file=pub_key_file)
1580 if to_authorized_keys:
1581 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1582 (constants.SSHS_ADD, keys_by_uuid_auth)
1583
1584 pot_mc_data = base_data.copy()
1585 keys_by_uuid_pub = ssh.QueryPubKeyFile(
1586 [node_info.uuid for node_info in node_list
1587 if node_info.to_public_keys],
1588 key_file=pub_key_file)
1589 if to_public_keys:
1590 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1591 (constants.SSHS_REPLACE_OR_ADD, keys_by_uuid_pub)
1592
1593 all_nodes = ssconf_store.GetNodeList()
1594 master_node = ssconf_store.GetMasterNode()
1595 online_nodes = ssconf_store.GetOnlineNodeList()
1596
1597 node_errors = []
1598 for node in all_nodes:
1599 if node == master_node:
1600 logging.debug("Skipping master node '%s'.", master_node)
1601 continue
1602 if node not in online_nodes:
1603 logging.debug("Skipping offline node '%s'.", node)
1604 continue
1605 if node in potential_master_candidates:
1606 logging.debug("Updating SSH key files of node '%s'.", node)
1607 try:
1608 utils.RetryByNumberOfTimes(
1609 constants.SSHS_MAX_RETRIES,
1610 errors.SshUpdateError,
1611 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1612 ssh_port_map.get(node), pot_mc_data,
1613 debug=False, verbose=False, use_cluster_key=False,
1614 ask_key=False, strict_host_check=False)
1615 except errors.SshUpdateError as last_exception:
1616 error_msg = ("When adding the key of node '%s', updating SSH key"
1617 " files of node '%s' failed after %s retries."
1618 " Not trying again. Last error was: %s." %
1619 (node, node_info.name, constants.SSHS_MAX_RETRIES,
1620 last_exception))
1621 node_errors.append((node, error_msg))
1622 # We only log the error and don't throw an exception, because
1623 # one unreachable node shall not abort the entire procedure.
1624 logging.error(error_msg)
1625
1626 else:
1627 if to_authorized_keys:
1628 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
1629 ssh_port_map.get(node), base_data,
1630 debug=False, verbose=False, use_cluster_key=False,
1631 ask_key=False, strict_host_check=False)
1632
1633 return node_errors
1634
1635
1636 def RemoveNodeSshKey(node_uuid, node_name,
1637 master_candidate_uuids,
1638 potential_master_candidates,
1639 master_uuid=None,
1640 keys_to_remove=None,
1641 from_authorized_keys=False,
1642 from_public_keys=False,
1643 clear_authorized_keys=False,
1644 clear_public_keys=False,
1645 pub_key_file=pathutils.SSH_PUB_KEYS,
1646 ssconf_store=None,
1647 noded_cert_file=pathutils.NODED_CERT_FILE,
1648 readd=False,
1649 run_cmd_fn=ssh.RunSshCmdWithStdin):
1650 """Removes the node's SSH keys from the key files and distributes those.
1651
1652 Note that at least one of the flags C{from_authorized_keys},
1653 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1654 has to be set to C{True} for the function to perform any action at all.
1655 Not doing so will trigger an assertion in the function.
1656
1657 @type node_uuid: str
1658 @param node_uuid: UUID of the node whose key is removed
1659 @type node_name: str
1660 @param node_name: name of the node whose key is remove
1661 @type master_candidate_uuids: list of str
1662 @param master_candidate_uuids: list of UUIDs of the current master candidates
1663 @type potential_master_candidates: list of str
1664 @param potential_master_candidates: list of names of potential master
1665 candidates
1666 @type keys_to_remove: dict of str to list of str
1667 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1668 to be removed. This list is supposed to be used only if the keys are not
1669 in the public keys file. This is for example the case when removing a
1670 master node's key.
1671 @type from_authorized_keys: boolean
1672 @param from_authorized_keys: whether or not the key should be removed
1673 from the C{authorized_keys} file
1674 @type from_public_keys: boolean
1675 @param from_public_keys: whether or not the key should be remove from
1676 the C{ganeti_pub_keys} file
1677 @type clear_authorized_keys: boolean
1678 @param clear_authorized_keys: whether or not the C{authorized_keys} file
1679 should be cleared on the node whose keys are removed
1680 @type clear_public_keys: boolean
1681 @param clear_public_keys: whether to clear the node's C{ganeti_pub_key} file
1682 @type readd: boolean
1683 @param readd: whether this is called during a readd operation.
1684 @rtype: list of string
1685 @returns: list of feedback messages
1686
1687 """
1688 node_list = [SshRemoveNodeInfo(uuid=node_uuid,
1689 name=node_name,
1690 from_authorized_keys=from_authorized_keys,
1691 from_public_keys=from_public_keys,
1692 clear_authorized_keys=clear_authorized_keys,
1693 clear_public_keys=clear_public_keys)]
1694 return RemoveNodeSshKeyBulk(node_list,
1695 master_candidate_uuids,
1696 potential_master_candidates,
1697 master_uuid=master_uuid,
1698 keys_to_remove=keys_to_remove,
1699 pub_key_file=pub_key_file,
1700 ssconf_store=ssconf_store,
1701 noded_cert_file=noded_cert_file,
1702 readd=readd,
1703 run_cmd_fn=run_cmd_fn)
1704
1705
1706 # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
1707 SshRemoveNodeInfo = collections.namedtuple(
1708 "SshRemoveNodeInfo",
1709 ["uuid",
1710 "name",
1711 "from_authorized_keys",
1712 "from_public_keys",
1713 "clear_authorized_keys",
1714 "clear_public_keys"])
1715
1716
1717 def RemoveNodeSshKeyBulk(node_list,
1718 master_candidate_uuids,
1719 potential_master_candidates,
1720 master_uuid=None,
1721 keys_to_remove=None,
1722 pub_key_file=pathutils.SSH_PUB_KEYS,
1723 ssconf_store=None,
1724 noded_cert_file=pathutils.NODED_CERT_FILE,
1725 readd=False,
1726 run_cmd_fn=ssh.RunSshCmdWithStdin):
1727 """Removes the node's SSH keys from the key files and distributes those.
1728
1729 Note that at least one of the flags C{from_authorized_keys},
1730 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1731 of at least one node has to be set to C{True} for the function to perform any
1732 action at all. Not doing so will trigger an assertion in the function.
1733
1734 @type node_list: list of C{SshRemoveNodeInfo}.
1735 @param node_list: list of information about nodes whose keys are being removed
1736 @type master_candidate_uuids: list of str
1737 @param master_candidate_uuids: list of UUIDs of the current master candidates
1738 @type potential_master_candidates: list of str
1739 @param potential_master_candidates: list of names of potential master
1740 candidates
1741 @type keys_to_remove: dict of str to list of str
1742 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1743 to be removed. This list is supposed to be used only if the keys are not
1744 in the public keys file. This is for example the case when removing a
1745 master node's key.
1746 @type readd: boolean
1747 @param readd: whether this is called during a readd operation.
1748 @rtype: list of string
1749 @returns: list of feedback messages
1750
1751 """
1752 # Non-disruptive error messages, list of (node, msg) pairs
1753 result_msgs = []
1754
1755 # whether there are any keys to be added or retrieved at all
1756 from_authorized_keys = any([node_info.from_authorized_keys for node_info in
1757 node_list])
1758 from_public_keys = any([node_info.from_public_keys for node_info in
1759 node_list])
1760 clear_authorized_keys = any([node_info.clear_authorized_keys for node_info in
1761 node_list])
1762 clear_public_keys = any([node_info.clear_public_keys for node_info in
1763 node_list])
1764
1765 # Make sure at least one of these flags is true.
1766 if not (from_authorized_keys or from_public_keys or clear_authorized_keys
1767 or clear_public_keys):
1768 raise errors.SshUpdateError("No removal from any key file was requested.")
1769
1770 if not ssconf_store:
1771 ssconf_store = ssconf.SimpleStore()
1772
1773 master_node = ssconf_store.GetMasterNode()
1774 ssh_port_map = ssconf_store.GetSshPortMap()
1775
1776 all_keys_to_remove = {}
1777 if from_authorized_keys or from_public_keys:
1778 for node_info in node_list:
1779 # Skip nodes that don't actually need any keys to be removed.
1780 if not (node_info.from_authorized_keys or node_info.from_public_keys):
1781 continue
1782 if node_info.name == master_node and not keys_to_remove:
1783 raise errors.SshUpdateError("Cannot remove the master node's keys.")
1784 if keys_to_remove:
1785 keys = keys_to_remove
1786 else:
1787 keys = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1788 if (not keys or node_info.uuid not in keys) and not readd:
1789 raise errors.SshUpdateError("Node '%s' not found in the list of"
1790 " public SSH keys. It seems someone"
1791 " tries to remove a key from outside"
1792 " the cluster!" % node_info.uuid)
1793 # During an upgrade all nodes have the master key. In this case we
1794 # should not remove it to avoid accidentally shutting down cluster
1795 # SSH communication
1796 master_keys = None
1797 if master_uuid:
1798 master_keys = ssh.QueryPubKeyFile([master_uuid],
1799 key_file=pub_key_file)
1800 for master_key in master_keys:
1801 if master_key in keys[node_info.uuid]:
1802 keys[node_info.uuid].remove(master_key)
1803
1804 all_keys_to_remove.update(keys)
1805
1806 if all_keys_to_remove:
1807 base_data = {}
1808 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1809 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1810
1811 if from_authorized_keys:
1812 # UUIDs of nodes that are supposed to be removed from the
1813 # authorized_keys files.
1814 nodes_remove_from_authorized_keys = [
1815 node_info.uuid for node_info in node_list
1816 if node_info.from_authorized_keys]
1817 keys_to_remove_from_authorized_keys = dict([
1818 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1819 if uuid in nodes_remove_from_authorized_keys])
1820 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1821 (constants.SSHS_REMOVE, keys_to_remove_from_authorized_keys)
1822 (auth_key_file, _) = \
1823 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False,
1824 dircheck=False)
1825
1826 for uuid in nodes_remove_from_authorized_keys:
1827 ssh.RemoveAuthorizedKeys(auth_key_file,
1828 keys_to_remove_from_authorized_keys[uuid])
1829
1830 pot_mc_data = base_data.copy()
1831
1832 if from_public_keys:
1833 nodes_remove_from_public_keys = [
1834 node_info.uuid for node_info in node_list
1835 if node_info.from_public_keys]
1836 keys_to_remove_from_public_keys = dict([
1837 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1838 if uuid in nodes_remove_from_public_keys])
1839 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1840 (constants.SSHS_REMOVE, keys_to_remove_from_public_keys)
1841
1842 all_nodes = ssconf_store.GetNodeList()
1843 online_nodes = ssconf_store.GetOnlineNodeList()
1844 all_nodes_to_remove = [node_info.name for node_info in node_list]
1845 logging.debug("Removing keys of nodes '%s' from all nodes but itself and"
1846 " master.", ", ".join(all_nodes_to_remove))
1847 for node in all_nodes:
1848 if node == master_node:
1849 logging.debug("Skipping master node '%s'.", master_node)
1850 continue
1851 if node not in online_nodes:
1852 logging.debug("Skipping offline node '%s'.", node)
1853 continue
1854 if node in all_nodes_to_remove:
1855 logging.debug("Skipping node whose key is removed itself '%s'.", node)
1856 continue
1857 ssh_port = ssh_port_map.get(node)
1858 if not ssh_port:
1859 raise errors.OpExecError("No SSH port information available for"
1860 " node '%s', map: %s." %
1861 (node, ssh_port_map))
1862 error_msg_final = ("When removing the key of node '%s', updating the"
1863 " SSH key files of node '%s' failed. Last error"
1864 " was: %s.")
1865 if node in potential_master_candidates:
1866 logging.debug("Updating key setup of potential master candidate node"
1867 " %s.", node)
1868 try:
1869 utils.RetryByNumberOfTimes(
1870 constants.SSHS_MAX_RETRIES,
1871 errors.SshUpdateError,
1872 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1873 ssh_port, pot_mc_data,
1874 debug=False, verbose=False, use_cluster_key=False,
1875 ask_key=False, strict_host_check=False)
1876 except errors.SshUpdateError as last_exception:
1877 error_msg = error_msg_final % (
1878 node_info.name, node, last_exception)
1879 result_msgs.append((node, error_msg))
1880 logging.error(error_msg)
1881
1882 else:
1883 if from_authorized_keys:
1884 logging.debug("Updating key setup of normal node %s.", node)
1885 try:
1886 utils.RetryByNumberOfTimes(
1887 constants.SSHS_MAX_RETRIES,
1888 errors.SshUpdateError,
1889 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1890 ssh_port, base_data,
1891 debug=False, verbose=False, use_cluster_key=False,
1892 ask_key=False, strict_host_check=False)
1893 except errors.SshUpdateError as last_exception:
1894 error_msg = error_msg_final % (
1895 node_info.name, node, last_exception)
1896 result_msgs.append((node, error_msg))
1897 logging.error(error_msg)
1898
1899 for node_info in node_list:
1900 if node_info.clear_authorized_keys or node_info.from_public_keys or \
1901 node_info.clear_public_keys:
1902 data = {}
1903 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1904 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1905 ssh_port = ssh_port_map.get(node_info.name)
1906 if not ssh_port:
1907 raise errors.OpExecError("No SSH port information available for"
1908 " node '%s', which is leaving the cluster.")
1909
1910 if node_info.clear_authorized_keys:
1911 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore,
1912 # we have to specify exactly which keys to clear to leave keys untouched
1913 # that were not added by Ganeti.
1914 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids
1915 if uuid != node_info.uuid]
1916 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids,
1917 key_file=pub_key_file)
1918 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1919 (constants.SSHS_REMOVE, candidate_keys)
1920
1921 if node_info.clear_public_keys:
1922 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1923 (constants.SSHS_CLEAR, {})
1924 elif node_info.from_public_keys:
1925 # Since clearing the public keys subsumes removing just a single key,
1926 # we only do it if clear_public_keys is 'False'.
1927
1928 if all_keys_to_remove:
1929 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1930 (constants.SSHS_REMOVE, all_keys_to_remove)
1931
1932 # If we have no changes to any keyfile, just return
1933 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or
1934 constants.SSHS_SSH_AUTHORIZED_KEYS in data):
1935 return
1936
1937 logging.debug("Updating SSH key setup of target node '%s'.",
1938 node_info.name)
1939 try:
1940 utils.RetryByNumberOfTimes(
1941 constants.SSHS_MAX_RETRIES,
1942 errors.SshUpdateError,
1943 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1944 ssh_port, data,
1945 debug=False, verbose=False, use_cluster_key=False,
1946 ask_key=False, strict_host_check=False)
1947 except errors.SshUpdateError as last_exception:
1948 result_msgs.append(
1949 (node_info.name,
1950 ("Removing SSH keys from node '%s' failed."
1951 " This can happen when the node is already unreachable."
1952 " Error: %s" % (node_info.name, last_exception))))
1953
1954 if all_keys_to_remove and from_public_keys:
1955 for node_uuid in nodes_remove_from_public_keys:
1956 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
1957
1958 return result_msgs
1959
1960
1961 def _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map,
1962 pub_key_file=pathutils.SSH_PUB_KEYS,
1963 ssconf_store=None,
1964 noded_cert_file=pathutils.NODED_CERT_FILE,
1965 run_cmd_fn=ssh.RunSshCmdWithStdin,
1966 suffix=""):
1967 """Generates the root SSH key pair on the node.
1968
1969 @type node_uuid: str
1970 @param node_uuid: UUID of the node whose key is removed
1971 @type node_name: str
1972 @param node_name: name of the node whose key is remove
1973 @type ssh_port_map: dict of str to int
1974 @param ssh_port_map: mapping of node names to their SSH port
1975
1976 """
1977 if not ssconf_store:
1978 ssconf_store = ssconf.SimpleStore()
1979
1980 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
1981 if not keys_by_uuid or node_uuid not in keys_by_uuid:
1982 raise errors.SshUpdateError("Node %s (UUID: %s) whose key is requested to"
1983 " be regenerated is not registered in the"
1984 " public keys file." % (node_name, node_uuid))
1985
1986 data = {}
1987 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1988 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1989 data[constants.SSHS_GENERATE] = {constants.SSHS_SUFFIX: suffix}
1990
1991 run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
1992 ssh_port_map.get(node_name), data,
1993 debug=False, verbose=False, use_cluster_key=False,
1994 ask_key=False, strict_host_check=False)
1995
1996
1997 def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
1998 master_node_uuids = [node_uuid for (node_uuid, node_name)
1999 in node_uuid_name_map
2000 if node_name == master_node_name]
2001 if len(master_node_uuids) != 1:
2002 raise errors.SshUpdateError("No (unique) master UUID found. Master node"
2003 " name: '%s', Master UUID: '%s'" %
2004 (master_node_name, master_node_uuids))
2005 return master_node_uuids[0]
2006
2007
2008 def _GetOldMasterKeys(master_node_uuid, pub_key_file):
2009 old_master_keys_by_uuid = ssh.QueryPubKeyFile([master_node_uuid],
2010 key_file=pub_key_file)
2011 if not old_master_keys_by_uuid:
2012 raise errors.SshUpdateError("No public key of the master node (UUID '%s')"
2013 " found, not generating a new key."
2014 % master_node_uuid)
2015 return old_master_keys_by_uuid
2016
2017
2018 def _GetNewMasterKey(root_keyfiles, master_node_uuid):
2019 new_master_keys = []
2020 for (_, (_, public_key_file)) in root_keyfiles.items():
2021 public_key_dir = os.path.dirname(public_key_file)
2022 public_key_file_tmp_filename = \
2023 os.path.splitext(os.path.basename(public_key_file))[0] \
2024 + constants.SSHS_MASTER_SUFFIX + ".pub"
2025 public_key_path_tmp = os.path.join(public_key_dir,
2026 public_key_file_tmp_filename)
2027 if os.path.exists(public_key_path_tmp):
2028 # for some key types, there might not be any keys
2029 key = utils.ReadFile(public_key_path_tmp)
2030 new_master_keys.append(key)
2031 if not new_master_keys:
2032 raise errors.SshUpdateError("Cannot find any type of temporary SSH key.")
2033 return {master_node_uuid: new_master_keys}
2034
2035
2036 def _ReplaceMasterKeyOnMaster(root_keyfiles):
2037 number_of_moves = 0
2038 for (_, (private_key_file, public_key_file)) in root_keyfiles.items():
2039 key_dir = os.path.dirname(public_key_file)
2040 private_key_file_tmp = \
2041 os.path.basename(private_key_file) + constants.SSHS_MASTER_SUFFIX
2042 public_key_file_tmp = private_key_file_tmp + ".pub"
2043 private_key_path_tmp = os.path.join(key_dir,
2044 private_key_file_tmp)
2045 public_key_path_tmp = os.path.join(key_dir,
2046 public_key_file_tmp)
2047 if os.path.exists(public_key_file):
2048 utils.CreateBackup(public_key_file)
2049 utils.RemoveFile(public_key_file)
2050 if os.path.exists(private_key_file):
2051 utils.CreateBackup(private_key_file)
2052 utils.RemoveFile(private_key_file)
2053 if os.path.exists(public_key_path_tmp) and \
2054 os.path.exists(private_key_path_tmp):
2055 # for some key types, there might not be any keys
2056 shutil.move(public_key_path_tmp, public_key_file)
2057 shutil.move(private_key_path_tmp, private_key_file)
2058 number_of_moves += 1
2059 if not number_of_moves:
2060 raise errors.SshUpdateError("Could not move at least one master SSH key.")
2061
2062
2063 def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
2064 potential_master_candidates,
2065 pub_key_file=pathutils.SSH_PUB_KEYS,
2066 ssconf_store=None,
2067 noded_cert_file=pathutils.NODED_CERT_FILE,
2068 run_cmd_fn=ssh.RunSshCmdWithStdin):
2069 """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
2070
2071 @type node_uuids: list of str
2072 @param node_uuids: list of node UUIDs whose keys should be renewed
2073 @type node_names: list of str
2074 @param node_names: list of node names whose keys should be removed. This list
2075 should match the C{node_uuids} parameter
2076 @type master_candidate_uuids: list of str
2077 @param master_candidate_uuids: list of UUIDs of master candidates or
2078 master node
2079 @type pub_key_file: str
2080 @param pub_key_file: file path of the the public key file
2081 @type noded_cert_file: str
2082 @param noded_cert_file: path of the noded SSL certificate file
2083 @type run_cmd_fn: function
2084 @param run_cmd_fn: function to run commands on remote nodes via SSH
2085 @raises ProgrammerError: if node_uuids and node_names don't match;
2086 SshUpdateError if a node's key is missing from the public key file,
2087 if a node's new SSH key could not be fetched from it, if there is
2088 none or more than one entry in the public key list for the master
2089 node.
2090
2091 """
2092 if not ssconf_store:
2093 ssconf_store = ssconf.SimpleStore()
2094 cluster_name = ssconf_store.GetClusterName()
2095
2096 if not len(node_uuids) == len(node_names):
2097 raise errors.ProgrammerError("List of nodes UUIDs and node names"
2098 " does not match in length.")
2099
2100 (_, root_keyfiles) = \
2101 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2102 (_, dsa_pub_keyfile) = root_keyfiles[constants.SSHK_DSA]
2103 old_master_key = utils.ReadFile(dsa_pub_keyfile)
2104
2105 node_uuid_name_map = zip(node_uuids, node_names)
2106
2107 master_node_name = ssconf_store.GetMasterNode()
2108 master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name)
2109 ssh_port_map = ssconf_store.GetSshPortMap()
2110 # List of all node errors that happened, but which did not abort the
2111 # procedure as a whole. It is important that this is a list to have a
2112 # somewhat chronological history of events.
2113 all_node_errors = []
2114
2115 # process non-master nodes
2116
2117 # keys to add in bulk at the end
2118 node_keys_to_add = []
2119
2120 # list of all nodes
2121 node_list = []
2122
2123 # list of keys to be removed before generating new keys
2124 node_info_to_remove = []
2125
2126 for node_uuid, node_name in node_uuid_name_map:
2127 if node_name == master_node_name:
2128 continue
2129 master_candidate = node_uuid in master_candidate_uuids
2130 potential_master_candidate = node_name in potential_master_candidates
2131 node_list.append((node_uuid, node_name, master_candidate,
2132 potential_master_candidate))
2133
2134 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
2135 if not keys_by_uuid:
2136 raise errors.SshUpdateError("No public key of node %s (UUID %s) found,"
2137 " not generating a new key."
2138 % (node_name, node_uuid))
2139
2140 if master_candidate:
2141 logging.debug("Fetching old SSH key from node '%s'.", node_name)
2142 old_pub_key = ssh.ReadRemoteSshPubKeys(dsa_pub_keyfile,
2143 node_name, cluster_name,
2144 ssh_port_map[node_name],
2145 False, # ask_key
2146 False) # key_check
2147 if old_pub_key != old_master_key:
2148 # If we are already in a multi-key setup (that is past Ganeti 2.12),
2149 # we can safely remove the old key of the node. Otherwise, we cannot
2150 # remove that node's key, because it is also the master node's key
2151 # and that would terminate all communication from the master to the
2152 # node.
2153 node_info_to_remove.append(SshRemoveNodeInfo(
2154 uuid=node_uuid,
2155 name=node_name,
2156 from_authorized_keys=master_candidate,
2157 from_public_keys=False,
2158 clear_authorized_keys=False,
2159 clear_public_keys=False))
2160 else:
2161 logging.debug("Old key of node '%s' is the same as the current master"
2162 " key. Not deleting that key on the node.", node_name)
2163
2164 logging.debug("Removing old SSH keys of all master candidates.")
2165 if node_info_to_remove:
2166 node_errors = RemoveNodeSshKeyBulk(
2167 node_info_to_remove,
2168 master_candidate_uuids,
2169 potential_master_candidates,
2170 master_uuid=master_node_uuid)
2171 if node_errors:
2172 all_node_errors = all_node_errors + node_errors
2173
2174 for (node_uuid, node_name, master_candidate, potential_master_candidate) \
2175 in node_list:
2176
2177 logging.debug("Generating new SSH key for node '%s'.", node_name)
2178 _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map,
2179 pub_key_file=pub_key_file,
2180 ssconf_store=ssconf_store,
2181 noded_cert_file=noded_cert_file,
2182 run_cmd_fn=run_cmd_fn)
2183
2184 try:
2185 logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
2186 pub_key = ssh.ReadRemoteSshPubKeys(dsa_pub_keyfile,
2187 node_name, cluster_name,
2188 ssh_port_map[node_name],
2189 False, # ask_key
2190 False) # key_check
2191 except:
2192 raise errors.SshUpdateError("Could not fetch key of node %s"
2193 " (UUID %s)" % (node_name, node_uuid))
2194
2195 if potential_master_candidate:
2196 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
2197 ssh.AddPublicKey(node_uuid, pub_key, key_file=pub_key_file)
2198
2199 logging.debug("Add ssh key of node '%s'.", node_name)
2200 node_info = SshAddNodeInfo(name=node_name,
2201 uuid=node_uuid,
2202 to_authorized_keys=master_candidate,
2203 to_public_keys=potential_master_candidate,
2204 get_public_keys=True)
2205 node_keys_to_add.append(node_info)
2206
2207 node_errors = AddNodeSshKeyBulk(
2208 node_keys_to_add, potential_master_candidates,
2209 pub_key_file=pub_key_file, ssconf_store=ssconf_store,
2210 noded_cert_file=noded_cert_file,
2211 run_cmd_fn=run_cmd_fn)
2212 if node_errors:
2213 all_node_errors = all_node_errors + node_errors
2214
2215 # Renewing the master node's key
2216
2217 # Preserve the old keys for now
2218 old_master_keys_by_uuid = _GetOldMasterKeys(master_node_uuid, pub_key_file)
2219
2220 # Generate a new master key with a suffix, don't touch the old one for now
2221 logging.debug("Generate new ssh key of master.")
2222 _GenerateNodeSshKey(master_node_uuid, master_node_name, ssh_port_map,
2223 pub_key_file=pub_key_file,
2224 ssconf_store=ssconf_store,
2225 noded_cert_file=noded_cert_file,
2226 run_cmd_fn=run_cmd_fn,
2227 suffix=constants.SSHS_MASTER_SUFFIX)
2228 # Read newly created master key
2229 new_master_key_dict = _GetNewMasterKey(root_keyfiles, master_node_uuid)
2230
2231 # Replace master key in the master nodes' public key file
2232 ssh.RemovePublicKey(master_node_uuid, key_file=pub_key_file)
2233 for pub_key in new_master_key_dict[master_node_uuid]:
2234 ssh.AddPublicKey(master_node_uuid, pub_key, key_file=pub_key_file)
2235
2236 # Add new master key to all node's public and authorized keys
2237 logging.debug("Add new master key to all nodes.")
2238 node_errors = AddNodeSshKey(
2239 master_node_uuid, master_node_name, potential_master_candidates,
2240 to_authorized_keys=True, to_public_keys=True,
2241 get_public_keys=False, pub_key_file=pub_key_file,
2242 ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
2243 run_cmd_fn=run_cmd_fn)
2244 if node_errors:
2245 all_node_errors = all_node_errors + node_errors
2246
2247 # Remove the old key file and rename the new key to the non-temporary filename
2248 _ReplaceMasterKeyOnMaster(root_keyfiles)
2249
2250 # Remove old key from authorized keys
2251 (auth_key_file, _) = \
2252 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2253 ssh.RemoveAuthorizedKeys(auth_key_file,
2254 old_master_keys_by_uuid[master_node_uuid])
2255
2256 # Remove the old key from all node's authorized keys file
2257 logging.debug("Remove the old master key from all nodes.")
2258 node_errors = RemoveNodeSshKey(
2259 master_node_uuid, master_node_name, master_candidate_uuids,
2260 potential_master_candidates,
2261 keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
2262 from_public_keys=False, clear_authorized_keys=False,
2263 clear_public_keys=False)
2264 if node_errors:
2265 all_node_errors = all_node_errors + node_errors
2266
2267 return all_node_errors
2268
2269
2270 def GetBlockDevSizes(devices):
2271 """Return the size of the given block devices
2272
2273 @type devices: list
2274 @param devices: list of block device nodes to query
2275 @rtype: dict
2276 @return:
2277 dictionary of all block devices under /dev (key). The value is their
2278 size in MiB.
2279
2280 {'/dev/disk/by-uuid/123456-12321231-312312-312': 124}
2281
2282 """
2283 DEV_PREFIX = "/dev/"
2284 blockdevs = {}
2285
2286 for devpath in devices:
2287 if not utils.IsBelowDir(DEV_PREFIX, devpath):
2288 continue
2289
2290 try:
2291 st = os.stat(devpath)
2292 except EnvironmentError, err:
2293 logging.warning("Error stat()'ing device %s: %s", devpath, str(err))
2294 continue
2295
2296 if stat.S_ISBLK(st.st_mode):
2297 result = utils.RunCmd(["blockdev", "--getsize64", devpath])
2298 if result.failed:
2299 # We don't want to fail, just do not list this device as available
2300 logging.warning("Cannot get size for block device %s", devpath)
2301 continue
2302
2303 size = int(result.stdout) / (1024 * 1024)
2304 blockdevs[devpath] = size
2305 return blockdevs
2306
2307
2308 def GetVolumeList(vg_names):
2309 """Compute list of logical volumes and their size.
2310
2311 @type vg_names: list
2312 @param vg_names: the volume groups whose LVs we should list, or
2313 empty for all volume groups
2314 @rtype: dict
2315 @return:
2316 dictionary of all partions (key) with value being a tuple of
2317 their size (in MiB), inactive and online status::
2318
2319 {'xenvg/test1': ('20.06', True, True)}
2320
2321 in case of errors, a string is returned with the error
2322 details.
2323
2324 """
2325 lvs = {}
2326 sep = "|"
2327 if not vg_names:
2328 vg_names = []
2329 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2330 "--separator=%s" % sep,
2331 "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names)
2332 if result.failed:
2333 _Fail("Failed to list logical volumes, lvs output: %s", result.output)
2334
2335 for line in result.stdout.splitlines():
2336 line = line.strip()
2337 match = _LVSLINE_REGEX.match(line)
2338 if not match:
2339 logging.error("Invalid line returned from lvs output: '%s'", line)
2340 continue
2341 vg_name, name, size, attr = match.groups()
2342 inactive = attr[4] == "-"
2343 online = attr[5] == "o"
2344 virtual = attr[0] == "v"
2345 if virtual:
2346 # we don't want to report such volumes as existing, since they
2347 # don't really hold data
2348 continue
2349 lvs[vg_name + "/" + name] = (size, inactive, online)
2350
2351 return lvs
2352
2353
2354 def ListVolumeGroups():
2355 """List the volume groups and their size.
2356
2357 @rtype: dict
2358 @return: dictionary with keys volume name and values the
2359 size of the volume
2360
2361 """
2362 return utils.ListVolumeGroups()
2363
2364
2365 def NodeVolumes():
2366 """List all volumes on this node.
2367
2368 @rtype: list
2369 @return:
2370 A list of dictionaries, each having four keys:
2371 - name: the logical volume name,
2372 - size: the size of the logical volume
2373 - dev: the physical device on which the LV lives
2374 - vg: the volume group to which it belongs
2375
2376 In case of errors, we return an empty list and log the
2377 error.
2378
2379 Note that since a logical volume can live on multiple physical
2380 volumes, the resulting list might include a logical volume
2381 multiple times.
2382
2383 """
2384 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2385 "--separator=|",
2386 "--options=lv_name,lv_size,devices,vg_name"])
2387 if result.failed:
2388 _Fail("Failed to list logical volumes, lvs output: %s",
2389 result.output)
2390
2391 def parse_dev(dev):
2392 return dev.split("(")[0]
2393
2394 def handle_dev(dev):
2395 return [parse_dev(x) for x in dev.split(",")]
2396
2397 def map_line(line):
2398 line = [v.strip() for v in line]
2399 return [{"name": line[0], "size": line[1],
2400 "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])]
2401
2402 all_devs = []
2403 for line in result.stdout.splitlines():
2404 if line.count("|") >= 3:
2405 all_devs.extend(map_line(line.split("|")))
2406 else:
2407 logging.warning("Strange line in the output from lvs: '%s'", line)
2408 return all_devs
2409
2410
2411 def BridgesExist(bridges_list):
2412 """Check if a list of bridges exist on the current node.
2413
2414 @rtype: boolean
2415 @return: C{True} if all of them exist, C{False} otherwise
2416
2417 """
2418 missing = []
2419 for bridge in bridges_list:
2420 if not utils.BridgeExists(bridge):
2421 missing.append(bridge)
2422
2423 if missing:
2424 _Fail("Missing bridges %s", utils.CommaJoin(missing))
2425
2426
2427 def GetInstanceListForHypervisor(hname, hvparams=None,
2428 get_hv_fn=hypervisor.GetHypervisor):
2429 """Provides a list of instances of the given hypervisor.
2430
2431 @type hname: string
2432 @param hname: name of the hypervisor
2433 @type hvparams: dict of strings
2434 @param hvparams: hypervisor parameters for the given hypervisor
2435 @type get_hv_fn: function
2436 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2437 name; optional parameter to increase testability
2438
2439 @rtype: list
2440 @return: a list of all running instances on the current node
2441 - instance1.example.com
2442 - instance2.example.com
2443
2444 """
2445 try:
2446 return get_hv_fn(hname).ListInstances(hvparams=hvparams)
2447 except errors.HypervisorError, err:
2448 _Fail("Error enumerating instances (hypervisor %s): %s",
2449 hname, err, exc=True)
2450
2451
2452 def GetInstanceList(hypervisor_list, all_hvparams=None,
2453 get_hv_fn=hypervisor.GetHypervisor):
2454 """Provides a list of instances.
2455
2456 @type hypervisor_list: list
2457 @param hypervisor_list: the list of hypervisors to query information
2458 @type all_hvparams: dict of dict of strings
2459 @param all_hvparams: a dictionary mapping hypervisor types to respective
2460 cluster-wide hypervisor parameters
2461 @type get_hv_fn: function
2462 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2463 name; optional parameter to increase testability
2464
2465 @rtype: list
2466 @return: a list of all running instances on the current node
2467 - instance1.example.com
2468 - instance2.example.com
2469
2470 """
2471 results = []
2472 for hname in hypervisor_list:
2473 hvparams = all_hvparams[hname]
2474 results.extend(GetInstanceListForHypervisor(hname, hvparams=hvparams,
2475 get_hv_fn=get_hv_fn))
2476 return results
2477
2478
2479 def GetInstanceInfo(instance, hname, hvparams=None):
2480 """Gives back the information about an instance as a dictionary.
2481
2482 @type instance: string
2483 @param instance: the instance name
2484 @type hname: string
2485 @param hname: the hypervisor type of the instance
2486 @type hvparams: dict of strings
2487 @param hvparams: the instance's hvparams
2488
2489 @rtype: dict
2490 @return: dictionary with the following keys:
2491 - memory: memory size of instance (int)
2492 - state: state of instance (HvInstanceState)
2493 - time: cpu time of instance (float)
2494 - vcpus: the number of vcpus (int)
2495
2496 """
2497 output = {}
2498
2499 iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance,
2500 hvparams=hvparams)
2501 if iinfo is not None:
2502 output["memory"] = iinfo[2]
2503 output["vcpus"] = iinfo[3]
2504 output["state"] = iinfo[4]
2505 output["time"] = iinfo[5]
2506
2507 return output
2508
2509
2510 def GetInstanceMigratable(instance):
2511 """Computes whether an instance can be migrated.
2512
2513 @type instance: L{objects.Instance}
2514 @param instance: object representing the instance to be checked.
2515
2516 @rtype: tuple
2517 @return: tuple of (result, description) where:
2518 - result: whether the instance can be migrated or not
2519 - description: a description of the issue, if relevant
2520
2521 """
2522 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2523 iname = instance.name
2524 if iname not in hyper.ListInstances(hvparams=instance.hvparams):
2525 _Fail("Instance %s is not running", iname)
2526
2527 for idx in range(len(instance.disks_info)):
2528 link_name = _GetBlockDevSymlinkPath(iname, idx)
2529 if not os.path.islink(link_name):
2530 logging.warning("Instance %s is missing symlink %s for disk %d",
2531 iname, link_name, idx)
2532
2533
2534 def GetAllInstancesInfo(hypervisor_list, all_hvparams):
2535 """Gather data about all instances.
2536
2537 This is the equivalent of L{GetInstanceInfo}, except that it
2538 computes data for all instances at once, thus being faster if one
2539 needs data about more than one instance.
2540
2541 @type hypervisor_list: list
2542 @param hypervisor_list: list of hypervisors to query for instance data
2543 @type all_hvparams: dict of dict of strings
2544 @param all_hvparams: mapping of hypervisor names to hvparams
2545
2546 @rtype: dict
2547 @return: dictionary of instance: data, with data having the following keys:
2548 - memory: memory size of instance (int)
2549 - state: xen state of instance (string)
2550 - time: cpu time of instance (float)
2551 - vcpus: the number of vcpus
2552
2553 """
2554 output = {}
2555 for hname in hypervisor_list:
2556 hvparams = all_hvparams[hname]
2557 iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo(hvparams)
2558 if iinfo:
2559 for name, _, memory, vcpus, state, times in iinfo:
2560 value = {
2561 "memory": memory,
2562 "vcpus": vcpus,
2563 "state": state,
2564 "time": times,
2565 }
2566 if name in output:
2567 # we only check static parameters, like memory and vcpus,
2568 # and not state and time which can change between the
2569 # invocations of the different hypervisors
2570 for key in "memory", "vcpus":
2571 if value[key] != output[name][key]:
2572 _Fail("Instance %s is running twice"
2573 " with different parameters", name)
2574 output[name] = value
2575
2576 return output
2577
2578
2579 def GetInstanceConsoleInfo(instance_param_dict,
2580 get_hv_fn=hypervisor.GetHypervisor):
2581 """Gather data about the console access of a set of instances of this node.
2582
2583 This function assumes that the caller already knows which instances are on
2584 this node, by calling a function such as L{GetAllInstancesInfo} or
2585 L{GetInstanceList}.
2586
2587 For every instance, a large amount of configuration data needs to be
2588 provided to the hypervisor interface in order to receive the console
2589 information. Whether this could or should be cut down can be discussed.
2590 The information is provided in a dictionary indexed by instance name,
2591 allowing any number of instance queries to be done.
2592
2593 @type instance_param_dict: dict of string to tuple of dictionaries, where the
2594 dictionaries represent: L{objects.Instance}, L{objects.Node},
2595 L{objects.NodeGroup}, HvParams, BeParams
2596 @param instance_param_dict: mapping of instance name to parameters necessary
2597 for console information retrieval
2598
2599 @rtype: dict
2600 @return: dictionary of instance: data, with data having the following keys:
2601 - instance: instance name
2602 - kind: console kind
2603 - message: used with kind == CONS_MESSAGE, indicates console to be
2604 unavailable, supplies error message
2605 - host: host to connect to
2606 - port: port to use
2607 - user: user for login
2608 - command: the command, broken into parts as an array
2609 - display: unknown, potentially unused?
2610
2611 """
2612
2613 output = {}
2614 for inst_name in instance_param_dict:
2615 instance = instance_param_dict[inst_name]["instance"]
2616 pnode = instance_param_dict[inst_name]["node"]
2617 group = instance_param_dict[inst_name]["group"]
2618 hvparams = instance_param_dict[inst_name]["hvParams"]
2619 beparams = instance_param_dict[inst_name]["beParams"]
2620
2621 instance = objects.Instance.FromDict(instance)
2622 pnode = objects.Node.FromDict(pnode)
2623 group = objects.NodeGroup.FromDict(group)
2624
2625 h = get_hv_fn(instance.hypervisor)
2626 output[inst_name] = h.GetInstanceConsole(instance, pnode, group,
2627 hvparams, beparams).ToDict()
2628
2629 return output
2630
2631
2632 def _InstanceLogName(kind, os_name, instance, component):
2633 """Compute the OS log filename for a given instance and operation.
2634
2635 The instance name and os name are passed in as strings since not all
2636 operations have these as part of an instance object.
2637
2638 @type kind: string
2639 @param kind: the operation type (e.g. add, import, etc.)
2640 @type os_name: string
2641 @param os_name: the os name
2642 @type instance: string
2643 @param instance: the name of the instance being imported/added/etc.
2644 @type component: string or None
2645 @param component: the name of the component of the instance being
2646 transferred
2647
2648 """
2649 # TODO: Use tempfile.mkstemp to create unique filename
2650 if component:
2651 assert "/" not in component
2652 c_msg = "-%s" % component
2653 else:
2654 c_msg = ""
2655 base = ("%s-%s-%s%s-%s.log" %
2656 (kind, os_name, instance, c_msg, utils.TimestampForFilename()))
2657 return utils.PathJoin(pathutils.LOG_OS_DIR, base)
2658
2659
2660 def InstanceOsAdd(instance, reinstall, debug):
2661 """Add an OS to an instance.
2662
2663 @type instance: L{objects.Instance}
2664 @param instance: Instance whose OS is to be installed
2665 @type reinstall: boolean
2666 @param reinstall: whether this is an instance reinstall
2667 @type debug: integer
2668 @param debug: debug level, passed to the OS scripts
2669 @rtype: None
2670
2671 """
2672 inst_os = OSFromDisk(instance.os)
2673
2674 create_env = OSEnvironment(instance, inst_os, debug)
2675 if reinstall:
2676 create_env["INSTANCE_REINSTALL"] = "1"
2677
2678 logfile = _InstanceLogName("add", instance.os, instance.name, None)
2679
2680 result = utils.RunCmd([inst_os.create_script], env=create_env,
2681 cwd=inst_os.path, output=logfile, reset_env=True)
2682 if result.failed:
2683 logging.error("os create command '%s' returned error: %s, logfile: %s,"
2684 " output: %s", result.cmd, result.fail_reason, logfile,
2685 result.output)
2686 lines = [utils.SafeEncode(val)
2687 for val in utils.TailFile(logfile, lines=20)]
2688 _Fail("OS create script failed (%s), last lines in the"
2689 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2690
2691
2692 def RunRenameInstance(instance, old_name, debug):
2693 """Run the OS rename script for an instance.
2694
2695 @type instance: L{objects.Instance}
2696 @param instance: Instance whose OS is to be installed
2697 @type old_name: string
2698 @param old_name: previous instance name
2699 @type debug: integer
2700 @param debug: debug level, passed to the OS scripts
2701 @rtype: boolean
2702 @return: the success of the operation
2703
2704 """
2705 inst_os = OSFromDisk(instance.os)
2706
2707 rename_env = OSEnvironment(instance, inst_os, debug)
2708 rename_env["OLD_INSTANCE_NAME"] = old_name
2709
2710 logfile = _InstanceLogName("rename", instance.os,
2711 "%s-%s" % (old_name, instance.name), None)
2712
2713 result = utils.RunCmd([inst_os.rename_script], env=rename_env,
2714 cwd=inst_os.path, output=logfile, reset_env=True)
2715
2716 if result.failed:
2717 logging.error("os create command '%s' returned error: %s output: %s",
2718 result.cmd, result.fail_reason, result.output)
2719 lines = [utils.SafeEncode(val)
2720 for val in utils.TailFile(logfile, lines=20)]
2721 _Fail("OS rename script failed (%s), last lines in the"
2722 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2723
2724
2725 def _GetBlockDevSymlinkPath(instance_name, idx, _dir=None):
2726 """Returns symlink path for block device.
2727
2728 """
2729 if _dir is None:
2730 _dir = pathutils.DISK_LINKS_DIR
2731
2732 return utils.PathJoin(_dir,
2733 ("%s%s%s" %
2734 (instance_name, constants.DISK_SEPARATOR, idx)))
2735
2736
2737 def _SymlinkBlockDev(instance_name, device_path, idx):
2738 """Set up symlinks to a instance's block device.
2739
2740 This is an auxiliary function run when an instance is start (on the primary
2741 node) or when an instance is migrated (on the target node).
2742
2743
2744 @param instance_name: the name of the target instance
2745 @param device_path: path of the physical block device, on the node
2746 @param idx: the disk index
2747 @return: absolute path to the disk's symlink
2748
2749 """
2750 # In case we have only a userspace access URI, device_path is None
2751 if not device_path:
2752 return None
2753
2754 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2755 try:
2756 os.symlink(device_path, link_name)
2757 except OSError, err:
2758 if err.errno == errno.EEXIST:
2759 if (not os.path.islink(link_name) or
2760 os.readlink(link_name) != device_path):
2761 os.remove(link_name)
2762 os.symlink(device_path, link_name)
2763 else:
2764 raise
2765
2766 return link_name
2767
2768
2769 def _RemoveBlockDevLinks(instance_name, disks):
2770 """Remove the block device symlinks belonging to the given instance.
2771
2772 """
2773 for idx, _ in enumerate(disks):
2774 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2775 if os.path.islink(link_name):
2776 try:
2777 os.remove(link_name)
2778 except OSError:
2779 logging.exception("Can't remove symlink '%s'", link_name)
2780
2781
2782 def _CalculateDeviceURI(instance, disk, device):
2783 """Get the URI for the device.
2784
2785 @type instance: L{objects.Instance}
2786 @param instance: the instance which disk belongs to
2787 @type disk: L{objects.Disk}
2788 @param disk: the target disk object
2789 @type device: L{bdev.BlockDev}
2790 @param device: the corresponding BlockDevice
2791 @rtype: string
2792 @return: the device uri if any else None
2793
2794 """
2795 access_mode = disk.params.get(constants.LDP_ACCESS,
2796 constants.DISK_KERNELSPACE)
2797 if access_mode == constants.DISK_USERSPACE:
2798 # This can raise errors.BlockDeviceError
2799 return device.GetUserspaceAccessUri(instance.hypervisor)
2800 else:
2801 return None
2802
2803
2804 def _GatherAndLinkBlockDevs(instance):
2805 """Set up an instance's block device(s).
2806
2807 This is run on the primary node at instance startup. The block
2808 devices must be already assembled.
2809
2810 @type instance: L{objects.Instance}
2811 @param instance: the instance whose disks we should assemble
2812 @rtype: list
2813 @return: list of (disk_object, link_name, drive_uri)
2814
2815 """
2816 block_devices = []
2817 for idx, disk in enumerate(instance.disks_info):
2818 device = _RecursiveFindBD(disk)
2819 if device is None:
2820 raise errors.BlockDeviceError("Block device '%s' is not set up." %
2821 str(disk))
2822 device.Open()
2823 try:
2824 link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
2825 except OSError, e:
2826 raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
2827 e.strerror)
2828 uri = _CalculateDeviceURI(instance, disk, device)
2829
2830 block_devices.append((disk, link_name, uri))
2831
2832 return block_devices
2833
2834
2835 def _IsInstanceUserDown(instance_info):
2836 return instance_info and \
2837 "state" in instance_info and \
2838 hv_base.HvInstanceState.IsShutdown(instance_info["state"])
2839
2840
2841 def _GetInstanceInfo(instance):
2842 """Helper function L{GetInstanceInfo}"""
2843 return GetInstanceInfo(instance.name, instance.hypervisor,
2844 hvparams=instance.hvparams)
2845
2846
2847 def StartInstance(instance, startup_paused, reason, store_reason=True):
2848 """Start an instance.
2849
2850 @type instance: L{objects.Instance}
2851 @param instance: the instance object
2852 @type startup_paused: bool
2853 @param instance: pause instance at startup?
2854 @type reason: list of reasons
2855 @param reason: the reason trail for this startup
2856 @type store_reason: boolean
2857 @param store_reason: whether to store the shutdown reason trail on file
2858 @rtype: None
2859
2860 """
2861 instance_info = _GetInstanceInfo(instance)
2862
2863 if instance_info and not _IsInstanceUserDown(instance_info):
2864 logging.info("Instance '%s' already running, not starting", instance.name)
2865 return
2866
2867 try:
2868 block_devices = _GatherAndLinkBlockDevs(instance)
2869 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2870 hyper.StartInstance(instance, block_devices, startup_paused)
2871 if store_reason:
2872 _StoreInstReasonTrail(instance.name, reason)
2873 except errors.BlockDeviceError, err:
2874 _Fail("Block device error: %s", err, exc=True)
2875 except errors.HypervisorError, err:
2876 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2877 _Fail("Hypervisor error: %s", err, exc=True)
2878
2879
2880 def InstanceShutdown(instance, timeout, reason, store_reason=True):
2881 """Shut an instance down.
2882
2883 @note: this functions uses polling with a hardcoded timeout.
2884
2885 @type instance: L{objects.Instance}
2886 @param instance: the instance object
2887 @type timeout: integer
2888 @param timeout: maximum timeout for soft shutdown
2889 @type reason: list of reasons
2890 @param reason: the reason trail for this shutdown
2891 @type store_reason: boolean
2892 @param store_reason: whether to store the shutdown reason trail on file
2893 @rtype: None
2894
2895 """
2896 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2897
2898 if not _GetInstanceInfo(instance):
2899 logging.info("Instance '%s' not running, doing nothing", instance.name)
2900 return
2901
2902 class _TryShutdown(object):
2903 def __init__(self):
2904 self.tried_once = False
2905
2906 def __call__(self):
2907 if not _GetInstanceInfo(instance):
2908 return
2909
2910 try:
2911 hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout)
2912 if store_reason:
2913 _StoreInstReasonTrail(instance.name, reason)
2914 except errors.HypervisorError, err:
2915 # if the instance is no longer existing, consider this a
2916 # success and go to cleanup
2917 if not _GetInstanceInfo(instance):
2918 return
2919
2920 _Fail("Failed to stop instance '%s': %s", instance.name, err)
2921
2922 self.tried_once = True
2923
2924 raise utils.RetryAgain()
2925
2926 try:
2927 utils.Retry(_TryShutdown(), 5, timeout)
2928 except utils.RetryTimeout:
2929 # the shutdown did not succeed
2930 logging.error("Shutdown of '%s' unsuccessful, forcing", instance.name)
2931
2932 try:
2933 hyper.StopInstance(instance, force=True)
2934 except errors.HypervisorError, err:
2935 # only raise an error if the instance still exists, otherwise
2936 # the error could simply be "instance ... unknown"!
2937 if _GetInstanceInfo(instance):
2938 _Fail("Failed to force stop instance '%s': %s", instance.name, err)
2939
2940 time.sleep(1)
2941
2942 if _GetInstanceInfo(instance):
2943 _Fail("Could not shutdown instance '%s' even by destroy", instance.name)
2944
2945 try:
2946 hyper.CleanupInstance(instance.name)
2947 except errors.HypervisorError, err:
2948 logging.warning("Failed to execute post-shutdown cleanup step: %s", err)
2949
2950 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2951
2952
2953 def InstanceReboot(instance, reboot_type, shutdown_timeout, reason):
2954 """Reboot an instance.
2955
2956 @type instance: L{objects.Instance}
2957 @param instance: the instance object to reboot
2958 @type reboot_type: str
2959 @param reboot_type: the type of reboot, one the following
2960 constants:
2961 - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
2962 instance OS, do not recreate the VM
2963 - L{constants.INSTANCE_REBOOT_HARD}: tear down and
2964 restart the VM (at the hypervisor level)
2965 - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is
2966 not accepted here, since that mode is handled differently, in
2967 cmdlib, and translates into full stop and start of the
2968 instance (instead of a call_instance_reboot RPC)
2969 @type shutdown_timeout: integer
2970 @param shutdown_timeout: maximum timeout for soft shutdown
2971 @type reason: list of reasons
2972 @param reason: the reason trail for this reboot
2973 @rtype: None
2974
2975 """
2976 # TODO: this is inconsistent with 'StartInstance' and 'InstanceShutdown'
2977 # because those functions simply 'return' on error whereas this one
2978 # raises an exception with '_Fail'
2979 if not _GetInstanceInfo(instance):
2980 _Fail("Cannot reboot instance '%s' that is not running", instance.name)
2981
2982 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2983 if reboot_type == constants.INSTANCE_REBOOT_SOFT:
2984 try:
2985 hyper.RebootInstance(instance)
2986 except errors.HypervisorError, err:
2987 _Fail("Failed to soft reboot instance '%s': %s", instance.name, err)
2988 elif reboot_type == constants.INSTANCE_REBOOT_HARD:
2989 try:
2990 InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False)
2991 result = StartInstance(instance, False, reason, store_reason=False)
2992 _StoreInstReasonTrail(instance.name, reason)
2993 return result
2994 except errors.HypervisorError, err:
2995 _Fail("Failed to hard reboot instance '%s': %s", instance.name, err)
2996 else:
2997 _Fail("Invalid reboot_type received: '%s'", reboot_type)
2998
2999
3000 def InstanceBalloonMemory(instance, memory):
3001 """Resize an instance's memory.
3002
3003 @type instance: L{objects.Instance}
3004 @param instance: the instance object
3005 @type memory: int
3006 @param memory: new memory amount in MB
3007 @rtype: None
3008
3009 """
3010 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3011 running = hyper.ListInstances(hvparams=instance.hvparams)
3012 if instance.name not in running:
3013 logging.info("Instance %s is not running, cannot balloon", instance.name)
3014 return
3015 try:
3016 hyper.BalloonInstanceMemory(instance, memory)
3017 except errors.HypervisorError, err:
3018 _Fail("Failed to balloon instance memory: %s", err, exc=True)
3019
3020
3021 def MigrationInfo(instance):
3022 """Gather information about an instance to be migrated.
3023
3024 @type instance: L{objects.Instance}
3025 @param instance: the instance definition
3026
3027 """
3028 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3029 try:
3030 info = hyper.MigrationInfo(instance)
3031 except errors.HypervisorError, err:
3032 _Fail("Failed to fetch migration information: %s", err, exc=True)
3033 return info
3034
3035
3036 def AcceptInstance(instance, info, target):
3037 """Prepare the node to accept an instance.
3038
3039 @type instance: L{objects.Instance}
3040 @param instance: the instance definition
3041 @type info: string/data (opaque)
3042 @param info: migration information, from the source node
3043 @type target: string
3044 @param target: target host (usually ip), on this node
3045
3046 """
3047 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3048 try:
3049 hyper.AcceptInstance(instance, info, target)
3050 except errors.HypervisorError, err:
3051 _Fail("Failed to accept instance: %s", err, exc=True)
3052
3053
3054 def FinalizeMigrationDst(instance, info, success):
3055 """Finalize any preparation to accept an instance.
3056
3057 @type instance: L{objects.Instance}
3058 @param instance: the instance definition
3059 @type info: string/data (opaque)
3060 @param info: migration information, from the source node
3061 @type success: boolean
3062 @param success: whether the migration was a success or a failure
3063
3064 """
3065 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3066 try:
3067 hyper.FinalizeMigrationDst(instance, info, success)
3068 except errors.HypervisorError, err:
3069 _Fail("Failed to finalize migration on the target node: %s", err, exc=True)
3070
3071
3072 def MigrateInstance(cluster_name, instance, target, live):
3073 """Migrates an instance to another node.
3074
3075 @type cluster_name: string
3076 @param cluster_name: name of the cluster
3077 @type instance: L{objects.Instance}
3078 @param instance: the instance definition
3079 @type target: string
3080 @param target: the target node name
3081 @type live: boolean
3082 @param live: whether the migration should be done live or not (the
3083 interpretation of this parameter is left to the hypervisor)
3084 @raise RPCFail: if migration fails for some reason
3085
3086 """
3087 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3088
3089 try:
3090 hyper.MigrateInstance(cluster_name, instance, target, live)
3091 except errors.HypervisorError, err:
3092 _Fail("Failed to migrate instance: %s", err, exc=True)
3093
3094
3095 def FinalizeMigrationSource(instance, success, live):
3096 """Finalize the instance migration on the source node.
3097
3098 @type instance: L{objects.Instance}
3099 @param instance: the instance definition of the migrated instance
3100 @type success: bool
3101 @param success: whether the migration succeeded or not
3102 @type live: bool
3103 @param live: whether the user requested a live migration or not
3104 @raise RPCFail: If the execution fails for some reason
3105
3106 """
3107 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3108
3109 try:
3110 hyper.FinalizeMigrationSource(instance, success, live)
3111 except Exception, err: # pylint: disable=W0703
3112 _Fail("Failed to finalize the migration on the source node: %s", err,
3113 exc=True)
3114
3115
3116 def GetMigrationStatus(instance):
3117 """Get the migration status
3118
3119 @type instance: L{objects.Instance}
3120 @param instance: the instance that is being migrated
3121 @rtype: L{objects.MigrationStatus}
3122 @return: the status of the current migration (one of
3123 L{constants.HV_MIGRATION_VALID_STATUSES}), plus any additional
3124 progress info that can be retrieved from the hypervisor
3125 @raise RPCFail: If the migration status cannot be retrieved
3126
3127 """
3128 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3129 try:
3130 return hyper.GetMigrationStatus(instance)
3131 except Exception, err: # pylint: disable=W0703
3132 _Fail("Failed to get migration status: %s", err, exc=True)
3133
3134
3135 def HotplugDevice(instance, action, dev_type, device, extra, seq):
3136 """Hotplug a device
3137
3138 Hotplug is currently supported only for KVM Hypervisor.
3139 @type instance: L{objects.Instance}
3140 @param instance: the instance to which we hotplug a device
3141 @type action: string
3142 @param action: the hotplug action to perform
3143 @type dev_type: string
3144 @param dev_type: the device type to hotplug
3145 @type device: either L{objects.NIC} or L{objects.Disk}
3146 @param device: the device object to hotplug
3147 @type extra: tuple
3148 @param extra: extra info used for disk hotplug (disk link, drive uri)
3149 @type seq: int
3150 @param seq: the index of the device from master perspective
3151 @raise RPCFail: in case instance does not have KVM hypervisor
3152
3153 """
3154 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3155 try:
3156 hyper.VerifyHotplugSupport(instance, action, dev_type)
3157 except errors.HotplugError, err:
3158 _Fail("Hotplug is not supported: %s", err)
3159
3160 if action == constants.HOTPLUG_ACTION_ADD:
3161 fn = hyper.HotAddDevice
3162 elif action == constants.HOTPLUG_ACTION_REMOVE:
3163 fn = hyper.HotDelDevice
3164 elif action == constants.HOTPLUG_ACTION_MODIFY:
3165 fn = hyper.HotModDevice
3166 else:
3167 assert action in constants.HOTPLUG_ALL_ACTIONS
3168
3169 return fn(instance, dev_type, device, extra, seq)
3170
3171
3172 def HotplugSupported(instance):
3173 """Checks if hotplug is generally supported.
3174
3175 """
3176 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3177 try:
3178 hyper.HotplugSupported(instance)
3179 except errors.HotplugError, err:
3180 _Fail("Hotplug is not supported: %s", err)
3181
3182
3183 def ModifyInstanceMetadata(metadata):
3184 """Sends instance data to the metadata daemon.
3185
3186 Uses the Luxi transport layer to communicate with the metadata
3187 daemon configuration server. It starts the metadata daemon if it is
3188 not running.
3189 The daemon must be enabled during at configuration time.
3190
3191 @type metadata: dict
3192 @param metadata: instance metadata obtained by calling
3193 L{objects.Instance.ToDict} on an instance object
3194
3195 """
3196 if not constants.ENABLE_METAD:
3197 raise errors.ProgrammerError("The metadata deamon is disabled, yet"
3198 " ModifyInstanceMetadata has been called")
3199
3200 if not utils.IsDaemonAlive(constants.METAD):
3201 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.METAD])
3202 if result.failed:
3203 raise errors.HypervisorError("Failed to start metadata daemon")
3204
3205 with contextlib.closing(metad.Client()) as client:
3206 client.UpdateConfig(metadata)
3207
3208
3209 def BlockdevCreate(disk, size, owner, on_primary, info, excl_stor):
3210 """Creates a block device for an instance.
3211
3212 @type disk: L{objects.Disk}
3213 @param disk: the object describing the disk we should create
3214 @type size: int
3215 @param size: the size of the physical underlying device, in MiB
3216 @type owner: str
3217 @param owner: the name of the instance for which disk is created,
3218 used for device cache data
3219 @type on_primary: boolean
3220 @param on_primary: indicates if it is the primary node or not
3221 @type info: string
3222 @param info: string that will be sent to the physical device
3223 creation, used for example to set (LVM) tags on LVs
3224 @type excl_stor: boolean
3225 @param excl_stor: Whether exclusive_storage is active
3226
3227 @return: the new unique_id of the device (this can sometime be
3228 computed only after creation), or None. On secondary nodes,
3229 it's not required to return anything.
3230
3231 """
3232 # TODO: remove the obsolete "size" argument
3233 # pylint: disable=W0613
3234 clist = []
3235 if disk.children:
3236 for child in disk.children:
3237 try:
3238 crdev = _RecursiveAssembleBD(child, owner, on_primary)
3239 except errors.BlockDeviceError, err:
3240 _Fail("Can't assemble device %s: %s", child, err)
3241 if on_primary or disk.AssembleOnSecondary():
3242 # we need the children open in case the device itself has to
3243 # be assembled
3244 try:
3245 # pylint: disable=E1103
3246 crdev.Open()
3247 except errors.BlockDeviceError, err:
3248 _Fail("Can't make child '%s' read-write: %s", child, err)
3249 clist.append(crdev)
3250
3251 try:
3252 device = bdev.Create(disk, clist, excl_stor)
3253 except errors.BlockDeviceError, err:
3254 _Fail("Can't create block device: %s", err)
3255
3256 if on_primary or disk.AssembleOnSecondary():
3257 try:
3258 device.Assemble()
3259 except errors.BlockDeviceError, err:
3260 _Fail("Can't assemble device after creation, unusual event: %s", err)
3261 if on_primary or disk.OpenOnSecondary():
3262 try:
3263 device.Open(force=True)
3264 except errors.BlockDeviceError, err:
3265 _Fail("Can't make device r/w after creation, unusual event: %s", err)
3266 DevCacheManager.UpdateCache(device.dev_path, owner,
3267 on_primary, disk.iv_name)
3268
3269 device.SetInfo(info)
3270
3271 return device.unique_id
3272
3273
3274 def _DumpDevice(source_path, target_path, offset, size, truncate):
3275 """This function images/wipes the device using a local file.
3276
3277 @type source_path: string
3278 @param source_path: path of the image or data source (e.g., "/dev/zero")
3279
3280 @type target_path: string
3281 @param target_path: path of the device to image/wipe
3282
3283 @type offset: int
3284 @param offset: offset in MiB in the output file
3285
3286 @type size: int
3287 @param size: maximum size in MiB to write (data source might be smaller)
3288
3289 @type truncate: bool
3290 @param truncate: whether the file should be truncated
3291
3292 @return: None
3293 @raise RPCFail: in case of failure
3294
3295 """
3296 # Internal sizes are always in Mebibytes; if the following "dd" command
3297 # should use a different block size the offset and size given to this
3298 # function must be adjusted accordingly before being passed to "dd".
3299 block_size = constants.DD_BLOCK_SIZE
3300
3301 cmd = [constants.DD_CMD, "if=%s" % source_path, "seek=%d" % offset,
3302 "bs=%s" % block_size, "oflag=direct", "of=%s" % target_path,
3303 "count=%d" % size]
3304
3305 if not truncate:
3306 cmd.append("conv=notrunc")
3307
3308 result = utils.RunCmd(cmd)
3309
3310 if result.failed:
3311 _Fail("Dump command '%s' exited with error: %s; output: %s", result.cmd,
3312 result.fail_reason, result.output)
3313
3314
3315 def _DownloadAndDumpDevice(source_url, target_path, size):
3316 """This function images a device using a downloaded image file.
3317
3318 @type source_url: string
3319 @param source_url: URL of image to dump to disk
3320
3321 @type target_path: string
3322 @param target_path: path of the device to image
3323
3324 @type size: int
3325 @param size: maximum size in MiB to write (data source might be smaller)
3326
3327 @rtype: NoneType
3328 @return: None
3329 @raise RPCFail: in case of download or write failures
3330
3331 """
3332 class DDParams(object):
3333 def __init__(self, current_size, total_size):
3334 self.current_size = current_size
3335 self.total_size = total_size
3336 self.image_size_error = False
3337
3338 def dd_write(ddparams, out):
3339 if ddparams.current_size < ddparams.total_size:
3340 ddparams.current_size += len(out)
3341 target_file.write(out)
3342 else:
3343 ddparams.image_size_error = True
3344 return -1
3345
3346 target_file = open(target_path, "r+")
3347 ddparams = DDParams(0, 1024 * 1024 * size)