Merge branch 'stable-2.15' into stable-2.16
[ganeti-github.git] / lib / backend.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31 """Functions used by the node daemon
32
33 @var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
34 the L{UploadFile} function
35 @var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted
36 in the L{_CleanDirectory} function
37
38 """
39
40 # pylint: disable=E1103,C0302
41
42 # E1103: %s %r has no %r member (but some types could not be
43 # inferred), because the _TryOSFromDisk returns either (True, os_obj)
44 # or (False, "string") which confuses pylint
45
46 # C0302: This module has become too big and should be split up
47
48
49 import base64
50 import errno
51 import logging
52 import os
53 import os.path
54 import pycurl
55 import random
56 import re
57 import shutil
58 import signal
59 import stat
60 import tempfile
61 import time
62 import zlib
63 import contextlib
64 import collections
65
66 from ganeti import errors
67 from ganeti import http
68 from ganeti import utils
69 from ganeti import ssh
70 from ganeti import hypervisor
71 from ganeti.hypervisor import hv_base
72 from ganeti import constants
73 from ganeti.storage import bdev
74 from ganeti.storage import drbd
75 from ganeti.storage import extstorage
76 from ganeti.storage import filestorage
77 from ganeti import objects
78 from ganeti import ssconf
79 from ganeti import serializer
80 from ganeti import netutils
81 from ganeti import runtime
82 from ganeti import compat
83 from ganeti import pathutils
84 from ganeti import vcluster
85 from ganeti import ht
86 from ganeti.storage.base import BlockDev
87 from ganeti.storage.drbd import DRBD8
88 from ganeti import hooksmaster
89 import ganeti.metad as metad
90
91
92 _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id"
93 _ALLOWED_CLEAN_DIRS = compat.UniqueFrozenset([
94 pathutils.DATA_DIR,
95 pathutils.JOB_QUEUE_ARCHIVE_DIR,
96 pathutils.QUEUE_DIR,
97 pathutils.CRYPTO_KEYS_DIR,
98 ])
99 _MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60
100 _X509_KEY_FILE = "key"
101 _X509_CERT_FILE = "cert"
102 _IES_STATUS_FILE = "status"
103 _IES_PID_FILE = "pid"
104 _IES_CA_FILE = "ca"
105
106 #: Valid LVS output line regex
107 _LVSLINE_REGEX = re.compile(r"^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$")
108
109 # Actions for the master setup script
110 _MASTER_START = "start"
111 _MASTER_STOP = "stop"
112
113 #: Maximum file permissions for restricted command directory and executables
114 _RCMD_MAX_MODE = (stat.S_IRWXU |
115 stat.S_IRGRP | stat.S_IXGRP |
116 stat.S_IROTH | stat.S_IXOTH)
117
118 #: Delay before returning an error for restricted commands
119 _RCMD_INVALID_DELAY = 10
120
121 #: How long to wait to acquire lock for restricted commands (shorter than
122 #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many
123 #: command requests arrive
124 _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8
125
126
127 class RPCFail(Exception):
128 """Class denoting RPC failure.
129
130 Its argument is the error message.
131
132 """
133
134
135 def _GetInstReasonFilename(instance_name):
136 """Path of the file containing the reason of the instance status change.
137
138 @type instance_name: string
139 @param instance_name: The name of the instance
140 @rtype: string
141 @return: The path of the file
142
143 """
144 return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name)
145
146
147 def _StoreInstReasonTrail(instance_name, trail):
148 """Serialize a reason trail related to an instance change of state to file.
149
150 The exact location of the file depends on the name of the instance and on
151 the configuration of the Ganeti cluster defined at deploy time.
152
153 @type instance_name: string
154 @param instance_name: The name of the instance
155
156 @type trail: list of reasons
157 @param trail: reason trail
158
159 @rtype: None
160
161 """
162 json = serializer.DumpJson(trail)
163 filename = _GetInstReasonFilename(instance_name)
164 utils.WriteFile(filename, data=json)
165
166
167 def _Fail(msg, *args, **kwargs):
168 """Log an error and the raise an RPCFail exception.
169
170 This exception is then handled specially in the ganeti daemon and
171 turned into a 'failed' return type. As such, this function is a
172 useful shortcut for logging the error and returning it to the master
173 daemon.
174
175 @type msg: string
176 @param msg: the text of the exception
177 @raise RPCFail
178
179 """
180 if args:
181 msg = msg % args
182 if "log" not in kwargs or kwargs["log"]: # if we should log this error
183 if "exc" in kwargs and kwargs["exc"]:
184 logging.exception(msg)
185 else:
186 logging.error(msg)
187 raise RPCFail(msg)
188
189
190 def _GetConfig():
191 """Simple wrapper to return a SimpleStore.
192
193 @rtype: L{ssconf.SimpleStore}
194 @return: a SimpleStore instance
195
196 """
197 return ssconf.SimpleStore()
198
199
200 def _GetSshRunner(cluster_name):
201 """Simple wrapper to return an SshRunner.
202
203 @type cluster_name: str
204 @param cluster_name: the cluster name, which is needed
205 by the SshRunner constructor
206 @rtype: L{ssh.SshRunner}
207 @return: an SshRunner instance
208
209 """
210 return ssh.SshRunner(cluster_name)
211
212
213 def _Decompress(data):
214 """Unpacks data compressed by the RPC client.
215
216 @type data: list or tuple
217 @param data: Data sent by RPC client
218 @rtype: str
219 @return: Decompressed data
220
221 """
222 assert isinstance(data, (list, tuple))
223 assert len(data) == 2
224 (encoding, content) = data
225 if encoding == constants.RPC_ENCODING_NONE:
226 return content
227 elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
228 return zlib.decompress(base64.b64decode(content))
229 else:
230 raise AssertionError("Unknown data encoding")
231
232
233 def _CleanDirectory(path, exclude=None):
234 """Removes all regular files in a directory.
235
236 @type path: str
237 @param path: the directory to clean
238 @type exclude: list
239 @param exclude: list of files to be excluded, defaults
240 to the empty list
241
242 """
243 if path not in _ALLOWED_CLEAN_DIRS:
244 _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'",
245 path)
246
247 if not os.path.isdir(path):
248 return
249 if exclude is None:
250 exclude = []
251 else:
252 # Normalize excluded paths
253 exclude = [os.path.normpath(i) for i in exclude]
254
255 for rel_name in utils.ListVisibleFiles(path):
256 full_name = utils.PathJoin(path, rel_name)
257 if full_name in exclude:
258 continue
259 if os.path.isfile(full_name) and not os.path.islink(full_name):
260 utils.RemoveFile(full_name)
261
262
263 def _BuildUploadFileList():
264 """Build the list of allowed upload files.
265
266 This is abstracted so that it's built only once at module import time.
267
268 """
269 allowed_files = set([
270 pathutils.CLUSTER_CONF_FILE,
271 pathutils.ETC_HOSTS,
272 pathutils.SSH_KNOWN_HOSTS_FILE,
273 pathutils.VNC_PASSWORD_FILE,
274 pathutils.RAPI_CERT_FILE,
275 pathutils.SPICE_CERT_FILE,
276 pathutils.SPICE_CACERT_FILE,
277 pathutils.RAPI_USERS_FILE,
278 pathutils.CONFD_HMAC_KEY,
279 pathutils.CLUSTER_DOMAIN_SECRET_FILE,
280 ])
281
282 for hv_name in constants.HYPER_TYPES:
283 hv_class = hypervisor.GetHypervisorClass(hv_name)
284 allowed_files.update(hv_class.GetAncillaryFiles()[0])
285
286 assert pathutils.FILE_STORAGE_PATHS_FILE not in allowed_files, \
287 "Allowed file storage paths should never be uploaded via RPC"
288
289 return frozenset(allowed_files)
290
291
292 _ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
293
294
295 def JobQueuePurge():
296 """Removes job queue files and archived jobs.
297
298 @rtype: tuple
299 @return: True, None
300
301 """
302 _CleanDirectory(pathutils.QUEUE_DIR, exclude=[pathutils.JOB_QUEUE_LOCK_FILE])
303 _CleanDirectory(pathutils.JOB_QUEUE_ARCHIVE_DIR)
304
305
306 def GetMasterNodeName():
307 """Returns the master node name.
308
309 @rtype: string
310 @return: name of the master node
311 @raise RPCFail: in case of errors
312
313 """
314 try:
315 return _GetConfig().GetMasterNode()
316 except errors.ConfigurationError, err:
317 _Fail("Cluster configuration incomplete: %s", err, exc=True)
318
319
320 def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
321 """Decorator that runs hooks before and after the decorated function.
322
323 @type hook_opcode: string
324 @param hook_opcode: opcode of the hook
325 @type hooks_path: string
326 @param hooks_path: path of the hooks
327 @type env_builder_fn: function
328 @param env_builder_fn: function that returns a dictionary containing the
329 environment variables for the hooks. Will get all the parameters of the
330 decorated function.
331 @raise RPCFail: in case of pre-hook failure
332
333 """
334 def decorator(fn):
335 def wrapper(*args, **kwargs):
336 _, myself = ssconf.GetMasterAndMyself()
337 nodes = ([myself], [myself]) # these hooks run locally
338
339 env_fn = compat.partial(env_builder_fn, *args, **kwargs)
340
341 cfg = _GetConfig()
342 hr = HooksRunner()
343 hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes,
344 hr.RunLocalHooks, None, env_fn, None,
345 logging.warning, cfg.GetClusterName(),
346 cfg.GetMasterNode())
347 hm.RunPhase(constants.HOOKS_PHASE_PRE)
348 result = fn(*args, **kwargs)
349 hm.RunPhase(constants.HOOKS_PHASE_POST)
350
351 return result
352 return wrapper
353 return decorator
354
355
356 def _BuildMasterIpEnv(master_params, use_external_mip_script=None):
357 """Builds environment variables for master IP hooks.
358
359 @type master_params: L{objects.MasterNetworkParameters}
360 @param master_params: network parameters of the master
361 @type use_external_mip_script: boolean
362 @param use_external_mip_script: whether to use an external master IP
363 address setup script (unused, but necessary per the implementation of the
364 _RunLocalHooks decorator)
365
366 """
367 # pylint: disable=W0613
368 ver = netutils.IPAddress.GetVersionFromAddressFamily(master_params.ip_family)
369 env = {
370 "MASTER_NETDEV": master_params.netdev,
371 "MASTER_IP": master_params.ip,
372 "MASTER_NETMASK": str(master_params.netmask),
373 "CLUSTER_IP_VERSION": str(ver),
374 }
375
376 return env
377
378
379 def _RunMasterSetupScript(master_params, action, use_external_mip_script):
380 """Execute the master IP address setup script.
381
382 @type master_params: L{objects.MasterNetworkParameters}
383 @param master_params: network parameters of the master
384 @type action: string
385 @param action: action to pass to the script. Must be one of
386 L{backend._MASTER_START} or L{backend._MASTER_STOP}
387 @type use_external_mip_script: boolean
388 @param use_external_mip_script: whether to use an external master IP
389 address setup script
390 @raise backend.RPCFail: if there are errors during the execution of the
391 script
392
393 """
394 env = _BuildMasterIpEnv(master_params)
395
396 if use_external_mip_script:
397 setup_script = pathutils.EXTERNAL_MASTER_SETUP_SCRIPT
398 else:
399 setup_script = pathutils.DEFAULT_MASTER_SETUP_SCRIPT
400
401 result = utils.RunCmd([setup_script, action], env=env, reset_env=True)
402
403 if result.failed:
404 _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" %
405 (action, result.exit_code, result.output), log=True)
406
407
408 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup",
409 _BuildMasterIpEnv)
410 def ActivateMasterIp(master_params, use_external_mip_script):
411 """Activate the IP address of the master daemon.
412
413 @type master_params: L{objects.MasterNetworkParameters}
414 @param master_params: network parameters of the master
415 @type use_external_mip_script: boolean
416 @param use_external_mip_script: whether to use an external master IP
417 address setup script
418 @raise RPCFail: in case of errors during the IP startup
419
420 """
421 _RunMasterSetupScript(master_params, _MASTER_START,
422 use_external_mip_script)
423
424
425 def StartMasterDaemons(no_voting):
426 """Activate local node as master node.
427
428 The function will start the master daemons (ganeti-masterd and ganeti-rapi).
429
430 @type no_voting: boolean
431 @param no_voting: whether to start ganeti-masterd without a node vote
432 but still non-interactively
433 @rtype: None
434
435 """
436
437 if no_voting:
438 daemon_args = "--no-voting --yes-do-it"
439 else:
440 daemon_args = ""
441
442 env = {
443 "EXTRA_LUXID_ARGS": daemon_args,
444 "EXTRA_WCONFD_ARGS": daemon_args,
445 }
446
447 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-master"], env=env)
448 if result.failed:
449 msg = "Can't start Ganeti master: %s" % result.output
450 logging.error(msg)
451 _Fail(msg)
452
453
454 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown",
455 _BuildMasterIpEnv)
456 def DeactivateMasterIp(master_params, use_external_mip_script):
457 """Deactivate the master IP on this node.
458
459 @type master_params: L{objects.MasterNetworkParameters}
460 @param master_params: network parameters of the master
461 @type use_external_mip_script: boolean
462 @param use_external_mip_script: whether to use an external master IP
463 address setup script
464 @raise RPCFail: in case of errors during the IP turndown
465
466 """
467 _RunMasterSetupScript(master_params, _MASTER_STOP,
468 use_external_mip_script)
469
470
471 def StopMasterDaemons():
472 """Stop the master daemons on this node.
473
474 Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node.
475
476 @rtype: None
477
478 """
479 # TODO: log and report back to the caller the error failures; we
480 # need to decide in which case we fail the RPC for this
481
482 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-master"])
483 if result.failed:
484 logging.error("Could not stop Ganeti master, command %s had exitcode %s"
485 " and error %s",
486 result.cmd, result.exit_code, result.output)
487
488
489 def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
490 """Change the netmask of the master IP.
491
492 @param old_netmask: the old value of the netmask
493 @param netmask: the new value of the netmask
494 @param master_ip: the master IP
495 @param master_netdev: the master network device
496
497 """
498 if old_netmask == netmask:
499 return
500
501 if not netutils.IPAddress.Own(master_ip):
502 _Fail("The master IP address is not up, not attempting to change its"
503 " netmask")
504
505 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
506 "%s/%s" % (master_ip, netmask),
507 "dev", master_netdev, "label",
508 "%s:0" % master_netdev])
509 if result.failed:
510 _Fail("Could not set the new netmask on the master IP address")
511
512 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
513 "%s/%s" % (master_ip, old_netmask),
514 "dev", master_netdev, "label",
515 "%s:0" % master_netdev])
516 if result.failed:
517 _Fail("Could not bring down the master IP address with the old netmask")
518
519
520 def EtcHostsModify(mode, host, ip):
521 """Modify a host entry in /etc/hosts.
522
523 @param mode: The mode to operate. Either add or remove entry
524 @param host: The host to operate on
525 @param ip: The ip associated with the entry
526
527 """
528 if mode == constants.ETC_HOSTS_ADD:
529 if not ip:
530 RPCFail("Mode 'add' needs 'ip' parameter, but parameter not"
531 " present")
532 utils.AddHostToEtcHosts(host, ip)
533 elif mode == constants.ETC_HOSTS_REMOVE:
534 if ip:
535 RPCFail("Mode 'remove' does not allow 'ip' parameter, but"
536 " parameter is present")
537 utils.RemoveHostFromEtcHosts(host)
538 else:
539 RPCFail("Mode not supported")
540
541
542 def LeaveCluster(modify_ssh_setup):
543 """Cleans up and remove the current node.
544
545 This function cleans up and prepares the current node to be removed
546 from the cluster.
547
548 If processing is successful, then it raises an
549 L{errors.QuitGanetiException} which is used as a special case to
550 shutdown the node daemon.
551
552 @param modify_ssh_setup: boolean
553
554 """
555 _CleanDirectory(pathutils.DATA_DIR)
556 _CleanDirectory(pathutils.CRYPTO_KEYS_DIR)
557 JobQueuePurge()
558
559 if modify_ssh_setup:
560 try:
561 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
562
563 ssh.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
564
565 utils.RemoveFile(priv_key)
566 utils.RemoveFile(pub_key)
567 except errors.OpExecError:
568 logging.exception("Error while processing ssh files")
569 except IOError:
570 logging.exception("At least one SSH file was not accessible.")
571
572 try:
573 utils.RemoveFile(pathutils.CONFD_HMAC_KEY)
574 utils.RemoveFile(pathutils.RAPI_CERT_FILE)
575 utils.RemoveFile(pathutils.SPICE_CERT_FILE)
576 utils.RemoveFile(pathutils.SPICE_CACERT_FILE)
577 utils.RemoveFile(pathutils.NODED_CERT_FILE)
578 except: # pylint: disable=W0702
579 logging.exception("Error while removing cluster secrets")
580
581 utils.StopDaemon(constants.CONFD)
582 utils.StopDaemon(constants.MOND)
583 utils.StopDaemon(constants.KVMD)
584
585 # Raise a custom exception (handled in ganeti-noded)
586 raise errors.QuitGanetiException(True, "Shutdown scheduled")
587
588
589 def _CheckStorageParams(params, num_params):
590 """Performs sanity checks for storage parameters.
591
592 @type params: list
593 @param params: list of storage parameters
594 @type num_params: int
595 @param num_params: expected number of parameters
596
597 """
598 if params is None:
599 raise errors.ProgrammerError("No storage parameters for storage"
600 " reporting is provided.")
601 if not isinstance(params, list):
602 raise errors.ProgrammerError("The storage parameters are not of type"
603 " list: '%s'" % params)
604 if not len(params) == num_params:
605 raise errors.ProgrammerError("Did not receive the expected number of"
606 "storage parameters: expected %s,"
607 " received '%s'" % (num_params, len(params)))
608
609
610 def _CheckLvmStorageParams(params):
611 """Performs sanity check for the 'exclusive storage' flag.
612
613 @see: C{_CheckStorageParams}
614
615 """
616 _CheckStorageParams(params, 1)
617 excl_stor = params[0]
618 if not isinstance(params[0], bool):
619 raise errors.ProgrammerError("Exclusive storage parameter is not"
620 " boolean: '%s'." % excl_stor)
621 return excl_stor
622
623
624 def _GetLvmVgSpaceInfo(name, params):
625 """Wrapper around C{_GetVgInfo} which checks the storage parameters.
626
627 @type name: string
628 @param name: name of the volume group
629 @type params: list
630 @param params: list of storage parameters, which in this case should be
631 containing only one for exclusive storage
632
633 """
634 excl_stor = _CheckLvmStorageParams(params)
635 return _GetVgInfo(name, excl_stor)
636
637
638 def _GetVgInfo(
639 name, excl_stor, info_fn=bdev.LogicalVolume.GetVGInfo):
640 """Retrieves information about a LVM volume group.
641
642 """
643 # TODO: GetVGInfo supports returning information for multiple VGs at once
644 vginfo = info_fn([name], excl_stor)
645 if vginfo:
646 vg_free = int(round(vginfo[0][0], 0))
647 vg_size = int(round(vginfo[0][1], 0))
648 else:
649 vg_free = None
650 vg_size = None
651
652 return {
653 "type": constants.ST_LVM_VG,
654 "name": name,
655 "storage_free": vg_free,
656 "storage_size": vg_size,
657 }
658
659
660 def _GetLvmPvSpaceInfo(name, params):
661 """Wrapper around C{_GetVgSpindlesInfo} with sanity checks.
662
663 @see: C{_GetLvmVgSpaceInfo}
664
665 """
666 excl_stor = _CheckLvmStorageParams(params)
667 return _GetVgSpindlesInfo(name, excl_stor)
668
669
670 def _GetVgSpindlesInfo(
671 name, excl_stor, info_fn=bdev.LogicalVolume.GetVgSpindlesInfo):
672 """Retrieves information about spindles in an LVM volume group.
673
674 @type name: string
675 @param name: VG name
676 @type excl_stor: bool
677 @param excl_stor: exclusive storage
678 @rtype: dict
679 @return: dictionary whose keys are "name", "vg_free", "vg_size" for VG name,
680 free spindles, total spindles respectively
681
682 """
683 if excl_stor:
684 (vg_free, vg_size) = info_fn(name)
685 else:
686 vg_free = 0
687 vg_size = 0
688 return {
689 "type": constants.ST_LVM_PV,
690 "name": name,
691 "storage_free": vg_free,
692 "storage_size": vg_size,
693 }
694
695
696 def _GetHvInfo(name, hvparams, get_hv_fn=hypervisor.GetHypervisor):
697 """Retrieves node information from a hypervisor.
698
699 The information returned depends on the hypervisor. Common items:
700
701 - vg_size is the size of the configured volume group in MiB
702 - vg_free is the free size of the volume group in MiB
703 - memory_dom0 is the memory allocated for domain0 in MiB
704 - memory_free is the currently available (free) ram in MiB
705 - memory_total is the total number of ram in MiB
706 - hv_version: the hypervisor version, if available
707
708 @type hvparams: dict of string
709 @param hvparams: the hypervisor's hvparams
710
711 """
712 return get_hv_fn(name).GetNodeInfo(hvparams=hvparams)
713
714
715 def _GetHvInfoAll(hv_specs, get_hv_fn=hypervisor.GetHypervisor):
716 """Retrieves node information for all hypervisors.
717
718 See C{_GetHvInfo} for information on the output.
719
720 @type hv_specs: list of pairs (string, dict of strings)
721 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
722
723 """
724 if hv_specs is None:
725 return None
726
727 result = []
728 for hvname, hvparams in hv_specs:
729 result.append(_GetHvInfo(hvname, hvparams, get_hv_fn))
730 return result
731
732
733 def _GetNamedNodeInfo(names, fn):
734 """Calls C{fn} for all names in C{names} and returns a dictionary.
735
736 @rtype: None or dict
737
738 """
739 if names is None:
740 return None
741 else:
742 return map(fn, names)
743
744
745 def GetNodeInfo(storage_units, hv_specs):
746 """Gives back a hash with different information about the node.
747
748 @type storage_units: list of tuples (string, string, list)
749 @param storage_units: List of tuples (storage unit, identifier, parameters) to
750 ask for disk space information. In case of lvm-vg, the identifier is
751 the VG name. The parameters can contain additional, storage-type-specific
752 parameters, for example exclusive storage for lvm storage.
753 @type hv_specs: list of pairs (string, dict of strings)
754 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
755 @rtype: tuple; (string, None/dict, None/dict)
756 @return: Tuple containing boot ID, volume group information and hypervisor
757 information
758
759 """
760 bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n")
761 storage_info = _GetNamedNodeInfo(
762 storage_units,
763 (lambda (storage_type, storage_key, storage_params):
764 _ApplyStorageInfoFunction(storage_type, storage_key, storage_params)))
765 hv_info = _GetHvInfoAll(hv_specs)
766 return (bootid, storage_info, hv_info)
767
768
769 def _GetFileStorageSpaceInfo(path, params):
770 """Wrapper around filestorage.GetSpaceInfo.
771
772 The purpose of this wrapper is to call filestorage.GetFileStorageSpaceInfo
773 and ignore the *args parameter to not leak it into the filestorage
774 module's code.
775
776 @see: C{filestorage.GetFileStorageSpaceInfo} for description of the
777 parameters.
778
779 """
780 _CheckStorageParams(params, 0)
781 return filestorage.GetFileStorageSpaceInfo(path)
782
783
784 # FIXME: implement storage reporting for all missing storage types.
785 _STORAGE_TYPE_INFO_FN = {
786 constants.ST_BLOCK: None,
787 constants.ST_DISKLESS: None,
788 constants.ST_EXT: None,
789 constants.ST_FILE: _GetFileStorageSpaceInfo,
790 constants.ST_LVM_PV: _GetLvmPvSpaceInfo,
791 constants.ST_LVM_VG: _GetLvmVgSpaceInfo,
792 constants.ST_SHARED_FILE: None,
793 constants.ST_GLUSTER: None,
794 constants.ST_RADOS: None,
795 }
796
797
798 def _ApplyStorageInfoFunction(storage_type, storage_key, *args):
799 """Looks up and applies the correct function to calculate free and total
800 storage for the given storage type.
801
802 @type storage_type: string
803 @param storage_type: the storage type for which the storage shall be reported.
804 @type storage_key: string
805 @param storage_key: identifier of a storage unit, e.g. the volume group name
806 of an LVM storage unit
807 @type args: any
808 @param args: various parameters that can be used for storage reporting. These
809 parameters and their semantics vary from storage type to storage type and
810 are just propagated in this function.
811 @return: the results of the application of the storage space function (see
812 _STORAGE_TYPE_INFO_FN) if storage space reporting is implemented for that
813 storage type
814 @raises NotImplementedError: for storage types who don't support space
815 reporting yet
816 """
817 fn = _STORAGE_TYPE_INFO_FN[storage_type]
818 if fn is not None:
819 return fn(storage_key, *args)
820 else:
821 raise NotImplementedError
822
823
824 def _CheckExclusivePvs(pvi_list):
825 """Check that PVs are not shared among LVs
826
827 @type pvi_list: list of L{objects.LvmPvInfo} objects
828 @param pvi_list: information about the PVs
829
830 @rtype: list of tuples (string, list of strings)
831 @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...])
832
833 """
834 res = []
835 for pvi in pvi_list:
836 if len(pvi.lv_list) > 1:
837 res.append((pvi.name, pvi.lv_list))
838 return res
839
840
841 def _VerifyHypervisors(what, vm_capable, result, all_hvparams,
842 get_hv_fn=hypervisor.GetHypervisor):
843 """Verifies the hypervisor. Appends the results to the 'results' list.
844
845 @type what: C{dict}
846 @param what: a dictionary of things to check
847 @type vm_capable: boolean
848 @param vm_capable: whether or not this node is vm capable
849 @type result: dict
850 @param result: dictionary of verification results; results of the
851 verifications in this function will be added here
852 @type all_hvparams: dict of dict of string
853 @param all_hvparams: dictionary mapping hypervisor names to hvparams
854 @type get_hv_fn: function
855 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
856
857 """
858 if not vm_capable:
859 return
860
861 if constants.NV_HYPERVISOR in what:
862 result[constants.NV_HYPERVISOR] = {}
863 for hv_name in what[constants.NV_HYPERVISOR]:
864 hvparams = all_hvparams[hv_name]
865 try:
866 val = get_hv_fn(hv_name).Verify(hvparams=hvparams)
867 except errors.HypervisorError, err:
868 val = "Error while checking hypervisor: %s" % str(err)
869 result[constants.NV_HYPERVISOR][hv_name] = val
870
871
872 def _VerifyHvparams(what, vm_capable, result,
873 get_hv_fn=hypervisor.GetHypervisor):
874 """Verifies the hvparams. Appends the results to the 'results' list.
875
876 @type what: C{dict}
877 @param what: a dictionary of things to check
878 @type vm_capable: boolean
879 @param vm_capable: whether or not this node is vm capable
880 @type result: dict
881 @param result: dictionary of verification results; results of the
882 verifications in this function will be added here
883 @type get_hv_fn: function
884 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
885
886 """
887 if not vm_capable:
888 return
889
890 if constants.NV_HVPARAMS in what:
891 result[constants.NV_HVPARAMS] = []
892 for source, hv_name, hvparms in what[constants.NV_HVPARAMS]:
893 try:
894 logging.info("Validating hv %s, %s", hv_name, hvparms)
895 get_hv_fn(hv_name).ValidateParameters(hvparms)
896 except errors.HypervisorError, err:
897 result[constants.NV_HVPARAMS].append((source, hv_name, str(err)))
898
899
900 def _VerifyInstanceList(what, vm_capable, result, all_hvparams):
901 """Verifies the instance list.
902
903 @type what: C{dict}
904 @param what: a dictionary of things to check
905 @type vm_capable: boolean
906 @param vm_capable: whether or not this node is vm capable
907 @type result: dict
908 @param result: dictionary of verification results; results of the
909 verifications in this function will be added here
910 @type all_hvparams: dict of dict of string
911 @param all_hvparams: dictionary mapping hypervisor names to hvparams
912
913 """
914 if constants.NV_INSTANCELIST in what and vm_capable:
915 # GetInstanceList can fail
916 try:
917 val = GetInstanceList(what[constants.NV_INSTANCELIST],
918 all_hvparams=all_hvparams)
919 except RPCFail, err:
920 val = str(err)
921 result[constants.NV_INSTANCELIST] = val
922
923
924 def _VerifyNodeInfo(what, vm_capable, result, all_hvparams):
925 """Verifies the node info.
926
927 @type what: C{dict}
928 @param what: a dictionary of things to check
929 @type vm_capable: boolean
930 @param vm_capable: whether or not this node is vm capable
931 @type result: dict
932 @param result: dictionary of verification results; results of the
933 verifications in this function will be added here
934 @type all_hvparams: dict of dict of string
935 @param all_hvparams: dictionary mapping hypervisor names to hvparams
936
937 """
938 if constants.NV_HVINFO in what and vm_capable:
939 hvname = what[constants.NV_HVINFO]
940 hyper = hypervisor.GetHypervisor(hvname)
941 hvparams = all_hvparams[hvname]
942 result[constants.NV_HVINFO] = hyper.GetNodeInfo(hvparams=hvparams)
943
944
945 def _VerifyClientCertificate(cert_file=pathutils.NODED_CLIENT_CERT_FILE):
946 """Verify the existance and validity of the client SSL certificate.
947
948 Also, verify that the client certificate is not self-signed. Self-
949 signed client certificates stem from Ganeti versions 2.12.0 - 2.12.4
950 and should be replaced by client certificates signed by the server
951 certificate. Hence we output a warning when we encounter a self-signed
952 one.
953
954 """
955 create_cert_cmd = "gnt-cluster renew-crypto --new-node-certificates"
956 if not os.path.exists(cert_file):
957 return (constants.CV_ERROR,
958 "The client certificate does not exist. Run '%s' to create"
959 " client certificates for all nodes." % create_cert_cmd)
960
961 (errcode, msg) = utils.VerifyCertificate(cert_file)
962 if errcode is not None:
963 return (errcode, msg)
964
965 (errcode, msg) = utils.IsCertificateSelfSigned(cert_file)
966 if errcode is not None:
967 return (errcode, msg)
968
969 # if everything is fine, we return the digest to be compared to the config
970 return (None, utils.GetCertificateDigest(cert_filename=cert_file))
971
972
973 def _VerifySshSetup(node_status_list, my_name, ssh_key_type,
974 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS):
975 """Verifies the state of the SSH key files.
976
977 @type node_status_list: list of tuples
978 @param node_status_list: list of nodes of the cluster associated with a
979 couple of flags: (uuid, name, is_master_candidate,
980 is_potential_master_candidate, online)
981 @type my_name: str
982 @param my_name: name of this node
983 @type ssh_key_type: one of L{constants.SSHK_ALL}
984 @param ssh_key_type: type of key used on nodes
985 @type ganeti_pub_keys_file: str
986 @param ganeti_pub_keys_file: filename of the public keys file
987
988 """
989 if node_status_list is None:
990 return ["No node list to check against the pub_key_file received."]
991
992 my_status_list = [(my_uuid, name, mc, pot_mc, online) for
993 (my_uuid, name, mc, pot_mc, online)
994 in node_status_list if name == my_name]
995 if len(my_status_list) == 0:
996 return ["Cannot find node information for node '%s'." % my_name]
997 (my_uuid, _, _, potential_master_candidate, online) = \
998 my_status_list[0]
999
1000 result = []
1001
1002 if not os.path.exists(ganeti_pub_keys_file):
1003 result.append("The public key file '%s' does not exist. Consider running"
1004 " 'gnt-cluster renew-crypto --new-ssh-keys"
1005 " [--no-ssh-key-check]' to fix this." % ganeti_pub_keys_file)
1006 return result
1007
1008 pot_mc_uuids = [uuid for (uuid, _, _, _, _) in node_status_list]
1009 offline_nodes = [uuid for (uuid, _, _, _, online) in node_status_list
1010 if not online]
1011 pub_keys = ssh.QueryPubKeyFile(None, key_file=ganeti_pub_keys_file)
1012
1013 if potential_master_candidate:
1014 # Check that the set of potential master candidates matches the
1015 # public key file
1016 pub_uuids_set = set(pub_keys.keys()) - set(offline_nodes)
1017 pot_mc_uuids_set = set(pot_mc_uuids) - set(offline_nodes)
1018 missing_uuids = set([])
1019 if pub_uuids_set != pot_mc_uuids_set:
1020 unknown_uuids = pub_uuids_set - pot_mc_uuids_set
1021 if unknown_uuids:
1022 result.append("The following node UUIDs are listed in the public key"
1023 " file on node '%s', but are not potential master"
1024 " candidates: %s."
1025 % (my_name, ", ".join(list(unknown_uuids))))
1026 missing_uuids = pot_mc_uuids_set - pub_uuids_set
1027 if missing_uuids:
1028 result.append("The following node UUIDs of potential master candidates"
1029 " are missing in the public key file on node %s: %s."
1030 % (my_name, ", ".join(list(missing_uuids))))
1031
1032 (_, key_files) = \
1033 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1034 (_, node_pub_key_file) = key_files[ssh_key_type]
1035
1036 my_keys = pub_keys[my_uuid]
1037
1038 node_pub_key = utils.ReadFile(node_pub_key_file)
1039 if node_pub_key.strip() not in my_keys:
1040 result.append("The dsa key of node %s does not match this node's key"
1041 " in the pub key file." % my_name)
1042 if len(my_keys) != 1:
1043 result.append("There is more than one key for node %s in the public key"
1044 " file." % my_name)
1045 else:
1046 if len(pub_keys.keys()) > 0:
1047 result.append("The public key file of node '%s' is not empty, although"
1048 " the node is not a potential master candidate."
1049 % my_name)
1050
1051 # Check that all master candidate keys are in the authorized_keys file
1052 (auth_key_file, _) = \
1053 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1054 for (uuid, name, mc, _, online) in node_status_list:
1055 if not online:
1056 continue
1057 if uuid in missing_uuids:
1058 continue
1059 if mc:
1060 for key in pub_keys[uuid]:
1061 if not ssh.HasAuthorizedKey(auth_key_file, key):
1062 result.append("A SSH key of master candidate '%s' (UUID: '%s') is"
1063 " not in the 'authorized_keys' file of node '%s'."
1064 % (name, uuid, my_name))
1065 else:
1066 for key in pub_keys[uuid]:
1067 if name != my_name and ssh.HasAuthorizedKey(auth_key_file, key):
1068 result.append("A SSH key of normal node '%s' (UUID: '%s') is in the"
1069 " 'authorized_keys' file of node '%s'."
1070 % (name, uuid, my_name))
1071 if name == my_name and not ssh.HasAuthorizedKey(auth_key_file, key):
1072 result.append("A SSH key of normal node '%s' (UUID: '%s') is not"
1073 " in the 'authorized_keys' file of itself."
1074 % (my_name, uuid))
1075
1076 return result
1077
1078
1079 def _VerifySshClutter(node_status_list, my_name):
1080 """Verifies that the 'authorized_keys' files are not cluttered up.
1081
1082 @type node_status_list: list of tuples
1083 @param node_status_list: list of nodes of the cluster associated with a
1084 couple of flags: (uuid, name, is_master_candidate,
1085 is_potential_master_candidate, online)
1086 @type my_name: str
1087 @param my_name: name of this node
1088
1089 """
1090 result = []
1091 (auth_key_file, _) = \
1092 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1093 node_names = [name for (_, name, _, _) in node_status_list]
1094 multiple_occurrences = ssh.CheckForMultipleKeys(auth_key_file, node_names)
1095 if multiple_occurrences:
1096 msg = "There are hosts which have more than one SSH key stored for the" \
1097 " same user in the 'authorized_keys' file of node %s. This can be" \
1098 " due to an unsuccessful operation which cluttered up the" \
1099 " 'authorized_keys' file. We recommend to clean this up manually. " \
1100 % my_name
1101 for host, occ in multiple_occurrences.items():
1102 msg += "Entry for '%s' in lines %s. " % (host, utils.CommaJoin(occ))
1103 result.append(msg)
1104
1105 return result
1106
1107
1108 def VerifyNode(what, cluster_name, all_hvparams):
1109 """Verify the status of the local node.
1110
1111 Based on the input L{what} parameter, various checks are done on the
1112 local node.
1113
1114 If the I{filelist} key is present, this list of
1115 files is checksummed and the file/checksum pairs are returned.
1116
1117 If the I{nodelist} key is present, we check that we have
1118 connectivity via ssh with the target nodes (and check the hostname
1119 report).
1120
1121 If the I{node-net-test} key is present, we check that we have
1122 connectivity to the given nodes via both primary IP and, if
1123 applicable, secondary IPs.
1124
1125 @type what: C{dict}
1126 @param what: a dictionary of things to check:
1127 - filelist: list of files for which to compute checksums
1128 - nodelist: list of nodes we should check ssh communication with
1129 - node-net-test: list of nodes we should check node daemon port
1130 connectivity with
1131 - hypervisor: list with hypervisors to run the verify for
1132 @type cluster_name: string
1133 @param cluster_name: the cluster's name
1134 @type all_hvparams: dict of dict of strings
1135 @param all_hvparams: a dictionary mapping hypervisor names to hvparams
1136 @rtype: dict
1137 @return: a dictionary with the same keys as the input dict, and
1138 values representing the result of the checks
1139
1140 """
1141 result = {}
1142 my_name = netutils.Hostname.GetSysName()
1143 port = netutils.GetDaemonPort(constants.NODED)
1144 vm_capable = my_name not in what.get(constants.NV_NONVMNODES, [])
1145
1146 _VerifyHypervisors(what, vm_capable, result, all_hvparams)
1147 _VerifyHvparams(what, vm_capable, result)
1148
1149 if constants.NV_FILELIST in what:
1150 fingerprints = utils.FingerprintFiles(map(vcluster.LocalizeVirtualPath,
1151 what[constants.NV_FILELIST]))
1152 result[constants.NV_FILELIST] = \
1153 dict((vcluster.MakeVirtualPath(key), value)
1154 for (key, value) in fingerprints.items())
1155
1156 if constants.NV_CLIENT_CERT in what:
1157 result[constants.NV_CLIENT_CERT] = _VerifyClientCertificate()
1158
1159 if constants.NV_SSH_SETUP in what:
1160 node_status_list, key_type = what[constants.NV_SSH_SETUP]
1161 result[constants.NV_SSH_SETUP] = \
1162 _VerifySshSetup(node_status_list, my_name, key_type)
1163 if constants.NV_SSH_CLUTTER in what:
1164 result[constants.NV_SSH_CLUTTER] = \
1165 _VerifySshClutter(what[constants.NV_SSH_SETUP], my_name)
1166
1167 if constants.NV_NODELIST in what:
1168 (nodes, bynode, mcs) = what[constants.NV_NODELIST]
1169
1170 # Add nodes from other groups (different for each node)
1171 try:
1172 nodes.extend(bynode[my_name])
1173 except KeyError:
1174 pass
1175
1176 # Use a random order
1177 random.shuffle(nodes)
1178
1179 # Try to contact all nodes
1180 val = {}
1181 ssh_port_map = ssconf.SimpleStore().GetSshPortMap()
1182 for node in nodes:
1183 # We only test if master candidates can communicate to other nodes.
1184 # We cannot test if normal nodes cannot communicate with other nodes,
1185 # because the administrator might have installed additional SSH keys,
1186 # over which Ganeti has no power.
1187 if my_name in mcs:
1188 success, message = _GetSshRunner(cluster_name). \
1189 VerifyNodeHostname(node, ssh_port_map[node])
1190 if not success:
1191 val[node] = message
1192
1193 result[constants.NV_NODELIST] = val
1194
1195 if constants.NV_NODENETTEST in what:
1196 result[constants.NV_NODENETTEST] = tmp = {}
1197 my_pip = my_sip = None
1198 for name, pip, sip in what[constants.NV_NODENETTEST]:
1199 if name == my_name:
1200 my_pip = pip
1201 my_sip = sip
1202 break
1203 if not my_pip:
1204 tmp[my_name] = ("Can't find my own primary/secondary IP"
1205 " in the node list")
1206 else:
1207 for name, pip, sip in what[constants.NV_NODENETTEST]:
1208 fail = []
1209 if not netutils.TcpPing(pip, port, source=my_pip):
1210 fail.append("primary")
1211 if sip != pip:
1212 if not netutils.TcpPing(sip, port, source=my_sip):
1213 fail.append("secondary")
1214 if fail:
1215 tmp[name] = ("failure using the %s interface(s)" %
1216 " and ".join(fail))
1217
1218 if constants.NV_MASTERIP in what:
1219 # FIXME: add checks on incoming data structures (here and in the
1220 # rest of the function)
1221 master_name, master_ip = what[constants.NV_MASTERIP]
1222 if master_name == my_name:
1223 source = constants.IP4_ADDRESS_LOCALHOST
1224 else:
1225 source = None
1226 result[constants.NV_MASTERIP] = netutils.TcpPing(master_ip, port,
1227 source=source)
1228
1229 if constants.NV_USERSCRIPTS in what:
1230 result[constants.NV_USERSCRIPTS] = \
1231 [script for script in what[constants.NV_USERSCRIPTS]
1232 if not utils.IsExecutable(script)]
1233
1234 if constants.NV_OOB_PATHS in what:
1235 result[constants.NV_OOB_PATHS] = tmp = []
1236 for path in what[constants.NV_OOB_PATHS]:
1237 try:
1238 st = os.stat(path)
1239 except OSError, err:
1240 tmp.append("error stating out of band helper: %s" % err)
1241 else:
1242 if stat.S_ISREG(st.st_mode):
1243 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR:
1244 tmp.append(None)
1245 else:
1246 tmp.append("out of band helper %s is not executable" % path)
1247 else:
1248 tmp.append("out of band helper %s is not a file" % path)
1249
1250 if constants.NV_LVLIST in what and vm_capable:
1251 try:
1252 val = GetVolumeList(utils.ListVolumeGroups().keys())
1253 except RPCFail, err:
1254 val = str(err)
1255 result[constants.NV_LVLIST] = val
1256
1257 _VerifyInstanceList(what, vm_capable, result, all_hvparams)
1258
1259 if constants.NV_VGLIST in what and vm_capable:
1260 result[constants.NV_VGLIST] = utils.ListVolumeGroups()
1261
1262 if constants.NV_PVLIST in what and vm_capable:
1263 check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what
1264 val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST],
1265 filter_allocatable=False,
1266 include_lvs=check_exclusive_pvs)
1267 if check_exclusive_pvs:
1268 result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val)
1269 for pvi in val:
1270 # Avoid sending useless data on the wire
1271 pvi.lv_list = []
1272 result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val)
1273
1274 if constants.NV_VERSION in what:
1275 result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
1276 constants.RELEASE_VERSION)
1277
1278 _VerifyNodeInfo(what, vm_capable, result, all_hvparams)
1279
1280 if constants.NV_DRBDVERSION in what and vm_capable:
1281 try:
1282 drbd_version = DRBD8.GetProcInfo().GetVersionString()
1283 except errors.BlockDeviceError, err:
1284 logging.warning("Can't get DRBD version", exc_info=True)
1285 drbd_version = str(err)
1286 result[constants.NV_DRBDVERSION] = drbd_version
1287
1288 if constants.NV_DRBDLIST in what and vm_capable:
1289 try:
1290 used_minors = drbd.DRBD8.GetUsedDevs()
1291 except errors.BlockDeviceError, err:
1292 logging.warning("Can't get used minors list", exc_info=True)
1293 used_minors = str(err)
1294 result[constants.NV_DRBDLIST] = used_minors
1295
1296 if constants.NV_DRBDHELPER in what and vm_capable:
1297 status = True
1298 try:
1299 payload = drbd.DRBD8.GetUsermodeHelper()
1300 except errors.BlockDeviceError, err:
1301 logging.error("Can't get DRBD usermode helper: %s", str(err))
1302 status = False
1303 payload = str(err)
1304 result[constants.NV_DRBDHELPER] = (status, payload)
1305
1306 if constants.NV_NODESETUP in what:
1307 result[constants.NV_NODESETUP] = tmpr = []
1308 if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"):
1309 tmpr.append("The sysfs filesytem doesn't seem to be mounted"
1310 " under /sys, missing required directories /sys/block"
1311 " and /sys/class/net")
1312 if (not os.path.isdir("/proc/sys") or
1313 not os.path.isfile("/proc/sysrq-trigger")):
1314 tmpr.append("The procfs filesystem doesn't seem to be mounted"
1315 " under /proc, missing required directory /proc/sys and"
1316 " the file /proc/sysrq-trigger")
1317
1318 if constants.NV_TIME in what:
1319 result[constants.NV_TIME] = utils.SplitTime(time.time())
1320
1321 if constants.NV_OSLIST in what and vm_capable:
1322 result[constants.NV_OSLIST] = DiagnoseOS()
1323
1324 if constants.NV_BRIDGES in what and vm_capable:
1325 result[constants.NV_BRIDGES] = [bridge
1326 for bridge in what[constants.NV_BRIDGES]
1327 if not utils.BridgeExists(bridge)]
1328
1329 if what.get(constants.NV_ACCEPTED_STORAGE_PATHS) == my_name:
1330 result[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1331 filestorage.ComputeWrongFileStoragePaths()
1332
1333 if what.get(constants.NV_FILE_STORAGE_PATH):
1334 pathresult = filestorage.CheckFileStoragePath(
1335 what[constants.NV_FILE_STORAGE_PATH])
1336 if pathresult:
1337 result[constants.NV_FILE_STORAGE_PATH] = pathresult
1338
1339 if what.get(constants.NV_SHARED_FILE_STORAGE_PATH):
1340 pathresult = filestorage.CheckFileStoragePath(
1341 what[constants.NV_SHARED_FILE_STORAGE_PATH])
1342 if pathresult:
1343 result[constants.NV_SHARED_FILE_STORAGE_PATH] = pathresult
1344
1345 return result
1346
1347
1348 def GetCryptoTokens(token_requests):
1349 """Perform actions on the node's cryptographic tokens.
1350
1351 Token types can be 'ssl' or 'ssh'. So far only some actions are implemented
1352 for 'ssl'. Action 'get' returns the digest of the public client ssl
1353 certificate. Action 'create' creates a new client certificate and private key
1354 and also returns the digest of the certificate. The third parameter of a
1355 token request are optional parameters for the actions, so far only the
1356 filename is supported.
1357
1358 @type token_requests: list of tuples of (string, string, dict), where the
1359 first string is in constants.CRYPTO_TYPES, the second in
1360 constants.CRYPTO_ACTIONS. The third parameter is a dictionary of string
1361 to string.
1362 @param token_requests: list of requests of cryptographic tokens and actions
1363 to perform on them. The actions come with a dictionary of options.
1364 @rtype: list of tuples (string, string)
1365 @return: list of tuples of the token type and the public crypto token
1366
1367 """
1368 tokens = []
1369 for (token_type, action, _) in token_requests:
1370 if token_type not in constants.CRYPTO_TYPES:
1371 raise errors.ProgrammerError("Token type '%s' not supported." %
1372 token_type)
1373 if action not in constants.CRYPTO_ACTIONS:
1374 raise errors.ProgrammerError("Action '%s' is not supported." %
1375 action)
1376 if token_type == constants.CRYPTO_TYPE_SSL_DIGEST:
1377 tokens.append((token_type,
1378 utils.GetCertificateDigest()))
1379 return tokens
1380
1381
1382 def EnsureDaemon(daemon_name, run):
1383 """Ensures the given daemon is running or stopped.
1384
1385 @type daemon_name: string
1386 @param daemon_name: name of the daemon (e.g., constants.KVMD)
1387
1388 @type run: bool
1389 @param run: whether to start or stop the daemon
1390
1391 @rtype: bool
1392 @return: 'True' if daemon successfully started/stopped,
1393 'False' otherwise
1394
1395 """
1396 allowed_daemons = [constants.KVMD]
1397
1398 if daemon_name not in allowed_daemons:
1399 fn = lambda _: False
1400 elif run:
1401 fn = utils.EnsureDaemon
1402 else:
1403 fn = utils.StopDaemon
1404
1405 return fn(daemon_name)
1406
1407
1408 def _InitSshUpdateData(data, noded_cert_file, ssconf_store):
1409 (_, noded_cert) = \
1410 utils.ExtractX509Certificate(utils.ReadFile(noded_cert_file))
1411 data[constants.SSHS_NODE_DAEMON_CERTIFICATE] = noded_cert
1412
1413 cluster_name = ssconf_store.GetClusterName()
1414 data[constants.SSHS_CLUSTER_NAME] = cluster_name
1415
1416
1417 def AddNodeSshKey(node_uuid, node_name,
1418 potential_master_candidates,
1419 to_authorized_keys=False,
1420 to_public_keys=False,
1421 get_public_keys=False,
1422 pub_key_file=pathutils.SSH_PUB_KEYS,
1423 ssconf_store=None,
1424 noded_cert_file=pathutils.NODED_CERT_FILE,
1425 run_cmd_fn=ssh.RunSshCmdWithStdin):
1426 """Distributes a node's public SSH key across the cluster.
1427
1428 Note that this function should only be executed on the master node, which
1429 then will copy the new node's key to all nodes in the cluster via SSH.
1430
1431 Also note: at least one of the flags C{to_authorized_keys},
1432 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1433 the function to actually perform any actions.
1434
1435 @type node_uuid: str
1436 @param node_uuid: the UUID of the node whose key is added
1437 @type node_name: str
1438 @param node_name: the name of the node whose key is added
1439 @type potential_master_candidates: list of str
1440 @param potential_master_candidates: list of node names of potential master
1441 candidates; this should match the list of uuids in the public key file
1442 @type to_authorized_keys: boolean
1443 @param to_authorized_keys: whether the key should be added to the
1444 C{authorized_keys} file of all nodes
1445 @type to_public_keys: boolean
1446 @param to_public_keys: whether the keys should be added to the public key file
1447 @type get_public_keys: boolean
1448 @param get_public_keys: whether the node should add the clusters' public keys
1449 to its {ganeti_pub_keys} file
1450
1451 """
1452 node_list = [SshAddNodeInfo(name=node_name, uuid=node_uuid,
1453 to_authorized_keys=to_authorized_keys,
1454 to_public_keys=to_public_keys,
1455 get_public_keys=get_public_keys)]
1456 return AddNodeSshKeyBulk(node_list,
1457 potential_master_candidates,
1458 pub_key_file=pub_key_file,
1459 ssconf_store=ssconf_store,
1460 noded_cert_file=noded_cert_file,
1461 run_cmd_fn=run_cmd_fn)
1462
1463
1464 # Node info named tuple specifically for the use with AddNodeSshKeyBulk
1465 SshAddNodeInfo = collections.namedtuple(
1466 "SshAddNodeInfo",
1467 ["uuid",
1468 "name",
1469 "to_authorized_keys",
1470 "to_public_keys",
1471 "get_public_keys"])
1472
1473
1474 def AddNodeSshKeyBulk(node_list,
1475 potential_master_candidates,
1476 pub_key_file=pathutils.SSH_PUB_KEYS,
1477 ssconf_store=None,
1478 noded_cert_file=pathutils.NODED_CERT_FILE,
1479 run_cmd_fn=ssh.RunSshCmdWithStdin):
1480 """Distributes a node's public SSH key across the cluster.
1481
1482 Note that this function should only be executed on the master node, which
1483 then will copy the new node's key to all nodes in the cluster via SSH.
1484
1485 Also note: at least one of the flags C{to_authorized_keys},
1486 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1487 the function to actually perform any actions.
1488
1489 @type node_list: list of SshAddNodeInfo tuples
1490 @param node_list: list of tuples containing the necessary node information for
1491 adding their keys
1492 @type potential_master_candidates: list of str
1493 @param potential_master_candidates: list of node names of potential master
1494 candidates; this should match the list of uuids in the public key file
1495
1496 """
1497 # whether there are any keys to be added or retrieved at all
1498 to_authorized_keys = any([node_info.to_authorized_keys for node_info in
1499 node_list])
1500 to_public_keys = any([node_info.to_public_keys for node_info in
1501 node_list])
1502
1503 if not ssconf_store:
1504 ssconf_store = ssconf.SimpleStore()
1505
1506 for node_info in node_list:
1507 # replacement not necessary for keys that are not supposed to be in the
1508 # list of public keys
1509 if not node_info.to_public_keys:
1510 continue
1511 # Check and fix sanity of key file
1512 keys_by_name = ssh.QueryPubKeyFile([node_info.name], key_file=pub_key_file)
1513 keys_by_uuid = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1514
1515 if (not keys_by_name or node_info.name not in keys_by_name) \
1516 and (not keys_by_uuid or node_info.uuid not in keys_by_uuid):
1517 raise errors.SshUpdateError(
1518 "No keys found for the new node '%s' (UUID %s) in the list of public"
1519 " SSH keys, neither for the name or the UUID" %
1520 (node_info.name, node_info.uuid))
1521 else:
1522 if node_info.name in keys_by_name:
1523 # Replace the name by UUID in the file as the name should only be used
1524 # temporarily
1525 ssh.ReplaceNameByUuid(node_info.uuid, node_info.name,
1526 error_fn=errors.SshUpdateError,
1527 key_file=pub_key_file)
1528
1529 # Retrieve updated map of UUIDs to keys
1530 keys_by_uuid = ssh.QueryPubKeyFile(
1531 [node_info.uuid for node_info in node_list], key_file=pub_key_file)
1532
1533 # Update the master node's key files
1534 (auth_key_file, _) = \
1535 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1536 for node_info in node_list:
1537 if node_info.to_authorized_keys:
1538 ssh.AddAuthorizedKeys(auth_key_file, keys_by_uuid[node_info.uuid])
1539
1540 base_data = {}
1541 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1542 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1543
1544 ssh_port_map = ssconf_store.GetSshPortMap()
1545
1546 # Update the target nodes themselves
1547 for node_info in node_list:
1548 logging.debug("Updating SSH key files of target node '%s'.", node_info.name)
1549 if node_info.get_public_keys:
1550 node_data = {}
1551 _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
1552 all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
1553 node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1554 (constants.SSHS_OVERRIDE, all_keys)
1555
1556 try:
1557 utils.RetryByNumberOfTimes(
1558 constants.SSHS_MAX_RETRIES,
1559 errors.SshUpdateError,
1560 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1561 ssh_port_map.get(node_info.name), node_data,
1562 debug=False, verbose=False, use_cluster_key=False,
1563 ask_key=False, strict_host_check=False)
1564 except errors.SshUpdateError as e:
1565 # Clean up the master's public key file if adding key fails
1566 if node_info.to_public_keys:
1567 ssh.RemovePublicKey(node_info.uuid)
1568 raise e
1569
1570 # Update all nodes except master and the target nodes
1571 keys_by_uuid_auth = ssh.QueryPubKeyFile(
1572 [node_info.uuid for node_info in node_list
1573 if node_info.to_authorized_keys],
1574 key_file=pub_key_file)
1575 if to_authorized_keys:
1576 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1577 (constants.SSHS_ADD, keys_by_uuid_auth)
1578
1579 pot_mc_data = base_data.copy()
1580 keys_by_uuid_pub = ssh.QueryPubKeyFile(
1581 [node_info.uuid for node_info in node_list
1582 if node_info.to_public_keys],
1583 key_file=pub_key_file)
1584 if to_public_keys:
1585 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1586 (constants.SSHS_REPLACE_OR_ADD, keys_by_uuid_pub)
1587
1588 all_nodes = ssconf_store.GetNodeList()
1589 master_node = ssconf_store.GetMasterNode()
1590 online_nodes = ssconf_store.GetOnlineNodeList()
1591
1592 node_errors = []
1593 for node in all_nodes:
1594 if node == master_node:
1595 logging.debug("Skipping master node '%s'.", master_node)
1596 continue
1597 if node not in online_nodes:
1598 logging.debug("Skipping offline node '%s'.", node)
1599 continue
1600 if node in potential_master_candidates:
1601 logging.debug("Updating SSH key files of node '%s'.", node)
1602 try:
1603 utils.RetryByNumberOfTimes(
1604 constants.SSHS_MAX_RETRIES,
1605 errors.SshUpdateError,
1606 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1607 ssh_port_map.get(node), pot_mc_data,
1608 debug=False, verbose=False, use_cluster_key=False,
1609 ask_key=False, strict_host_check=False)
1610 except errors.SshUpdateError as last_exception:
1611 error_msg = ("When adding the key of node '%s', updating SSH key"
1612 " files of node '%s' failed after %s retries."
1613 " Not trying again. Last error was: %s." %
1614 (node, node_info.name, constants.SSHS_MAX_RETRIES,
1615 last_exception))
1616 node_errors.append((node, error_msg))
1617 # We only log the error and don't throw an exception, because
1618 # one unreachable node shall not abort the entire procedure.
1619 logging.error(error_msg)
1620
1621 else:
1622 if to_authorized_keys:
1623 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
1624 ssh_port_map.get(node), base_data,
1625 debug=False, verbose=False, use_cluster_key=False,
1626 ask_key=False, strict_host_check=False)
1627
1628 return node_errors
1629
1630
1631 def RemoveNodeSshKey(node_uuid, node_name,
1632 master_candidate_uuids,
1633 potential_master_candidates,
1634 master_uuid=None,
1635 keys_to_remove=None,
1636 from_authorized_keys=False,
1637 from_public_keys=False,
1638 clear_authorized_keys=False,
1639 clear_public_keys=False,
1640 pub_key_file=pathutils.SSH_PUB_KEYS,
1641 ssconf_store=None,
1642 noded_cert_file=pathutils.NODED_CERT_FILE,
1643 readd=False,
1644 run_cmd_fn=ssh.RunSshCmdWithStdin):
1645 """Removes the node's SSH keys from the key files and distributes those.
1646
1647 Note that at least one of the flags C{from_authorized_keys},
1648 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1649 has to be set to C{True} for the function to perform any action at all.
1650 Not doing so will trigger an assertion in the function.
1651
1652 @type node_uuid: str
1653 @param node_uuid: UUID of the node whose key is removed
1654 @type node_name: str
1655 @param node_name: name of the node whose key is remove
1656 @type master_candidate_uuids: list of str
1657 @param master_candidate_uuids: list of UUIDs of the current master candidates
1658 @type potential_master_candidates: list of str
1659 @param potential_master_candidates: list of names of potential master
1660 candidates
1661 @type keys_to_remove: dict of str to list of str
1662 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1663 to be removed. This list is supposed to be used only if the keys are not
1664 in the public keys file. This is for example the case when removing a
1665 master node's key.
1666 @type from_authorized_keys: boolean
1667 @param from_authorized_keys: whether or not the key should be removed
1668 from the C{authorized_keys} file
1669 @type from_public_keys: boolean
1670 @param from_public_keys: whether or not the key should be remove from
1671 the C{ganeti_pub_keys} file
1672 @type clear_authorized_keys: boolean
1673 @param clear_authorized_keys: whether or not the C{authorized_keys} file
1674 should be cleared on the node whose keys are removed
1675 @type clear_public_keys: boolean
1676 @param clear_public_keys: whether to clear the node's C{ganeti_pub_key} file
1677 @type readd: boolean
1678 @param readd: whether this is called during a readd operation.
1679 @rtype: list of string
1680 @returns: list of feedback messages
1681
1682 """
1683 node_list = [SshRemoveNodeInfo(uuid=node_uuid,
1684 name=node_name,
1685 from_authorized_keys=from_authorized_keys,
1686 from_public_keys=from_public_keys,
1687 clear_authorized_keys=clear_authorized_keys,
1688 clear_public_keys=clear_public_keys)]
1689 return RemoveNodeSshKeyBulk(node_list,
1690 master_candidate_uuids,
1691 potential_master_candidates,
1692 master_uuid=master_uuid,
1693 keys_to_remove=keys_to_remove,
1694 pub_key_file=pub_key_file,
1695 ssconf_store=ssconf_store,
1696 noded_cert_file=noded_cert_file,
1697 readd=readd,
1698 run_cmd_fn=run_cmd_fn)
1699
1700
1701 # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
1702 SshRemoveNodeInfo = collections.namedtuple(
1703 "SshRemoveNodeInfo",
1704 ["uuid",
1705 "name",
1706 "from_authorized_keys",
1707 "from_public_keys",
1708 "clear_authorized_keys",
1709 "clear_public_keys"])
1710
1711
1712 def RemoveNodeSshKeyBulk(node_list,
1713 master_candidate_uuids,
1714 potential_master_candidates,
1715 master_uuid=None,
1716 keys_to_remove=None,
1717 pub_key_file=pathutils.SSH_PUB_KEYS,
1718 ssconf_store=None,
1719 noded_cert_file=pathutils.NODED_CERT_FILE,
1720 readd=False,
1721 run_cmd_fn=ssh.RunSshCmdWithStdin):
1722 """Removes the node's SSH keys from the key files and distributes those.
1723
1724 Note that at least one of the flags C{from_authorized_keys},
1725 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1726 of at least one node has to be set to C{True} for the function to perform any
1727 action at all. Not doing so will trigger an assertion in the function.
1728
1729 @type node_list: list of C{SshRemoveNodeInfo}.
1730 @param node_list: list of information about nodes whose keys are being removed
1731 @type master_candidate_uuids: list of str
1732 @param master_candidate_uuids: list of UUIDs of the current master candidates
1733 @type potential_master_candidates: list of str
1734 @param potential_master_candidates: list of names of potential master
1735 candidates
1736 @type keys_to_remove: dict of str to list of str
1737 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1738 to be removed. This list is supposed to be used only if the keys are not
1739 in the public keys file. This is for example the case when removing a
1740 master node's key.
1741 @type readd: boolean
1742 @param readd: whether this is called during a readd operation.
1743 @rtype: list of string
1744 @returns: list of feedback messages
1745
1746 """
1747 # Non-disruptive error messages, list of (node, msg) pairs
1748 result_msgs = []
1749
1750 # whether there are any keys to be added or retrieved at all
1751 from_authorized_keys = any([node_info.from_authorized_keys for node_info in
1752 node_list])
1753 from_public_keys = any([node_info.from_public_keys for node_info in
1754 node_list])
1755 clear_authorized_keys = any([node_info.clear_authorized_keys for node_info in
1756 node_list])
1757 clear_public_keys = any([node_info.clear_public_keys for node_info in
1758 node_list])
1759
1760 # Make sure at least one of these flags is true.
1761 if not (from_authorized_keys or from_public_keys or clear_authorized_keys
1762 or clear_public_keys):
1763 raise errors.SshUpdateError("No removal from any key file was requested.")
1764
1765 if not ssconf_store:
1766 ssconf_store = ssconf.SimpleStore()
1767
1768 master_node = ssconf_store.GetMasterNode()
1769 ssh_port_map = ssconf_store.GetSshPortMap()
1770
1771 all_keys_to_remove = {}
1772 if from_authorized_keys or from_public_keys:
1773 for node_info in node_list:
1774 # Skip nodes that don't actually need any keys to be removed.
1775 if not (node_info.from_authorized_keys or node_info.from_public_keys):
1776 continue
1777 if node_info.name == master_node and not keys_to_remove:
1778 raise errors.SshUpdateError("Cannot remove the master node's keys.")
1779 if keys_to_remove:
1780 keys = keys_to_remove
1781 else:
1782 keys = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1783 if (not keys or node_info.uuid not in keys) and not readd:
1784 raise errors.SshUpdateError("Node '%s' not found in the list of"
1785 " public SSH keys. It seems someone"
1786 " tries to remove a key from outside"
1787 " the cluster!" % node_info.uuid)
1788 # During an upgrade all nodes have the master key. In this case we
1789 # should not remove it to avoid accidentally shutting down cluster
1790 # SSH communication
1791 master_keys = None
1792 if master_uuid:
1793 master_keys = ssh.QueryPubKeyFile([master_uuid],
1794 key_file=pub_key_file)
1795 for master_key in master_keys:
1796 if master_key in keys[node_info.uuid]:
1797 keys[node_info.uuid].remove(master_key)
1798
1799 all_keys_to_remove.update(keys)
1800
1801 if all_keys_to_remove:
1802 base_data = {}
1803 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1804 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1805
1806 if from_authorized_keys:
1807 # UUIDs of nodes that are supposed to be removed from the
1808 # authorized_keys files.
1809 nodes_remove_from_authorized_keys = [
1810 node_info.uuid for node_info in node_list
1811 if node_info.from_authorized_keys]
1812 keys_to_remove_from_authorized_keys = dict([
1813 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1814 if uuid in nodes_remove_from_authorized_keys])
1815 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1816 (constants.SSHS_REMOVE, keys_to_remove_from_authorized_keys)
1817 (auth_key_file, _) = \
1818 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False,
1819 dircheck=False)
1820
1821 for uuid in nodes_remove_from_authorized_keys:
1822 ssh.RemoveAuthorizedKeys(auth_key_file,
1823 keys_to_remove_from_authorized_keys[uuid])
1824
1825 pot_mc_data = base_data.copy()
1826
1827 if from_public_keys:
1828 nodes_remove_from_public_keys = [
1829 node_info.uuid for node_info in node_list
1830 if node_info.from_public_keys]
1831 keys_to_remove_from_public_keys = dict([
1832 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1833 if uuid in nodes_remove_from_public_keys])
1834 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1835 (constants.SSHS_REMOVE, keys_to_remove_from_public_keys)
1836
1837 all_nodes = ssconf_store.GetNodeList()
1838 online_nodes = ssconf_store.GetOnlineNodeList()
1839 all_nodes_to_remove = [node_info.name for node_info in node_list]
1840 logging.debug("Removing keys of nodes '%s' from all nodes but itself and"
1841 " master.", ", ".join(all_nodes_to_remove))
1842 for node in all_nodes:
1843 if node == master_node:
1844 logging.debug("Skipping master node '%s'.", master_node)
1845 continue
1846 if node not in online_nodes:
1847 logging.debug("Skipping offline node '%s'.", node)
1848 continue
1849 if node in all_nodes_to_remove:
1850 logging.debug("Skipping node whose key is removed itself '%s'.", node)
1851 continue
1852 ssh_port = ssh_port_map.get(node)
1853 if not ssh_port:
1854 raise errors.OpExecError("No SSH port information available for"
1855 " node '%s', map: %s." %
1856 (node, ssh_port_map))
1857 error_msg_final = ("When removing the key of node '%s', updating the"
1858 " SSH key files of node '%s' failed. Last error"
1859 " was: %s.")
1860 if node in potential_master_candidates:
1861 logging.debug("Updating key setup of potential master candidate node"
1862 " %s.", node)
1863 try:
1864 utils.RetryByNumberOfTimes(
1865 constants.SSHS_MAX_RETRIES,
1866 errors.SshUpdateError,
1867 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1868 ssh_port, pot_mc_data,
1869 debug=False, verbose=False, use_cluster_key=False,
1870 ask_key=False, strict_host_check=False)
1871 except errors.SshUpdateError as last_exception:
1872 error_msg = error_msg_final % (
1873 node_info.name, node, last_exception)
1874 result_msgs.append((node, error_msg))
1875 logging.error(error_msg)
1876
1877 else:
1878 if from_authorized_keys:
1879 logging.debug("Updating key setup of normal node %s.", node)
1880 try:
1881 utils.RetryByNumberOfTimes(
1882 constants.SSHS_MAX_RETRIES,
1883 errors.SshUpdateError,
1884 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1885 ssh_port, base_data,
1886 debug=False, verbose=False, use_cluster_key=False,
1887 ask_key=False, strict_host_check=False)
1888 except errors.SshUpdateError as last_exception:
1889 error_msg = error_msg_final % (
1890 node_info.name, node, last_exception)
1891 result_msgs.append((node, error_msg))
1892 logging.error(error_msg)
1893
1894 for node_info in node_list:
1895 if node_info.clear_authorized_keys or node_info.from_public_keys or \
1896 node_info.clear_public_keys:
1897 data = {}
1898 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1899 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1900 ssh_port = ssh_port_map.get(node_info.name)
1901 if not ssh_port:
1902 raise errors.OpExecError("No SSH port information available for"
1903 " node '%s', which is leaving the cluster.")
1904
1905 if node_info.clear_authorized_keys:
1906 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore,
1907 # we have to specify exactly which keys to clear to leave keys untouched
1908 # that were not added by Ganeti.
1909 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids
1910 if uuid != node_info.uuid]
1911 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids,
1912 key_file=pub_key_file)
1913 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1914 (constants.SSHS_REMOVE, candidate_keys)
1915
1916 if node_info.clear_public_keys:
1917 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1918 (constants.SSHS_CLEAR, {})
1919 elif node_info.from_public_keys:
1920 # Since clearing the public keys subsumes removing just a single key,
1921 # we only do it if clear_public_keys is 'False'.
1922
1923 if all_keys_to_remove:
1924 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1925 (constants.SSHS_REMOVE, all_keys_to_remove)
1926
1927 # If we have no changes to any keyfile, just return
1928 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or
1929 constants.SSHS_SSH_AUTHORIZED_KEYS in data):
1930 return
1931
1932 logging.debug("Updating SSH key setup of target node '%s'.",
1933 node_info.name)
1934 try:
1935 utils.RetryByNumberOfTimes(
1936 constants.SSHS_MAX_RETRIES,
1937 errors.SshUpdateError,
1938 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1939 ssh_port, data,
1940 debug=False, verbose=False, use_cluster_key=False,
1941 ask_key=False, strict_host_check=False)
1942 except errors.SshUpdateError as last_exception:
1943 result_msgs.append(
1944 (node_info.name,
1945 ("Removing SSH keys from node '%s' failed."
1946 " This can happen when the node is already unreachable."
1947 " Error: %s" % (node_info.name, last_exception))))
1948
1949 if all_keys_to_remove and from_public_keys:
1950 for node_uuid in nodes_remove_from_public_keys:
1951 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
1952
1953 return result_msgs
1954
1955
1956 def _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, ssh_key_type,
1957 ssh_key_bits, pub_key_file=pathutils.SSH_PUB_KEYS,
1958 ssconf_store=None,
1959 noded_cert_file=pathutils.NODED_CERT_FILE,
1960 run_cmd_fn=ssh.RunSshCmdWithStdin,
1961 suffix=""):
1962 """Generates the root SSH key pair on the node.
1963
1964 @type node_uuid: str
1965 @param node_uuid: UUID of the node whose key is removed
1966 @type node_name: str
1967 @param node_name: name of the node whose key is remove
1968 @type ssh_port_map: dict of str to int
1969 @param ssh_port_map: mapping of node names to their SSH port
1970 @type ssh_key_type: One of L{constants.SSHK_ALL}
1971 @param ssh_key_type: the type of SSH key to be generated
1972 @type ssh_key_bits: int
1973 @param ssh_key_bits: the length of the key to be generated
1974
1975 """
1976 if not ssconf_store:
1977 ssconf_store = ssconf.SimpleStore()
1978
1979 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
1980 if not keys_by_uuid or node_uuid not in keys_by_uuid:
1981 raise errors.SshUpdateError("Node %s (UUID: %s) whose key is requested to"
1982 " be regenerated is not registered in the"
1983 " public keys file." % (node_name, node_uuid))
1984
1985 data = {}
1986 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1987 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1988 data[constants.SSHS_GENERATE] = (ssh_key_type, ssh_key_bits, suffix)
1989
1990 run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
1991 ssh_port_map.get(node_name), data,
1992 debug=False, verbose=False, use_cluster_key=False,
1993 ask_key=False, strict_host_check=False)
1994
1995
1996 def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
1997 master_node_uuids = [node_uuid for (node_uuid, node_name)
1998 in node_uuid_name_map
1999 if node_name == master_node_name]
2000 if len(master_node_uuids) != 1:
2001 raise errors.SshUpdateError("No (unique) master UUID found. Master node"
2002 " name: '%s', Master UUID: '%s'" %
2003 (master_node_name, master_node_uuids))
2004 return master_node_uuids[0]
2005
2006
2007 def _GetOldMasterKeys(master_node_uuid, pub_key_file):
2008 old_master_keys_by_uuid = ssh.QueryPubKeyFile([master_node_uuid],
2009 key_file=pub_key_file)
2010 if not old_master_keys_by_uuid:
2011 raise errors.SshUpdateError("No public key of the master node (UUID '%s')"
2012 " found, not generating a new key."
2013 % master_node_uuid)
2014 return old_master_keys_by_uuid
2015
2016
2017 def _GetNewMasterKey(root_keyfiles, master_node_uuid):
2018 new_master_keys = []
2019 for (_, (_, public_key_file)) in root_keyfiles.items():
2020 public_key_dir = os.path.dirname(public_key_file)
2021 public_key_file_tmp_filename = \
2022 os.path.splitext(os.path.basename(public_key_file))[0] \
2023 + constants.SSHS_MASTER_SUFFIX + ".pub"
2024 public_key_path_tmp = os.path.join(public_key_dir,
2025 public_key_file_tmp_filename)
2026 if os.path.exists(public_key_path_tmp):
2027 # for some key types, there might not be any keys
2028 key = utils.ReadFile(public_key_path_tmp)
2029 new_master_keys.append(key)
2030 if not new_master_keys:
2031 raise errors.SshUpdateError("Cannot find any type of temporary SSH key.")
2032 return {master_node_uuid: new_master_keys}
2033
2034
2035 def _ReplaceMasterKeyOnMaster(root_keyfiles):
2036 number_of_moves = 0
2037 for (_, (private_key_file, public_key_file)) in root_keyfiles.items():
2038 key_dir = os.path.dirname(public_key_file)
2039 private_key_file_tmp = \
2040 os.path.basename(private_key_file) + constants.SSHS_MASTER_SUFFIX
2041 public_key_file_tmp = private_key_file_tmp + ".pub"
2042 private_key_path_tmp = os.path.join(key_dir,
2043 private_key_file_tmp)
2044 public_key_path_tmp = os.path.join(key_dir,
2045 public_key_file_tmp)
2046 if os.path.exists(public_key_file):
2047 utils.CreateBackup(public_key_file)
2048 utils.RemoveFile(public_key_file)
2049 if os.path.exists(private_key_file):
2050 utils.CreateBackup(private_key_file)
2051 utils.RemoveFile(private_key_file)
2052 if os.path.exists(public_key_path_tmp) and \
2053 os.path.exists(private_key_path_tmp):
2054 # for some key types, there might not be any keys
2055 shutil.move(public_key_path_tmp, public_key_file)
2056 shutil.move(private_key_path_tmp, private_key_file)
2057 number_of_moves += 1
2058 if not number_of_moves:
2059 raise errors.SshUpdateError("Could not move at least one master SSH key.")
2060
2061
2062 def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
2063 potential_master_candidates, old_key_type, new_key_type,
2064 new_key_bits,
2065 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS,
2066 ssconf_store=None,
2067 noded_cert_file=pathutils.NODED_CERT_FILE,
2068 run_cmd_fn=ssh.RunSshCmdWithStdin):
2069 """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
2070
2071 @type node_uuids: list of str
2072 @param node_uuids: list of node UUIDs whose keys should be renewed
2073 @type node_names: list of str
2074 @param node_names: list of node names whose keys should be removed. This list
2075 should match the C{node_uuids} parameter
2076 @type master_candidate_uuids: list of str
2077 @param master_candidate_uuids: list of UUIDs of master candidates or
2078 master node
2079 @type old_key_type: One of L{constants.SSHK_ALL}
2080 @param old_key_type: the type of SSH key already present on nodes
2081 @type new_key_type: One of L{constants.SSHK_ALL}
2082 @param new_key_type: the type of SSH key to be generated
2083 @type new_key_bits: int
2084 @param new_key_bits: the length of the key to be generated
2085 @type ganeti_pub_keys_file: str
2086 @param ganeti_pub_keys_file: file path of the the public key file
2087 @type noded_cert_file: str
2088 @param noded_cert_file: path of the noded SSL certificate file
2089 @type run_cmd_fn: function
2090 @param run_cmd_fn: function to run commands on remote nodes via SSH
2091 @raises ProgrammerError: if node_uuids and node_names don't match;
2092 SshUpdateError if a node's key is missing from the public key file,
2093 if a node's new SSH key could not be fetched from it, if there is
2094 none or more than one entry in the public key list for the master
2095 node.
2096
2097 """
2098 if not ssconf_store:
2099 ssconf_store = ssconf.SimpleStore()
2100 cluster_name = ssconf_store.GetClusterName()
2101
2102 if not len(node_uuids) == len(node_names):
2103 raise errors.ProgrammerError("List of nodes UUIDs and node names"
2104 " does not match in length.")
2105
2106 (_, root_keyfiles) = \
2107 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2108 (_, old_pub_keyfile) = root_keyfiles[old_key_type]
2109 (_, new_pub_keyfile) = root_keyfiles[new_key_type]
2110 old_master_key = utils.ReadFile(old_pub_keyfile)
2111
2112 node_uuid_name_map = zip(node_uuids, node_names)
2113
2114 master_node_name = ssconf_store.GetMasterNode()
2115 master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name)
2116 ssh_port_map = ssconf_store.GetSshPortMap()
2117 # List of all node errors that happened, but which did not abort the
2118 # procedure as a whole. It is important that this is a list to have a
2119 # somewhat chronological history of events.
2120 all_node_errors = []
2121
2122 # process non-master nodes
2123
2124 # keys to add in bulk at the end
2125 node_keys_to_add = []
2126
2127 # list of all nodes
2128 node_list = []
2129
2130 # list of keys to be removed before generating new keys
2131 node_info_to_remove = []
2132
2133 for node_uuid, node_name in node_uuid_name_map:
2134 if node_name == master_node_name:
2135 continue
2136 master_candidate = node_uuid in master_candidate_uuids
2137 potential_master_candidate = node_name in potential_master_candidates
2138 node_list.append((node_uuid, node_name, master_candidate,
2139 potential_master_candidate))
2140
2141 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid],
2142 key_file=ganeti_pub_keys_file)
2143 if not keys_by_uuid:
2144 raise errors.SshUpdateError("No public key of node %s (UUID %s) found,"
2145 " not generating a new key."
2146 % (node_name, node_uuid))
2147
2148 if master_candidate:
2149 logging.debug("Fetching old SSH key from node '%s'.", node_name)
2150 old_pub_key = ssh.ReadRemoteSshPubKeys(old_pub_keyfile,
2151 node_name, cluster_name,
2152 ssh_port_map[node_name],
2153 False, # ask_key
2154 False) # key_check
2155 if old_pub_key != old_master_key:
2156 # If we are already in a multi-key setup (that is past Ganeti 2.12),
2157 # we can safely remove the old key of the node. Otherwise, we cannot
2158 # remove that node's key, because it is also the master node's key
2159 # and that would terminate all communication from the master to the
2160 # node.
2161 node_info_to_remove.append(SshRemoveNodeInfo(
2162 uuid=node_uuid,
2163 name=node_name,
2164 from_authorized_keys=master_candidate,
2165 from_public_keys=False,
2166 clear_authorized_keys=False,
2167 clear_public_keys=False))
2168 else:
2169 logging.debug("Old key of node '%s' is the same as the current master"
2170 " key. Not deleting that key on the node.", node_name)
2171
2172 logging.debug("Removing old SSH keys of all master candidates.")
2173 if node_info_to_remove:
2174 node_errors = RemoveNodeSshKeyBulk(
2175 node_info_to_remove,
2176 master_candidate_uuids,
2177 potential_master_candidates,
2178 master_uuid=master_node_uuid)
2179 if node_errors:
2180 all_node_errors = all_node_errors + node_errors
2181
2182 for (node_uuid, node_name, master_candidate, potential_master_candidate) \
2183 in node_list:
2184
2185 logging.debug("Generating new SSH key for node '%s'.", node_name)
2186 _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, new_key_type,
2187 new_key_bits, pub_key_file=ganeti_pub_keys_file,
2188 ssconf_store=ssconf_store,
2189 noded_cert_file=noded_cert_file,
2190 run_cmd_fn=run_cmd_fn)
2191
2192 try:
2193 logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
2194 pub_key = ssh.ReadRemoteSshPubKeys(new_pub_keyfile,
2195 node_name, cluster_name,
2196 ssh_port_map[node_name],
2197 False, # ask_key
2198 False) # key_check
2199 except:
2200 raise errors.SshUpdateError("Could not fetch key of node %s"
2201 " (UUID %s)" % (node_name, node_uuid))
2202
2203 if potential_master_candidate:
2204 ssh.RemovePublicKey(node_uuid, key_file=ganeti_pub_keys_file)
2205 ssh.AddPublicKey(node_uuid, pub_key, key_file=ganeti_pub_keys_file)
2206
2207 node_info = SshAddNodeInfo(name=node_name,
2208 uuid=node_uuid,
2209 to_authorized_keys=master_candidate,
2210 to_public_keys=potential_master_candidate,
2211 get_public_keys=True)
2212 node_keys_to_add.append(node_info)
2213
2214 node_errors = AddNodeSshKeyBulk(
2215 node_keys_to_add, potential_master_candidates,
2216 pub_key_file=ganeti_pub_keys_file, ssconf_store=ssconf_store,
2217 noded_cert_file=noded_cert_file,
2218 run_cmd_fn=run_cmd_fn)
2219 if node_errors:
2220 all_node_errors = all_node_errors + node_errors
2221
2222 # Renewing the master node's key
2223
2224 # Preserve the old keys for now
2225 old_master_keys_by_uuid = _GetOldMasterKeys(master_node_uuid,
2226 ganeti_pub_keys_file)
2227
2228 # Generate a new master key with a suffix, don't touch the old one for now
2229 logging.debug("Generate new ssh key of master.")
2230 _GenerateNodeSshKey(master_node_uuid, master_node_name, ssh_port_map,
2231 new_key_type, new_key_bits,
2232 pub_key_file=ganeti_pub_keys_file,
2233 ssconf_store=ssconf_store,
2234 noded_cert_file=noded_cert_file,
2235 run_cmd_fn=run_cmd_fn,
2236 suffix=constants.SSHS_MASTER_SUFFIX)
2237 # Read newly created master key
2238 new_master_key_dict = _GetNewMasterKey(root_keyfiles, master_node_uuid)
2239
2240 # Replace master key in the master nodes' public key file
2241 ssh.RemovePublicKey(master_node_uuid, key_file=ganeti_pub_keys_file)
2242 for pub_key in new_master_key_dict[master_node_uuid]:
2243 ssh.AddPublicKey(master_node_uuid, pub_key, key_file=ganeti_pub_keys_file)
2244
2245 # Add new master key to all node's public and authorized keys
2246 logging.debug("Add new master key to all nodes.")
2247 node_errors = AddNodeSshKey(
2248 master_node_uuid, master_node_name, potential_master_candidates,
2249 to_authorized_keys=True, to_public_keys=True,
2250 get_public_keys=False, pub_key_file=ganeti_pub_keys_file,
2251 ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
2252 run_cmd_fn=run_cmd_fn)
2253 if node_errors:
2254 all_node_errors = all_node_errors + node_errors
2255
2256 # Remove the old key file and rename the new key to the non-temporary filename
2257 _ReplaceMasterKeyOnMaster(root_keyfiles)
2258
2259 # Remove old key from authorized keys
2260 (auth_key_file, _) = \
2261 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2262 ssh.RemoveAuthorizedKeys(auth_key_file,
2263 old_master_keys_by_uuid[master_node_uuid])
2264
2265 # Remove the old key from all node's authorized keys file
2266 logging.debug("Remove the old master key from all nodes.")
2267 node_errors = RemoveNodeSshKey(
2268 master_node_uuid, master_node_name, master_candidate_uuids,
2269 potential_master_candidates,
2270 keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
2271 from_public_keys=False, clear_authorized_keys=False,
2272 clear_public_keys=False)
2273 if node_errors:
2274 all_node_errors = all_node_errors + node_errors
2275
2276 return all_node_errors
2277
2278
2279 def GetBlockDevSizes(devices):
2280 """Return the size of the given block devices
2281
2282 @type devices: list
2283 @param devices: list of block device nodes to query
2284 @rtype: dict
2285 @return:
2286 dictionary of all block devices under /dev (key). The value is their
2287 size in MiB.
2288
2289 {'/dev/disk/by-uuid/123456-12321231-312312-312': 124}
2290
2291 """
2292 DEV_PREFIX = "/dev/"
2293 blockdevs = {}
2294
2295 for devpath in devices:
2296 if not utils.IsBelowDir(DEV_PREFIX, devpath):
2297 continue
2298
2299 try:
2300 st = os.stat(devpath)
2301 except EnvironmentError, err:
2302 logging.warning("Error stat()'ing device %s: %s", devpath, str(err))
2303 continue
2304
2305 if stat.S_ISBLK(st.st_mode):
2306 result = utils.RunCmd(["blockdev", "--getsize64", devpath])
2307 if result.failed:
2308 # We don't want to fail, just do not list this device as available
2309 logging.warning("Cannot get size for block device %s", devpath)
2310 continue
2311
2312 size = int(result.stdout) / (1024 * 1024)
2313 blockdevs[devpath] = size
2314 return blockdevs
2315
2316
2317 def GetVolumeList(vg_names):
2318 """Compute list of logical volumes and their size.
2319
2320 @type vg_names: list
2321 @param vg_names: the volume groups whose LVs we should list, or
2322 empty for all volume groups
2323 @rtype: dict
2324 @return:
2325 dictionary of all partions (key) with value being a tuple of
2326 their size (in MiB), inactive and online status::
2327
2328 {'xenvg/test1': ('20.06', True, True)}
2329
2330 in case of errors, a string is returned with the error
2331 details.
2332
2333 """
2334 lvs = {}
2335 sep = "|"
2336 if not vg_names:
2337 vg_names = []
2338 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2339 "--separator=%s" % sep,
2340 "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names)
2341 if result.failed:
2342 _Fail("Failed to list logical volumes, lvs output: %s", result.output)
2343
2344 for line in result.stdout.splitlines():
2345 line = line.strip()
2346 match = _LVSLINE_REGEX.match(line)
2347 if not match:
2348 logging.error("Invalid line returned from lvs output: '%s'", line)
2349 continue
2350 vg_name, name, size, attr = match.groups()
2351 inactive = attr[4] == "-"
2352 online = attr[5] == "o"
2353 virtual = attr[0] == "v"
2354 if virtual:
2355 # we don't want to report such volumes as existing, since they
2356 # don't really hold data
2357 continue
2358 lvs[vg_name + "/" + name] = (size, inactive, online)
2359
2360 return lvs
2361
2362
2363 def ListVolumeGroups():
2364 """List the volume groups and their size.
2365
2366 @rtype: dict
2367 @return: dictionary with keys volume name and values the
2368 size of the volume
2369
2370 """
2371 return utils.ListVolumeGroups()
2372
2373
2374 def NodeVolumes():
2375 """List all volumes on this node.
2376
2377 @rtype: list
2378 @return:
2379 A list of dictionaries, each having four keys:
2380 - name: the logical volume name,
2381 - size: the size of the logical volume
2382 - dev: the physical device on which the LV lives
2383 - vg: the volume group to which it belongs
2384
2385 In case of errors, we return an empty list and log the
2386 error.
2387
2388 Note that since a logical volume can live on multiple physical
2389 volumes, the resulting list might include a logical volume
2390 multiple times.
2391
2392 """
2393 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2394 "--separator=|",
2395 "--options=lv_name,lv_size,devices,vg_name"])
2396 if result.failed:
2397 _Fail("Failed to list logical volumes, lvs output: %s",
2398 result.output)
2399
2400 def parse_dev(dev):
2401 return dev.split("(")[0]
2402
2403 def handle_dev(dev):
2404 return [parse_dev(x) for x in dev.split(",")]
2405
2406 def map_line(line):
2407 line = [v.strip() for v in line]
2408 return [{"name": line[0], "size": line[1],
2409 "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])]
2410
2411 all_devs = []
2412 for line in result.stdout.splitlines():
2413 if line.count("|") >= 3:
2414 all_devs.extend(map_line(line.split("|")))
2415 else:
2416 logging.warning("Strange line in the output from lvs: '%s'", line)
2417 return all_devs
2418
2419
2420 def BridgesExist(bridges_list):
2421 """Check if a list of bridges exist on the current node.
2422
2423 @rtype: boolean
2424 @return: C{True} if all of them exist, C{False} otherwise
2425
2426 """
2427 missing = []
2428 for bridge in bridges_list:
2429 if not utils.BridgeExists(bridge):
2430 missing.append(bridge)
2431
2432 if missing:
2433 _Fail("Missing bridges %s", utils.CommaJoin(missing))
2434
2435
2436 def GetInstanceListForHypervisor(hname, hvparams=None,
2437 get_hv_fn=hypervisor.GetHypervisor):
2438 """Provides a list of instances of the given hypervisor.
2439
2440 @type hname: string
2441 @param hname: name of the hypervisor
2442 @type hvparams: dict of strings
2443 @param hvparams: hypervisor parameters for the given hypervisor
2444 @type get_hv_fn: function
2445 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2446 name; optional parameter to increase testability
2447
2448 @rtype: list
2449 @return: a list of all running instances on the current node
2450 - instance1.example.com
2451 - instance2.example.com
2452
2453 """
2454 try:
2455 return get_hv_fn(hname).ListInstances(hvparams=hvparams)
2456 except errors.HypervisorError, err:
2457 _Fail("Error enumerating instances (hypervisor %s): %s",
2458 hname, err, exc=True)
2459
2460
2461 def GetInstanceList(hypervisor_list, all_hvparams=None,
2462 get_hv_fn=hypervisor.GetHypervisor):
2463 """Provides a list of instances.
2464
2465 @type hypervisor_list: list
2466 @param hypervisor_list: the list of hypervisors to query information
2467 @type all_hvparams: dict of dict of strings
2468 @param all_hvparams: a dictionary mapping hypervisor types to respective
2469 cluster-wide hypervisor parameters
2470 @type get_hv_fn: function
2471 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2472 name; optional parameter to increase testability
2473
2474 @rtype: list
2475 @return: a list of all running instances on the current node
2476 - instance1.example.com
2477 - instance2.example.com
2478
2479 """
2480 results = []
2481 for hname in hypervisor_list:
2482 hvparams = all_hvparams[hname]
2483 results.extend(GetInstanceListForHypervisor(hname, hvparams=hvparams,
2484 get_hv_fn=get_hv_fn))
2485 return results
2486
2487
2488 def GetInstanceInfo(instance, hname, hvparams=None):
2489 """Gives back the information about an instance as a dictionary.
2490
2491 @type instance: string
2492 @param instance: the instance name
2493 @type hname: string
2494 @param hname: the hypervisor type of the instance
2495 @type hvparams: dict of strings
2496 @param hvparams: the instance's hvparams
2497
2498 @rtype: dict
2499 @return: dictionary with the following keys:
2500 - memory: memory size of instance (int)
2501 - state: state of instance (HvInstanceState)
2502 - time: cpu time of instance (float)
2503 - vcpus: the number of vcpus (int)
2504
2505 """
2506 output = {}
2507
2508 iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance,
2509 hvparams=hvparams)
2510 if iinfo is not None:
2511 output["memory"] = iinfo[2]
2512 output["vcpus"] = iinfo[3]
2513 output["state"] = iinfo[4]
2514 output["time"] = iinfo[5]
2515
2516 return output
2517
2518
2519 def GetInstanceMigratable(instance):
2520 """Computes whether an instance can be migrated.
2521
2522 @type instance: L{objects.Instance}
2523 @param instance: object representing the instance to be checked.
2524
2525 @rtype: tuple
2526 @return: tuple of (result, description) where:
2527 - result: whether the instance can be migrated or not
2528 - description: a description of the issue, if relevant
2529
2530 """
2531 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2532 iname = instance.name
2533 if iname not in hyper.ListInstances(hvparams=instance.hvparams):
2534 _Fail("Instance %s is not running", iname)
2535
2536 for idx in range(len(instance.disks_info)):
2537 link_name = _GetBlockDevSymlinkPath(iname, idx)
2538 if not os.path.islink(link_name):
2539 logging.warning("Instance %s is missing symlink %s for disk %d",
2540 iname, link_name, idx)
2541
2542
2543 def GetAllInstancesInfo(hypervisor_list, all_hvparams):
2544 """Gather data about all instances.
2545
2546 This is the equivalent of L{GetInstanceInfo}, except that it
2547 computes data for all instances at once, thus being faster if one
2548 needs data about more than one instance.
2549
2550 @type hypervisor_list: list
2551 @param hypervisor_list: list of hypervisors to query for instance data
2552 @type all_hvparams: dict of dict of strings
2553 @param all_hvparams: mapping of hypervisor names to hvparams
2554
2555 @rtype: dict
2556 @return: dictionary of instance: data, with data having the following keys:
2557 - memory: memory size of instance (int)
2558 - state: xen state of instance (string)
2559 - time: cpu time of instance (float)
2560 - vcpus: the number of vcpus
2561
2562 """
2563 output = {}
2564 for hname in hypervisor_list:
2565 hvparams = all_hvparams[hname]
2566 iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo(hvparams)
2567 if iinfo:
2568 for name, _, memory, vcpus, state, times in iinfo:
2569 value = {
2570 "memory": memory,
2571 "vcpus": vcpus,
2572 "state": state,
2573 "time": times,
2574 }
2575 if name in output:
2576 # we only check static parameters, like memory and vcpus,
2577 # and not state and time which can change between the
2578 # invocations of the different hypervisors
2579 for key in "memory", "vcpus":
2580 if value[key] != output[name][key]:
2581 _Fail("Instance %s is running twice"
2582 " with different parameters", name)
2583 output[name] = value
2584
2585 return output
2586
2587
2588 def GetInstanceConsoleInfo(instance_param_dict,
2589 get_hv_fn=hypervisor.GetHypervisor):
2590 """Gather data about the console access of a set of instances of this node.
2591
2592 This function assumes that the caller already knows which instances are on
2593 this node, by calling a function such as L{GetAllInstancesInfo} or
2594 L{GetInstanceList}.
2595
2596 For every instance, a large amount of configuration data needs to be
2597 provided to the hypervisor interface in order to receive the console
2598 information. Whether this could or should be cut down can be discussed.
2599 The information is provided in a dictionary indexed by instance name,
2600 allowing any number of instance queries to be done.
2601
2602 @type instance_param_dict: dict of string to tuple of dictionaries, where the
2603 dictionaries represent: L{objects.Instance}, L{objects.Node},
2604 L{objects.NodeGroup}, HvParams, BeParams
2605 @param instance_param_dict: mapping of instance name to parameters necessary
2606 for console information retrieval
2607
2608 @rtype: dict
2609 @return: dictionary of instance: data, with data having the following keys:
2610 - instance: instance name
2611 - kind: console kind
2612 - message: used with kind == CONS_MESSAGE, indicates console to be
2613 unavailable, supplies error message
2614 - host: host to connect to
2615 - port: port to use
2616 - user: user for login
2617 - command: the command, broken into parts as an array
2618 - display: unknown, potentially unused?
2619
2620 """
2621
2622 output = {}
2623 for inst_name in instance_param_dict:
2624 instance = instance_param_dict[inst_name]["instance"]
2625 pnode = instance_param_dict[inst_name]["node"]
2626 group = instance_param_dict[inst_name]["group"]
2627 hvparams = instance_param_dict[inst_name]["hvParams"]
2628 beparams = instance_param_dict[inst_name]["beParams"]
2629
2630 instance = objects.Instance.FromDict(instance)
2631 pnode = objects.Node.FromDict(pnode)
2632 group = objects.NodeGroup.FromDict(group)
2633
2634 h = get_hv_fn(instance.hypervisor)
2635 output[inst_name] = h.GetInstanceConsole(instance, pnode, group,
2636 hvparams, beparams).ToDict()
2637
2638 return output
2639
2640
2641 def _InstanceLogName(kind, os_name, instance, component):
2642 """Compute the OS log filename for a given instance and operation.
2643
2644 The instance name and os name are passed in as strings since not all
2645 operations have these as part of an instance object.
2646
2647 @type kind: string
2648 @param kind: the operation type (e.g. add, import, etc.)
2649 @type os_name: string
2650 @param os_name: the os name
2651 @type instance: string
2652 @param instance: the name of the instance being imported/added/etc.
2653 @type component: string or None
2654 @param component: the name of the component of the instance being
2655 transferred
2656
2657 """
2658 # TODO: Use tempfile.mkstemp to create unique filename
2659 if component:
2660 assert "/" not in component
2661 c_msg = "-%s" % component
2662 else:
2663 c_msg = ""
2664 base = ("%s-%s-%s%s-%s.log" %
2665 (kind, os_name, instance, c_msg, utils.TimestampForFilename()))
2666 return utils.PathJoin(pathutils.LOG_OS_DIR, base)
2667
2668
2669 def InstanceOsAdd(instance, reinstall, debug):
2670 """Add an OS to an instance.
2671
2672 @type instance: L{objects.Instance}
2673 @param instance: Instance whose OS is to be installed
2674 @type reinstall: boolean
2675 @param reinstall: whether this is an instance reinstall
2676 @type debug: integer
2677 @param debug: debug level, passed to the OS scripts
2678 @rtype: None
2679
2680 """
2681 inst_os = OSFromDisk(instance.os)
2682
2683 create_env = OSEnvironment(instance, inst_os, debug)
2684 if reinstall:
2685 create_env["INSTANCE_REINSTALL"] = "1"
2686
2687 logfile = _InstanceLogName("add", instance.os, instance.name, None)
2688
2689 result = utils.RunCmd([inst_os.create_script], env=create_env,
2690 cwd=inst_os.path, output=logfile, reset_env=True)
2691 if result.failed:
2692 logging.error("os create command '%s' returned error: %s, logfile: %s,"
2693 " output: %s", result.cmd, result.fail_reason, logfile,
2694 result.output)
2695 lines = [utils.SafeEncode(val)
2696 for val in utils.TailFile(logfile, lines=20)]
2697 _Fail("OS create script failed (%s), last lines in the"
2698 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2699
2700
2701 def RunRenameInstance(instance, old_name, debug):
2702 """Run the OS rename script for an instance.
2703
2704 @type instance: L{objects.Instance}
2705 @param instance: Instance whose OS is to be installed
2706 @type old_name: string
2707 @param old_name: previous instance name
2708 @type debug: integer
2709 @param debug: debug level, passed to the OS scripts
2710 @rtype: boolean
2711 @return: the success of the operation
2712
2713 """
2714 inst_os = OSFromDisk(instance.os)
2715
2716 rename_env = OSEnvironment(instance, inst_os, debug)
2717 rename_env["OLD_INSTANCE_NAME"] = old_name
2718
2719 logfile = _InstanceLogName("rename", instance.os,
2720 "%s-%s" % (old_name, instance.name), None)
2721
2722 result = utils.RunCmd([inst_os.rename_script], env=rename_env,
2723 cwd=inst_os.path, output=logfile, reset_env=True)
2724
2725 if result.failed:
2726 logging.error("os create command '%s' returned error: %s output: %s",
2727 result.cmd, result.fail_reason, result.output)
2728 lines = [utils.SafeEncode(val)
2729 for val in utils.TailFile(logfile, lines=20)]
2730 _Fail("OS rename script failed (%s), last lines in the"
2731 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2732
2733
2734 def _GetBlockDevSymlinkPath(instance_name, idx, _dir=None):
2735 """Returns symlink path for block device.
2736
2737 """
2738 if _dir is None:
2739 _dir = pathutils.DISK_LINKS_DIR
2740
2741 return utils.PathJoin(_dir,
2742 ("%s%s%s" %
2743 (instance_name, constants.DISK_SEPARATOR, idx)))
2744
2745
2746 def _SymlinkBlockDev(instance_name, device_path, idx):
2747 """Set up symlinks to a instance's block device.
2748
2749 This is an auxiliary function run when an instance is start (on the primary
2750 node) or when an instance is migrated (on the target node).
2751
2752
2753 @param instance_name: the name of the target instance
2754 @param device_path: path of the physical block device, on the node
2755 @param idx: the disk index
2756 @return: absolute path to the disk's symlink
2757
2758 """
2759 # In case we have only a userspace access URI, device_path is None
2760 if not device_path:
2761 return None
2762
2763 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2764 try:
2765 os.symlink(device_path, link_name)
2766 except OSError, err:
2767 if err.errno == errno.EEXIST:
2768 if (not os.path.islink(link_name) or
2769 os.readlink(link_name) != device_path):
2770 os.remove(link_name)
2771 os.symlink(device_path, link_name)
2772 else:
2773 raise
2774
2775 return link_name
2776
2777
2778 def _RemoveBlockDevLinks(instance_name, disks):
2779 """Remove the block device symlinks belonging to the given instance.
2780
2781 """
2782 for idx, _ in enumerate(disks):
2783 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2784 if os.path.islink(link_name):
2785 try:
2786 os.remove(link_name)
2787 except OSError:
2788 logging.exception("Can't remove symlink '%s'", link_name)
2789
2790
2791 def _CalculateDeviceURI(instance, disk, device):
2792 """Get the URI for the device.
2793
2794 @type instance: L{objects.Instance}
2795 @param instance: the instance which disk belongs to
2796 @type disk: L{objects.Disk}
2797 @param disk: the target disk object
2798 @type device: L{bdev.BlockDev}
2799 @param device: the corresponding BlockDevice
2800 @rtype: string
2801 @return: the device uri if any else None
2802
2803 """
2804 access_mode = disk.params.get(constants.LDP_ACCESS,
2805 constants.DISK_KERNELSPACE)
2806 if access_mode == constants.DISK_USERSPACE:
2807 # This can raise errors.BlockDeviceError
2808 return device.GetUserspaceAccessUri(instance.hypervisor)
2809 else:
2810 return None
2811
2812
2813 def _GatherAndLinkBlockDevs(instance):
2814 """Set up an instance's block device(s).
2815
2816 This is run on the primary node at instance startup. The block
2817 devices must be already assembled.
2818
2819 @type instance: L{objects.Instance}
2820 @param instance: the instance whose disks we should assemble
2821 @rtype: list
2822 @return: list of (disk_object, link_name, drive_uri)
2823
2824 """
2825 block_devices = []
2826 for idx, disk in enumerate(instance.disks_info):
2827 device = _RecursiveFindBD(disk)
2828 if device is None:
2829 raise errors.BlockDeviceError("Block device '%s' is not set up." %
2830 str(disk))
2831 device.Open()
2832 try:
2833 link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
2834 except OSError, e:
2835 raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
2836 e.strerror)
2837 uri = _CalculateDeviceURI(instance, disk, device)
2838
2839 block_devices.append((disk, link_name, uri))
2840
2841 return block_devices
2842
2843
2844 def _IsInstanceUserDown(instance_info):
2845 return instance_info and \
2846 "state" in instance_info and \
2847 hv_base.HvInstanceState.IsShutdown(instance_info["state"])
2848
2849
2850 def _GetInstanceInfo(instance):
2851 """Helper function L{GetInstanceInfo}"""
2852 return GetInstanceInfo(instance.name, instance.hypervisor,
2853 hvparams=instance.hvparams)
2854
2855
2856 def StartInstance(instance, startup_paused, reason, store_reason=True):
2857 """Start an instance.
2858
2859 @type instance: L{objects.Instance}
2860 @param instance: the instance object
2861 @type startup_paused: bool
2862 @param instance: pause instance at startup?
2863 @type reason: list of reasons
2864 @param reason: the reason trail for this startup
2865 @type store_reason: boolean
2866 @param store_reason: whether to store the shutdown reason trail on file
2867 @rtype: None
2868
2869 """
2870 instance_info = _GetInstanceInfo(instance)
2871
2872 if instance_info and not _IsInstanceUserDown(instance_info):
2873 logging.info("Instance '%s' already running, not starting", instance.name)
2874 return
2875
2876 try:
2877 block_devices = _GatherAndLinkBlockDevs(instance)
2878 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2879 hyper.StartInstance(instance, block_devices, startup_paused)
2880 if store_reason:
2881 _StoreInstReasonTrail(instance.name, reason)
2882 except errors.BlockDeviceError, err:
2883 _Fail("Block device error: %s", err, exc=True)
2884 except errors.HypervisorError, err:
2885 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2886 _Fail("Hypervisor error: %s", err, exc=True)
2887
2888
2889 def InstanceShutdown(instance, timeout, reason, store_reason=True):
2890 """Shut an instance down.
2891
2892 @note: this functions uses polling with a hardcoded timeout.
2893
2894 @type instance: L{objects.Instance}
2895 @param instance: the instance object
2896 @type timeout: integer
2897 @param timeout: maximum timeout for soft shutdown
2898 @type reason: list of reasons
2899 @param reason: the reason trail for this shutdown
2900 @type store_reason: boolean
2901 @param store_reason: whether to store the shutdown reason trail on file
2902 @rtype: None
2903
2904 """
2905 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2906
2907 if not _GetInstanceInfo(instance):
2908 logging.info("Instance '%s' not running, doing nothing", instance.name)
2909 return
2910
2911 class _TryShutdown(object):
2912 def __init__(self):
2913 self.tried_once = False
2914
2915 def __call__(self):
2916 if not _GetInstanceInfo(instance):
2917 return
2918
2919 try:
2920 hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout)
2921 if store_reason:
2922 _StoreInstReasonTrail(instance.name, reason)
2923 except errors.HypervisorError, err:
2924 # if the instance is no longer existing, consider this a
2925 # success and go to cleanup
2926 if not _GetInstanceInfo(instance):
2927 return
2928
2929 _Fail("Failed to stop instance '%s': %s", instance.name, err)
2930
2931 self.tried_once = True
2932
2933 raise utils.RetryAgain()
2934
2935 try:
2936 utils.Retry(_TryShutdown(), 5, timeout)
2937 except utils.RetryTimeout:
2938 # the shutdown did not succeed
2939 logging.error("Shutdown of '%s' unsuccessful, forcing", instance.name)
2940
2941 try:
2942 hyper.StopInstance(instance, force=True)
2943 except errors.HypervisorError, err:
2944 # only raise an error if the instance still exists, otherwise
2945 # the error could simply be "instance ... unknown"!
2946 if _GetInstanceInfo(instance):
2947 _Fail("Failed to force stop instance '%s': %s", instance.name, err)
2948
2949 time.sleep(1)
2950
2951 if _GetInstanceInfo(instance):
2952 _Fail("Could not shutdown instance '%s' even by destroy", instance.name)
2953
2954 try:
2955 hyper.CleanupInstance(instance.name)
2956 except errors.HypervisorError, err:
2957 logging.warning("Failed to execute post-shutdown cleanup step: %s", err)
2958
2959 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2960
2961
2962 def InstanceReboot(instance, reboot_type, shutdown_timeout, reason):
2963 """Reboot an instance.
2964
2965 @type instance: L{objects.Instance}
2966 @param instance: the instance object to reboot
2967 @type reboot_type: str
2968 @param reboot_type: the type of reboot, one the following
2969 constants:
2970 - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
2971 instance OS, do not recreate the VM
2972 - L{constants.INSTANCE_REBOOT_HARD}: tear down and
2973 restart the VM (at the hypervisor level)
2974 - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is
2975 not accepted here, since that mode is handled differently, in
2976 cmdlib, and translates into full stop and start of the
2977 instance (instead of a call_instance_reboot RPC)
2978 @type shutdown_timeout: integer
2979 @param shutdown_timeout: maximum timeout for soft shutdown
2980 @type reason: list of reasons
2981 @param reason: the reason trail for this reboot
2982 @rtype: None
2983
2984 """
2985 # TODO: this is inconsistent with 'StartInstance' and 'InstanceShutdown'
2986 # because those functions simply 'return' on error whereas this one
2987 # raises an exception with '_Fail'
2988 if not _GetInstanceInfo(instance):
2989 _Fail("Cannot reboot instance '%s' that is not running", instance.name)
2990
2991 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2992 if reboot_type == constants.INSTANCE_REBOOT_SOFT:
2993 try:
2994 hyper.RebootInstance(instance)
2995 except errors.HypervisorError, err:
2996 _Fail("Failed to soft reboot instance '%s': %s", instance.name, err)
2997 elif reboot_type == constants.INSTANCE_REBOOT_HARD:
2998 try:
2999 InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False)
3000 result = StartInstance(instance, False, reason, store_reason=False)
3001 _StoreInstReasonTrail(instance.name, reason)
3002 return result
3003 except errors.HypervisorError, err:
3004 _Fail("Failed to hard reboot instance '%s': %s", instance.name, err)
3005 else:
3006 _Fail("Invalid reboot_type received: '%s'", reboot_type)
3007
3008
3009 def InstanceBalloonMemory(instance, memory):
3010 """Resize an instance's memory.
3011
3012 @type instance: L{objects.Instance}
3013 @param instance: the instance object
3014 @type memory: int
3015 @param memory: new memory amount in MB
3016 @rtype: None
3017
3018 """
3019 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3020 running = hyper.ListInstances(hvparams=instance.hvparams)
3021 if instance.name not in running:
3022 logging.info("Instance %s is not running, cannot balloon", instance.name)
3023 return
3024 try:
3025 hyper.BalloonInstanceMemory(instance, memory)
3026 except errors.HypervisorError, err:
3027 _Fail("Failed to balloon instance memory: %s", err, exc=True)
3028
3029
3030 def MigrationInfo(instance):
3031 """Gather information about an instance to be migrated.
3032
3033 @type instance: L{objects.Instance}
3034 @param instance: the instance definition
3035
3036 """
3037 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3038 try:
3039 info = hyper.MigrationInfo(instance)
3040 except errors.HypervisorError, err:
3041 _Fail("Failed to fetch migration information: %s", err, exc=True)
3042 return info
3043
3044
3045 def AcceptInstance(instance, info, target):
3046 """Prepare the node to accept an instance.
3047
3048 @type instance: L{objects.Instance}
3049 @param instance: the instance definition
3050 @type info: string/data (opaque)
3051 @param info: migration information, from the source node
3052 @type target: string
3053 @param target: target host (usually ip), on this node
3054
3055 """
3056 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3057 try:
3058 hyper.AcceptInstance(instance, info, target)
3059 except errors.HypervisorError, err:
3060 _Fail("Failed to accept instance: %s", err, exc=True)
3061
3062
3063 def FinalizeMigrationDst(instance, info, success):
3064 """Finalize any preparation to accept an instance.
3065
3066 @type instance: L{objects.Instance}
3067 @param instance: the instance definition
3068 @type info: string/data (opaque)
3069 @param info: migration information, from the source node
3070 @type success: boolean
3071 @param success: whether the migration was a success or a failure
3072
3073 """
3074 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3075 try:
3076 hyper.FinalizeMigrationDst(instance, info, success)
3077 except errors.HypervisorError, err:
3078 _Fail("Failed to finalize migration on the target node: %s", err, exc=True)
3079
3080
3081 def MigrateInstance(cluster_name, instance, target, live):
3082 """Migrates an instance to another node.
3083
3084 @type cluster_name: string
3085 @param cluster_name: name of the cluster
3086 @type instance: L{objects.Instance}
3087 @param instance: the instance definition
3088 @type target: string
3089 @param target: the target node name
3090 @type live: boolean
3091 @param live: whether the migration should be done live or not (the
3092 interpretation of this parameter is left to the hypervisor)
3093 @raise RPCFail: if migration fails for some reason
3094
3095 """
3096 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3097
3098 try:
3099 hyper.MigrateInstance(cluster_name, instance, target, live)
3100 except errors.HypervisorError, err:
3101 _Fail("Failed to migrate instance: %s", err, exc=True)
3102
3103
3104 def FinalizeMigrationSource(instance, success, live):
3105 """Finalize the instance migration on the source node.
3106
3107 @type instance: L{objects.Instance}
3108 @param instance: the instance definition of the migrated instance
3109 @type success: bool
3110 @param success: whether the migration succeeded or not
3111 @type live: bool
3112 @param live: whether the user requested a live migration or not
3113 @raise RPCFail: If the execution fails for some reason
3114
3115 """
3116 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3117
3118 try:
3119 hyper.FinalizeMigrationSource(instance, success, live)
3120 except Exception, err: # pylint: disable=W0703
3121 _Fail("Failed to finalize the migration on the source node: %s", err,
3122 exc=True)
3123
3124
3125 def GetMigrationStatus(instance):
3126 """Get the migration status
3127
3128 @type instance: L{objects.Instance}
3129 @param instance: the instance that is being migrated
3130 @rtype: L{objects.MigrationStatus}
3131 @return: the status of the current migration (one of
3132 L{constants.HV_MIGRATION_VALID_STATUSES}), plus any additional
3133 progress info that can be retrieved from the hypervisor
3134 @raise RPCFail: If the migration status cannot be retrieved
3135
3136 """
3137 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3138 try:
3139 return hyper.GetMigrationStatus(instance)
3140 except Exception, err: # pylint: disable=W0703
3141 _Fail("Failed to get migration status: %s", err, exc=True)
3142
3143
3144 def HotplugDevice(instance, action, dev_type, device, extra, seq):
3145 """Hotplug a device
3146
3147 Hotplug is currently supported only for KVM Hypervisor.
3148 @type instance: L{objects.Instance}
3149 @param instance: the instance to which we hotplug a device
3150 @type action: string
3151 @param action: the hotplug action to perform
3152 @type dev_type: string
3153 @param dev_type: the device type to hotplug
3154 @type device: either L{objects.NIC} or L{objects.Disk}
3155 @param device: the device object to hotplug
3156 @type extra: tuple
3157 @param extra: extra info used for disk hotplug (disk link, drive uri)
3158 @type seq: int
3159 @param seq: the index of the device from master perspective
3160 @raise RPCFail: in case instance does not have KVM hypervisor
3161
3162 """
3163 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3164 try:
3165 hyper.VerifyHotplugSupport(instance, action, dev_type)
3166 except errors.HotplugError, err:
3167 _Fail("Hotplug is not supported: %s", err)
3168
3169 if action == constants.HOTPLUG_ACTION_ADD:
3170 fn = hyper.HotAddDevice
3171 elif action == constants.HOTPLUG_ACTION_REMOVE:
3172 fn = hyper.HotDelDevice
3173 elif action == constants.HOTPLUG_ACTION_MODIFY:
3174 fn = hyper.HotModDevice
3175 else:
3176 assert action in constants.HOTPLUG_ALL_ACTIONS
3177
3178 return fn(instance, dev_type, device, extra, seq)
3179
3180
3181 def HotplugSupported(instance):
3182 """Checks if hotplug is generally supported.
3183
3184 """
3185 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3186 try:
3187 hyper.HotplugSupported(instance)
3188 except errors.HotplugError, err:
3189 _Fail("Hotplug is not supported: %s", err)
3190
3191
3192 def ModifyInstanceMetadata(metadata):
3193 """Sends instance data to the metadata daemon.
3194
3195 Uses the Luxi transport layer to communicate with the metadata
3196 daemon configuration server. It starts the metadata daemon if it is
3197 not running.
3198 The daemon must be enabled during at configuration time.
3199
3200 @type metadata: dict
3201 @param metadata: instance metadata obtained by calling
3202 L{objects.Instance.ToDict} on an instance object
3203
3204 """
3205 if not constants.ENABLE_METAD:
3206 raise errors.ProgrammerError("The metadata deamon is disabled, yet"
3207 " ModifyInstanceMetadata has been called")
3208
3209 if not utils.IsDaemonAlive(constants.METAD):
3210 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.METAD])
3211 if result.failed:
3212 raise errors.HypervisorError("Failed to start metadata daemon")
3213
3214 with contextlib.closing(metad.Client()) as client:
3215 client.UpdateConfig(metadata)
3216
3217
3218 def BlockdevCreate(disk, size, owner, on_primary, info, excl_stor):
3219 """Creates a block device for an instance.
3220
3221 @type disk: L{objects.Disk}
3222 @param disk: the object describing the disk we should create
3223 @type size: int
3224 @param size: the size of the physical underlying device, in MiB
3225 @type owner: str
3226 @param owner: the name of the instance for which disk is created,
3227 used for device cache data
3228 @type on_primary: boolean
3229 @param on_primary: indicates if it is the primary node or not
3230 @type info: string
3231 @param info: string that will be sent to the physical device
3232 creation, used for example to set (LVM) tags on LVs
3233 @type excl_stor: boolean
3234 @param excl_stor: Whether exclusive_storage is active
3235
3236 @return: the new unique_id of the device (this can sometime be
3237 computed only after creation), or None. On secondary nodes,
3238 it's not required to return anything.
3239
3240 """
3241 # TODO: remove the obsolete "size" argument
3242 # pylint: disable=W0613
3243 clist = []
3244 if disk.children:
3245 for child in disk.children:
3246 try:
3247 crdev = _RecursiveAssembleBD(child, owner, on_primary)
3248 except errors.BlockDeviceError, err:
3249 _Fail("Can't assemble device %s: %s", child, err)
3250 if on_primary or disk.AssembleOnSecondary():
3251 # we need the children open in case the device itself has to
3252 # be assembled
3253 try:
3254 # pylint: disable=E1103
3255 crdev.Open()
3256 except errors.BlockDeviceError, err:
3257 _Fail("Can't make child '%s' read-write: %s", child, err)
3258 clist.append(crdev)
3259
3260 try:
3261 device = bdev.Create(disk, clist, excl_stor)
3262 except errors.BlockDeviceError, err:
3263 _Fail("Can't create block device: %s", err)
3264
3265 if on_primary or disk.AssembleOnSecondary():
3266 try:
3267 device.Assemble()
3268 except errors.BlockDeviceError, err:
3269 _Fail("Can't assemble device after creation, unusual event: %s", err)
3270 if on_primary or disk.OpenOnSecondary():
3271 try:
3272 device.Open(force=True)
3273 except errors.BlockDeviceError, err:
3274 _Fail("Can't make device r/w after creation, unusual event: %s", err)
3275 DevCacheManager.UpdateCache(device.dev_path, owner,
3276 on_primary, disk.iv_name)
3277
3278 device.SetInfo(info)
3279
3280 return device.unique_id
3281
3282
3283 def _DumpDevice(source_path, target_path, offset, size, truncate):
3284 """This function images/wipes the device using a local file.
3285
3286 @type source_path: string
3287 @param source_path: path of the image or data source (e.g., "/dev/zero")
3288
3289 @type target_path: string
3290 @param target_path: path of the device to image/wipe
3291
3292 @type offset: int
3293 @param offset: offset in MiB in the output file
3294
3295 @type size: int
3296 @param size: maximum size in MiB to write (data source might be smaller)
3297
3298 @type truncate: bool
3299 @param truncate: whether the file should be truncated
3300
3301 @return: None
3302 @raise RPCFail: in case of failure
3303
3304 """
3305 # Internal sizes are always in Mebibytes; if the following "dd" command
3306 # should use a different block size the offset and size given to this
3307 # function must be adjusted accordingly before being passed to "dd".
3308 block_size = constants.DD_BLOCK_SIZE
3309
3310 cmd = [constants.DD_CMD, "if=%s" % source_path, "seek=%d" % offset,
3311 "bs=%s" % block_size, "oflag=direct", "of=%s" % target_path,
3312 "count=%d" % size]
3313
3314 if not truncate:
3315 cmd.append("conv=notrunc")
3316
3317 result = utils.RunCmd(cmd)
3318
3319 if result.failed:
3320 _Fail("Dump command '%s' exited with error: %s; output: %s", result.cmd,
3321 result.fail_reason, result.output)
3322
3323
3324 def _DownloadAndDumpDevice(source_url, target_path, size):
3325 """This function images a device using a downloaded image file.
3326
3327 @type source_url: string
3328 @param source_url: URL of image to dump to disk
3329
3330 @type target_path: string
3331 @param target_path: path of the device to image
3332
3333 @type size: int
3334 @param size: maximum size in MiB to write (data source might be smaller)
3335
3336 @rtype: NoneType
3337 @return: None
3338 @raise RPCFail: in case of download or write failures
3339
3340 """
3341 class DDParams(object):
3342 def __init__(self, current_size, total_size):
3343 self.current_size = current_size
3344 self.total_size = total_size
3345 self.image_size_error = False
3346
3347 def dd_write(ddparams, out):
3348 if ddparams.current_size < ddparams.total_size:
3349 ddparams.current_size += len(out)