Merge branch 'stable-2.14' into stable-2.15
[ganeti-github.git] / lib / backend.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31 """Functions used by the node daemon
32
33 @var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
34 the L{UploadFile} function
35 @var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted
36 in the L{_CleanDirectory} function
37
38 """
39
40 # pylint: disable=E1103,C0302
41
42 # E1103: %s %r has no %r member (but some types could not be
43 # inferred), because the _TryOSFromDisk returns either (True, os_obj)
44 # or (False, "string") which confuses pylint
45
46 # C0302: This module has become too big and should be split up
47
48
49 import base64
50 import errno
51 import logging
52 import os
53 import os.path
54 import pycurl
55 import random
56 import re
57 import shutil
58 import signal
59 import stat
60 import tempfile
61 import time
62 import zlib
63 import contextlib
64 import collections
65
66 from ganeti import errors
67 from ganeti import http
68 from ganeti import utils
69 from ganeti import ssh
70 from ganeti import hypervisor
71 from ganeti.hypervisor import hv_base
72 from ganeti import constants
73 from ganeti.storage import bdev
74 from ganeti.storage import drbd
75 from ganeti.storage import extstorage
76 from ganeti.storage import filestorage
77 from ganeti import objects
78 from ganeti import ssconf
79 from ganeti import serializer
80 from ganeti import netutils
81 from ganeti import runtime
82 from ganeti import compat
83 from ganeti import pathutils
84 from ganeti import vcluster
85 from ganeti import ht
86 from ganeti.storage.base import BlockDev
87 from ganeti.storage.drbd import DRBD8
88 from ganeti import hooksmaster
89 import ganeti.metad as metad
90
91
92 _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id"
93 _ALLOWED_CLEAN_DIRS = compat.UniqueFrozenset([
94 pathutils.DATA_DIR,
95 pathutils.JOB_QUEUE_ARCHIVE_DIR,
96 pathutils.QUEUE_DIR,
97 pathutils.CRYPTO_KEYS_DIR,
98 ])
99 _MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60
100 _X509_KEY_FILE = "key"
101 _X509_CERT_FILE = "cert"
102 _IES_STATUS_FILE = "status"
103 _IES_PID_FILE = "pid"
104 _IES_CA_FILE = "ca"
105
106 #: Valid LVS output line regex
107 _LVSLINE_REGEX = re.compile(r"^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$")
108
109 # Actions for the master setup script
110 _MASTER_START = "start"
111 _MASTER_STOP = "stop"
112
113 #: Maximum file permissions for restricted command directory and executables
114 _RCMD_MAX_MODE = (stat.S_IRWXU |
115 stat.S_IRGRP | stat.S_IXGRP |
116 stat.S_IROTH | stat.S_IXOTH)
117
118 #: Delay before returning an error for restricted commands
119 _RCMD_INVALID_DELAY = 10
120
121 #: How long to wait to acquire lock for restricted commands (shorter than
122 #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many
123 #: command requests arrive
124 _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8
125
126
127 class RPCFail(Exception):
128 """Class denoting RPC failure.
129
130 Its argument is the error message.
131
132 """
133
134
135 def _GetInstReasonFilename(instance_name):
136 """Path of the file containing the reason of the instance status change.
137
138 @type instance_name: string
139 @param instance_name: The name of the instance
140 @rtype: string
141 @return: The path of the file
142
143 """
144 return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name)
145
146
147 def _StoreInstReasonTrail(instance_name, trail):
148 """Serialize a reason trail related to an instance change of state to file.
149
150 The exact location of the file depends on the name of the instance and on
151 the configuration of the Ganeti cluster defined at deploy time.
152
153 @type instance_name: string
154 @param instance_name: The name of the instance
155
156 @type trail: list of reasons
157 @param trail: reason trail
158
159 @rtype: None
160
161 """
162 json = serializer.DumpJson(trail)
163 filename = _GetInstReasonFilename(instance_name)
164 utils.WriteFile(filename, data=json)
165
166
167 def _Fail(msg, *args, **kwargs):
168 """Log an error and the raise an RPCFail exception.
169
170 This exception is then handled specially in the ganeti daemon and
171 turned into a 'failed' return type. As such, this function is a
172 useful shortcut for logging the error and returning it to the master
173 daemon.
174
175 @type msg: string
176 @param msg: the text of the exception
177 @raise RPCFail
178
179 """
180 if args:
181 msg = msg % args
182 if "log" not in kwargs or kwargs["log"]: # if we should log this error
183 if "exc" in kwargs and kwargs["exc"]:
184 logging.exception(msg)
185 else:
186 logging.error(msg)
187 raise RPCFail(msg)
188
189
190 def _GetConfig():
191 """Simple wrapper to return a SimpleStore.
192
193 @rtype: L{ssconf.SimpleStore}
194 @return: a SimpleStore instance
195
196 """
197 return ssconf.SimpleStore()
198
199
200 def _GetSshRunner(cluster_name):
201 """Simple wrapper to return an SshRunner.
202
203 @type cluster_name: str
204 @param cluster_name: the cluster name, which is needed
205 by the SshRunner constructor
206 @rtype: L{ssh.SshRunner}
207 @return: an SshRunner instance
208
209 """
210 return ssh.SshRunner(cluster_name)
211
212
213 def _Decompress(data):
214 """Unpacks data compressed by the RPC client.
215
216 @type data: list or tuple
217 @param data: Data sent by RPC client
218 @rtype: str
219 @return: Decompressed data
220
221 """
222 assert isinstance(data, (list, tuple))
223 assert len(data) == 2
224 (encoding, content) = data
225 if encoding == constants.RPC_ENCODING_NONE:
226 return content
227 elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
228 return zlib.decompress(base64.b64decode(content))
229 else:
230 raise AssertionError("Unknown data encoding")
231
232
233 def _CleanDirectory(path, exclude=None):
234 """Removes all regular files in a directory.
235
236 @type path: str
237 @param path: the directory to clean
238 @type exclude: list
239 @param exclude: list of files to be excluded, defaults
240 to the empty list
241
242 """
243 if path not in _ALLOWED_CLEAN_DIRS:
244 _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'",
245 path)
246
247 if not os.path.isdir(path):
248 return
249 if exclude is None:
250 exclude = []
251 else:
252 # Normalize excluded paths
253 exclude = [os.path.normpath(i) for i in exclude]
254
255 for rel_name in utils.ListVisibleFiles(path):
256 full_name = utils.PathJoin(path, rel_name)
257 if full_name in exclude:
258 continue
259 if os.path.isfile(full_name) and not os.path.islink(full_name):
260 utils.RemoveFile(full_name)
261
262
263 def _BuildUploadFileList():
264 """Build the list of allowed upload files.
265
266 This is abstracted so that it's built only once at module import time.
267
268 """
269 allowed_files = set([
270 pathutils.CLUSTER_CONF_FILE,
271 pathutils.ETC_HOSTS,
272 pathutils.SSH_KNOWN_HOSTS_FILE,
273 pathutils.VNC_PASSWORD_FILE,
274 pathutils.RAPI_CERT_FILE,
275 pathutils.SPICE_CERT_FILE,
276 pathutils.SPICE_CACERT_FILE,
277 pathutils.RAPI_USERS_FILE,
278 pathutils.CONFD_HMAC_KEY,
279 pathutils.CLUSTER_DOMAIN_SECRET_FILE,
280 ])
281
282 for hv_name in constants.HYPER_TYPES:
283 hv_class = hypervisor.GetHypervisorClass(hv_name)
284 allowed_files.update(hv_class.GetAncillaryFiles()[0])
285
286 assert pathutils.FILE_STORAGE_PATHS_FILE not in allowed_files, \
287 "Allowed file storage paths should never be uploaded via RPC"
288
289 return frozenset(allowed_files)
290
291
292 _ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
293
294
295 def JobQueuePurge():
296 """Removes job queue files and archived jobs.
297
298 @rtype: tuple
299 @return: True, None
300
301 """
302 _CleanDirectory(pathutils.QUEUE_DIR, exclude=[pathutils.JOB_QUEUE_LOCK_FILE])
303 _CleanDirectory(pathutils.JOB_QUEUE_ARCHIVE_DIR)
304
305
306 def GetMasterNodeName():
307 """Returns the master node name.
308
309 @rtype: string
310 @return: name of the master node
311 @raise RPCFail: in case of errors
312
313 """
314 try:
315 return _GetConfig().GetMasterNode()
316 except errors.ConfigurationError, err:
317 _Fail("Cluster configuration incomplete: %s", err, exc=True)
318
319
320 def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
321 """Decorator that runs hooks before and after the decorated function.
322
323 @type hook_opcode: string
324 @param hook_opcode: opcode of the hook
325 @type hooks_path: string
326 @param hooks_path: path of the hooks
327 @type env_builder_fn: function
328 @param env_builder_fn: function that returns a dictionary containing the
329 environment variables for the hooks. Will get all the parameters of the
330 decorated function.
331 @raise RPCFail: in case of pre-hook failure
332
333 """
334 def decorator(fn):
335 def wrapper(*args, **kwargs):
336 _, myself = ssconf.GetMasterAndMyself()
337 nodes = ([myself], [myself]) # these hooks run locally
338
339 env_fn = compat.partial(env_builder_fn, *args, **kwargs)
340
341 cfg = _GetConfig()
342 hr = HooksRunner()
343 hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes,
344 hr.RunLocalHooks, None, env_fn, None,
345 logging.warning, cfg.GetClusterName(),
346 cfg.GetMasterNode())
347 hm.RunPhase(constants.HOOKS_PHASE_PRE)
348 result = fn(*args, **kwargs)
349 hm.RunPhase(constants.HOOKS_PHASE_POST)
350
351 return result
352 return wrapper
353 return decorator
354
355
356 def _BuildMasterIpEnv(master_params, use_external_mip_script=None):
357 """Builds environment variables for master IP hooks.
358
359 @type master_params: L{objects.MasterNetworkParameters}
360 @param master_params: network parameters of the master
361 @type use_external_mip_script: boolean
362 @param use_external_mip_script: whether to use an external master IP
363 address setup script (unused, but necessary per the implementation of the
364 _RunLocalHooks decorator)
365
366 """
367 # pylint: disable=W0613
368 ver = netutils.IPAddress.GetVersionFromAddressFamily(master_params.ip_family)
369 env = {
370 "MASTER_NETDEV": master_params.netdev,
371 "MASTER_IP": master_params.ip,
372 "MASTER_NETMASK": str(master_params.netmask),
373 "CLUSTER_IP_VERSION": str(ver),
374 }
375
376 return env
377
378
379 def _RunMasterSetupScript(master_params, action, use_external_mip_script):
380 """Execute the master IP address setup script.
381
382 @type master_params: L{objects.MasterNetworkParameters}
383 @param master_params: network parameters of the master
384 @type action: string
385 @param action: action to pass to the script. Must be one of
386 L{backend._MASTER_START} or L{backend._MASTER_STOP}
387 @type use_external_mip_script: boolean
388 @param use_external_mip_script: whether to use an external master IP
389 address setup script
390 @raise backend.RPCFail: if there are errors during the execution of the
391 script
392
393 """
394 env = _BuildMasterIpEnv(master_params)
395
396 if use_external_mip_script:
397 setup_script = pathutils.EXTERNAL_MASTER_SETUP_SCRIPT
398 else:
399 setup_script = pathutils.DEFAULT_MASTER_SETUP_SCRIPT
400
401 result = utils.RunCmd([setup_script, action], env=env, reset_env=True)
402
403 if result.failed:
404 _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" %
405 (action, result.exit_code, result.output), log=True)
406
407
408 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup",
409 _BuildMasterIpEnv)
410 def ActivateMasterIp(master_params, use_external_mip_script):
411 """Activate the IP address of the master daemon.
412
413 @type master_params: L{objects.MasterNetworkParameters}
414 @param master_params: network parameters of the master
415 @type use_external_mip_script: boolean
416 @param use_external_mip_script: whether to use an external master IP
417 address setup script
418 @raise RPCFail: in case of errors during the IP startup
419
420 """
421 _RunMasterSetupScript(master_params, _MASTER_START,
422 use_external_mip_script)
423
424
425 def StartMasterDaemons(no_voting):
426 """Activate local node as master node.
427
428 The function will start the master daemons (ganeti-masterd and ganeti-rapi).
429
430 @type no_voting: boolean
431 @param no_voting: whether to start ganeti-masterd without a node vote
432 but still non-interactively
433 @rtype: None
434
435 """
436
437 if no_voting:
438 daemon_args = "--no-voting --yes-do-it"
439 else:
440 daemon_args = ""
441
442 env = {
443 "EXTRA_LUXID_ARGS": daemon_args,
444 "EXTRA_WCONFD_ARGS": daemon_args,
445 }
446
447 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-master"], env=env)
448 if result.failed:
449 msg = "Can't start Ganeti master: %s" % result.output
450 logging.error(msg)
451 _Fail(msg)
452
453
454 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown",
455 _BuildMasterIpEnv)
456 def DeactivateMasterIp(master_params, use_external_mip_script):
457 """Deactivate the master IP on this node.
458
459 @type master_params: L{objects.MasterNetworkParameters}
460 @param master_params: network parameters of the master
461 @type use_external_mip_script: boolean
462 @param use_external_mip_script: whether to use an external master IP
463 address setup script
464 @raise RPCFail: in case of errors during the IP turndown
465
466 """
467 _RunMasterSetupScript(master_params, _MASTER_STOP,
468 use_external_mip_script)
469
470
471 def StopMasterDaemons():
472 """Stop the master daemons on this node.
473
474 Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node.
475
476 @rtype: None
477
478 """
479 # TODO: log and report back to the caller the error failures; we
480 # need to decide in which case we fail the RPC for this
481
482 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-master"])
483 if result.failed:
484 logging.error("Could not stop Ganeti master, command %s had exitcode %s"
485 " and error %s",
486 result.cmd, result.exit_code, result.output)
487
488
489 def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
490 """Change the netmask of the master IP.
491
492 @param old_netmask: the old value of the netmask
493 @param netmask: the new value of the netmask
494 @param master_ip: the master IP
495 @param master_netdev: the master network device
496
497 """
498 if old_netmask == netmask:
499 return
500
501 if not netutils.IPAddress.Own(master_ip):
502 _Fail("The master IP address is not up, not attempting to change its"
503 " netmask")
504
505 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
506 "%s/%s" % (master_ip, netmask),
507 "dev", master_netdev, "label",
508 "%s:0" % master_netdev])
509 if result.failed:
510 _Fail("Could not set the new netmask on the master IP address")
511
512 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
513 "%s/%s" % (master_ip, old_netmask),
514 "dev", master_netdev, "label",
515 "%s:0" % master_netdev])
516 if result.failed:
517 _Fail("Could not bring down the master IP address with the old netmask")
518
519
520 def EtcHostsModify(mode, host, ip):
521 """Modify a host entry in /etc/hosts.
522
523 @param mode: The mode to operate. Either add or remove entry
524 @param host: The host to operate on
525 @param ip: The ip associated with the entry
526
527 """
528 if mode == constants.ETC_HOSTS_ADD:
529 if not ip:
530 RPCFail("Mode 'add' needs 'ip' parameter, but parameter not"
531 " present")
532 utils.AddHostToEtcHosts(host, ip)
533 elif mode == constants.ETC_HOSTS_REMOVE:
534 if ip:
535 RPCFail("Mode 'remove' does not allow 'ip' parameter, but"
536 " parameter is present")
537 utils.RemoveHostFromEtcHosts(host)
538 else:
539 RPCFail("Mode not supported")
540
541
542 def LeaveCluster(modify_ssh_setup):
543 """Cleans up and remove the current node.
544
545 This function cleans up and prepares the current node to be removed
546 from the cluster.
547
548 If processing is successful, then it raises an
549 L{errors.QuitGanetiException} which is used as a special case to
550 shutdown the node daemon.
551
552 @param modify_ssh_setup: boolean
553
554 """
555 _CleanDirectory(pathutils.DATA_DIR)
556 _CleanDirectory(pathutils.CRYPTO_KEYS_DIR)
557 JobQueuePurge()
558
559 if modify_ssh_setup:
560 try:
561 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
562
563 ssh.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
564
565 utils.RemoveFile(priv_key)
566 utils.RemoveFile(pub_key)
567 except errors.OpExecError:
568 logging.exception("Error while processing ssh files")
569 except IOError:
570 logging.exception("At least one SSH file was not accessible.")
571
572 try:
573 utils.RemoveFile(pathutils.CONFD_HMAC_KEY)
574 utils.RemoveFile(pathutils.RAPI_CERT_FILE)
575 utils.RemoveFile(pathutils.SPICE_CERT_FILE)
576 utils.RemoveFile(pathutils.SPICE_CACERT_FILE)
577 utils.RemoveFile(pathutils.NODED_CERT_FILE)
578 except: # pylint: disable=W0702
579 logging.exception("Error while removing cluster secrets")
580
581 utils.StopDaemon(constants.CONFD)
582 utils.StopDaemon(constants.MOND)
583 utils.StopDaemon(constants.KVMD)
584
585 # Raise a custom exception (handled in ganeti-noded)
586 raise errors.QuitGanetiException(True, "Shutdown scheduled")
587
588
589 def _CheckStorageParams(params, num_params):
590 """Performs sanity checks for storage parameters.
591
592 @type params: list
593 @param params: list of storage parameters
594 @type num_params: int
595 @param num_params: expected number of parameters
596
597 """
598 if params is None:
599 raise errors.ProgrammerError("No storage parameters for storage"
600 " reporting is provided.")
601 if not isinstance(params, list):
602 raise errors.ProgrammerError("The storage parameters are not of type"
603 " list: '%s'" % params)
604 if not len(params) == num_params:
605 raise errors.ProgrammerError("Did not receive the expected number of"
606 "storage parameters: expected %s,"
607 " received '%s'" % (num_params, len(params)))
608
609
610 def _CheckLvmStorageParams(params):
611 """Performs sanity check for the 'exclusive storage' flag.
612
613 @see: C{_CheckStorageParams}
614
615 """
616 _CheckStorageParams(params, 1)
617 excl_stor = params[0]
618 if not isinstance(params[0], bool):
619 raise errors.ProgrammerError("Exclusive storage parameter is not"
620 " boolean: '%s'." % excl_stor)
621 return excl_stor
622
623
624 def _GetLvmVgSpaceInfo(name, params):
625 """Wrapper around C{_GetVgInfo} which checks the storage parameters.
626
627 @type name: string
628 @param name: name of the volume group
629 @type params: list
630 @param params: list of storage parameters, which in this case should be
631 containing only one for exclusive storage
632
633 """
634 excl_stor = _CheckLvmStorageParams(params)
635 return _GetVgInfo(name, excl_stor)
636
637
638 def _GetVgInfo(
639 name, excl_stor, info_fn=bdev.LogicalVolume.GetVGInfo):
640 """Retrieves information about a LVM volume group.
641
642 """
643 # TODO: GetVGInfo supports returning information for multiple VGs at once
644 vginfo = info_fn([name], excl_stor)
645 if vginfo:
646 vg_free = int(round(vginfo[0][0], 0))
647 vg_size = int(round(vginfo[0][1], 0))
648 else:
649 vg_free = None
650 vg_size = None
651
652 return {
653 "type": constants.ST_LVM_VG,
654 "name": name,
655 "storage_free": vg_free,
656 "storage_size": vg_size,
657 }
658
659
660 def _GetLvmPvSpaceInfo(name, params):
661 """Wrapper around C{_GetVgSpindlesInfo} with sanity checks.
662
663 @see: C{_GetLvmVgSpaceInfo}
664
665 """
666 excl_stor = _CheckLvmStorageParams(params)
667 return _GetVgSpindlesInfo(name, excl_stor)
668
669
670 def _GetVgSpindlesInfo(
671 name, excl_stor, info_fn=bdev.LogicalVolume.GetVgSpindlesInfo):
672 """Retrieves information about spindles in an LVM volume group.
673
674 @type name: string
675 @param name: VG name
676 @type excl_stor: bool
677 @param excl_stor: exclusive storage
678 @rtype: dict
679 @return: dictionary whose keys are "name", "vg_free", "vg_size" for VG name,
680 free spindles, total spindles respectively
681
682 """
683 if excl_stor:
684 (vg_free, vg_size) = info_fn(name)
685 else:
686 vg_free = 0
687 vg_size = 0
688 return {
689 "type": constants.ST_LVM_PV,
690 "name": name,
691 "storage_free": vg_free,
692 "storage_size": vg_size,
693 }
694
695
696 def _GetHvInfo(name, hvparams, get_hv_fn=hypervisor.GetHypervisor):
697 """Retrieves node information from a hypervisor.
698
699 The information returned depends on the hypervisor. Common items:
700
701 - vg_size is the size of the configured volume group in MiB
702 - vg_free is the free size of the volume group in MiB
703 - memory_dom0 is the memory allocated for domain0 in MiB
704 - memory_free is the currently available (free) ram in MiB
705 - memory_total is the total number of ram in MiB
706 - hv_version: the hypervisor version, if available
707
708 @type hvparams: dict of string
709 @param hvparams: the hypervisor's hvparams
710
711 """
712 return get_hv_fn(name).GetNodeInfo(hvparams=hvparams)
713
714
715 def _GetHvInfoAll(hv_specs, get_hv_fn=hypervisor.GetHypervisor):
716 """Retrieves node information for all hypervisors.
717
718 See C{_GetHvInfo} for information on the output.
719
720 @type hv_specs: list of pairs (string, dict of strings)
721 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
722
723 """
724 if hv_specs is None:
725 return None
726
727 result = []
728 for hvname, hvparams in hv_specs:
729 result.append(_GetHvInfo(hvname, hvparams, get_hv_fn))
730 return result
731
732
733 def _GetNamedNodeInfo(names, fn):
734 """Calls C{fn} for all names in C{names} and returns a dictionary.
735
736 @rtype: None or dict
737
738 """
739 if names is None:
740 return None
741 else:
742 return map(fn, names)
743
744
745 def GetNodeInfo(storage_units, hv_specs):
746 """Gives back a hash with different information about the node.
747
748 @type storage_units: list of tuples (string, string, list)
749 @param storage_units: List of tuples (storage unit, identifier, parameters) to
750 ask for disk space information. In case of lvm-vg, the identifier is
751 the VG name. The parameters can contain additional, storage-type-specific
752 parameters, for example exclusive storage for lvm storage.
753 @type hv_specs: list of pairs (string, dict of strings)
754 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
755 @rtype: tuple; (string, None/dict, None/dict)
756 @return: Tuple containing boot ID, volume group information and hypervisor
757 information
758
759 """
760 bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n")
761 storage_info = _GetNamedNodeInfo(
762 storage_units,
763 (lambda (storage_type, storage_key, storage_params):
764 _ApplyStorageInfoFunction(storage_type, storage_key, storage_params)))
765 hv_info = _GetHvInfoAll(hv_specs)
766 return (bootid, storage_info, hv_info)
767
768
769 def _GetFileStorageSpaceInfo(path, params):
770 """Wrapper around filestorage.GetSpaceInfo.
771
772 The purpose of this wrapper is to call filestorage.GetFileStorageSpaceInfo
773 and ignore the *args parameter to not leak it into the filestorage
774 module's code.
775
776 @see: C{filestorage.GetFileStorageSpaceInfo} for description of the
777 parameters.
778
779 """
780 _CheckStorageParams(params, 0)
781 return filestorage.GetFileStorageSpaceInfo(path)
782
783
784 # FIXME: implement storage reporting for all missing storage types.
785 _STORAGE_TYPE_INFO_FN = {
786 constants.ST_BLOCK: None,
787 constants.ST_DISKLESS: None,
788 constants.ST_EXT: None,
789 constants.ST_FILE: _GetFileStorageSpaceInfo,
790 constants.ST_LVM_PV: _GetLvmPvSpaceInfo,
791 constants.ST_LVM_VG: _GetLvmVgSpaceInfo,
792 constants.ST_SHARED_FILE: None,
793 constants.ST_GLUSTER: None,
794 constants.ST_RADOS: None,
795 }
796
797
798 def _ApplyStorageInfoFunction(storage_type, storage_key, *args):
799 """Looks up and applies the correct function to calculate free and total
800 storage for the given storage type.
801
802 @type storage_type: string
803 @param storage_type: the storage type for which the storage shall be reported.
804 @type storage_key: string
805 @param storage_key: identifier of a storage unit, e.g. the volume group name
806 of an LVM storage unit
807 @type args: any
808 @param args: various parameters that can be used for storage reporting. These
809 parameters and their semantics vary from storage type to storage type and
810 are just propagated in this function.
811 @return: the results of the application of the storage space function (see
812 _STORAGE_TYPE_INFO_FN) if storage space reporting is implemented for that
813 storage type
814 @raises NotImplementedError: for storage types who don't support space
815 reporting yet
816 """
817 fn = _STORAGE_TYPE_INFO_FN[storage_type]
818 if fn is not None:
819 return fn(storage_key, *args)
820 else:
821 raise NotImplementedError
822
823
824 def _CheckExclusivePvs(pvi_list):
825 """Check that PVs are not shared among LVs
826
827 @type pvi_list: list of L{objects.LvmPvInfo} objects
828 @param pvi_list: information about the PVs
829
830 @rtype: list of tuples (string, list of strings)
831 @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...])
832
833 """
834 res = []
835 for pvi in pvi_list:
836 if len(pvi.lv_list) > 1:
837 res.append((pvi.name, pvi.lv_list))
838 return res
839
840
841 def _VerifyHypervisors(what, vm_capable, result, all_hvparams,
842 get_hv_fn=hypervisor.GetHypervisor):
843 """Verifies the hypervisor. Appends the results to the 'results' list.
844
845 @type what: C{dict}
846 @param what: a dictionary of things to check
847 @type vm_capable: boolean
848 @param vm_capable: whether or not this node is vm capable
849 @type result: dict
850 @param result: dictionary of verification results; results of the
851 verifications in this function will be added here
852 @type all_hvparams: dict of dict of string
853 @param all_hvparams: dictionary mapping hypervisor names to hvparams
854 @type get_hv_fn: function
855 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
856
857 """
858 if not vm_capable:
859 return
860
861 if constants.NV_HYPERVISOR in what:
862 result[constants.NV_HYPERVISOR] = {}
863 for hv_name in what[constants.NV_HYPERVISOR]:
864 hvparams = all_hvparams[hv_name]
865 try:
866 val = get_hv_fn(hv_name).Verify(hvparams=hvparams)
867 except errors.HypervisorError, err:
868 val = "Error while checking hypervisor: %s" % str(err)
869 result[constants.NV_HYPERVISOR][hv_name] = val
870
871
872 def _VerifyHvparams(what, vm_capable, result,
873 get_hv_fn=hypervisor.GetHypervisor):
874 """Verifies the hvparams. Appends the results to the 'results' list.
875
876 @type what: C{dict}
877 @param what: a dictionary of things to check
878 @type vm_capable: boolean
879 @param vm_capable: whether or not this node is vm capable
880 @type result: dict
881 @param result: dictionary of verification results; results of the
882 verifications in this function will be added here
883 @type get_hv_fn: function
884 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
885
886 """
887 if not vm_capable:
888 return
889
890 if constants.NV_HVPARAMS in what:
891 result[constants.NV_HVPARAMS] = []
892 for source, hv_name, hvparms in what[constants.NV_HVPARAMS]:
893 try:
894 logging.info("Validating hv %s, %s", hv_name, hvparms)
895 get_hv_fn(hv_name).ValidateParameters(hvparms)
896 except errors.HypervisorError, err:
897 result[constants.NV_HVPARAMS].append((source, hv_name, str(err)))
898
899
900 def _VerifyInstanceList(what, vm_capable, result, all_hvparams):
901 """Verifies the instance list.
902
903 @type what: C{dict}
904 @param what: a dictionary of things to check
905 @type vm_capable: boolean
906 @param vm_capable: whether or not this node is vm capable
907 @type result: dict
908 @param result: dictionary of verification results; results of the
909 verifications in this function will be added here
910 @type all_hvparams: dict of dict of string
911 @param all_hvparams: dictionary mapping hypervisor names to hvparams
912
913 """
914 if constants.NV_INSTANCELIST in what and vm_capable:
915 # GetInstanceList can fail
916 try:
917 val = GetInstanceList(what[constants.NV_INSTANCELIST],
918 all_hvparams=all_hvparams)
919 except RPCFail, err:
920 val = str(err)
921 result[constants.NV_INSTANCELIST] = val
922
923
924 def _VerifyNodeInfo(what, vm_capable, result, all_hvparams):
925 """Verifies the node info.
926
927 @type what: C{dict}
928 @param what: a dictionary of things to check
929 @type vm_capable: boolean
930 @param vm_capable: whether or not this node is vm capable
931 @type result: dict
932 @param result: dictionary of verification results; results of the
933 verifications in this function will be added here
934 @type all_hvparams: dict of dict of string
935 @param all_hvparams: dictionary mapping hypervisor names to hvparams
936
937 """
938 if constants.NV_HVINFO in what and vm_capable:
939 hvname = what[constants.NV_HVINFO]
940 hyper = hypervisor.GetHypervisor(hvname)
941 hvparams = all_hvparams[hvname]
942 result[constants.NV_HVINFO] = hyper.GetNodeInfo(hvparams=hvparams)
943
944
945 def _VerifyClientCertificate(cert_file=pathutils.NODED_CLIENT_CERT_FILE):
946 """Verify the existance and validity of the client SSL certificate.
947
948 Also, verify that the client certificate is not self-signed. Self-
949 signed client certificates stem from Ganeti versions 2.12.0 - 2.12.4
950 and should be replaced by client certificates signed by the server
951 certificate. Hence we output a warning when we encounter a self-signed
952 one.
953
954 """
955 create_cert_cmd = "gnt-cluster renew-crypto --new-node-certificates"
956 if not os.path.exists(cert_file):
957 return (constants.CV_ERROR,
958 "The client certificate does not exist. Run '%s' to create"
959 " client certificates for all nodes." % create_cert_cmd)
960
961 (errcode, msg) = utils.VerifyCertificate(cert_file)
962 if errcode is not None:
963 return (errcode, msg)
964
965 (errcode, msg) = utils.IsCertificateSelfSigned(cert_file)
966 if errcode is not None:
967 return (errcode, msg)
968
969 # if everything is fine, we return the digest to be compared to the config
970 return (None, utils.GetCertificateDigest(cert_filename=cert_file))
971
972
973 def _VerifySshSetup(node_status_list, my_name,
974 pub_key_file=pathutils.SSH_PUB_KEYS):
975 """Verifies the state of the SSH key files.
976
977 @type node_status_list: list of tuples
978 @param node_status_list: list of nodes of the cluster associated with a
979 couple of flags: (uuid, name, is_master_candidate,
980 is_potential_master_candidate, online)
981 @type my_name: str
982 @param my_name: name of this node
983 @type pub_key_file: str
984 @param pub_key_file: filename of the public key file
985
986 """
987 if node_status_list is None:
988 return ["No node list to check against the pub_key_file received."]
989
990 my_status_list = [(my_uuid, name, mc, pot_mc, online) for
991 (my_uuid, name, mc, pot_mc, online)
992 in node_status_list if name == my_name]
993 if len(my_status_list) == 0:
994 return ["Cannot find node information for node '%s'." % my_name]
995 (my_uuid, _, _, potential_master_candidate, online) = \
996 my_status_list[0]
997
998 result = []
999
1000 if not os.path.exists(pub_key_file):
1001 result.append("The public key file '%s' does not exist. Consider running"
1002 " 'gnt-cluster renew-crypto --new-ssh-keys"
1003 " [--no-ssh-key-check]' to fix this." % pub_key_file)
1004 return result
1005
1006 pot_mc_uuids = [uuid for (uuid, _, _, _, _) in node_status_list]
1007 offline_nodes = [uuid for (uuid, _, _, _, online) in node_status_list
1008 if not online]
1009 pub_keys = ssh.QueryPubKeyFile(None)
1010
1011 if potential_master_candidate:
1012 # Check that the set of potential master candidates matches the
1013 # public key file
1014 pub_uuids_set = set(pub_keys.keys()) - set(offline_nodes)
1015 pot_mc_uuids_set = set(pot_mc_uuids) - set(offline_nodes)
1016 missing_uuids = set([])
1017 if pub_uuids_set != pot_mc_uuids_set:
1018 unknown_uuids = pub_uuids_set - pot_mc_uuids_set
1019 if unknown_uuids:
1020 result.append("The following node UUIDs are listed in the public key"
1021 " file on node '%s', but are not potential master"
1022 " candidates: %s."
1023 % (my_name, ", ".join(list(unknown_uuids))))
1024 missing_uuids = pot_mc_uuids_set - pub_uuids_set
1025 if missing_uuids:
1026 result.append("The following node UUIDs of potential master candidates"
1027 " are missing in the public key file on node %s: %s."
1028 % (my_name, ", ".join(list(missing_uuids))))
1029
1030 (_, key_files) = \
1031 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1032 (_, dsa_pub_key_filename) = key_files[constants.SSHK_DSA]
1033
1034 my_keys = pub_keys[my_uuid]
1035
1036 dsa_pub_key = utils.ReadFile(dsa_pub_key_filename)
1037 if dsa_pub_key.strip() not in my_keys:
1038 result.append("The dsa key of node %s does not match this node's key"
1039 " in the pub key file." % (my_name))
1040 if len(my_keys) != 1:
1041 result.append("There is more than one key for node %s in the public key"
1042 " file." % my_name)
1043 else:
1044 if len(pub_keys.keys()) > 0:
1045 result.append("The public key file of node '%s' is not empty, although"
1046 " the node is not a potential master candidate."
1047 % my_name)
1048
1049 # Check that all master candidate keys are in the authorized_keys file
1050 (auth_key_file, _) = \
1051 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1052 for (uuid, name, mc, _, online) in node_status_list:
1053 if not online:
1054 continue
1055 if uuid in missing_uuids:
1056 continue
1057 if mc:
1058 for key in pub_keys[uuid]:
1059 if not ssh.HasAuthorizedKey(auth_key_file, key):
1060 result.append("A SSH key of master candidate '%s' (UUID: '%s') is"
1061 " not in the 'authorized_keys' file of node '%s'."
1062 % (name, uuid, my_name))
1063 else:
1064 for key in pub_keys[uuid]:
1065 if name != my_name and ssh.HasAuthorizedKey(auth_key_file, key):
1066 result.append("A SSH key of normal node '%s' (UUID: '%s') is in the"
1067 " 'authorized_keys' file of node '%s'."
1068 % (name, uuid, my_name))
1069 if name == my_name and not ssh.HasAuthorizedKey(auth_key_file, key):
1070 result.append("A SSH key of normal node '%s' (UUID: '%s') is not"
1071 " in the 'authorized_keys' file of itself."
1072 % (my_name, uuid))
1073
1074 return result
1075
1076
1077 def _VerifySshClutter(node_status_list, my_name):
1078 """Verifies that the 'authorized_keys' files are not cluttered up.
1079
1080 @type node_status_list: list of tuples
1081 @param node_status_list: list of nodes of the cluster associated with a
1082 couple of flags: (uuid, name, is_master_candidate,
1083 is_potential_master_candidate, online)
1084 @type my_name: str
1085 @param my_name: name of this node
1086
1087 """
1088 result = []
1089 (auth_key_file, _) = \
1090 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1091 node_names = [name for (_, name, _, _) in node_status_list]
1092 multiple_occurrences = ssh.CheckForMultipleKeys(auth_key_file, node_names)
1093 if multiple_occurrences:
1094 msg = "There are hosts which have more than one SSH key stored for the" \
1095 " same user in the 'authorized_keys' file of node %s. This can be" \
1096 " due to an unsuccessful operation which cluttered up the" \
1097 " 'authorized_keys' file. We recommend to clean this up manually. " \
1098 % my_name
1099 for host, occ in multiple_occurrences.items():
1100 msg += "Entry for '%s' in lines %s. " % (host, utils.CommaJoin(occ))
1101 result.append(msg)
1102
1103 return result
1104
1105
1106 def VerifyNode(what, cluster_name, all_hvparams, node_groups, groups_cfg):
1107 """Verify the status of the local node.
1108
1109 Based on the input L{what} parameter, various checks are done on the
1110 local node.
1111
1112 If the I{filelist} key is present, this list of
1113 files is checksummed and the file/checksum pairs are returned.
1114
1115 If the I{nodelist} key is present, we check that we have
1116 connectivity via ssh with the target nodes (and check the hostname
1117 report).
1118
1119 If the I{node-net-test} key is present, we check that we have
1120 connectivity to the given nodes via both primary IP and, if
1121 applicable, secondary IPs.
1122
1123 @type what: C{dict}
1124 @param what: a dictionary of things to check:
1125 - filelist: list of files for which to compute checksums
1126 - nodelist: list of nodes we should check ssh communication with
1127 - node-net-test: list of nodes we should check node daemon port
1128 connectivity with
1129 - hypervisor: list with hypervisors to run the verify for
1130 @type cluster_name: string
1131 @param cluster_name: the cluster's name
1132 @type all_hvparams: dict of dict of strings
1133 @param all_hvparams: a dictionary mapping hypervisor names to hvparams
1134 @type node_groups: a dict of strings
1135 @param node_groups: node _names_ mapped to their group uuids (it's enough to
1136 have only those nodes that are in `what["nodelist"]`)
1137 @type groups_cfg: a dict of dict of strings
1138 @param groups_cfg: a dictionary mapping group uuids to their configuration
1139 @rtype: dict
1140 @return: a dictionary with the same keys as the input dict, and
1141 values representing the result of the checks
1142
1143 """
1144 result = {}
1145 my_name = netutils.Hostname.GetSysName()
1146 port = netutils.GetDaemonPort(constants.NODED)
1147 vm_capable = my_name not in what.get(constants.NV_NONVMNODES, [])
1148
1149 _VerifyHypervisors(what, vm_capable, result, all_hvparams)
1150 _VerifyHvparams(what, vm_capable, result)
1151
1152 if constants.NV_FILELIST in what:
1153 fingerprints = utils.FingerprintFiles(map(vcluster.LocalizeVirtualPath,
1154 what[constants.NV_FILELIST]))
1155 result[constants.NV_FILELIST] = \
1156 dict((vcluster.MakeVirtualPath(key), value)
1157 for (key, value) in fingerprints.items())
1158
1159 if constants.NV_CLIENT_CERT in what:
1160 result[constants.NV_CLIENT_CERT] = _VerifyClientCertificate()
1161
1162 if constants.NV_SSH_SETUP in what:
1163 result[constants.NV_SSH_SETUP] = \
1164 _VerifySshSetup(what[constants.NV_SSH_SETUP], my_name)
1165 if constants.NV_SSH_CLUTTER in what:
1166 result[constants.NV_SSH_CLUTTER] = \
1167 _VerifySshClutter(what[constants.NV_SSH_SETUP], my_name)
1168
1169 if constants.NV_NODELIST in what:
1170 (nodes, bynode, mcs) = what[constants.NV_NODELIST]
1171
1172 # Add nodes from other groups (different for each node)
1173 try:
1174 nodes.extend(bynode[my_name])
1175 except KeyError:
1176 pass
1177
1178 # Use a random order
1179 random.shuffle(nodes)
1180
1181 # Try to contact all nodes
1182 val = {}
1183 for node in nodes:
1184 params = groups_cfg.get(node_groups.get(node))
1185 ssh_port = params["ndparams"].get(constants.ND_SSH_PORT)
1186 logging.debug("Ssh port %s (None = default) for node %s",
1187 str(ssh_port), node)
1188
1189 # We only test if master candidates can communicate to other nodes.
1190 # We cannot test if normal nodes cannot communicate with other nodes,
1191 # because the administrator might have installed additional SSH keys,
1192 # over which Ganeti has no power.
1193 if my_name in mcs:
1194 success, message = _GetSshRunner(cluster_name). \
1195 VerifyNodeHostname(node, ssh_port)
1196 if not success:
1197 val[node] = message
1198
1199 result[constants.NV_NODELIST] = val
1200
1201 if constants.NV_NODENETTEST in what:
1202 result[constants.NV_NODENETTEST] = tmp = {}
1203 my_pip = my_sip = None
1204 for name, pip, sip in what[constants.NV_NODENETTEST]:
1205 if name == my_name:
1206 my_pip = pip
1207 my_sip = sip
1208 break
1209 if not my_pip:
1210 tmp[my_name] = ("Can't find my own primary/secondary IP"
1211 " in the node list")
1212 else:
1213 for name, pip, sip in what[constants.NV_NODENETTEST]:
1214 fail = []
1215 if not netutils.TcpPing(pip, port, source=my_pip):
1216 fail.append("primary")
1217 if sip != pip:
1218 if not netutils.TcpPing(sip, port, source=my_sip):
1219 fail.append("secondary")
1220 if fail:
1221 tmp[name] = ("failure using the %s interface(s)" %
1222 " and ".join(fail))
1223
1224 if constants.NV_MASTERIP in what:
1225 # FIXME: add checks on incoming data structures (here and in the
1226 # rest of the function)
1227 master_name, master_ip = what[constants.NV_MASTERIP]
1228 if master_name == my_name:
1229 source = constants.IP4_ADDRESS_LOCALHOST
1230 else:
1231 source = None
1232 result[constants.NV_MASTERIP] = netutils.TcpPing(master_ip, port,
1233 source=source)
1234
1235 if constants.NV_USERSCRIPTS in what:
1236 result[constants.NV_USERSCRIPTS] = \
1237 [script for script in what[constants.NV_USERSCRIPTS]
1238 if not utils.IsExecutable(script)]
1239
1240 if constants.NV_OOB_PATHS in what:
1241 result[constants.NV_OOB_PATHS] = tmp = []
1242 for path in what[constants.NV_OOB_PATHS]:
1243 try:
1244 st = os.stat(path)
1245 except OSError, err:
1246 tmp.append("error stating out of band helper: %s" % err)
1247 else:
1248 if stat.S_ISREG(st.st_mode):
1249 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR:
1250 tmp.append(None)
1251 else:
1252 tmp.append("out of band helper %s is not executable" % path)
1253 else:
1254 tmp.append("out of band helper %s is not a file" % path)
1255
1256 if constants.NV_LVLIST in what and vm_capable:
1257 try:
1258 val = GetVolumeList(utils.ListVolumeGroups().keys())
1259 except RPCFail, err:
1260 val = str(err)
1261 result[constants.NV_LVLIST] = val
1262
1263 _VerifyInstanceList(what, vm_capable, result, all_hvparams)
1264
1265 if constants.NV_VGLIST in what and vm_capable:
1266 result[constants.NV_VGLIST] = utils.ListVolumeGroups()
1267
1268 if constants.NV_PVLIST in what and vm_capable:
1269 check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what
1270 val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST],
1271 filter_allocatable=False,
1272 include_lvs=check_exclusive_pvs)
1273 if check_exclusive_pvs:
1274 result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val)
1275 for pvi in val:
1276 # Avoid sending useless data on the wire
1277 pvi.lv_list = []
1278 result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val)
1279
1280 if constants.NV_VERSION in what:
1281 result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
1282 constants.RELEASE_VERSION)
1283
1284 _VerifyNodeInfo(what, vm_capable, result, all_hvparams)
1285
1286 if constants.NV_DRBDVERSION in what and vm_capable:
1287 try:
1288 drbd_version = DRBD8.GetProcInfo().GetVersionString()
1289 except errors.BlockDeviceError, err:
1290 logging.warning("Can't get DRBD version", exc_info=True)
1291 drbd_version = str(err)
1292 result[constants.NV_DRBDVERSION] = drbd_version
1293
1294 if constants.NV_DRBDLIST in what and vm_capable:
1295 try:
1296 used_minors = drbd.DRBD8.GetUsedDevs()
1297 except errors.BlockDeviceError, err:
1298 logging.warning("Can't get used minors list", exc_info=True)
1299 used_minors = str(err)
1300 result[constants.NV_DRBDLIST] = used_minors
1301
1302 if constants.NV_DRBDHELPER in what and vm_capable:
1303 status = True
1304 try:
1305 payload = drbd.DRBD8.GetUsermodeHelper()
1306 except errors.BlockDeviceError, err:
1307 logging.error("Can't get DRBD usermode helper: %s", str(err))
1308 status = False
1309 payload = str(err)
1310 result[constants.NV_DRBDHELPER] = (status, payload)
1311
1312 if constants.NV_NODESETUP in what:
1313 result[constants.NV_NODESETUP] = tmpr = []
1314 if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"):
1315 tmpr.append("The sysfs filesytem doesn't seem to be mounted"
1316 " under /sys, missing required directories /sys/block"
1317 " and /sys/class/net")
1318 if (not os.path.isdir("/proc/sys") or
1319 not os.path.isfile("/proc/sysrq-trigger")):
1320 tmpr.append("The procfs filesystem doesn't seem to be mounted"
1321 " under /proc, missing required directory /proc/sys and"
1322 " the file /proc/sysrq-trigger")
1323
1324 if constants.NV_TIME in what:
1325 result[constants.NV_TIME] = utils.SplitTime(time.time())
1326
1327 if constants.NV_OSLIST in what and vm_capable:
1328 result[constants.NV_OSLIST] = DiagnoseOS()
1329
1330 if constants.NV_BRIDGES in what and vm_capable:
1331 result[constants.NV_BRIDGES] = [bridge
1332 for bridge in what[constants.NV_BRIDGES]
1333 if not utils.BridgeExists(bridge)]
1334
1335 if what.get(constants.NV_ACCEPTED_STORAGE_PATHS) == my_name:
1336 result[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1337 filestorage.ComputeWrongFileStoragePaths()
1338
1339 if what.get(constants.NV_FILE_STORAGE_PATH):
1340 pathresult = filestorage.CheckFileStoragePath(
1341 what[constants.NV_FILE_STORAGE_PATH])
1342 if pathresult:
1343 result[constants.NV_FILE_STORAGE_PATH] = pathresult
1344
1345 if what.get(constants.NV_SHARED_FILE_STORAGE_PATH):
1346 pathresult = filestorage.CheckFileStoragePath(
1347 what[constants.NV_SHARED_FILE_STORAGE_PATH])
1348 if pathresult:
1349 result[constants.NV_SHARED_FILE_STORAGE_PATH] = pathresult
1350
1351 return result
1352
1353
1354 def GetCryptoTokens(token_requests):
1355 """Perform actions on the node's cryptographic tokens.
1356
1357 Token types can be 'ssl' or 'ssh'. So far only some actions are implemented
1358 for 'ssl'. Action 'get' returns the digest of the public client ssl
1359 certificate. Action 'create' creates a new client certificate and private key
1360 and also returns the digest of the certificate. The third parameter of a
1361 token request are optional parameters for the actions, so far only the
1362 filename is supported.
1363
1364 @type token_requests: list of tuples of (string, string, dict), where the
1365 first string is in constants.CRYPTO_TYPES, the second in
1366 constants.CRYPTO_ACTIONS. The third parameter is a dictionary of string
1367 to string.
1368 @param token_requests: list of requests of cryptographic tokens and actions
1369 to perform on them. The actions come with a dictionary of options.
1370 @rtype: list of tuples (string, string)
1371 @return: list of tuples of the token type and the public crypto token
1372
1373 """
1374 tokens = []
1375 for (token_type, action, _) in token_requests:
1376 if token_type not in constants.CRYPTO_TYPES:
1377 raise errors.ProgrammerError("Token type '%s' not supported." %
1378 token_type)
1379 if action not in constants.CRYPTO_ACTIONS:
1380 raise errors.ProgrammerError("Action '%s' is not supported." %
1381 action)
1382 if token_type == constants.CRYPTO_TYPE_SSL_DIGEST:
1383 tokens.append((token_type,
1384 utils.GetCertificateDigest()))
1385 return tokens
1386
1387
1388 def EnsureDaemon(daemon_name, run):
1389 """Ensures the given daemon is running or stopped.
1390
1391 @type daemon_name: string
1392 @param daemon_name: name of the daemon (e.g., constants.KVMD)
1393
1394 @type run: bool
1395 @param run: whether to start or stop the daemon
1396
1397 @rtype: bool
1398 @return: 'True' if daemon successfully started/stopped,
1399 'False' otherwise
1400
1401 """
1402 allowed_daemons = [constants.KVMD]
1403
1404 if daemon_name not in allowed_daemons:
1405 fn = lambda _: False
1406 elif run:
1407 fn = utils.EnsureDaemon
1408 else:
1409 fn = utils.StopDaemon
1410
1411 return fn(daemon_name)
1412
1413
1414 def _InitSshUpdateData(data, noded_cert_file, ssconf_store):
1415 (_, noded_cert) = \
1416 utils.ExtractX509Certificate(utils.ReadFile(noded_cert_file))
1417 data[constants.SSHS_NODE_DAEMON_CERTIFICATE] = noded_cert
1418
1419 cluster_name = ssconf_store.GetClusterName()
1420 data[constants.SSHS_CLUSTER_NAME] = cluster_name
1421
1422
1423 def AddNodeSshKey(node_uuid, node_name,
1424 potential_master_candidates,
1425 to_authorized_keys=False,
1426 to_public_keys=False,
1427 get_public_keys=False,
1428 pub_key_file=pathutils.SSH_PUB_KEYS,
1429 ssconf_store=None,
1430 noded_cert_file=pathutils.NODED_CERT_FILE,
1431 run_cmd_fn=ssh.RunSshCmdWithStdin):
1432 """Distributes a node's public SSH key across the cluster.
1433
1434 Note that this function should only be executed on the master node, which
1435 then will copy the new node's key to all nodes in the cluster via SSH.
1436
1437 Also note: at least one of the flags C{to_authorized_keys},
1438 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1439 the function to actually perform any actions.
1440
1441 @type node_uuid: str
1442 @param node_uuid: the UUID of the node whose key is added
1443 @type node_name: str
1444 @param node_name: the name of the node whose key is added
1445 @type potential_master_candidates: list of str
1446 @param potential_master_candidates: list of node names of potential master
1447 candidates; this should match the list of uuids in the public key file
1448 @type to_authorized_keys: boolean
1449 @param to_authorized_keys: whether the key should be added to the
1450 C{authorized_keys} file of all nodes
1451 @type to_public_keys: boolean
1452 @param to_public_keys: whether the keys should be added to the public key file
1453 @type get_public_keys: boolean
1454 @param get_public_keys: whether the node should add the clusters' public keys
1455 to its {ganeti_pub_keys} file
1456
1457 """
1458 node_list = [SshAddNodeInfo(name=node_name, uuid=node_uuid,
1459 to_authorized_keys=to_authorized_keys,
1460 to_public_keys=to_public_keys,
1461 get_public_keys=get_public_keys)]
1462 return AddNodeSshKeyBulk(node_list,
1463 potential_master_candidates,
1464 pub_key_file=pub_key_file,
1465 ssconf_store=ssconf_store,
1466 noded_cert_file=noded_cert_file,
1467 run_cmd_fn=run_cmd_fn)
1468
1469
1470 # Node info named tuple specifically for the use with AddNodeSshKeyBulk
1471 SshAddNodeInfo = collections.namedtuple(
1472 "SshAddNodeInfo",
1473 ["uuid",
1474 "name",
1475 "to_authorized_keys",
1476 "to_public_keys",
1477 "get_public_keys"])
1478
1479
1480 def AddNodeSshKeyBulk(node_list,
1481 potential_master_candidates,
1482 pub_key_file=pathutils.SSH_PUB_KEYS,
1483 ssconf_store=None,
1484 noded_cert_file=pathutils.NODED_CERT_FILE,
1485 run_cmd_fn=ssh.RunSshCmdWithStdin):
1486 """Distributes a node's public SSH key across the cluster.
1487
1488 Note that this function should only be executed on the master node, which
1489 then will copy the new node's key to all nodes in the cluster via SSH.
1490
1491 Also note: at least one of the flags C{to_authorized_keys},
1492 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1493 the function to actually perform any actions.
1494
1495 @type node_list: list of SshAddNodeInfo tuples
1496 @param node_list: list of tuples containing the necessary node information for
1497 adding their keys
1498 @type potential_master_candidates: list of str
1499 @param potential_master_candidates: list of node names of potential master
1500 candidates; this should match the list of uuids in the public key file
1501
1502 """
1503 # whether there are any keys to be added or retrieved at all
1504 to_authorized_keys = any([node_info.to_authorized_keys for node_info in
1505 node_list])
1506 to_public_keys = any([node_info.to_public_keys for node_info in
1507 node_list])
1508
1509 if not ssconf_store:
1510 ssconf_store = ssconf.SimpleStore()
1511
1512 for node_info in node_list:
1513 # replacement not necessary for keys that are not supposed to be in the
1514 # list of public keys
1515 if not node_info.to_public_keys:
1516 continue
1517 # Check and fix sanity of key file
1518 keys_by_name = ssh.QueryPubKeyFile([node_info.name], key_file=pub_key_file)
1519 keys_by_uuid = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1520
1521 if (not keys_by_name or node_info.name not in keys_by_name) \
1522 and (not keys_by_uuid or node_info.uuid not in keys_by_uuid):
1523 raise errors.SshUpdateError(
1524 "No keys found for the new node '%s' (UUID %s) in the list of public"
1525 " SSH keys, neither for the name or the UUID" %
1526 (node_info.name, node_info.uuid))
1527 else:
1528 if node_info.name in keys_by_name:
1529 # Replace the name by UUID in the file as the name should only be used
1530 # temporarily
1531 ssh.ReplaceNameByUuid(node_info.uuid, node_info.name,
1532 error_fn=errors.SshUpdateError,
1533 key_file=pub_key_file)
1534
1535 # Retrieve updated map of UUIDs to keys
1536 keys_by_uuid = ssh.QueryPubKeyFile(
1537 [node_info.uuid for node_info in node_list], key_file=pub_key_file)
1538
1539 # Update the master node's key files
1540 (auth_key_file, _) = \
1541 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1542 for node_info in node_list:
1543 if node_info.to_authorized_keys:
1544 ssh.AddAuthorizedKeys(auth_key_file, keys_by_uuid[node_info.uuid])
1545
1546 base_data = {}
1547 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1548 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1549
1550 ssh_port_map = ssconf_store.GetSshPortMap()
1551
1552 # Update the target nodes themselves
1553 for node_info in node_list:
1554 logging.debug("Updating SSH key files of target node '%s'.", node_info.name)
1555 if node_info.get_public_keys:
1556 node_data = {}
1557 _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
1558 all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
1559 node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1560 (constants.SSHS_OVERRIDE, all_keys)
1561
1562 try:
1563 utils.RetryByNumberOfTimes(
1564 constants.SSHS_MAX_RETRIES,
1565 errors.SshUpdateError,
1566 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1567 ssh_port_map.get(node_info.name), node_data,
1568 debug=False, verbose=False, use_cluster_key=False,
1569 ask_key=False, strict_host_check=False)
1570 except errors.SshUpdateError as e:
1571 # Clean up the master's public key file if adding key fails
1572 if node_info.to_public_keys:
1573 ssh.RemovePublicKey(node_info.uuid)
1574 raise e
1575
1576 # Update all nodes except master and the target nodes
1577 keys_by_uuid_auth = ssh.QueryPubKeyFile(
1578 [node_info.uuid for node_info in node_list
1579 if node_info.to_authorized_keys],
1580 key_file=pub_key_file)
1581 if to_authorized_keys:
1582 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1583 (constants.SSHS_ADD, keys_by_uuid_auth)
1584
1585 pot_mc_data = base_data.copy()
1586 keys_by_uuid_pub = ssh.QueryPubKeyFile(
1587 [node_info.uuid for node_info in node_list
1588 if node_info.to_public_keys],
1589 key_file=pub_key_file)
1590 if to_public_keys:
1591 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1592 (constants.SSHS_REPLACE_OR_ADD, keys_by_uuid_pub)
1593
1594 all_nodes = ssconf_store.GetNodeList()
1595 master_node = ssconf_store.GetMasterNode()
1596 online_nodes = ssconf_store.GetOnlineNodeList()
1597
1598 node_errors = []
1599 for node in all_nodes:
1600 if node == master_node:
1601 logging.debug("Skipping master node '%s'.", master_node)
1602 continue
1603 if node not in online_nodes:
1604 logging.debug("Skipping offline node '%s'.", node)
1605 continue
1606 if node in potential_master_candidates:
1607 logging.debug("Updating SSH key files of node '%s'.", node)
1608 try:
1609 utils.RetryByNumberOfTimes(
1610 constants.SSHS_MAX_RETRIES,
1611 errors.SshUpdateError,
1612 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1613 ssh_port_map.get(node), pot_mc_data,
1614 debug=False, verbose=False, use_cluster_key=False,
1615 ask_key=False, strict_host_check=False)
1616 except errors.SshUpdateError as last_exception:
1617 error_msg = ("When adding the key of node '%s', updating SSH key"
1618 " files of node '%s' failed after %s retries."
1619 " Not trying again. Last error was: %s." %
1620 (node, node_info.name, constants.SSHS_MAX_RETRIES,
1621 last_exception))
1622 node_errors.append((node, error_msg))
1623 # We only log the error and don't throw an exception, because
1624 # one unreachable node shall not abort the entire procedure.
1625 logging.error(error_msg)
1626
1627 else:
1628 if to_authorized_keys:
1629 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
1630 ssh_port_map.get(node), base_data,
1631 debug=False, verbose=False, use_cluster_key=False,
1632 ask_key=False, strict_host_check=False)
1633
1634 return node_errors
1635
1636
1637 def RemoveNodeSshKey(node_uuid, node_name,
1638 master_candidate_uuids,
1639 potential_master_candidates,
1640 master_uuid=None,
1641 keys_to_remove=None,
1642 from_authorized_keys=False,
1643 from_public_keys=False,
1644 clear_authorized_keys=False,
1645 clear_public_keys=False,
1646 pub_key_file=pathutils.SSH_PUB_KEYS,
1647 ssconf_store=None,
1648 noded_cert_file=pathutils.NODED_CERT_FILE,
1649 readd=False,
1650 run_cmd_fn=ssh.RunSshCmdWithStdin):
1651 """Removes the node's SSH keys from the key files and distributes those.
1652
1653 Note that at least one of the flags C{from_authorized_keys},
1654 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1655 has to be set to C{True} for the function to perform any action at all.
1656 Not doing so will trigger an assertion in the function.
1657
1658 @type node_uuid: str
1659 @param node_uuid: UUID of the node whose key is removed
1660 @type node_name: str
1661 @param node_name: name of the node whose key is remove
1662 @type master_candidate_uuids: list of str
1663 @param master_candidate_uuids: list of UUIDs of the current master candidates
1664 @type potential_master_candidates: list of str
1665 @param potential_master_candidates: list of names of potential master
1666 candidates
1667 @type keys_to_remove: dict of str to list of str
1668 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1669 to be removed. This list is supposed to be used only if the keys are not
1670 in the public keys file. This is for example the case when removing a
1671 master node's key.
1672 @type from_authorized_keys: boolean
1673 @param from_authorized_keys: whether or not the key should be removed
1674 from the C{authorized_keys} file
1675 @type from_public_keys: boolean
1676 @param from_public_keys: whether or not the key should be remove from
1677 the C{ganeti_pub_keys} file
1678 @type clear_authorized_keys: boolean
1679 @param clear_authorized_keys: whether or not the C{authorized_keys} file
1680 should be cleared on the node whose keys are removed
1681 @type clear_public_keys: boolean
1682 @param clear_public_keys: whether to clear the node's C{ganeti_pub_key} file
1683 @type readd: boolean
1684 @param readd: whether this is called during a readd operation.
1685 @rtype: list of string
1686 @returns: list of feedback messages
1687
1688 """
1689 node_list = [SshRemoveNodeInfo(uuid=node_uuid,
1690 name=node_name,
1691 from_authorized_keys=from_authorized_keys,
1692 from_public_keys=from_public_keys,
1693 clear_authorized_keys=clear_authorized_keys,
1694 clear_public_keys=clear_public_keys)]
1695 return RemoveNodeSshKeyBulk(node_list,
1696 master_candidate_uuids,
1697 potential_master_candidates,
1698 master_uuid=master_uuid,
1699 keys_to_remove=keys_to_remove,
1700 pub_key_file=pub_key_file,
1701 ssconf_store=ssconf_store,
1702 noded_cert_file=noded_cert_file,
1703 readd=readd,
1704 run_cmd_fn=run_cmd_fn)
1705
1706
1707 # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
1708 SshRemoveNodeInfo = collections.namedtuple(
1709 "SshRemoveNodeInfo",
1710 ["uuid",
1711 "name",
1712 "from_authorized_keys",
1713 "from_public_keys",
1714 "clear_authorized_keys",
1715 "clear_public_keys"])
1716
1717
1718 def RemoveNodeSshKeyBulk(node_list,
1719 master_candidate_uuids,
1720 potential_master_candidates,
1721 master_uuid=None,
1722 keys_to_remove=None,
1723 pub_key_file=pathutils.SSH_PUB_KEYS,
1724 ssconf_store=None,
1725 noded_cert_file=pathutils.NODED_CERT_FILE,
1726 readd=False,
1727 run_cmd_fn=ssh.RunSshCmdWithStdin):
1728 """Removes the node's SSH keys from the key files and distributes those.
1729
1730 Note that at least one of the flags C{from_authorized_keys},
1731 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1732 of at least one node has to be set to C{True} for the function to perform any
1733 action at all. Not doing so will trigger an assertion in the function.
1734
1735 @type node_list: list of C{SshRemoveNodeInfo}.
1736 @param node_list: list of information about nodes whose keys are being removed
1737 @type master_candidate_uuids: list of str
1738 @param master_candidate_uuids: list of UUIDs of the current master candidates
1739 @type potential_master_candidates: list of str
1740 @param potential_master_candidates: list of names of potential master
1741 candidates
1742 @type keys_to_remove: dict of str to list of str
1743 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1744 to be removed. This list is supposed to be used only if the keys are not
1745 in the public keys file. This is for example the case when removing a
1746 master node's key.
1747 @type readd: boolean
1748 @param readd: whether this is called during a readd operation.
1749 @rtype: list of string
1750 @returns: list of feedback messages
1751
1752 """
1753 # Non-disruptive error messages, list of (node, msg) pairs
1754 result_msgs = []
1755
1756 # whether there are any keys to be added or retrieved at all
1757 from_authorized_keys = any([node_info.from_authorized_keys for node_info in
1758 node_list])
1759 from_public_keys = any([node_info.from_public_keys for node_info in
1760 node_list])
1761 clear_authorized_keys = any([node_info.clear_authorized_keys for node_info in
1762 node_list])
1763 clear_public_keys = any([node_info.clear_public_keys for node_info in
1764 node_list])
1765
1766 # Make sure at least one of these flags is true.
1767 if not (from_authorized_keys or from_public_keys or clear_authorized_keys
1768 or clear_public_keys):
1769 raise errors.SshUpdateError("No removal from any key file was requested.")
1770
1771 if not ssconf_store:
1772 ssconf_store = ssconf.SimpleStore()
1773
1774 master_node = ssconf_store.GetMasterNode()
1775 ssh_port_map = ssconf_store.GetSshPortMap()
1776
1777 all_keys_to_remove = {}
1778 if from_authorized_keys or from_public_keys:
1779 for node_info in node_list:
1780 # Skip nodes that don't actually need any keys to be removed.
1781 if not (node_info.from_authorized_keys or node_info.from_public_keys):
1782 continue
1783 if node_info.name == master_node and not keys_to_remove:
1784 raise errors.SshUpdateError("Cannot remove the master node's keys.")
1785 if keys_to_remove:
1786 keys = keys_to_remove
1787 else:
1788 keys = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1789 if (not keys or node_info.uuid not in keys) and not readd:
1790 raise errors.SshUpdateError("Node '%s' not found in the list of"
1791 " public SSH keys. It seems someone"
1792 " tries to remove a key from outside"
1793 " the cluster!" % node_info.uuid)
1794 # During an upgrade all nodes have the master key. In this case we
1795 # should not remove it to avoid accidentally shutting down cluster
1796 # SSH communication
1797 master_keys = None
1798 if master_uuid:
1799 master_keys = ssh.QueryPubKeyFile([master_uuid],
1800 key_file=pub_key_file)
1801 for master_key in master_keys:
1802 if master_key in keys[node_info.uuid]:
1803 keys[node_info.uuid].remove(master_key)
1804
1805 all_keys_to_remove.update(keys)
1806
1807 if all_keys_to_remove:
1808 base_data = {}
1809 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1810 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1811
1812 if from_authorized_keys:
1813 # UUIDs of nodes that are supposed to be removed from the
1814 # authorized_keys files.
1815 nodes_remove_from_authorized_keys = [
1816 node_info.uuid for node_info in node_list
1817 if node_info.from_authorized_keys]
1818 keys_to_remove_from_authorized_keys = dict([
1819 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1820 if uuid in nodes_remove_from_authorized_keys])
1821 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1822 (constants.SSHS_REMOVE, keys_to_remove_from_authorized_keys)
1823 (auth_key_file, _) = \
1824 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False,
1825 dircheck=False)
1826
1827 for uuid in nodes_remove_from_authorized_keys:
1828 ssh.RemoveAuthorizedKeys(auth_key_file,
1829 keys_to_remove_from_authorized_keys[uuid])
1830
1831 pot_mc_data = base_data.copy()
1832
1833 if from_public_keys:
1834 nodes_remove_from_public_keys = [
1835 node_info.uuid for node_info in node_list
1836 if node_info.from_public_keys]
1837 keys_to_remove_from_public_keys = dict([
1838 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1839 if uuid in nodes_remove_from_public_keys])
1840 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1841 (constants.SSHS_REMOVE, keys_to_remove_from_public_keys)
1842
1843 all_nodes = ssconf_store.GetNodeList()
1844 online_nodes = ssconf_store.GetOnlineNodeList()
1845 all_nodes_to_remove = [node_info.name for node_info in node_list]
1846 logging.debug("Removing keys of nodes '%s' from all nodes but itself and"
1847 " master.", ", ".join(all_nodes_to_remove))
1848 for node in all_nodes:
1849 if node == master_node:
1850 logging.debug("Skipping master node '%s'.", master_node)
1851 continue
1852 if node not in online_nodes:
1853 logging.debug("Skipping offline node '%s'.", node)
1854 continue
1855 if node in all_nodes_to_remove:
1856 logging.debug("Skipping node whose key is removed itself '%s'.", node)
1857 continue
1858 ssh_port = ssh_port_map.get(node)
1859 if not ssh_port:
1860 raise errors.OpExecError("No SSH port information available for"
1861 " node '%s', map: %s." %
1862 (node, ssh_port_map))
1863 error_msg_final = ("When removing the key of node '%s', updating the"
1864 " SSH key files of node '%s' failed. Last error"
1865 " was: %s.")
1866 if node in potential_master_candidates:
1867 logging.debug("Updating key setup of potential master candidate node"
1868 " %s.", node)
1869 try:
1870 utils.RetryByNumberOfTimes(
1871 constants.SSHS_MAX_RETRIES,
1872 errors.SshUpdateError,
1873 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1874 ssh_port, pot_mc_data,
1875 debug=False, verbose=False, use_cluster_key=False,
1876 ask_key=False, strict_host_check=False)
1877 except errors.SshUpdateError as last_exception:
1878 error_msg = error_msg_final % (
1879 node_info.name, node, last_exception)
1880 result_msgs.append((node, error_msg))
1881 logging.error(error_msg)
1882
1883 else:
1884 if from_authorized_keys:
1885 logging.debug("Updating key setup of normal node %s.", node)
1886 try:
1887 utils.RetryByNumberOfTimes(
1888 constants.SSHS_MAX_RETRIES,
1889 errors.SshUpdateError,
1890 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1891 ssh_port, base_data,
1892 debug=False, verbose=False, use_cluster_key=False,
1893 ask_key=False, strict_host_check=False)
1894 except errors.SshUpdateError as last_exception:
1895 error_msg = error_msg_final % (
1896 node_info.name, node, last_exception)
1897 result_msgs.append((node, error_msg))
1898 logging.error(error_msg)
1899
1900 for node_info in node_list:
1901 if node_info.clear_authorized_keys or node_info.from_public_keys or \
1902 node_info.clear_public_keys:
1903 data = {}
1904 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1905 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1906 ssh_port = ssh_port_map.get(node_info.name)
1907 if not ssh_port:
1908 raise errors.OpExecError("No SSH port information available for"
1909 " node '%s', which is leaving the cluster.")
1910
1911 if node_info.clear_authorized_keys:
1912 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore,
1913 # we have to specify exactly which keys to clear to leave keys untouched
1914 # that were not added by Ganeti.
1915 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids
1916 if uuid != node_info.uuid]
1917 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids,
1918 key_file=pub_key_file)
1919 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1920 (constants.SSHS_REMOVE, candidate_keys)
1921
1922 if node_info.clear_public_keys:
1923 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1924 (constants.SSHS_CLEAR, {})
1925 elif node_info.from_public_keys:
1926 # Since clearing the public keys subsumes removing just a single key,
1927 # we only do it if clear_public_keys is 'False'.
1928
1929 if all_keys_to_remove:
1930 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1931 (constants.SSHS_REMOVE, all_keys_to_remove)
1932
1933 # If we have no changes to any keyfile, just return
1934 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or
1935 constants.SSHS_SSH_AUTHORIZED_KEYS in data):
1936 return
1937
1938 logging.debug("Updating SSH key setup of target node '%s'.",
1939 node_info.name)
1940 try:
1941 utils.RetryByNumberOfTimes(
1942 constants.SSHS_MAX_RETRIES,
1943 errors.SshUpdateError,
1944 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1945 ssh_port, data,
1946 debug=False, verbose=False, use_cluster_key=False,
1947 ask_key=False, strict_host_check=False)
1948 except errors.SshUpdateError as last_exception:
1949 result_msgs.append(
1950 (node_info.name,
1951 ("Removing SSH keys from node '%s' failed."
1952 " This can happen when the node is already unreachable."
1953 " Error: %s" % (node_info.name, last_exception))))
1954
1955 if all_keys_to_remove and from_public_keys:
1956 for node_uuid in nodes_remove_from_public_keys:
1957 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
1958
1959 return result_msgs
1960
1961
1962 def _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map,
1963 pub_key_file=pathutils.SSH_PUB_KEYS,
1964 ssconf_store=None,
1965 noded_cert_file=pathutils.NODED_CERT_FILE,
1966 run_cmd_fn=ssh.RunSshCmdWithStdin,
1967 suffix=""):
1968 """Generates the root SSH key pair on the node.
1969
1970 @type node_uuid: str
1971 @param node_uuid: UUID of the node whose key is removed
1972 @type node_name: str
1973 @param node_name: name of the node whose key is remove
1974 @type ssh_port_map: dict of str to int
1975 @param ssh_port_map: mapping of node names to their SSH port
1976
1977 """
1978 if not ssconf_store:
1979 ssconf_store = ssconf.SimpleStore()
1980
1981 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
1982 if not keys_by_uuid or node_uuid not in keys_by_uuid:
1983 raise errors.SshUpdateError("Node %s (UUID: %s) whose key is requested to"
1984 " be regenerated is not registered in the"
1985 " public keys file." % (node_name, node_uuid))
1986
1987 data = {}
1988 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1989 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1990 data[constants.SSHS_GENERATE] = {constants.SSHS_SUFFIX: suffix}
1991
1992 run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
1993 ssh_port_map.get(node_name), data,
1994 debug=False, verbose=False, use_cluster_key=False,
1995 ask_key=False, strict_host_check=False)
1996
1997
1998 def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
1999 master_node_uuids = [node_uuid for (node_uuid, node_name)
2000 in node_uuid_name_map
2001 if node_name == master_node_name]
2002 if len(master_node_uuids) != 1:
2003 raise errors.SshUpdateError("No (unique) master UUID found. Master node"
2004 " name: '%s', Master UUID: '%s'" %
2005 (master_node_name, master_node_uuids))
2006 return master_node_uuids[0]
2007
2008
2009 def _GetOldMasterKeys(master_node_uuid, pub_key_file):
2010 old_master_keys_by_uuid = ssh.QueryPubKeyFile([master_node_uuid],
2011 key_file=pub_key_file)
2012 if not old_master_keys_by_uuid:
2013 raise errors.SshUpdateError("No public key of the master node (UUID '%s')"
2014 " found, not generating a new key."
2015 % master_node_uuid)
2016 return old_master_keys_by_uuid
2017
2018
2019 def _GetNewMasterKey(root_keyfiles, master_node_uuid):
2020 new_master_keys = []
2021 for (_, (_, public_key_file)) in root_keyfiles.items():
2022 public_key_dir = os.path.dirname(public_key_file)
2023 public_key_file_tmp_filename = \
2024 os.path.splitext(os.path.basename(public_key_file))[0] \
2025 + constants.SSHS_MASTER_SUFFIX + ".pub"
2026 public_key_path_tmp = os.path.join(public_key_dir,
2027 public_key_file_tmp_filename)
2028 if os.path.exists(public_key_path_tmp):
2029 # for some key types, there might not be any keys
2030 key = utils.ReadFile(public_key_path_tmp)
2031 new_master_keys.append(key)
2032 if not new_master_keys:
2033 raise errors.SshUpdateError("Cannot find any type of temporary SSH key.")
2034 return {master_node_uuid: new_master_keys}
2035
2036
2037 def _ReplaceMasterKeyOnMaster(root_keyfiles):
2038 number_of_moves = 0
2039 for (_, (private_key_file, public_key_file)) in root_keyfiles.items():
2040 key_dir = os.path.dirname(public_key_file)
2041 private_key_file_tmp = \
2042 os.path.basename(private_key_file) + constants.SSHS_MASTER_SUFFIX
2043 public_key_file_tmp = private_key_file_tmp + ".pub"
2044 private_key_path_tmp = os.path.join(key_dir,
2045 private_key_file_tmp)
2046 public_key_path_tmp = os.path.join(key_dir,
2047 public_key_file_tmp)
2048 if os.path.exists(public_key_file):
2049 utils.CreateBackup(public_key_file)
2050 utils.RemoveFile(public_key_file)
2051 if os.path.exists(private_key_file):
2052 utils.CreateBackup(private_key_file)
2053 utils.RemoveFile(private_key_file)
2054 if os.path.exists(public_key_path_tmp) and \
2055 os.path.exists(private_key_path_tmp):
2056 # for some key types, there might not be any keys
2057 shutil.move(public_key_path_tmp, public_key_file)
2058 shutil.move(private_key_path_tmp, private_key_file)
2059 number_of_moves += 1
2060 if not number_of_moves:
2061 raise errors.SshUpdateError("Could not move at least one master SSH key.")
2062
2063
2064 def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
2065 potential_master_candidates,
2066 pub_key_file=pathutils.SSH_PUB_KEYS,
2067 ssconf_store=None,
2068 noded_cert_file=pathutils.NODED_CERT_FILE,
2069 run_cmd_fn=ssh.RunSshCmdWithStdin):
2070 """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
2071
2072 @type node_uuids: list of str
2073 @param node_uuids: list of node UUIDs whose keys should be renewed
2074 @type node_names: list of str
2075 @param node_names: list of node names whose keys should be removed. This list
2076 should match the C{node_uuids} parameter
2077 @type master_candidate_uuids: list of str
2078 @param master_candidate_uuids: list of UUIDs of master candidates or
2079 master node
2080 @type pub_key_file: str
2081 @param pub_key_file: file path of the the public key file
2082 @type noded_cert_file: str
2083 @param noded_cert_file: path of the noded SSL certificate file
2084 @type run_cmd_fn: function
2085 @param run_cmd_fn: function to run commands on remote nodes via SSH
2086 @raises ProgrammerError: if node_uuids and node_names don't match;
2087 SshUpdateError if a node's key is missing from the public key file,
2088 if a node's new SSH key could not be fetched from it, if there is
2089 none or more than one entry in the public key list for the master
2090 node.
2091
2092 """
2093 if not ssconf_store:
2094 ssconf_store = ssconf.SimpleStore()
2095 cluster_name = ssconf_store.GetClusterName()
2096
2097 if not len(node_uuids) == len(node_names):
2098 raise errors.ProgrammerError("List of nodes UUIDs and node names"
2099 " does not match in length.")
2100
2101 (_, root_keyfiles) = \
2102 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2103 (_, dsa_pub_keyfile) = root_keyfiles[constants.SSHK_DSA]
2104 old_master_key = utils.ReadFile(dsa_pub_keyfile)
2105
2106 node_uuid_name_map = zip(node_uuids, node_names)
2107
2108 master_node_name = ssconf_store.GetMasterNode()
2109 master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name)
2110 ssh_port_map = ssconf_store.GetSshPortMap()
2111 # List of all node errors that happened, but which did not abort the
2112 # procedure as a whole. It is important that this is a list to have a
2113 # somewhat chronological history of events.
2114 all_node_errors = []
2115
2116 # process non-master nodes
2117
2118 # keys to add in bulk at the end
2119 node_keys_to_add = []
2120
2121 # list of all nodes
2122 node_list = []
2123
2124 # list of keys to be removed before generating new keys
2125 node_info_to_remove = []
2126
2127 for node_uuid, node_name in node_uuid_name_map:
2128 if node_name == master_node_name:
2129 continue
2130 master_candidate = node_uuid in master_candidate_uuids
2131 potential_master_candidate = node_name in potential_master_candidates
2132 node_list.append((node_uuid, node_name, master_candidate,
2133 potential_master_candidate))
2134
2135 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
2136 if not keys_by_uuid:
2137 raise errors.SshUpdateError("No public key of node %s (UUID %s) found,"
2138 " not generating a new key."
2139 % (node_name, node_uuid))
2140
2141 if master_candidate:
2142 logging.debug("Fetching old SSH key from node '%s'.", node_name)
2143 old_pub_key = ssh.ReadRemoteSshPubKeys(dsa_pub_keyfile,
2144 node_name, cluster_name,
2145 ssh_port_map[node_name],
2146 False, # ask_key
2147 False) # key_check
2148 if old_pub_key != old_master_key:
2149 # If we are already in a multi-key setup (that is past Ganeti 2.12),
2150 # we can safely remove the old key of the node. Otherwise, we cannot
2151 # remove that node's key, because it is also the master node's key
2152 # and that would terminate all communication from the master to the
2153 # node.
2154 node_info_to_remove.append(SshRemoveNodeInfo(
2155 uuid=node_uuid,
2156 name=node_name,
2157 from_authorized_keys=master_candidate,
2158 from_public_keys=False,
2159 clear_authorized_keys=False,
2160 clear_public_keys=False))
2161 else:
2162 logging.debug("Old key of node '%s' is the same as the current master"
2163 " key. Not deleting that key on the node.", node_name)
2164
2165 logging.debug("Removing old SSH keys of all master candidates.")
2166 if node_info_to_remove:
2167 node_errors = RemoveNodeSshKeyBulk(
2168 node_info_to_remove,
2169 master_candidate_uuids,
2170 potential_master_candidates,
2171 master_uuid=master_node_uuid)
2172 if node_errors:
2173 all_node_errors = all_node_errors + node_errors
2174
2175 for (node_uuid, node_name, master_candidate, potential_master_candidate) \
2176 in node_list:
2177
2178 logging.debug("Generating new SSH key for node '%s'.", node_name)
2179 _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map,
2180 pub_key_file=pub_key_file,
2181 ssconf_store=ssconf_store,
2182 noded_cert_file=noded_cert_file,
2183 run_cmd_fn=run_cmd_fn)
2184
2185 try:
2186 logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
2187 pub_key = ssh.ReadRemoteSshPubKeys(dsa_pub_keyfile,
2188 node_name, cluster_name,
2189 ssh_port_map[node_name],
2190 False, # ask_key
2191 False) # key_check
2192 except:
2193 raise errors.SshUpdateError("Could not fetch key of node %s"
2194 " (UUID %s)" % (node_name, node_uuid))
2195
2196 if potential_master_candidate:
2197 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
2198 ssh.AddPublicKey(node_uuid, pub_key, key_file=pub_key_file)
2199
2200 logging.debug("Add ssh key of node '%s'.", node_name)
2201 node_info = SshAddNodeInfo(name=node_name,
2202 uuid=node_uuid,
2203 to_authorized_keys=master_candidate,
2204 to_public_keys=potential_master_candidate,
2205 get_public_keys=True)
2206 node_keys_to_add.append(node_info)
2207
2208 node_errors = AddNodeSshKeyBulk(
2209 node_keys_to_add, potential_master_candidates,
2210 pub_key_file=pub_key_file, ssconf_store=ssconf_store,
2211 noded_cert_file=noded_cert_file,
2212 run_cmd_fn=run_cmd_fn)
2213 if node_errors:
2214 all_node_errors = all_node_errors + node_errors
2215
2216 # Renewing the master node's key
2217
2218 # Preserve the old keys for now
2219 old_master_keys_by_uuid = _GetOldMasterKeys(master_node_uuid, pub_key_file)
2220
2221 # Generate a new master key with a suffix, don't touch the old one for now
2222 logging.debug("Generate new ssh key of master.")
2223 _GenerateNodeSshKey(master_node_uuid, master_node_name, ssh_port_map,
2224 pub_key_file=pub_key_file,
2225 ssconf_store=ssconf_store,
2226 noded_cert_file=noded_cert_file,
2227 run_cmd_fn=run_cmd_fn,
2228 suffix=constants.SSHS_MASTER_SUFFIX)
2229 # Read newly created master key
2230 new_master_key_dict = _GetNewMasterKey(root_keyfiles, master_node_uuid)
2231
2232 # Replace master key in the master nodes' public key file
2233 ssh.RemovePublicKey(master_node_uuid, key_file=pub_key_file)
2234 for pub_key in new_master_key_dict[master_node_uuid]:
2235 ssh.AddPublicKey(master_node_uuid, pub_key, key_file=pub_key_file)
2236
2237 # Add new master key to all node's public and authorized keys
2238 logging.debug("Add new master key to all nodes.")
2239 node_errors = AddNodeSshKey(
2240 master_node_uuid, master_node_name, potential_master_candidates,
2241 to_authorized_keys=True, to_public_keys=True,
2242 get_public_keys=False, pub_key_file=pub_key_file,
2243 ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
2244 run_cmd_fn=run_cmd_fn)
2245 if node_errors:
2246 all_node_errors = all_node_errors + node_errors
2247
2248 # Remove the old key file and rename the new key to the non-temporary filename
2249 _ReplaceMasterKeyOnMaster(root_keyfiles)
2250
2251 # Remove old key from authorized keys
2252 (auth_key_file, _) = \
2253 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2254 ssh.RemoveAuthorizedKeys(auth_key_file,
2255 old_master_keys_by_uuid[master_node_uuid])
2256
2257 # Remove the old key from all node's authorized keys file
2258 logging.debug("Remove the old master key from all nodes.")
2259 node_errors = RemoveNodeSshKey(
2260 master_node_uuid, master_node_name, master_candidate_uuids,
2261 potential_master_candidates,
2262 keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
2263 from_public_keys=False, clear_authorized_keys=False,
2264 clear_public_keys=False)
2265 if node_errors:
2266 all_node_errors = all_node_errors + node_errors
2267
2268 return all_node_errors
2269
2270
2271 def GetBlockDevSizes(devices):
2272 """Return the size of the given block devices
2273
2274 @type devices: list
2275 @param devices: list of block device nodes to query
2276 @rtype: dict
2277 @return:
2278 dictionary of all block devices under /dev (key). The value is their
2279 size in MiB.
2280
2281 {'/dev/disk/by-uuid/123456-12321231-312312-312': 124}
2282
2283 """
2284 DEV_PREFIX = "/dev/"
2285 blockdevs = {}
2286
2287 for devpath in devices:
2288 if not utils.IsBelowDir(DEV_PREFIX, devpath):
2289 continue
2290
2291 try:
2292 st = os.stat(devpath)
2293 except EnvironmentError, err:
2294 logging.warning("Error stat()'ing device %s: %s", devpath, str(err))
2295 continue
2296
2297 if stat.S_ISBLK(st.st_mode):
2298 result = utils.RunCmd(["blockdev", "--getsize64", devpath])
2299 if result.failed:
2300 # We don't want to fail, just do not list this device as available
2301 logging.warning("Cannot get size for block device %s", devpath)
2302 continue
2303
2304 size = int(result.stdout) / (1024 * 1024)
2305 blockdevs[devpath] = size
2306 return blockdevs
2307
2308
2309 def GetVolumeList(vg_names):
2310 """Compute list of logical volumes and their size.
2311
2312 @type vg_names: list
2313 @param vg_names: the volume groups whose LVs we should list, or
2314 empty for all volume groups
2315 @rtype: dict
2316 @return:
2317 dictionary of all partions (key) with value being a tuple of
2318 their size (in MiB), inactive and online status::
2319
2320 {'xenvg/test1': ('20.06', True, True)}
2321
2322 in case of errors, a string is returned with the error
2323 details.
2324
2325 """
2326 lvs = {}
2327 sep = "|"
2328 if not vg_names:
2329 vg_names = []
2330 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2331 "--separator=%s" % sep,
2332 "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names)
2333 if result.failed:
2334 _Fail("Failed to list logical volumes, lvs output: %s", result.output)
2335
2336 for line in result.stdout.splitlines():
2337 line = line.strip()
2338 match = _LVSLINE_REGEX.match(line)
2339 if not match:
2340 logging.error("Invalid line returned from lvs output: '%s'", line)
2341 continue
2342 vg_name, name, size, attr = match.groups()
2343 inactive = attr[4] == "-"
2344 online = attr[5] == "o"
2345 virtual = attr[0] == "v"
2346 if virtual:
2347 # we don't want to report such volumes as existing, since they
2348 # don't really hold data
2349 continue
2350 lvs[vg_name + "/" + name] = (size, inactive, online)
2351
2352 return lvs
2353
2354
2355 def ListVolumeGroups():
2356 """List the volume groups and their size.
2357
2358 @rtype: dict
2359 @return: dictionary with keys volume name and values the
2360 size of the volume
2361
2362 """
2363 return utils.ListVolumeGroups()
2364
2365
2366 def NodeVolumes():
2367 """List all volumes on this node.
2368
2369 @rtype: list
2370 @return:
2371 A list of dictionaries, each having four keys:
2372 - name: the logical volume name,
2373 - size: the size of the logical volume
2374 - dev: the physical device on which the LV lives
2375 - vg: the volume group to which it belongs
2376
2377 In case of errors, we return an empty list and log the
2378 error.
2379
2380 Note that since a logical volume can live on multiple physical
2381 volumes, the resulting list might include a logical volume
2382 multiple times.
2383
2384 """
2385 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2386 "--separator=|",
2387 "--options=lv_name,lv_size,devices,vg_name"])
2388 if result.failed:
2389 _Fail("Failed to list logical volumes, lvs output: %s",
2390 result.output)
2391
2392 def parse_dev(dev):
2393 return dev.split("(")[0]
2394
2395 def handle_dev(dev):
2396 return [parse_dev(x) for x in dev.split(",")]
2397
2398 def map_line(line):
2399 line = [v.strip() for v in line]
2400 return [{"name": line[0], "size": line[1],
2401 "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])]
2402
2403 all_devs = []
2404 for line in result.stdout.splitlines():
2405 if line.count("|") >= 3:
2406 all_devs.extend(map_line(line.split("|")))
2407 else:
2408 logging.warning("Strange line in the output from lvs: '%s'", line)
2409 return all_devs
2410
2411
2412 def BridgesExist(bridges_list):
2413 """Check if a list of bridges exist on the current node.
2414
2415 @rtype: boolean
2416 @return: C{True} if all of them exist, C{False} otherwise
2417
2418 """
2419 missing = []
2420 for bridge in bridges_list:
2421 if not utils.BridgeExists(bridge):
2422 missing.append(bridge)
2423
2424 if missing:
2425 _Fail("Missing bridges %s", utils.CommaJoin(missing))
2426
2427
2428 def GetInstanceListForHypervisor(hname, hvparams=None,
2429 get_hv_fn=hypervisor.GetHypervisor):
2430 """Provides a list of instances of the given hypervisor.
2431
2432 @type hname: string
2433 @param hname: name of the hypervisor
2434 @type hvparams: dict of strings
2435 @param hvparams: hypervisor parameters for the given hypervisor
2436 @type get_hv_fn: function
2437 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2438 name; optional parameter to increase testability
2439
2440 @rtype: list
2441 @return: a list of all running instances on the current node
2442 - instance1.example.com
2443 - instance2.example.com
2444
2445 """
2446 try:
2447 return get_hv_fn(hname).ListInstances(hvparams=hvparams)
2448 except errors.HypervisorError, err:
2449 _Fail("Error enumerating instances (hypervisor %s): %s",
2450 hname, err, exc=True)
2451
2452
2453 def GetInstanceList(hypervisor_list, all_hvparams=None,
2454 get_hv_fn=hypervisor.GetHypervisor):
2455 """Provides a list of instances.
2456
2457 @type hypervisor_list: list
2458 @param hypervisor_list: the list of hypervisors to query information
2459 @type all_hvparams: dict of dict of strings
2460 @param all_hvparams: a dictionary mapping hypervisor types to respective
2461 cluster-wide hypervisor parameters
2462 @type get_hv_fn: function
2463 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2464 name; optional parameter to increase testability
2465
2466 @rtype: list
2467 @return: a list of all running instances on the current node
2468 - instance1.example.com
2469 - instance2.example.com
2470
2471 """
2472 results = []
2473 for hname in hypervisor_list:
2474 hvparams = all_hvparams[hname]
2475 results.extend(GetInstanceListForHypervisor(hname, hvparams=hvparams,
2476 get_hv_fn=get_hv_fn))
2477 return results
2478
2479
2480 def GetInstanceInfo(instance, hname, hvparams=None):
2481 """Gives back the information about an instance as a dictionary.
2482
2483 @type instance: string
2484 @param instance: the instance name
2485 @type hname: string
2486 @param hname: the hypervisor type of the instance
2487 @type hvparams: dict of strings
2488 @param hvparams: the instance's hvparams
2489
2490 @rtype: dict
2491 @return: dictionary with the following keys:
2492 - memory: memory size of instance (int)
2493 - state: state of instance (HvInstanceState)
2494 - time: cpu time of instance (float)
2495 - vcpus: the number of vcpus (int)
2496
2497 """
2498 output = {}
2499
2500 iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance,
2501 hvparams=hvparams)
2502 if iinfo is not None:
2503 output["memory"] = iinfo[2]
2504 output["vcpus"] = iinfo[3]
2505 output["state"] = iinfo[4]
2506 output["time"] = iinfo[5]
2507
2508 return output
2509
2510
2511 def GetInstanceMigratable(instance):
2512 """Computes whether an instance can be migrated.
2513
2514 @type instance: L{objects.Instance}
2515 @param instance: object representing the instance to be checked.
2516
2517 @rtype: tuple
2518 @return: tuple of (result, description) where:
2519 - result: whether the instance can be migrated or not
2520 - description: a description of the issue, if relevant
2521
2522 """
2523 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2524 iname = instance.name
2525 if iname not in hyper.ListInstances(hvparams=instance.hvparams):
2526 _Fail("Instance %s is not running", iname)
2527
2528 for idx in range(len(instance.disks_info)):
2529 link_name = _GetBlockDevSymlinkPath(iname, idx)
2530 if not os.path.islink(link_name):
2531 logging.warning("Instance %s is missing symlink %s for disk %d",
2532 iname, link_name, idx)
2533
2534
2535 def GetAllInstancesInfo(hypervisor_list, all_hvparams):
2536 """Gather data about all instances.
2537
2538 This is the equivalent of L{GetInstanceInfo}, except that it
2539 computes data for all instances at once, thus being faster if one
2540 needs data about more than one instance.
2541
2542 @type hypervisor_list: list
2543 @param hypervisor_list: list of hypervisors to query for instance data
2544 @type all_hvparams: dict of dict of strings
2545 @param all_hvparams: mapping of hypervisor names to hvparams
2546
2547 @rtype: dict
2548 @return: dictionary of instance: data, with data having the following keys:
2549 - memory: memory size of instance (int)
2550 - state: xen state of instance (string)
2551 - time: cpu time of instance (float)
2552 - vcpus: the number of vcpus
2553
2554 """
2555 output = {}
2556 for hname in hypervisor_list:
2557 hvparams = all_hvparams[hname]
2558 iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo(hvparams)
2559 if iinfo:
2560 for name, _, memory, vcpus, state, times in iinfo:
2561 value = {
2562 "memory": memory,
2563 "vcpus": vcpus,
2564 "state": state,
2565 "time": times,
2566 }
2567 if name in output:
2568 # we only check static parameters, like memory and vcpus,
2569 # and not state and time which can change between the
2570 # invocations of the different hypervisors
2571 for key in "memory", "vcpus":
2572 if value[key] != output[name][key]:
2573 _Fail("Instance %s is running twice"
2574 " with different parameters", name)
2575 output[name] = value
2576
2577 return output
2578
2579
2580 def GetInstanceConsoleInfo(instance_param_dict,
2581 get_hv_fn=hypervisor.GetHypervisor):
2582 """Gather data about the console access of a set of instances of this node.
2583
2584 This function assumes that the caller already knows which instances are on
2585 this node, by calling a function such as L{GetAllInstancesInfo} or
2586 L{GetInstanceList}.
2587
2588 For every instance, a large amount of configuration data needs to be
2589 provided to the hypervisor interface in order to receive the console
2590 information. Whether this could or should be cut down can be discussed.
2591 The information is provided in a dictionary indexed by instance name,
2592 allowing any number of instance queries to be done.
2593
2594 @type instance_param_dict: dict of string to tuple of dictionaries, where the
2595 dictionaries represent: L{objects.Instance}, L{objects.Node},
2596 L{objects.NodeGroup}, HvParams, BeParams
2597 @param instance_param_dict: mapping of instance name to parameters necessary
2598 for console information retrieval
2599
2600 @rtype: dict
2601 @return: dictionary of instance: data, with data having the following keys:
2602 - instance: instance name
2603 - kind: console kind
2604 - message: used with kind == CONS_MESSAGE, indicates console to be
2605 unavailable, supplies error message
2606 - host: host to connect to
2607 - port: port to use
2608 - user: user for login
2609 - command: the command, broken into parts as an array
2610 - display: unknown, potentially unused?
2611
2612 """
2613
2614 output = {}
2615 for inst_name in instance_param_dict:
2616 instance = instance_param_dict[inst_name]["instance"]
2617 pnode = instance_param_dict[inst_name]["node"]
2618 group = instance_param_dict[inst_name]["group"]
2619 hvparams = instance_param_dict[inst_name]["hvParams"]
2620 beparams = instance_param_dict[inst_name]["beParams"]
2621
2622 instance = objects.Instance.FromDict(instance)
2623 pnode = objects.Node.FromDict(pnode)
2624 group = objects.NodeGroup.FromDict(group)
2625
2626 h = get_hv_fn(instance.hypervisor)
2627 output[inst_name] = h.GetInstanceConsole(instance, pnode, group,
2628 hvparams, beparams).ToDict()
2629
2630 return output
2631
2632
2633 def _InstanceLogName(kind, os_name, instance, component):
2634 """Compute the OS log filename for a given instance and operation.
2635
2636 The instance name and os name are passed in as strings since not all
2637 operations have these as part of an instance object.
2638
2639 @type kind: string
2640 @param kind: the operation type (e.g. add, import, etc.)
2641 @type os_name: string
2642 @param os_name: the os name
2643 @type instance: string
2644 @param instance: the name of the instance being imported/added/etc.
2645 @type component: string or None
2646 @param component: the name of the component of the instance being
2647 transferred
2648
2649 """
2650 # TODO: Use tempfile.mkstemp to create unique filename
2651 if component:
2652 assert "/" not in component
2653 c_msg = "-%s" % component
2654 else:
2655 c_msg = ""
2656 base = ("%s-%s-%s%s-%s.log" %
2657 (kind, os_name, instance, c_msg, utils.TimestampForFilename()))
2658 return utils.PathJoin(pathutils.LOG_OS_DIR, base)
2659
2660
2661 def InstanceOsAdd(instance, reinstall, debug):
2662 """Add an OS to an instance.
2663
2664 @type instance: L{objects.Instance}
2665 @param instance: Instance whose OS is to be installed
2666 @type reinstall: boolean
2667 @param reinstall: whether this is an instance reinstall
2668 @type debug: integer
2669 @param debug: debug level, passed to the OS scripts
2670 @rtype: None
2671
2672 """
2673 inst_os = OSFromDisk(instance.os)
2674
2675 create_env = OSEnvironment(instance, inst_os, debug)
2676 if reinstall:
2677 create_env["INSTANCE_REINSTALL"] = "1"
2678
2679 logfile = _InstanceLogName("add", instance.os, instance.name, None)
2680
2681 result = utils.RunCmd([inst_os.create_script], env=create_env,
2682 cwd=inst_os.path, output=logfile, reset_env=True)
2683 if result.failed:
2684 logging.error("os create command '%s' returned error: %s, logfile: %s,"
2685 " output: %s", result.cmd, result.fail_reason, logfile,
2686 result.output)
2687 lines = [utils.SafeEncode(val)
2688 for val in utils.TailFile(logfile, lines=20)]
2689 _Fail("OS create script failed (%s), last lines in the"
2690 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2691
2692
2693 def RunRenameInstance(instance, old_name, debug):
2694 """Run the OS rename script for an instance.
2695
2696 @type instance: L{objects.Instance}
2697 @param instance: Instance whose OS is to be installed
2698 @type old_name: string
2699 @param old_name: previous instance name
2700 @type debug: integer
2701 @param debug: debug level, passed to the OS scripts
2702 @rtype: boolean
2703 @return: the success of the operation
2704
2705 """
2706 inst_os = OSFromDisk(instance.os)
2707
2708 rename_env = OSEnvironment(instance, inst_os, debug)
2709 rename_env["OLD_INSTANCE_NAME"] = old_name
2710
2711 logfile = _InstanceLogName("rename", instance.os,
2712 "%s-%s" % (old_name, instance.name), None)
2713
2714 result = utils.RunCmd([inst_os.rename_script], env=rename_env,
2715 cwd=inst_os.path, output=logfile, reset_env=True)
2716
2717 if result.failed:
2718 logging.error("os create command '%s' returned error: %s output: %s",
2719 result.cmd, result.fail_reason, result.output)
2720 lines = [utils.SafeEncode(val)
2721 for val in utils.TailFile(logfile, lines=20)]
2722 _Fail("OS rename script failed (%s), last lines in the"
2723 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2724
2725
2726 def _GetBlockDevSymlinkPath(instance_name, idx, _dir=None):
2727 """Returns symlink path for block device.
2728
2729 """
2730 if _dir is None:
2731 _dir = pathutils.DISK_LINKS_DIR
2732
2733 return utils.PathJoin(_dir,
2734 ("%s%s%s" %
2735 (instance_name, constants.DISK_SEPARATOR, idx)))
2736
2737
2738 def _SymlinkBlockDev(instance_name, device_path, idx):
2739 """Set up symlinks to a instance's block device.
2740
2741 This is an auxiliary function run when an instance is start (on the primary
2742 node) or when an instance is migrated (on the target node).
2743
2744
2745 @param instance_name: the name of the target instance
2746 @param device_path: path of the physical block device, on the node
2747 @param idx: the disk index
2748 @return: absolute path to the disk's symlink
2749
2750 """
2751 # In case we have only a userspace access URI, device_path is None
2752 if not device_path:
2753 return None
2754
2755 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2756 try:
2757 os.symlink(device_path, link_name)
2758 except OSError, err:
2759 if err.errno == errno.EEXIST:
2760 if (not os.path.islink(link_name) or
2761 os.readlink(link_name) != device_path):
2762 os.remove(link_name)
2763 os.symlink(device_path, link_name)
2764 else:
2765 raise
2766
2767 return link_name
2768
2769
2770 def _RemoveBlockDevLinks(instance_name, disks):
2771 """Remove the block device symlinks belonging to the given instance.
2772
2773 """
2774 for idx, _ in enumerate(disks):
2775 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2776 if os.path.islink(link_name):
2777 try:
2778 os.remove(link_name)
2779 except OSError:
2780 logging.exception("Can't remove symlink '%s'", link_name)
2781
2782
2783 def _CalculateDeviceURI(instance, disk, device):
2784 """Get the URI for the device.
2785
2786 @type instance: L{objects.Instance}
2787 @param instance: the instance which disk belongs to
2788 @type disk: L{objects.Disk}
2789 @param disk: the target disk object
2790 @type device: L{bdev.BlockDev}
2791 @param device: the corresponding BlockDevice
2792 @rtype: string
2793 @return: the device uri if any else None
2794
2795 """
2796 access_mode = disk.params.get(constants.LDP_ACCESS,
2797 constants.DISK_KERNELSPACE)
2798 if access_mode == constants.DISK_USERSPACE:
2799 # This can raise errors.BlockDeviceError
2800 return device.GetUserspaceAccessUri(instance.hypervisor)
2801 else:
2802 return None
2803
2804
2805 def _GatherAndLinkBlockDevs(instance):
2806 """Set up an instance's block device(s).
2807
2808 This is run on the primary node at instance startup. The block
2809 devices must be already assembled.
2810
2811 @type instance: L{objects.Instance}
2812 @param instance: the instance whose disks we should assemble
2813 @rtype: list
2814 @return: list of (disk_object, link_name, drive_uri)
2815
2816 """
2817 block_devices = []
2818 for idx, disk in enumerate(instance.disks_info):
2819 device = _RecursiveFindBD(disk)
2820 if device is None:
2821 raise errors.BlockDeviceError("Block device '%s' is not set up." %
2822 str(disk))
2823 device.Open()
2824 try:
2825 link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
2826 except OSError, e:
2827 raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
2828 e.strerror)
2829 uri = _CalculateDeviceURI(instance, disk, device)
2830
2831 block_devices.append((disk, link_name, uri))
2832
2833 return block_devices
2834
2835
2836 def _IsInstanceUserDown(instance_info):
2837 return instance_info and \
2838 "state" in instance_info and \
2839 hv_base.HvInstanceState.IsShutdown(instance_info["state"])
2840
2841
2842 def _GetInstanceInfo(instance):
2843 """Helper function L{GetInstanceInfo}"""
2844 return GetInstanceInfo(instance.name, instance.hypervisor,
2845 hvparams=instance.hvparams)
2846
2847
2848 def StartInstance(instance, startup_paused, reason, store_reason=True):
2849 """Start an instance.
2850
2851 @type instance: L{objects.Instance}
2852 @param instance: the instance object
2853 @type startup_paused: bool
2854 @param instance: pause instance at startup?
2855 @type reason: list of reasons
2856 @param reason: the reason trail for this startup
2857 @type store_reason: boolean
2858 @param store_reason: whether to store the shutdown reason trail on file
2859 @rtype: None
2860
2861 """
2862 instance_info = _GetInstanceInfo(instance)
2863
2864 if instance_info and not _IsInstanceUserDown(instance_info):
2865 logging.info("Instance '%s' already running, not starting", instance.name)
2866 return
2867
2868 try:
2869 block_devices = _GatherAndLinkBlockDevs(instance)
2870 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2871 hyper.StartInstance(instance, block_devices, startup_paused)
2872 if store_reason:
2873 _StoreInstReasonTrail(instance.name, reason)
2874 except errors.BlockDeviceError, err:
2875 _Fail("Block device error: %s", err, exc=True)
2876 except errors.HypervisorError, err:
2877 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2878 _Fail("Hypervisor error: %s", err, exc=True)
2879
2880
2881 def InstanceShutdown(instance, timeout, reason, store_reason=True):
2882 """Shut an instance down.
2883
2884 @note: this functions uses polling with a hardcoded timeout.
2885
2886 @type instance: L{objects.Instance}
2887 @param instance: the instance object
2888 @type timeout: integer
2889 @param timeout: maximum timeout for soft shutdown
2890 @type reason: list of reasons
2891 @param reason: the reason trail for this shutdown
2892 @type store_reason: boolean
2893 @param store_reason: whether to store the shutdown reason trail on file
2894 @rtype: None
2895
2896 """
2897 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2898
2899 if not _GetInstanceInfo(instance):
2900 logging.info("Instance '%s' not running, doing nothing", instance.name)
2901 return
2902
2903 class _TryShutdown(object):
2904 def __init__(self):
2905 self.tried_once = False
2906
2907 def __call__(self):
2908 if not _GetInstanceInfo(instance):
2909 return
2910
2911 try:
2912 hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout)
2913 if store_reason:
2914 _StoreInstReasonTrail(instance.name, reason)
2915 except errors.HypervisorError, err:
2916 # if the instance is no longer existing, consider this a
2917 # success and go to cleanup
2918 if not _GetInstanceInfo(instance):
2919 return
2920
2921 _Fail("Failed to stop instance '%s': %s", instance.name, err)
2922
2923 self.tried_once = True
2924
2925 raise utils.RetryAgain()
2926
2927 try:
2928 utils.Retry(_TryShutdown(), 5, timeout)
2929 except utils.RetryTimeout:
2930 # the shutdown did not succeed
2931 logging.error("Shutdown of '%s' unsuccessful, forcing", instance.name)
2932
2933 try:
2934 hyper.StopInstance(instance, force=True)
2935 except errors.HypervisorError, err:
2936 # only raise an error if the instance still exists, otherwise
2937 # the error could simply be "instance ... unknown"!
2938 if _GetInstanceInfo(instance):
2939 _Fail("Failed to force stop instance '%s': %s", instance.name, err)
2940
2941 time.sleep(1)
2942
2943 if _GetInstanceInfo(instance):
2944 _Fail("Could not shutdown instance '%s' even by destroy", instance.name)
2945
2946 try:
2947 hyper.CleanupInstance(instance.name)
2948 except errors.HypervisorError, err:
2949 logging.warning("Failed to execute post-shutdown cleanup step: %s", err)
2950
2951 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2952
2953
2954 def InstanceReboot(instance, reboot_type, shutdown_timeout, reason):
2955 """Reboot an instance.
2956
2957 @type instance: L{objects.Instance}
2958 @param instance: the instance object to reboot
2959 @type reboot_type: str
2960 @param reboot_type: the type of reboot, one the following
2961 constants:
2962 - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
2963 instance OS, do not recreate the VM
2964 - L{constants.INSTANCE_REBOOT_HARD}: tear down and
2965 restart the VM (at the hypervisor level)
2966 - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is
2967 not accepted here, since that mode is handled differently, in
2968 cmdlib, and translates into full stop and start of the
2969 instance (instead of a call_instance_reboot RPC)
2970 @type shutdown_timeout: integer
2971 @param shutdown_timeout: maximum timeout for soft shutdown
2972 @type reason: list of reasons
2973 @param reason: the reason trail for this reboot
2974 @rtype: None
2975
2976 """
2977 # TODO: this is inconsistent with 'StartInstance' and 'InstanceShutdown'
2978 # because those functions simply 'return' on error whereas this one
2979 # raises an exception with '_Fail'
2980 if not _GetInstanceInfo(instance):
2981 _Fail("Cannot reboot instance '%s' that is not running", instance.name)
2982
2983 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2984 if reboot_type == constants.INSTANCE_REBOOT_SOFT:
2985 try:
2986 hyper.RebootInstance(instance)
2987 except errors.HypervisorError, err:
2988 _Fail("Failed to soft reboot instance '%s': %s", instance.name, err)
2989 elif reboot_type == constants.INSTANCE_REBOOT_HARD:
2990 try:
2991 InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False)
2992 result = StartInstance(instance, False, reason, store_reason=False)
2993 _StoreInstReasonTrail(instance.name, reason)
2994 return result
2995 except errors.HypervisorError, err:
2996 _Fail("Failed to hard reboot instance '%s': %s", instance.name, err)
2997 else:
2998 _Fail("Invalid reboot_type received: '%s'", reboot_type)
2999
3000
3001 def InstanceBalloonMemory(instance, memory):
3002 """Resize an instance's memory.
3003
3004 @type instance: L{objects.Instance}
3005 @param instance: the instance object
3006 @type memory: int
3007 @param memory: new memory amount in MB
3008 @rtype: None
3009
3010 """
3011 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3012 running = hyper.ListInstances(hvparams=instance.hvparams)
3013 if instance.name not in running:
3014 logging.info("Instance %s is not running, cannot balloon", instance.name)
3015 return
3016 try:
3017 hyper.BalloonInstanceMemory(instance, memory)
3018 except errors.HypervisorError, err:
3019 _Fail("Failed to balloon instance memory: %s", err, exc=True)
3020
3021
3022 def MigrationInfo(instance):
3023 """Gather information about an instance to be migrated.
3024
3025 @type instance: L{objects.Instance}
3026 @param instance: the instance definition
3027
3028 """
3029 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3030 try:
3031 info = hyper.MigrationInfo(instance)
3032 except errors.HypervisorError, err:
3033 _Fail("Failed to fetch migration information: %s", err, exc=True)
3034 return info
3035
3036
3037 def AcceptInstance(instance, info, target):
3038 """Prepare the node to accept an instance.
3039
3040 @type instance: L{objects.Instance}
3041 @param instance: the instance definition
3042 @type info: string/data (opaque)
3043 @param info: migration information, from the source node
3044 @type target: string
3045 @param target: target host (usually ip), on this node
3046
3047 """
3048 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3049 try:
3050 hyper.AcceptInstance(instance, info, target)
3051 except errors.HypervisorError, err:
3052 _Fail("Failed to accept instance: %s", err, exc=True)
3053
3054
3055 def FinalizeMigrationDst(instance, info, success):
3056 """Finalize any preparation to accept an instance.
3057
3058 @type instance: L{objects.Instance}
3059 @param instance: the instance definition
3060 @type info: string/data (opaque)
3061 @param info: migration information, from the source node
3062 @type success: boolean
3063 @param success: whether the migration was a success or a failure
3064
3065 """
3066 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3067 try:
3068 hyper.FinalizeMigrationDst(instance, info, success)
3069 except errors.HypervisorError, err:
3070 _Fail("Failed to finalize migration on the target node: %s", err, exc=True)
3071
3072
3073 def MigrateInstance(cluster_name, instance, target, live):
3074 """Migrates an instance to another node.
3075
3076 @type cluster_name: string
3077 @param cluster_name: name of the cluster
3078 @type instance: L{objects.Instance}
3079 @param instance: the instance definition
3080 @type target: string
3081 @param target: the target node name
3082 @type live: boolean
3083 @param live: whether the migration should be done live or not (the
3084 interpretation of this parameter is left to the hypervisor)
3085 @raise RPCFail: if migration fails for some reason
3086
3087 """
3088 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3089
3090 try:
3091 hyper.MigrateInstance(cluster_name, instance, target, live)
3092 except errors.HypervisorError, err:
3093 _Fail("Failed to migrate instance: %s", err, exc=True)
3094
3095
3096 def FinalizeMigrationSource(instance, success, live):
3097 """Finalize the instance migration on the source node.
3098
3099 @type instance: L{objects.Instance}
3100 @param instance: the instance definition of the migrated instance
3101 @type success: bool
3102 @param success: whether the migration succeeded or not
3103 @type live: bool
3104 @param live: whether the user requested a live migration or not
3105 @raise RPCFail: If the execution fails for some reason
3106
3107 """
3108 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3109
3110 try:
3111 hyper.FinalizeMigrationSource(instance, success, live)
3112 except Exception, err: # pylint: disable=W0703
3113 _Fail("Failed to finalize the migration on the source node: %s", err,
3114 exc=True)
3115
3116
3117 def GetMigrationStatus(instance):
3118 """Get the migration status
3119
3120 @type instance: L{objects.Instance}
3121 @param instance: the instance that is being migrated
3122 @rtype: L{objects.MigrationStatus}
3123 @return: the status of the current migration (one of
3124 L{constants.HV_MIGRATION_VALID_STATUSES}), plus any additional
3125 progress info that can be retrieved from the hypervisor
3126 @raise RPCFail: If the migration status cannot be retrieved
3127
3128 """
3129 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3130 try:
3131 return hyper.GetMigrationStatus(instance)
3132 except Exception, err: # pylint: disable=W0703
3133 _Fail("Failed to get migration status: %s", err, exc=True)
3134
3135
3136 def HotplugDevice(instance, action, dev_type, device, extra, seq):
3137 """Hotplug a device
3138
3139 Hotplug is currently supported only for KVM Hypervisor.
3140 @type instance: L{objects.Instance}
3141 @param instance: the instance to which we hotplug a device
3142 @type action: string
3143 @param action: the hotplug action to perform
3144 @type dev_type: string
3145 @param dev_type: the device type to hotplug
3146 @type device: either L{objects.NIC} or L{objects.Disk}
3147 @param device: the device object to hotplug
3148 @type extra: tuple
3149 @param extra: extra info used for disk hotplug (disk link, drive uri)
3150 @type seq: int
3151 @param seq: the index of the device from master perspective
3152 @raise RPCFail: in case instance does not have KVM hypervisor
3153
3154 """
3155 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3156 try:
3157 hyper.VerifyHotplugSupport(instance, action, dev_type)
3158 except errors.HotplugError, err:
3159 _Fail("Hotplug is not supported: %s", err)
3160
3161 if action == constants.HOTPLUG_ACTION_ADD:
3162 fn = hyper.HotAddDevice
3163 elif action == constants.HOTPLUG_ACTION_REMOVE:
3164 fn = hyper.HotDelDevice
3165 elif action == constants.HOTPLUG_ACTION_MODIFY:
3166 fn = hyper.HotModDevice
3167 else:
3168 assert action in constants.HOTPLUG_ALL_ACTIONS
3169
3170 return fn(instance, dev_type, device, extra, seq)
3171
3172
3173 def HotplugSupported(instance):
3174 """Checks if hotplug is generally supported.
3175
3176 """
3177 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3178 try:
3179 hyper.HotplugSupported(instance)
3180 except errors.HotplugError, err:
3181 _Fail("Hotplug is not supported: %s", err)
3182
3183
3184 def ModifyInstanceMetadata(metadata):
3185 """Sends instance data to the metadata daemon.
3186
3187 Uses the Luxi transport layer to communicate with the metadata
3188 daemon configuration server. It starts the metadata daemon if it is
3189 not running.
3190 The daemon must be enabled during at configuration time.
3191
3192 @type metadata: dict
3193 @param metadata: instance metadata obtained by calling
3194 L{objects.Instance.ToDict} on an instance object
3195
3196 """
3197 if not constants.ENABLE_METAD:
3198 raise errors.ProgrammerError("The metadata deamon is disabled, yet"
3199 " ModifyInstanceMetadata has been called")
3200
3201 if not utils.IsDaemonAlive(constants.METAD):
3202 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.METAD])
3203 if result.failed:
3204 raise errors.HypervisorError("Failed to start metadata daemon")
3205
3206 with contextlib.closing(metad.Client()) as client:
3207 client.UpdateConfig(metadata)
3208
3209
3210 def BlockdevCreate(disk, size, owner, on_primary, info, excl_stor):
3211 """Creates a block device for an instance.
3212
3213 @type disk: L{objects.Disk}
3214 @param disk: the object describing the disk we should create
3215 @type size: int
3216 @param size: the size of the physical underlying device, in MiB
3217 @type owner: str
3218 @param owner: the name of the instance for which disk is created,
3219 used for device cache data
3220 @type on_primary: boolean
3221 @param on_primary: indicates if it is the primary node or not
3222 @type info: string
3223 @param info: string that will be sent to the physical device
3224 creation, used for example to set (LVM) tags on LVs
3225 @type excl_stor: boolean
3226 @param excl_stor: Whether exclusive_storage is active
3227
3228 @return: the new unique_id of the device (this can sometime be
3229 computed only after creation), or None. On secondary nodes,
3230 it's not required to return anything.
3231
3232 """
3233 # TODO: remove the obsolete "size" argument
3234 # pylint: disable=W0613
3235 clist = []
3236 if disk.children:
3237 for child in disk.children:
3238 try:
3239 crdev = _RecursiveAssembleBD(child, owner, on_primary)
3240 except errors.BlockDeviceError, err:
3241 _Fail("Can't assemble device %s: %s", child, err)
3242 if on_primary or disk.AssembleOnSecondary():
3243 # we need the children open in case the device itself has to
3244 # be assembled
3245 try:
3246 # pylint: disable=E1103
3247 crdev.Open()
3248 except errors.BlockDeviceError, err:
3249 _Fail("Can't make child '%s' read-write: %s", child, err)
3250 clist.append(crdev)
3251
3252 try:
3253 device = bdev.Create(disk, clist, excl_stor)
3254 except errors.BlockDeviceError, err:
3255 _Fail("Can't create block device: %s", err)
3256
3257 if on_primary or disk.AssembleOnSecondary():
3258 try:
3259 device.Assemble()
3260 except errors.BlockDeviceError, err:
3261 _Fail("Can't assemble device after creation, unusual event: %s", err)
3262 if on_primary or disk.OpenOnSecondary():
3263 try:
3264 device.Open(force=True)
3265 except errors.BlockDeviceError, err:
3266 _Fail("Can't make device r/w after creation, unusual event: %s", err)
3267 DevCacheManager.UpdateCache(device.dev_path, owner,
3268 on_primary, disk.iv_name)
3269
3270 device.SetInfo(info)
3271
3272 return device.unique_id
3273
3274
3275 def _DumpDevice(source_path, target_path, offset, size, truncate):
3276 """This function images/wipes the device using a local file.
3277
3278 @type source_path: string
3279 @param source_path: path of the image or data source (e.g., "/dev/zero")
3280
3281 @type target_path: string
3282 @param target_path: path of the device to image/wipe
3283
3284 @type offset: int
3285 @param offset: offset in MiB in the output file
3286
3287 @type size: int
3288 @param size: maximum size in MiB to write (data source might be smaller)
3289
3290 @type truncate: bool
3291 @param truncate: whether the file should be truncated
3292
3293 @return: None
3294 @raise RPCFail: in case of failure
3295
3296 """
3297 # Internal sizes are always in Mebibytes; if the following "dd" command
3298 # should use a different block size the offset and size given to this
3299 # function must be adjusted accordingly before being passed to "dd".
3300 block_size = constants.DD_BLOCK_SIZE
3301
3302 cmd = [constants.DD_CMD, "if=%s" % source_path, "seek=%d" % offset,
3303 "bs=%s" % block_size, "oflag=direct", "of=%s" % target_path,
3304 "count=%d" % size]
3305
3306 if not truncate:
3307 cmd.append("conv=notrunc")
3308
3309 result = utils.RunCmd(cmd)
3310
3311 if result.failed:
3312 _Fail("Dump command '%s' exited with error: %s; output: %s", result.cmd,
3313 result.fail_reason, result.output)
3314
3315
3316 def _DownloadAndDumpDevice(source_url, target_path, size):
3317 """This function images a device using a downloaded image file.
3318
3319 @type source_url: string
3320 @param source_url: URL of image to dump to disk
3321
3322 @type target_path: string
3323 @param target_path: path of the device to image
3324
3325 @type size: int
3326 @param size: maximum size in MiB to write (data source might be smaller)
3327
3328 @rtype: NoneType
3329 @return: None
3330 @raise RPCFail: in case of download or write failures
3331
3332 """
3333 class DDParams(object):
3334 def __init__(self, current_size, total_size):
3335 self.current_size = current_size
3336 self.total_size = total_size
3337 self.image_size_error = False
3338
3339 def dd_write(ddparams, out):
3340 if ddparams.current_size < ddparams.total_size:
3341 ddparams.current_size += len(out)
3342 target_file.write(out)
3343 else:
3344 ddparams.image_size_error = True
3345 return -1
3346
3347 target_file = open(target_path, "r+")
3348 ddparams = DDParams(0,