Merge branch 'stable-2.15' into stable-2.16
[ganeti-github.git] / lib / backend.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31 """Functions used by the node daemon
32
33 @var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
34 the L{UploadFile} function
35 @var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted
36 in the L{_CleanDirectory} function
37
38 """
39
40 # pylint: disable=E1103,C0302
41
42 # E1103: %s %r has no %r member (but some types could not be
43 # inferred), because the _TryOSFromDisk returns either (True, os_obj)
44 # or (False, "string") which confuses pylint
45
46 # C0302: This module has become too big and should be split up
47
48
49 import base64
50 import contextlib
51 import collections
52 import errno
53 import logging
54 import os
55 import os.path
56 import random
57 import re
58 import shutil
59 import signal
60 import stat
61 import tempfile
62 import time
63 import zlib
64
65 import pycurl
66
67 from ganeti import errors
68 from ganeti import http
69 from ganeti import utils
70 from ganeti import ssh
71 from ganeti import hypervisor
72 from ganeti.hypervisor import hv_base
73 from ganeti import constants
74 from ganeti.storage import bdev
75 from ganeti.storage import drbd
76 from ganeti.storage import extstorage
77 from ganeti.storage import filestorage
78 from ganeti import objects
79 from ganeti import ssconf
80 from ganeti import serializer
81 from ganeti import netutils
82 from ganeti import runtime
83 from ganeti import compat
84 from ganeti import pathutils
85 from ganeti import vcluster
86 from ganeti import ht
87 from ganeti.storage.base import BlockDev
88 from ganeti.storage.drbd import DRBD8
89 from ganeti import hooksmaster
90 import ganeti.metad as metad
91
92
93 _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id"
94 _ALLOWED_CLEAN_DIRS = compat.UniqueFrozenset([
95 pathutils.DATA_DIR,
96 pathutils.JOB_QUEUE_ARCHIVE_DIR,
97 pathutils.QUEUE_DIR,
98 pathutils.CRYPTO_KEYS_DIR,
99 ])
100 _MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60
101 _X509_KEY_FILE = "key"
102 _X509_CERT_FILE = "cert"
103 _IES_STATUS_FILE = "status"
104 _IES_PID_FILE = "pid"
105 _IES_CA_FILE = "ca"
106
107 #: Valid LVS output line regex
108 _LVSLINE_REGEX = re.compile(r"^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$")
109
110 # Actions for the master setup script
111 _MASTER_START = "start"
112 _MASTER_STOP = "stop"
113
114 #: Maximum file permissions for restricted command directory and executables
115 _RCMD_MAX_MODE = (stat.S_IRWXU |
116 stat.S_IRGRP | stat.S_IXGRP |
117 stat.S_IROTH | stat.S_IXOTH)
118
119 #: Delay before returning an error for restricted commands
120 _RCMD_INVALID_DELAY = 10
121
122 #: How long to wait to acquire lock for restricted commands (shorter than
123 #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many
124 #: command requests arrive
125 _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8
126
127
128 class RPCFail(Exception):
129 """Class denoting RPC failure.
130
131 Its argument is the error message.
132
133 """
134
135
136 def _GetInstReasonFilename(instance_name):
137 """Path of the file containing the reason of the instance status change.
138
139 @type instance_name: string
140 @param instance_name: The name of the instance
141 @rtype: string
142 @return: The path of the file
143
144 """
145 return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name)
146
147
148 def _StoreInstReasonTrail(instance_name, trail):
149 """Serialize a reason trail related to an instance change of state to file.
150
151 The exact location of the file depends on the name of the instance and on
152 the configuration of the Ganeti cluster defined at deploy time.
153
154 @type instance_name: string
155 @param instance_name: The name of the instance
156
157 @type trail: list of reasons
158 @param trail: reason trail
159
160 @rtype: None
161
162 """
163 json = serializer.DumpJson(trail)
164 filename = _GetInstReasonFilename(instance_name)
165 utils.WriteFile(filename, data=json)
166
167
168 def _Fail(msg, *args, **kwargs):
169 """Log an error and the raise an RPCFail exception.
170
171 This exception is then handled specially in the ganeti daemon and
172 turned into a 'failed' return type. As such, this function is a
173 useful shortcut for logging the error and returning it to the master
174 daemon.
175
176 @type msg: string
177 @param msg: the text of the exception
178 @raise RPCFail
179
180 """
181 if args:
182 msg = msg % args
183 if "log" not in kwargs or kwargs["log"]: # if we should log this error
184 if "exc" in kwargs and kwargs["exc"]:
185 logging.exception(msg)
186 else:
187 logging.error(msg)
188 raise RPCFail(msg)
189
190
191 def _GetConfig():
192 """Simple wrapper to return a SimpleStore.
193
194 @rtype: L{ssconf.SimpleStore}
195 @return: a SimpleStore instance
196
197 """
198 return ssconf.SimpleStore()
199
200
201 def _GetSshRunner(cluster_name):
202 """Simple wrapper to return an SshRunner.
203
204 @type cluster_name: str
205 @param cluster_name: the cluster name, which is needed
206 by the SshRunner constructor
207 @rtype: L{ssh.SshRunner}
208 @return: an SshRunner instance
209
210 """
211 return ssh.SshRunner(cluster_name)
212
213
214 def _Decompress(data):
215 """Unpacks data compressed by the RPC client.
216
217 @type data: list or tuple
218 @param data: Data sent by RPC client
219 @rtype: str
220 @return: Decompressed data
221
222 """
223 assert isinstance(data, (list, tuple))
224 assert len(data) == 2
225 (encoding, content) = data
226 if encoding == constants.RPC_ENCODING_NONE:
227 return content
228 elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
229 return zlib.decompress(base64.b64decode(content))
230 else:
231 raise AssertionError("Unknown data encoding")
232
233
234 def _CleanDirectory(path, exclude=None):
235 """Removes all regular files in a directory.
236
237 @type path: str
238 @param path: the directory to clean
239 @type exclude: list
240 @param exclude: list of files to be excluded, defaults
241 to the empty list
242
243 """
244 if path not in _ALLOWED_CLEAN_DIRS:
245 _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'",
246 path)
247
248 if not os.path.isdir(path):
249 return
250 if exclude is None:
251 exclude = []
252 else:
253 # Normalize excluded paths
254 exclude = [os.path.normpath(i) for i in exclude]
255
256 for rel_name in utils.ListVisibleFiles(path):
257 full_name = utils.PathJoin(path, rel_name)
258 if full_name in exclude:
259 continue
260 if os.path.isfile(full_name) and not os.path.islink(full_name):
261 utils.RemoveFile(full_name)
262
263
264 def _BuildUploadFileList():
265 """Build the list of allowed upload files.
266
267 This is abstracted so that it's built only once at module import time.
268
269 """
270 allowed_files = set([
271 pathutils.CLUSTER_CONF_FILE,
272 pathutils.ETC_HOSTS,
273 pathutils.SSH_KNOWN_HOSTS_FILE,
274 pathutils.VNC_PASSWORD_FILE,
275 pathutils.RAPI_CERT_FILE,
276 pathutils.SPICE_CERT_FILE,
277 pathutils.SPICE_CACERT_FILE,
278 pathutils.RAPI_USERS_FILE,
279 pathutils.CONFD_HMAC_KEY,
280 pathutils.CLUSTER_DOMAIN_SECRET_FILE,
281 ])
282
283 for hv_name in constants.HYPER_TYPES:
284 hv_class = hypervisor.GetHypervisorClass(hv_name)
285 allowed_files.update(hv_class.GetAncillaryFiles()[0])
286
287 assert pathutils.FILE_STORAGE_PATHS_FILE not in allowed_files, \
288 "Allowed file storage paths should never be uploaded via RPC"
289
290 return frozenset(allowed_files)
291
292
293 _ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
294
295
296 def JobQueuePurge():
297 """Removes job queue files and archived jobs.
298
299 @rtype: tuple
300 @return: True, None
301
302 """
303 _CleanDirectory(pathutils.QUEUE_DIR, exclude=[pathutils.JOB_QUEUE_LOCK_FILE])
304 _CleanDirectory(pathutils.JOB_QUEUE_ARCHIVE_DIR)
305
306
307 def GetMasterNodeName():
308 """Returns the master node name.
309
310 @rtype: string
311 @return: name of the master node
312 @raise RPCFail: in case of errors
313
314 """
315 try:
316 return _GetConfig().GetMasterNode()
317 except errors.ConfigurationError, err:
318 _Fail("Cluster configuration incomplete: %s", err, exc=True)
319
320
321 def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
322 """Decorator that runs hooks before and after the decorated function.
323
324 @type hook_opcode: string
325 @param hook_opcode: opcode of the hook
326 @type hooks_path: string
327 @param hooks_path: path of the hooks
328 @type env_builder_fn: function
329 @param env_builder_fn: function that returns a dictionary containing the
330 environment variables for the hooks. Will get all the parameters of the
331 decorated function.
332 @raise RPCFail: in case of pre-hook failure
333
334 """
335 def decorator(fn):
336 def wrapper(*args, **kwargs):
337 _, myself = ssconf.GetMasterAndMyself()
338 nodes = ([myself], [myself]) # these hooks run locally
339
340 env_fn = compat.partial(env_builder_fn, *args, **kwargs)
341
342 cfg = _GetConfig()
343 hr = HooksRunner()
344 hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes,
345 hr.RunLocalHooks, None, env_fn, None,
346 logging.warning, cfg.GetClusterName(),
347 cfg.GetMasterNode())
348 hm.RunPhase(constants.HOOKS_PHASE_PRE)
349 result = fn(*args, **kwargs)
350 hm.RunPhase(constants.HOOKS_PHASE_POST)
351
352 return result
353 return wrapper
354 return decorator
355
356
357 def _BuildMasterIpEnv(master_params, use_external_mip_script=None):
358 """Builds environment variables for master IP hooks.
359
360 @type master_params: L{objects.MasterNetworkParameters}
361 @param master_params: network parameters of the master
362 @type use_external_mip_script: boolean
363 @param use_external_mip_script: whether to use an external master IP
364 address setup script (unused, but necessary per the implementation of the
365 _RunLocalHooks decorator)
366
367 """
368 # pylint: disable=W0613
369 ver = netutils.IPAddress.GetVersionFromAddressFamily(master_params.ip_family)
370 env = {
371 "MASTER_NETDEV": master_params.netdev,
372 "MASTER_IP": master_params.ip,
373 "MASTER_NETMASK": str(master_params.netmask),
374 "CLUSTER_IP_VERSION": str(ver),
375 }
376
377 return env
378
379
380 def _RunMasterSetupScript(master_params, action, use_external_mip_script):
381 """Execute the master IP address setup script.
382
383 @type master_params: L{objects.MasterNetworkParameters}
384 @param master_params: network parameters of the master
385 @type action: string
386 @param action: action to pass to the script. Must be one of
387 L{backend._MASTER_START} or L{backend._MASTER_STOP}
388 @type use_external_mip_script: boolean
389 @param use_external_mip_script: whether to use an external master IP
390 address setup script
391 @raise backend.RPCFail: if there are errors during the execution of the
392 script
393
394 """
395 env = _BuildMasterIpEnv(master_params)
396
397 if use_external_mip_script:
398 setup_script = pathutils.EXTERNAL_MASTER_SETUP_SCRIPT
399 else:
400 setup_script = pathutils.DEFAULT_MASTER_SETUP_SCRIPT
401
402 result = utils.RunCmd([setup_script, action], env=env, reset_env=True)
403
404 if result.failed:
405 _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" %
406 (action, result.exit_code, result.output), log=True)
407
408
409 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup",
410 _BuildMasterIpEnv)
411 def ActivateMasterIp(master_params, use_external_mip_script):
412 """Activate the IP address of the master daemon.
413
414 @type master_params: L{objects.MasterNetworkParameters}
415 @param master_params: network parameters of the master
416 @type use_external_mip_script: boolean
417 @param use_external_mip_script: whether to use an external master IP
418 address setup script
419 @raise RPCFail: in case of errors during the IP startup
420
421 """
422 _RunMasterSetupScript(master_params, _MASTER_START,
423 use_external_mip_script)
424
425
426 def StartMasterDaemons(no_voting):
427 """Activate local node as master node.
428
429 The function will start the master daemons (ganeti-masterd and ganeti-rapi).
430
431 @type no_voting: boolean
432 @param no_voting: whether to start ganeti-masterd without a node vote
433 but still non-interactively
434 @rtype: None
435
436 """
437
438 if no_voting:
439 daemon_args = "--no-voting --yes-do-it"
440 else:
441 daemon_args = ""
442
443 env = {
444 "EXTRA_LUXID_ARGS": daemon_args,
445 "EXTRA_WCONFD_ARGS": daemon_args,
446 }
447
448 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-master"], env=env)
449 if result.failed:
450 msg = "Can't start Ganeti master: %s" % result.output
451 logging.error(msg)
452 _Fail(msg)
453
454
455 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown",
456 _BuildMasterIpEnv)
457 def DeactivateMasterIp(master_params, use_external_mip_script):
458 """Deactivate the master IP on this node.
459
460 @type master_params: L{objects.MasterNetworkParameters}
461 @param master_params: network parameters of the master
462 @type use_external_mip_script: boolean
463 @param use_external_mip_script: whether to use an external master IP
464 address setup script
465 @raise RPCFail: in case of errors during the IP turndown
466
467 """
468 _RunMasterSetupScript(master_params, _MASTER_STOP,
469 use_external_mip_script)
470
471
472 def StopMasterDaemons():
473 """Stop the master daemons on this node.
474
475 Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node.
476
477 @rtype: None
478
479 """
480 # TODO: log and report back to the caller the error failures; we
481 # need to decide in which case we fail the RPC for this
482
483 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-master"])
484 if result.failed:
485 logging.error("Could not stop Ganeti master, command %s had exitcode %s"
486 " and error %s",
487 result.cmd, result.exit_code, result.output)
488
489
490 def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
491 """Change the netmask of the master IP.
492
493 @param old_netmask: the old value of the netmask
494 @param netmask: the new value of the netmask
495 @param master_ip: the master IP
496 @param master_netdev: the master network device
497
498 """
499 if old_netmask == netmask:
500 return
501
502 if not netutils.IPAddress.Own(master_ip):
503 _Fail("The master IP address is not up, not attempting to change its"
504 " netmask")
505
506 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
507 "%s/%s" % (master_ip, netmask),
508 "dev", master_netdev, "label",
509 "%s:0" % master_netdev])
510 if result.failed:
511 _Fail("Could not set the new netmask on the master IP address")
512
513 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
514 "%s/%s" % (master_ip, old_netmask),
515 "dev", master_netdev, "label",
516 "%s:0" % master_netdev])
517 if result.failed:
518 _Fail("Could not bring down the master IP address with the old netmask")
519
520
521 def EtcHostsModify(mode, host, ip):
522 """Modify a host entry in /etc/hosts.
523
524 @param mode: The mode to operate. Either add or remove entry
525 @param host: The host to operate on
526 @param ip: The ip associated with the entry
527
528 """
529 if mode == constants.ETC_HOSTS_ADD:
530 if not ip:
531 RPCFail("Mode 'add' needs 'ip' parameter, but parameter not"
532 " present")
533 utils.AddHostToEtcHosts(host, ip)
534 elif mode == constants.ETC_HOSTS_REMOVE:
535 if ip:
536 RPCFail("Mode 'remove' does not allow 'ip' parameter, but"
537 " parameter is present")
538 utils.RemoveHostFromEtcHosts(host)
539 else:
540 RPCFail("Mode not supported")
541
542
543 def LeaveCluster(modify_ssh_setup):
544 """Cleans up and remove the current node.
545
546 This function cleans up and prepares the current node to be removed
547 from the cluster.
548
549 If processing is successful, then it raises an
550 L{errors.QuitGanetiException} which is used as a special case to
551 shutdown the node daemon.
552
553 @param modify_ssh_setup: boolean
554
555 """
556 _CleanDirectory(pathutils.DATA_DIR)
557 _CleanDirectory(pathutils.CRYPTO_KEYS_DIR)
558 JobQueuePurge()
559
560 if modify_ssh_setup:
561 try:
562 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
563
564 ssh.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
565
566 utils.RemoveFile(priv_key)
567 utils.RemoveFile(pub_key)
568 except errors.OpExecError:
569 logging.exception("Error while processing ssh files")
570 except IOError:
571 logging.exception("At least one SSH file was not accessible.")
572
573 try:
574 utils.RemoveFile(pathutils.CONFD_HMAC_KEY)
575 utils.RemoveFile(pathutils.RAPI_CERT_FILE)
576 utils.RemoveFile(pathutils.SPICE_CERT_FILE)
577 utils.RemoveFile(pathutils.SPICE_CACERT_FILE)
578 utils.RemoveFile(pathutils.NODED_CERT_FILE)
579 except: # pylint: disable=W0702
580 logging.exception("Error while removing cluster secrets")
581
582 utils.StopDaemon(constants.CONFD)
583 utils.StopDaemon(constants.MOND)
584 utils.StopDaemon(constants.KVMD)
585
586 # Raise a custom exception (handled in ganeti-noded)
587 raise errors.QuitGanetiException(True, "Shutdown scheduled")
588
589
590 def _CheckStorageParams(params, num_params):
591 """Performs sanity checks for storage parameters.
592
593 @type params: list
594 @param params: list of storage parameters
595 @type num_params: int
596 @param num_params: expected number of parameters
597
598 """
599 if params is None:
600 raise errors.ProgrammerError("No storage parameters for storage"
601 " reporting is provided.")
602 if not isinstance(params, list):
603 raise errors.ProgrammerError("The storage parameters are not of type"
604 " list: '%s'" % params)
605 if not len(params) == num_params:
606 raise errors.ProgrammerError("Did not receive the expected number of"
607 "storage parameters: expected %s,"
608 " received '%s'" % (num_params, len(params)))
609
610
611 def _CheckLvmStorageParams(params):
612 """Performs sanity check for the 'exclusive storage' flag.
613
614 @see: C{_CheckStorageParams}
615
616 """
617 _CheckStorageParams(params, 1)
618 excl_stor = params[0]
619 if not isinstance(params[0], bool):
620 raise errors.ProgrammerError("Exclusive storage parameter is not"
621 " boolean: '%s'." % excl_stor)
622 return excl_stor
623
624
625 def _GetLvmVgSpaceInfo(name, params):
626 """Wrapper around C{_GetVgInfo} which checks the storage parameters.
627
628 @type name: string
629 @param name: name of the volume group
630 @type params: list
631 @param params: list of storage parameters, which in this case should be
632 containing only one for exclusive storage
633
634 """
635 excl_stor = _CheckLvmStorageParams(params)
636 return _GetVgInfo(name, excl_stor)
637
638
639 def _GetVgInfo(
640 name, excl_stor, info_fn=bdev.LogicalVolume.GetVGInfo):
641 """Retrieves information about a LVM volume group.
642
643 """
644 # TODO: GetVGInfo supports returning information for multiple VGs at once
645 vginfo = info_fn([name], excl_stor)
646 if vginfo:
647 vg_free = int(round(vginfo[0][0], 0))
648 vg_size = int(round(vginfo[0][1], 0))
649 else:
650 vg_free = None
651 vg_size = None
652
653 return {
654 "type": constants.ST_LVM_VG,
655 "name": name,
656 "storage_free": vg_free,
657 "storage_size": vg_size,
658 }
659
660
661 def _GetLvmPvSpaceInfo(name, params):
662 """Wrapper around C{_GetVgSpindlesInfo} with sanity checks.
663
664 @see: C{_GetLvmVgSpaceInfo}
665
666 """
667 excl_stor = _CheckLvmStorageParams(params)
668 return _GetVgSpindlesInfo(name, excl_stor)
669
670
671 def _GetVgSpindlesInfo(
672 name, excl_stor, info_fn=bdev.LogicalVolume.GetVgSpindlesInfo):
673 """Retrieves information about spindles in an LVM volume group.
674
675 @type name: string
676 @param name: VG name
677 @type excl_stor: bool
678 @param excl_stor: exclusive storage
679 @rtype: dict
680 @return: dictionary whose keys are "name", "vg_free", "vg_size" for VG name,
681 free spindles, total spindles respectively
682
683 """
684 if excl_stor:
685 (vg_free, vg_size) = info_fn(name)
686 else:
687 vg_free = 0
688 vg_size = 0
689 return {
690 "type": constants.ST_LVM_PV,
691 "name": name,
692 "storage_free": vg_free,
693 "storage_size": vg_size,
694 }
695
696
697 def _GetHvInfo(name, hvparams, get_hv_fn=hypervisor.GetHypervisor):
698 """Retrieves node information from a hypervisor.
699
700 The information returned depends on the hypervisor. Common items:
701
702 - vg_size is the size of the configured volume group in MiB
703 - vg_free is the free size of the volume group in MiB
704 - memory_dom0 is the memory allocated for domain0 in MiB
705 - memory_free is the currently available (free) ram in MiB
706 - memory_total is the total number of ram in MiB
707 - hv_version: the hypervisor version, if available
708
709 @type hvparams: dict of string
710 @param hvparams: the hypervisor's hvparams
711
712 """
713 return get_hv_fn(name).GetNodeInfo(hvparams=hvparams)
714
715
716 def _GetHvInfoAll(hv_specs, get_hv_fn=hypervisor.GetHypervisor):
717 """Retrieves node information for all hypervisors.
718
719 See C{_GetHvInfo} for information on the output.
720
721 @type hv_specs: list of pairs (string, dict of strings)
722 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
723
724 """
725 if hv_specs is None:
726 return None
727
728 result = []
729 for hvname, hvparams in hv_specs:
730 result.append(_GetHvInfo(hvname, hvparams, get_hv_fn))
731 return result
732
733
734 def _GetNamedNodeInfo(names, fn):
735 """Calls C{fn} for all names in C{names} and returns a dictionary.
736
737 @rtype: None or dict
738
739 """
740 if names is None:
741 return None
742 else:
743 return map(fn, names)
744
745
746 def GetNodeInfo(storage_units, hv_specs):
747 """Gives back a hash with different information about the node.
748
749 @type storage_units: list of tuples (string, string, list)
750 @param storage_units: List of tuples (storage unit, identifier, parameters) to
751 ask for disk space information. In case of lvm-vg, the identifier is
752 the VG name. The parameters can contain additional, storage-type-specific
753 parameters, for example exclusive storage for lvm storage.
754 @type hv_specs: list of pairs (string, dict of strings)
755 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
756 @rtype: tuple; (string, None/dict, None/dict)
757 @return: Tuple containing boot ID, volume group information and hypervisor
758 information
759
760 """
761 bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n")
762 storage_info = _GetNamedNodeInfo(
763 storage_units,
764 (lambda (storage_type, storage_key, storage_params):
765 _ApplyStorageInfoFunction(storage_type, storage_key, storage_params)))
766 hv_info = _GetHvInfoAll(hv_specs)
767 return (bootid, storage_info, hv_info)
768
769
770 def _GetFileStorageSpaceInfo(path, params):
771 """Wrapper around filestorage.GetSpaceInfo.
772
773 The purpose of this wrapper is to call filestorage.GetFileStorageSpaceInfo
774 and ignore the *args parameter to not leak it into the filestorage
775 module's code.
776
777 @see: C{filestorage.GetFileStorageSpaceInfo} for description of the
778 parameters.
779
780 """
781 _CheckStorageParams(params, 0)
782 return filestorage.GetFileStorageSpaceInfo(path)
783
784
785 # FIXME: implement storage reporting for all missing storage types.
786 _STORAGE_TYPE_INFO_FN = {
787 constants.ST_BLOCK: None,
788 constants.ST_DISKLESS: None,
789 constants.ST_EXT: None,
790 constants.ST_FILE: _GetFileStorageSpaceInfo,
791 constants.ST_LVM_PV: _GetLvmPvSpaceInfo,
792 constants.ST_LVM_VG: _GetLvmVgSpaceInfo,
793 constants.ST_SHARED_FILE: None,
794 constants.ST_GLUSTER: None,
795 constants.ST_RADOS: None,
796 }
797
798
799 def _ApplyStorageInfoFunction(storage_type, storage_key, *args):
800 """Looks up and applies the correct function to calculate free and total
801 storage for the given storage type.
802
803 @type storage_type: string
804 @param storage_type: the storage type for which the storage shall be reported.
805 @type storage_key: string
806 @param storage_key: identifier of a storage unit, e.g. the volume group name
807 of an LVM storage unit
808 @type args: any
809 @param args: various parameters that can be used for storage reporting. These
810 parameters and their semantics vary from storage type to storage type and
811 are just propagated in this function.
812 @return: the results of the application of the storage space function (see
813 _STORAGE_TYPE_INFO_FN) if storage space reporting is implemented for that
814 storage type
815 @raises NotImplementedError: for storage types who don't support space
816 reporting yet
817 """
818 fn = _STORAGE_TYPE_INFO_FN[storage_type]
819 if fn is not None:
820 return fn(storage_key, *args)
821 else:
822 raise NotImplementedError
823
824
825 def _CheckExclusivePvs(pvi_list):
826 """Check that PVs are not shared among LVs
827
828 @type pvi_list: list of L{objects.LvmPvInfo} objects
829 @param pvi_list: information about the PVs
830
831 @rtype: list of tuples (string, list of strings)
832 @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...])
833
834 """
835 res = []
836 for pvi in pvi_list:
837 if len(pvi.lv_list) > 1:
838 res.append((pvi.name, pvi.lv_list))
839 return res
840
841
842 def _VerifyHypervisors(what, vm_capable, result, all_hvparams,
843 get_hv_fn=hypervisor.GetHypervisor):
844 """Verifies the hypervisor. Appends the results to the 'results' list.
845
846 @type what: C{dict}
847 @param what: a dictionary of things to check
848 @type vm_capable: boolean
849 @param vm_capable: whether or not this node is vm capable
850 @type result: dict
851 @param result: dictionary of verification results; results of the
852 verifications in this function will be added here
853 @type all_hvparams: dict of dict of string
854 @param all_hvparams: dictionary mapping hypervisor names to hvparams
855 @type get_hv_fn: function
856 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
857
858 """
859 if not vm_capable:
860 return
861
862 if constants.NV_HYPERVISOR in what:
863 result[constants.NV_HYPERVISOR] = {}
864 for hv_name in what[constants.NV_HYPERVISOR]:
865 hvparams = all_hvparams[hv_name]
866 try:
867 val = get_hv_fn(hv_name).Verify(hvparams=hvparams)
868 except errors.HypervisorError, err:
869 val = "Error while checking hypervisor: %s" % str(err)
870 result[constants.NV_HYPERVISOR][hv_name] = val
871
872
873 def _VerifyHvparams(what, vm_capable, result,
874 get_hv_fn=hypervisor.GetHypervisor):
875 """Verifies the hvparams. Appends the results to the 'results' list.
876
877 @type what: C{dict}
878 @param what: a dictionary of things to check
879 @type vm_capable: boolean
880 @param vm_capable: whether or not this node is vm capable
881 @type result: dict
882 @param result: dictionary of verification results; results of the
883 verifications in this function will be added here
884 @type get_hv_fn: function
885 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
886
887 """
888 if not vm_capable:
889 return
890
891 if constants.NV_HVPARAMS in what:
892 result[constants.NV_HVPARAMS] = []
893 for source, hv_name, hvparms in what[constants.NV_HVPARAMS]:
894 try:
895 logging.info("Validating hv %s, %s", hv_name, hvparms)
896 get_hv_fn(hv_name).ValidateParameters(hvparms)
897 except errors.HypervisorError, err:
898 result[constants.NV_HVPARAMS].append((source, hv_name, str(err)))
899
900
901 def _VerifyInstanceList(what, vm_capable, result, all_hvparams):
902 """Verifies the instance list.
903
904 @type what: C{dict}
905 @param what: a dictionary of things to check
906 @type vm_capable: boolean
907 @param vm_capable: whether or not this node is vm capable
908 @type result: dict
909 @param result: dictionary of verification results; results of the
910 verifications in this function will be added here
911 @type all_hvparams: dict of dict of string
912 @param all_hvparams: dictionary mapping hypervisor names to hvparams
913
914 """
915 if constants.NV_INSTANCELIST in what and vm_capable:
916 # GetInstanceList can fail
917 try:
918 val = GetInstanceList(what[constants.NV_INSTANCELIST],
919 all_hvparams=all_hvparams)
920 except RPCFail, err:
921 val = str(err)
922 result[constants.NV_INSTANCELIST] = val
923
924
925 def _VerifyNodeInfo(what, vm_capable, result, all_hvparams):
926 """Verifies the node info.
927
928 @type what: C{dict}
929 @param what: a dictionary of things to check
930 @type vm_capable: boolean
931 @param vm_capable: whether or not this node is vm capable
932 @type result: dict
933 @param result: dictionary of verification results; results of the
934 verifications in this function will be added here
935 @type all_hvparams: dict of dict of string
936 @param all_hvparams: dictionary mapping hypervisor names to hvparams
937
938 """
939 if constants.NV_HVINFO in what and vm_capable:
940 hvname = what[constants.NV_HVINFO]
941 hyper = hypervisor.GetHypervisor(hvname)
942 hvparams = all_hvparams[hvname]
943 result[constants.NV_HVINFO] = hyper.GetNodeInfo(hvparams=hvparams)
944
945
946 def _VerifyClientCertificate(cert_file=pathutils.NODED_CLIENT_CERT_FILE):
947 """Verify the existance and validity of the client SSL certificate.
948
949 Also, verify that the client certificate is not self-signed. Self-
950 signed client certificates stem from Ganeti versions 2.12.0 - 2.12.4
951 and should be replaced by client certificates signed by the server
952 certificate. Hence we output a warning when we encounter a self-signed
953 one.
954
955 """
956 create_cert_cmd = "gnt-cluster renew-crypto --new-node-certificates"
957 if not os.path.exists(cert_file):
958 return (constants.CV_ERROR,
959 "The client certificate does not exist. Run '%s' to create"
960 " client certificates for all nodes." % create_cert_cmd)
961
962 (errcode, msg) = utils.VerifyCertificate(cert_file)
963 if errcode is not None:
964 return (errcode, msg)
965
966 (errcode, msg) = utils.IsCertificateSelfSigned(cert_file)
967 if errcode is not None:
968 return (errcode, msg)
969
970 # if everything is fine, we return the digest to be compared to the config
971 return (None, utils.GetCertificateDigest(cert_filename=cert_file))
972
973
974 def _VerifySshSetup(node_status_list, my_name, ssh_key_type,
975 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS):
976 """Verifies the state of the SSH key files.
977
978 @type node_status_list: list of tuples
979 @param node_status_list: list of nodes of the cluster associated with a
980 couple of flags: (uuid, name, is_master_candidate,
981 is_potential_master_candidate, online)
982 @type my_name: str
983 @param my_name: name of this node
984 @type ssh_key_type: one of L{constants.SSHK_ALL}
985 @param ssh_key_type: type of key used on nodes
986 @type ganeti_pub_keys_file: str
987 @param ganeti_pub_keys_file: filename of the public keys file
988
989 """
990 if node_status_list is None:
991 return ["No node list to check against the pub_key_file received."]
992
993 my_status_list = [(my_uuid, name, mc, pot_mc, online) for
994 (my_uuid, name, mc, pot_mc, online)
995 in node_status_list if name == my_name]
996 if len(my_status_list) == 0:
997 return ["Cannot find node information for node '%s'." % my_name]
998 (my_uuid, _, _, potential_master_candidate, online) = \
999 my_status_list[0]
1000
1001 result = []
1002
1003 if not os.path.exists(ganeti_pub_keys_file):
1004 result.append("The public key file '%s' does not exist. Consider running"
1005 " 'gnt-cluster renew-crypto --new-ssh-keys"
1006 " [--no-ssh-key-check]' to fix this." % ganeti_pub_keys_file)
1007 return result
1008
1009 pot_mc_uuids = [uuid for (uuid, _, _, _, _) in node_status_list]
1010 offline_nodes = [uuid for (uuid, _, _, _, online) in node_status_list
1011 if not online]
1012 pub_keys = ssh.QueryPubKeyFile(None, key_file=ganeti_pub_keys_file)
1013
1014 if potential_master_candidate:
1015 # Check that the set of potential master candidates matches the
1016 # public key file
1017 pub_uuids_set = set(pub_keys.keys()) - set(offline_nodes)
1018 pot_mc_uuids_set = set(pot_mc_uuids) - set(offline_nodes)
1019 missing_uuids = set([])
1020 if pub_uuids_set != pot_mc_uuids_set:
1021 unknown_uuids = pub_uuids_set - pot_mc_uuids_set
1022 pub_key_path = "%s:%s" % (my_name, ganeti_pub_keys_file)
1023 if unknown_uuids:
1024 result.append("The following node UUIDs are listed in the shared public"
1025 " keys file %s, but are not potential master"
1026 " candidates: %s." %
1027 (pub_key_path, ", ".join(list(unknown_uuids))))
1028 missing_uuids = pot_mc_uuids_set - pub_uuids_set
1029 if missing_uuids:
1030 result.append("The following node UUIDs of potential master candidates"
1031 " are missing in the shared public keys file %s: %s." %
1032 (pub_key_path, ", ".join(list(missing_uuids))))
1033
1034 (_, key_files) = \
1035 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1036 (_, node_pub_key_file) = key_files[ssh_key_type]
1037
1038 my_keys = pub_keys[my_uuid]
1039
1040 node_pub_key = utils.ReadFile(node_pub_key_file)
1041 node_pub_key_path = "%s:%s" % (my_name, node_pub_key_file)
1042 if node_pub_key.strip() not in my_keys:
1043 result.append("The key for node %s in the cluster config does not match"
1044 " this node's key in the node public key file %s." %
1045 (my_name, node_pub_key_path))
1046 if len(my_keys) != 1:
1047 result.append("There is more than one key for node %s in the node public"
1048 " key file %s." % (my_name, node_pub_key_path))
1049 else:
1050 if len(pub_keys.keys()) > 0:
1051 result.append("The public key file %s is not empty, although"
1052 " the node is not a potential master candidate." %
1053 node_pub_key_path)
1054
1055 # Check that all master candidate keys are in the authorized_keys file
1056 (auth_key_file, _) = \
1057 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1058 for (uuid, name, mc, _, online) in node_status_list:
1059 if not online:
1060 continue
1061 if uuid in missing_uuids:
1062 continue
1063 if mc:
1064 for key in pub_keys[uuid]:
1065 if not ssh.HasAuthorizedKey(auth_key_file, key):
1066 result.append("A SSH key of master candidate '%s' (UUID: '%s') is"
1067 " not in the 'authorized_keys' file of node '%s'."
1068 % (name, uuid, my_name))
1069 else:
1070 for key in pub_keys[uuid]:
1071 if name != my_name and ssh.HasAuthorizedKey(auth_key_file, key):
1072 result.append("A SSH key of normal node '%s' (UUID: '%s') is in the"
1073 " 'authorized_keys' file of node '%s'."
1074 % (name, uuid, my_name))
1075 if name == my_name and not ssh.HasAuthorizedKey(auth_key_file, key):
1076 result.append("A SSH key of normal node '%s' (UUID: '%s') is not"
1077 " in the 'authorized_keys' file of itself."
1078 % (my_name, uuid))
1079
1080 return result
1081
1082
1083 def _VerifySshClutter(node_status_list, my_name):
1084 """Verifies that the 'authorized_keys' files are not cluttered up.
1085
1086 @type node_status_list: list of tuples
1087 @param node_status_list: list of nodes of the cluster associated with a
1088 couple of flags: (uuid, name, is_master_candidate,
1089 is_potential_master_candidate, online)
1090 @type my_name: str
1091 @param my_name: name of this node
1092
1093 """
1094 result = []
1095 (auth_key_file, _) = \
1096 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1097 node_names = [name for (_, name, _, _) in node_status_list]
1098 multiple_occurrences = ssh.CheckForMultipleKeys(auth_key_file, node_names)
1099 if multiple_occurrences:
1100 msg = "There are hosts which have more than one SSH key stored for the" \
1101 " same user in the 'authorized_keys' file of node %s. This can be" \
1102 " due to an unsuccessful operation which cluttered up the" \
1103 " 'authorized_keys' file. We recommend to clean this up manually. " \
1104 % my_name
1105 for host, occ in multiple_occurrences.items():
1106 msg += "Entry for '%s' in lines %s. " % (host, utils.CommaJoin(occ))
1107 result.append(msg)
1108
1109 return result
1110
1111
1112 def VerifyNodeNetTest(my_name, test_config):
1113 """Verify nodes are reachable.
1114
1115 @type my_name: string
1116 @param my_name: name of the node this test is running on
1117
1118 @type test_config: tuple (node_list, master_candidate_list)
1119 @param test_config: configuration for test as passed from
1120 LUClusterVerify() in what[constants.NV_NODENETTEST]
1121
1122 @rtype: dict
1123 @return: a dictionary with node names as keys and error messages
1124 as values
1125 """
1126 result = {}
1127 nodes, master_candidates = test_config
1128 port = netutils.GetDaemonPort(constants.NODED)
1129
1130 if my_name not in master_candidates:
1131 return result
1132
1133 my_pip, my_sip = next(
1134 ((pip, sip) for name, pip, sip in nodes if name == my_name),
1135 (None, None)
1136 )
1137 if not my_pip:
1138 result[my_name] = ("Can't find my own primary/secondary IP"
1139 " in the node list")
1140 return result
1141
1142 for name, pip, sip in nodes:
1143 fail = []
1144 if not netutils.TcpPing(pip, port, source=my_pip):
1145 fail.append("primary")
1146 if sip != pip:
1147 if not netutils.TcpPing(sip, port, source=my_sip):
1148 fail.append("secondary")
1149 if fail:
1150 result[name] = ("failure using the %s interface(s)" %
1151 " and ".join(fail))
1152 return result
1153
1154
1155 def VerifyMasterIP(my_name, test_config):
1156 """Verify master IP is reachable.
1157
1158 @type my_name: string
1159 @param my_name: name of the node this test is running on
1160
1161 @type test_config: tuple (master_name, master_up, master_candidates)
1162 @param test_config: configuration for test as passed from
1163 LUClusterVerify() in what[constants.NV_MASTERIP]
1164
1165 @rtype: bool or None
1166 @return: Boolean test result, None if skipped
1167 """
1168 master_name, master_ip, master_candidates = test_config
1169 port = netutils.GetDaemonPort(constants.NODED)
1170 if my_name not in master_candidates:
1171 return None
1172
1173 if master_name == my_name:
1174 source = constants.IP4_ADDRESS_LOCALHOST
1175 else:
1176 source = None
1177 return netutils.TcpPing(master_ip, port, source=source)
1178
1179
1180 def VerifyNode(what, cluster_name, all_hvparams):
1181 """Verify the status of the local node.
1182
1183 Based on the input L{what} parameter, various checks are done on the
1184 local node.
1185
1186 If the I{filelist} key is present, this list of
1187 files is checksummed and the file/checksum pairs are returned.
1188
1189 If the I{nodelist} key is present, we check that we have
1190 connectivity via ssh with the target nodes (and check the hostname
1191 report).
1192
1193 If the I{node-net-test} key is present, we check that we have
1194 connectivity to the given nodes via both primary IP and, if
1195 applicable, secondary IPs.
1196
1197 @type what: C{dict}
1198 @param what: a dictionary of things to check:
1199 - filelist: list of files for which to compute checksums
1200 - nodelist: list of nodes we should check ssh communication with
1201 - node-net-test: list of nodes we should check node daemon port
1202 connectivity with
1203 - hypervisor: list with hypervisors to run the verify for
1204 @type cluster_name: string
1205 @param cluster_name: the cluster's name
1206 @type all_hvparams: dict of dict of strings
1207 @param all_hvparams: a dictionary mapping hypervisor names to hvparams
1208 @rtype: dict
1209 @return: a dictionary with the same keys as the input dict, and
1210 values representing the result of the checks
1211
1212 """
1213 result = {}
1214 my_name = netutils.Hostname.GetSysName()
1215 vm_capable = my_name not in what.get(constants.NV_NONVMNODES, [])
1216
1217 _VerifyHypervisors(what, vm_capable, result, all_hvparams)
1218 _VerifyHvparams(what, vm_capable, result)
1219
1220 if constants.NV_FILELIST in what:
1221 fingerprints = utils.FingerprintFiles(map(vcluster.LocalizeVirtualPath,
1222 what[constants.NV_FILELIST]))
1223 result[constants.NV_FILELIST] = \
1224 dict((vcluster.MakeVirtualPath(key), value)
1225 for (key, value) in fingerprints.items())
1226
1227 if constants.NV_CLIENT_CERT in what:
1228 result[constants.NV_CLIENT_CERT] = _VerifyClientCertificate()
1229
1230 if constants.NV_SSH_SETUP in what:
1231 node_status_list, key_type = what[constants.NV_SSH_SETUP]
1232 result[constants.NV_SSH_SETUP] = \
1233 _VerifySshSetup(node_status_list, my_name, key_type)
1234 if constants.NV_SSH_CLUTTER in what:
1235 result[constants.NV_SSH_CLUTTER] = \
1236 _VerifySshClutter(what[constants.NV_SSH_SETUP], my_name)
1237
1238 if constants.NV_NODELIST in what:
1239 (nodes, bynode, mcs) = what[constants.NV_NODELIST]
1240
1241 # Add nodes from other groups (different for each node)
1242 try:
1243 nodes.extend(bynode[my_name])
1244 except KeyError:
1245 pass
1246
1247 # Use a random order
1248 random.shuffle(nodes)
1249
1250 # Try to contact all nodes
1251 val = {}
1252 ssh_port_map = ssconf.SimpleStore().GetSshPortMap()
1253 for node in nodes:
1254 # We only test if master candidates can communicate to other nodes.
1255 # We cannot test if normal nodes cannot communicate with other nodes,
1256 # because the administrator might have installed additional SSH keys,
1257 # over which Ganeti has no power.
1258 if my_name in mcs:
1259 success, message = _GetSshRunner(cluster_name). \
1260 VerifyNodeHostname(node, ssh_port_map[node])
1261 if not success:
1262 val[node] = message
1263
1264 result[constants.NV_NODELIST] = val
1265
1266 if constants.NV_NODENETTEST in what:
1267 result[constants.NV_NODENETTEST] = VerifyNodeNetTest(
1268 my_name, what[constants.NV_NODENETTEST])
1269
1270 if constants.NV_MASTERIP in what:
1271 result[constants.NV_MASTERIP] = VerifyMasterIP(
1272 my_name, what[constants.NV_MASTERIP])
1273
1274 if constants.NV_USERSCRIPTS in what:
1275 result[constants.NV_USERSCRIPTS] = \
1276 [script for script in what[constants.NV_USERSCRIPTS]
1277 if not utils.IsExecutable(script)]
1278
1279 if constants.NV_OOB_PATHS in what:
1280 result[constants.NV_OOB_PATHS] = tmp = []
1281 for path in what[constants.NV_OOB_PATHS]:
1282 try:
1283 st = os.stat(path)
1284 except OSError, err:
1285 tmp.append("error stating out of band helper: %s" % err)
1286 else:
1287 if stat.S_ISREG(st.st_mode):
1288 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR:
1289 tmp.append(None)
1290 else:
1291 tmp.append("out of band helper %s is not executable" % path)
1292 else:
1293 tmp.append("out of band helper %s is not a file" % path)
1294
1295 if constants.NV_LVLIST in what and vm_capable:
1296 try:
1297 val = GetVolumeList(utils.ListVolumeGroups().keys())
1298 except RPCFail, err:
1299 val = str(err)
1300 result[constants.NV_LVLIST] = val
1301
1302 _VerifyInstanceList(what, vm_capable, result, all_hvparams)
1303
1304 if constants.NV_VGLIST in what and vm_capable:
1305 result[constants.NV_VGLIST] = utils.ListVolumeGroups()
1306
1307 if constants.NV_PVLIST in what and vm_capable:
1308 check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what
1309 val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST],
1310 filter_allocatable=False,
1311 include_lvs=check_exclusive_pvs)
1312 if check_exclusive_pvs:
1313 result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val)
1314 for pvi in val:
1315 # Avoid sending useless data on the wire
1316 pvi.lv_list = []
1317 result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val)
1318
1319 if constants.NV_VERSION in what:
1320 result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
1321 constants.RELEASE_VERSION)
1322
1323 _VerifyNodeInfo(what, vm_capable, result, all_hvparams)
1324
1325 if constants.NV_DRBDVERSION in what and vm_capable:
1326 try:
1327 drbd_version = DRBD8.GetProcInfo().GetVersionString()
1328 except errors.BlockDeviceError, err:
1329 logging.warning("Can't get DRBD version", exc_info=True)
1330 drbd_version = str(err)
1331 result[constants.NV_DRBDVERSION] = drbd_version
1332
1333 if constants.NV_DRBDLIST in what and vm_capable:
1334 try:
1335 used_minors = drbd.DRBD8.GetUsedDevs()
1336 except errors.BlockDeviceError, err:
1337 logging.warning("Can't get used minors list", exc_info=True)
1338 used_minors = str(err)
1339 result[constants.NV_DRBDLIST] = used_minors
1340
1341 if constants.NV_DRBDHELPER in what and vm_capable:
1342 status = True
1343 try:
1344 payload = drbd.DRBD8.GetUsermodeHelper()
1345 except errors.BlockDeviceError, err:
1346 logging.error("Can't get DRBD usermode helper: %s", str(err))
1347 status = False
1348 payload = str(err)
1349 result[constants.NV_DRBDHELPER] = (status, payload)
1350
1351 if constants.NV_NODESETUP in what:
1352 result[constants.NV_NODESETUP] = tmpr = []
1353 if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"):
1354 tmpr.append("The sysfs filesytem doesn't seem to be mounted"
1355 " under /sys, missing required directories /sys/block"
1356 " and /sys/class/net")
1357 if (not os.path.isdir("/proc/sys") or
1358 not os.path.isfile("/proc/sysrq-trigger")):
1359 tmpr.append("The procfs filesystem doesn't seem to be mounted"
1360 " under /proc, missing required directory /proc/sys and"
1361 " the file /proc/sysrq-trigger")
1362
1363 if constants.NV_TIME in what:
1364 result[constants.NV_TIME] = utils.SplitTime(time.time())
1365
1366 if constants.NV_OSLIST in what and vm_capable:
1367 result[constants.NV_OSLIST] = DiagnoseOS()
1368
1369 if constants.NV_BRIDGES in what and vm_capable:
1370 result[constants.NV_BRIDGES] = [bridge
1371 for bridge in what[constants.NV_BRIDGES]
1372 if not utils.BridgeExists(bridge)]
1373
1374 if what.get(constants.NV_ACCEPTED_STORAGE_PATHS) == my_name:
1375 result[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1376 filestorage.ComputeWrongFileStoragePaths()
1377
1378 if what.get(constants.NV_FILE_STORAGE_PATH):
1379 pathresult = filestorage.CheckFileStoragePath(
1380 what[constants.NV_FILE_STORAGE_PATH])
1381 if pathresult:
1382 result[constants.NV_FILE_STORAGE_PATH] = pathresult
1383
1384 if what.get(constants.NV_SHARED_FILE_STORAGE_PATH):
1385 pathresult = filestorage.CheckFileStoragePath(
1386 what[constants.NV_SHARED_FILE_STORAGE_PATH])
1387 if pathresult:
1388 result[constants.NV_SHARED_FILE_STORAGE_PATH] = pathresult
1389
1390 return result
1391
1392
1393 def GetCryptoTokens(token_requests):
1394 """Perform actions on the node's cryptographic tokens.
1395
1396 Token types can be 'ssl' or 'ssh'. So far only some actions are implemented
1397 for 'ssl'. Action 'get' returns the digest of the public client ssl
1398 certificate. Action 'create' creates a new client certificate and private key
1399 and also returns the digest of the certificate. The third parameter of a
1400 token request are optional parameters for the actions, so far only the
1401 filename is supported.
1402
1403 @type token_requests: list of tuples of (string, string, dict), where the
1404 first string is in constants.CRYPTO_TYPES, the second in
1405 constants.CRYPTO_ACTIONS. The third parameter is a dictionary of string
1406 to string.
1407 @param token_requests: list of requests of cryptographic tokens and actions
1408 to perform on them. The actions come with a dictionary of options.
1409 @rtype: list of tuples (string, string)
1410 @return: list of tuples of the token type and the public crypto token
1411
1412 """
1413 tokens = []
1414 for (token_type, action, _) in token_requests:
1415 if token_type not in constants.CRYPTO_TYPES:
1416 raise errors.ProgrammerError("Token type '%s' not supported." %
1417 token_type)
1418 if action not in constants.CRYPTO_ACTIONS:
1419 raise errors.ProgrammerError("Action '%s' is not supported." %
1420 action)
1421 if token_type == constants.CRYPTO_TYPE_SSL_DIGEST:
1422 tokens.append((token_type,
1423 utils.GetCertificateDigest()))
1424 return tokens
1425
1426
1427 def EnsureDaemon(daemon_name, run):
1428 """Ensures the given daemon is running or stopped.
1429
1430 @type daemon_name: string
1431 @param daemon_name: name of the daemon (e.g., constants.KVMD)
1432
1433 @type run: bool
1434 @param run: whether to start or stop the daemon
1435
1436 @rtype: bool
1437 @return: 'True' if daemon successfully started/stopped,
1438 'False' otherwise
1439
1440 """
1441 allowed_daemons = [constants.KVMD]
1442
1443 if daemon_name not in allowed_daemons:
1444 fn = lambda _: False
1445 elif run:
1446 fn = utils.EnsureDaemon
1447 else:
1448 fn = utils.StopDaemon
1449
1450 return fn(daemon_name)
1451
1452
1453 def _InitSshUpdateData(data, noded_cert_file, ssconf_store):
1454 (_, noded_cert) = \
1455 utils.ExtractX509Certificate(utils.ReadFile(noded_cert_file))
1456 data[constants.SSHS_NODE_DAEMON_CERTIFICATE] = noded_cert
1457
1458 cluster_name = ssconf_store.GetClusterName()
1459 data[constants.SSHS_CLUSTER_NAME] = cluster_name
1460
1461
1462 def AddNodeSshKey(node_uuid, node_name,
1463 potential_master_candidates,
1464 to_authorized_keys=False,
1465 to_public_keys=False,
1466 get_public_keys=False,
1467 pub_key_file=pathutils.SSH_PUB_KEYS,
1468 ssconf_store=None,
1469 noded_cert_file=pathutils.NODED_CERT_FILE,
1470 run_cmd_fn=ssh.RunSshCmdWithStdin):
1471 """Distributes a node's public SSH key across the cluster.
1472
1473 Note that this function should only be executed on the master node, which
1474 then will copy the new node's key to all nodes in the cluster via SSH.
1475
1476 Also note: at least one of the flags C{to_authorized_keys},
1477 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1478 the function to actually perform any actions.
1479
1480 @type node_uuid: str
1481 @param node_uuid: the UUID of the node whose key is added
1482 @type node_name: str
1483 @param node_name: the name of the node whose key is added
1484 @type potential_master_candidates: list of str
1485 @param potential_master_candidates: list of node names of potential master
1486 candidates; this should match the list of uuids in the public key file
1487 @type to_authorized_keys: boolean
1488 @param to_authorized_keys: whether the key should be added to the
1489 C{authorized_keys} file of all nodes
1490 @type to_public_keys: boolean
1491 @param to_public_keys: whether the keys should be added to the public key file
1492 @type get_public_keys: boolean
1493 @param get_public_keys: whether the node should add the clusters' public keys
1494 to its {ganeti_pub_keys} file
1495
1496 """
1497 node_list = [SshAddNodeInfo(name=node_name, uuid=node_uuid,
1498 to_authorized_keys=to_authorized_keys,
1499 to_public_keys=to_public_keys,
1500 get_public_keys=get_public_keys)]
1501 return AddNodeSshKeyBulk(node_list,
1502 potential_master_candidates,
1503 pub_key_file=pub_key_file,
1504 ssconf_store=ssconf_store,
1505 noded_cert_file=noded_cert_file,
1506 run_cmd_fn=run_cmd_fn)
1507
1508
1509 # Node info named tuple specifically for the use with AddNodeSshKeyBulk
1510 SshAddNodeInfo = collections.namedtuple(
1511 "SshAddNodeInfo",
1512 ["uuid",
1513 "name",
1514 "to_authorized_keys",
1515 "to_public_keys",
1516 "get_public_keys"])
1517
1518
1519 def AddNodeSshKeyBulk(node_list,
1520 potential_master_candidates,
1521 pub_key_file=pathutils.SSH_PUB_KEYS,
1522 ssconf_store=None,
1523 noded_cert_file=pathutils.NODED_CERT_FILE,
1524 run_cmd_fn=ssh.RunSshCmdWithStdin):
1525 """Distributes a node's public SSH key across the cluster.
1526
1527 Note that this function should only be executed on the master node, which
1528 then will copy the new node's key to all nodes in the cluster via SSH.
1529
1530 Also note: at least one of the flags C{to_authorized_keys},
1531 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1532 the function to actually perform any actions.
1533
1534 @type node_list: list of SshAddNodeInfo tuples
1535 @param node_list: list of tuples containing the necessary node information for
1536 adding their keys
1537 @type potential_master_candidates: list of str
1538 @param potential_master_candidates: list of node names of potential master
1539 candidates; this should match the list of uuids in the public key file
1540
1541 """
1542 # whether there are any keys to be added or retrieved at all
1543 to_authorized_keys = any([node_info.to_authorized_keys for node_info in
1544 node_list])
1545 to_public_keys = any([node_info.to_public_keys for node_info in
1546 node_list])
1547
1548 if not ssconf_store:
1549 ssconf_store = ssconf.SimpleStore()
1550
1551 for node_info in node_list:
1552 # replacement not necessary for keys that are not supposed to be in the
1553 # list of public keys
1554 if not node_info.to_public_keys:
1555 continue
1556 # Check and fix sanity of key file
1557 keys_by_name = ssh.QueryPubKeyFile([node_info.name], key_file=pub_key_file)
1558 keys_by_uuid = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1559
1560 if (not keys_by_name or node_info.name not in keys_by_name) \
1561 and (not keys_by_uuid or node_info.uuid not in keys_by_uuid):
1562 raise errors.SshUpdateError(
1563 "No keys found for the new node '%s' (UUID %s) in the list of public"
1564 " SSH keys, neither for the name or the UUID" %
1565 (node_info.name, node_info.uuid))
1566 else:
1567 if node_info.name in keys_by_name:
1568 # Replace the name by UUID in the file as the name should only be used
1569 # temporarily
1570 ssh.ReplaceNameByUuid(node_info.uuid, node_info.name,
1571 error_fn=errors.SshUpdateError,
1572 key_file=pub_key_file)
1573
1574 # Retrieve updated map of UUIDs to keys
1575 keys_by_uuid = ssh.QueryPubKeyFile(
1576 [node_info.uuid for node_info in node_list], key_file=pub_key_file)
1577
1578 # Update the master node's key files
1579 (auth_key_file, _) = \
1580 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1581 for node_info in node_list:
1582 if node_info.to_authorized_keys:
1583 ssh.AddAuthorizedKeys(auth_key_file, keys_by_uuid[node_info.uuid])
1584
1585 base_data = {}
1586 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1587 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1588
1589 ssh_port_map = ssconf_store.GetSshPortMap()
1590
1591 # Update the target nodes themselves
1592 for node_info in node_list:
1593 logging.debug("Updating SSH key files of target node '%s'.", node_info.name)
1594 if node_info.get_public_keys:
1595 node_data = {}
1596 _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
1597 all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
1598 node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1599 (constants.SSHS_OVERRIDE, all_keys)
1600
1601 try:
1602 utils.RetryByNumberOfTimes(
1603 constants.SSHS_MAX_RETRIES,
1604 errors.SshUpdateError,
1605 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1606 ssh_port_map.get(node_info.name), node_data,
1607 debug=False, verbose=False, use_cluster_key=False,
1608 ask_key=False, strict_host_check=False)
1609 except errors.SshUpdateError as e:
1610 # Clean up the master's public key file if adding key fails
1611 if node_info.to_public_keys:
1612 ssh.RemovePublicKey(node_info.uuid)
1613 raise e
1614
1615 # Update all nodes except master and the target nodes
1616 keys_by_uuid_auth = ssh.QueryPubKeyFile(
1617 [node_info.uuid for node_info in node_list
1618 if node_info.to_authorized_keys],
1619 key_file=pub_key_file)
1620 if to_authorized_keys:
1621 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1622 (constants.SSHS_ADD, keys_by_uuid_auth)
1623
1624 pot_mc_data = base_data.copy()
1625 keys_by_uuid_pub = ssh.QueryPubKeyFile(
1626 [node_info.uuid for node_info in node_list
1627 if node_info.to_public_keys],
1628 key_file=pub_key_file)
1629 if to_public_keys:
1630 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1631 (constants.SSHS_REPLACE_OR_ADD, keys_by_uuid_pub)
1632
1633 all_nodes = ssconf_store.GetNodeList()
1634 master_node = ssconf_store.GetMasterNode()
1635 online_nodes = ssconf_store.GetOnlineNodeList()
1636
1637 node_errors = []
1638 for node in all_nodes:
1639 if node == master_node:
1640 logging.debug("Skipping master node '%s'.", master_node)
1641 continue
1642 if node not in online_nodes:
1643 logging.debug("Skipping offline node '%s'.", node)
1644 continue
1645 if node in potential_master_candidates:
1646 logging.debug("Updating SSH key files of node '%s'.", node)
1647 try:
1648 utils.RetryByNumberOfTimes(
1649 constants.SSHS_MAX_RETRIES,
1650 errors.SshUpdateError,
1651 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1652 ssh_port_map.get(node), pot_mc_data,
1653 debug=False, verbose=False, use_cluster_key=False,
1654 ask_key=False, strict_host_check=False)
1655 except errors.SshUpdateError as last_exception:
1656 error_msg = ("When adding the key of node '%s', updating SSH key"
1657 " files of node '%s' failed after %s retries."
1658 " Not trying again. Last error was: %s." %
1659 (node, node_info.name, constants.SSHS_MAX_RETRIES,
1660 last_exception))
1661 node_errors.append((node, error_msg))
1662 # We only log the error and don't throw an exception, because
1663 # one unreachable node shall not abort the entire procedure.
1664 logging.error(error_msg)
1665
1666 else:
1667 if to_authorized_keys:
1668 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
1669 ssh_port_map.get(node), base_data,
1670 debug=False, verbose=False, use_cluster_key=False,
1671 ask_key=False, strict_host_check=False)
1672
1673 return node_errors
1674
1675
1676 def RemoveNodeSshKey(node_uuid, node_name,
1677 master_candidate_uuids,
1678 potential_master_candidates,
1679 master_uuid=None,
1680 keys_to_remove=None,
1681 from_authorized_keys=False,
1682 from_public_keys=False,
1683 clear_authorized_keys=False,
1684 clear_public_keys=False,
1685 pub_key_file=pathutils.SSH_PUB_KEYS,
1686 ssconf_store=None,
1687 noded_cert_file=pathutils.NODED_CERT_FILE,
1688 readd=False,
1689 run_cmd_fn=ssh.RunSshCmdWithStdin):
1690 """Removes the node's SSH keys from the key files and distributes those.
1691
1692 Note that at least one of the flags C{from_authorized_keys},
1693 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1694 has to be set to C{True} for the function to perform any action at all.
1695 Not doing so will trigger an assertion in the function.
1696
1697 @type node_uuid: str
1698 @param node_uuid: UUID of the node whose key is removed
1699 @type node_name: str
1700 @param node_name: name of the node whose key is remove
1701 @type master_candidate_uuids: list of str
1702 @param master_candidate_uuids: list of UUIDs of the current master candidates
1703 @type potential_master_candidates: list of str
1704 @param potential_master_candidates: list of names of potential master
1705 candidates
1706 @type keys_to_remove: dict of str to list of str
1707 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1708 to be removed. This list is supposed to be used only if the keys are not
1709 in the public keys file. This is for example the case when removing a
1710 master node's key.
1711 @type from_authorized_keys: boolean
1712 @param from_authorized_keys: whether or not the key should be removed
1713 from the C{authorized_keys} file
1714 @type from_public_keys: boolean
1715 @param from_public_keys: whether or not the key should be remove from
1716 the C{ganeti_pub_keys} file
1717 @type clear_authorized_keys: boolean
1718 @param clear_authorized_keys: whether or not the C{authorized_keys} file
1719 should be cleared on the node whose keys are removed
1720 @type clear_public_keys: boolean
1721 @param clear_public_keys: whether to clear the node's C{ganeti_pub_key} file
1722 @type readd: boolean
1723 @param readd: whether this is called during a readd operation.
1724 @rtype: list of string
1725 @returns: list of feedback messages
1726
1727 """
1728 node_list = [SshRemoveNodeInfo(uuid=node_uuid,
1729 name=node_name,
1730 from_authorized_keys=from_authorized_keys,
1731 from_public_keys=from_public_keys,
1732 clear_authorized_keys=clear_authorized_keys,
1733 clear_public_keys=clear_public_keys)]
1734 return RemoveNodeSshKeyBulk(node_list,
1735 master_candidate_uuids,
1736 potential_master_candidates,
1737 master_uuid=master_uuid,
1738 keys_to_remove=keys_to_remove,
1739 pub_key_file=pub_key_file,
1740 ssconf_store=ssconf_store,
1741 noded_cert_file=noded_cert_file,
1742 readd=readd,
1743 run_cmd_fn=run_cmd_fn)
1744
1745
1746 # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
1747 SshRemoveNodeInfo = collections.namedtuple(
1748 "SshRemoveNodeInfo",
1749 ["uuid",
1750 "name",
1751 "from_authorized_keys",
1752 "from_public_keys",
1753 "clear_authorized_keys",
1754 "clear_public_keys"])
1755
1756
1757 def RemoveNodeSshKeyBulk(node_list,
1758 master_candidate_uuids,
1759 potential_master_candidates,
1760 master_uuid=None,
1761 keys_to_remove=None,
1762 pub_key_file=pathutils.SSH_PUB_KEYS,
1763 ssconf_store=None,
1764 noded_cert_file=pathutils.NODED_CERT_FILE,
1765 readd=False,
1766 run_cmd_fn=ssh.RunSshCmdWithStdin):
1767 """Removes the node's SSH keys from the key files and distributes those.
1768
1769 Note that at least one of the flags C{from_authorized_keys},
1770 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1771 of at least one node has to be set to C{True} for the function to perform any
1772 action at all. Not doing so will trigger an assertion in the function.
1773
1774 @type node_list: list of C{SshRemoveNodeInfo}.
1775 @param node_list: list of information about nodes whose keys are being removed
1776 @type master_candidate_uuids: list of str
1777 @param master_candidate_uuids: list of UUIDs of the current master candidates
1778 @type potential_master_candidates: list of str
1779 @param potential_master_candidates: list of names of potential master
1780 candidates
1781 @type keys_to_remove: dict of str to list of str
1782 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1783 to be removed. This list is supposed to be used only if the keys are not
1784 in the public keys file. This is for example the case when removing a
1785 master node's key.
1786 @type readd: boolean
1787 @param readd: whether this is called during a readd operation.
1788 @rtype: list of string
1789 @returns: list of feedback messages
1790
1791 """
1792 # Non-disruptive error messages, list of (node, msg) pairs
1793 result_msgs = []
1794
1795 # whether there are any keys to be added or retrieved at all
1796 from_authorized_keys = any([node_info.from_authorized_keys for node_info in
1797 node_list])
1798 from_public_keys = any([node_info.from_public_keys for node_info in
1799 node_list])
1800 clear_authorized_keys = any([node_info.clear_authorized_keys for node_info in
1801 node_list])
1802 clear_public_keys = any([node_info.clear_public_keys for node_info in
1803 node_list])
1804
1805 # Make sure at least one of these flags is true.
1806 if not (from_authorized_keys or from_public_keys or clear_authorized_keys
1807 or clear_public_keys):
1808 raise errors.SshUpdateError("No removal from any key file was requested.")
1809
1810 if not ssconf_store:
1811 ssconf_store = ssconf.SimpleStore()
1812
1813 master_node = ssconf_store.GetMasterNode()
1814 ssh_port_map = ssconf_store.GetSshPortMap()
1815
1816 all_keys_to_remove = {}
1817 if from_authorized_keys or from_public_keys:
1818 for node_info in node_list:
1819 # Skip nodes that don't actually need any keys to be removed.
1820 if not (node_info.from_authorized_keys or node_info.from_public_keys):
1821 continue
1822 if node_info.name == master_node and not keys_to_remove:
1823 raise errors.SshUpdateError("Cannot remove the master node's keys.")
1824 if keys_to_remove:
1825 keys = keys_to_remove
1826 else:
1827 keys = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1828 if (not keys or node_info.uuid not in keys) and not readd:
1829 raise errors.SshUpdateError("Node '%s' not found in the list of"
1830 " public SSH keys. It seems someone"
1831 " tries to remove a key from outside"
1832 " the cluster!" % node_info.uuid)
1833 # During an upgrade all nodes have the master key. In this case we
1834 # should not remove it to avoid accidentally shutting down cluster
1835 # SSH communication
1836 master_keys = None
1837 if master_uuid:
1838 master_keys = ssh.QueryPubKeyFile([master_uuid],
1839 key_file=pub_key_file)
1840
1841 # Remove any master keys from the list of keys to remove from the node
1842 keys[node_info.uuid] = list(
1843 set(keys[node_info.uuid]) - set(master_keys))
1844
1845 all_keys_to_remove.update(keys)
1846
1847 if all_keys_to_remove:
1848 base_data = {}
1849 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1850 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1851
1852 if from_authorized_keys:
1853 # UUIDs of nodes that are supposed to be removed from the
1854 # authorized_keys files.
1855 nodes_remove_from_authorized_keys = [
1856 node_info.uuid for node_info in node_list
1857 if node_info.from_authorized_keys]
1858 keys_to_remove_from_authorized_keys = dict([
1859 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1860 if uuid in nodes_remove_from_authorized_keys])
1861 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1862 (constants.SSHS_REMOVE, keys_to_remove_from_authorized_keys)
1863 (auth_key_file, _) = \
1864 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False,
1865 dircheck=False)
1866
1867 for uuid in nodes_remove_from_authorized_keys:
1868 ssh.RemoveAuthorizedKeys(auth_key_file,
1869 keys_to_remove_from_authorized_keys[uuid])
1870
1871 pot_mc_data = base_data.copy()
1872
1873 if from_public_keys:
1874 nodes_remove_from_public_keys = [
1875 node_info.uuid for node_info in node_list
1876 if node_info.from_public_keys]
1877 keys_to_remove_from_public_keys = dict([
1878 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1879 if uuid in nodes_remove_from_public_keys])
1880 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1881 (constants.SSHS_REMOVE, keys_to_remove_from_public_keys)
1882
1883 all_nodes = ssconf_store.GetNodeList()
1884 online_nodes = ssconf_store.GetOnlineNodeList()
1885 all_nodes_to_remove = [node_info.name for node_info in node_list]
1886 logging.debug("Removing keys of nodes '%s' from all nodes but itself and"
1887 " master.", ", ".join(all_nodes_to_remove))
1888 for node in all_nodes:
1889 if node == master_node:
1890 logging.debug("Skipping master node '%s'.", master_node)
1891 continue
1892 if node not in online_nodes:
1893 logging.debug("Skipping offline node '%s'.", node)
1894 continue
1895 if node in all_nodes_to_remove:
1896 logging.debug("Skipping node whose key is removed itself '%s'.", node)
1897 continue
1898 ssh_port = ssh_port_map.get(node)
1899 if not ssh_port:
1900 raise errors.OpExecError("No SSH port information available for"
1901 " node '%s', map: %s." %
1902 (node, ssh_port_map))
1903 error_msg_final = ("When removing the key of node '%s', updating the"
1904 " SSH key files of node '%s' failed. Last error"
1905 " was: %s.")
1906
1907 if node in potential_master_candidates or from_authorized_keys:
1908 if node in potential_master_candidates:
1909 node_desc = "potential master candidate"
1910 else:
1911 node_desc = "normal"
1912 logging.debug("Updating key setup of %s node %s.", node_desc, node)
1913 try:
1914 utils.RetryByNumberOfTimes(
1915 constants.SSHS_MAX_RETRIES,
1916 errors.SshUpdateError,
1917 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1918 ssh_port, pot_mc_data,
1919 debug=False, verbose=False, use_cluster_key=False,
1920 ask_key=False, strict_host_check=False)
1921 except errors.SshUpdateError as last_exception:
1922 error_msg = error_msg_final % (
1923 node_info.name, node, last_exception)
1924 result_msgs.append((node, error_msg))
1925 logging.error(error_msg)
1926
1927 for node_info in node_list:
1928 if node_info.clear_authorized_keys or node_info.from_public_keys or \
1929 node_info.clear_public_keys:
1930 data = {}
1931 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1932 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1933 ssh_port = ssh_port_map.get(node_info.name)
1934 if not ssh_port:
1935 raise errors.OpExecError("No SSH port information available for"
1936 " node '%s', which is leaving the cluster.")
1937
1938 if node_info.clear_authorized_keys:
1939 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore,
1940 # we have to specify exactly which keys to clear to leave keys untouched
1941 # that were not added by Ganeti.
1942 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids
1943 if uuid != node_info.uuid]
1944 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids,
1945 key_file=pub_key_file)
1946 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1947 (constants.SSHS_REMOVE, candidate_keys)
1948
1949 if node_info.clear_public_keys:
1950 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1951 (constants.SSHS_CLEAR, {})
1952 elif node_info.from_public_keys:
1953 # Since clearing the public keys subsumes removing just a single key,
1954 # we only do it if clear_public_keys is 'False'.
1955
1956 if all_keys_to_remove:
1957 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1958 (constants.SSHS_REMOVE, all_keys_to_remove)
1959
1960 # If we have no changes to any keyfile, just return
1961 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or
1962 constants.SSHS_SSH_AUTHORIZED_KEYS in data):
1963 return
1964
1965 logging.debug("Updating SSH key setup of target node '%s'.",
1966 node_info.name)
1967 try:
1968 utils.RetryByNumberOfTimes(
1969 constants.SSHS_MAX_RETRIES,
1970 errors.SshUpdateError,
1971 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1972 ssh_port, data,
1973 debug=False, verbose=False, use_cluster_key=False,
1974 ask_key=False, strict_host_check=False)
1975 except errors.SshUpdateError as last_exception:
1976 result_msgs.append(
1977 (node_info.name,
1978 ("Removing SSH keys from node '%s' failed."
1979 " This can happen when the node is already unreachable."
1980 " Error: %s" % (node_info.name, last_exception))))
1981
1982 if all_keys_to_remove and from_public_keys:
1983 for node_uuid in nodes_remove_from_public_keys:
1984 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
1985
1986 return result_msgs
1987
1988
1989 def _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, ssh_key_type,
1990 ssh_key_bits, pub_key_file=pathutils.SSH_PUB_KEYS,
1991 ssconf_store=None,
1992 noded_cert_file=pathutils.NODED_CERT_FILE,
1993 run_cmd_fn=ssh.RunSshCmdWithStdin,
1994 suffix=""):
1995 """Generates the root SSH key pair on the node.
1996
1997 @type node_uuid: str
1998 @param node_uuid: UUID of the node whose key is removed
1999 @type node_name: str
2000 @param node_name: name of the node whose key is remove
2001 @type ssh_port_map: dict of str to int
2002 @param ssh_port_map: mapping of node names to their SSH port
2003 @type ssh_key_type: One of L{constants.SSHK_ALL}
2004 @param ssh_key_type: the type of SSH key to be generated
2005 @type ssh_key_bits: int
2006 @param ssh_key_bits: the length of the key to be generated
2007
2008 """
2009 if not ssconf_store:
2010 ssconf_store = ssconf.SimpleStore()
2011
2012 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
2013 if not keys_by_uuid or node_uuid not in keys_by_uuid:
2014 raise errors.SshUpdateError("Node %s (UUID: %s) whose key is requested to"
2015 " be regenerated is not registered in the"
2016 " public keys file." % (node_name, node_uuid))
2017
2018 data = {}
2019 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
2020 cluster_name = data[constants.SSHS_CLUSTER_NAME]
2021 data[constants.SSHS_GENERATE] = (ssh_key_type, ssh_key_bits, suffix)
2022
2023 run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
2024 ssh_port_map.get(node_name), data,
2025 debug=False, verbose=False, use_cluster_key=False,
2026 ask_key=False, strict_host_check=False)
2027
2028
2029 def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
2030 master_node_uuids = [node_uuid for (node_uuid, node_name)
2031 in node_uuid_name_map
2032 if node_name == master_node_name]
2033 if len(master_node_uuids) != 1:
2034 raise errors.SshUpdateError("No (unique) master UUID found. Master node"
2035 " name: '%s', Master UUID: '%s'" %
2036 (master_node_name, master_node_uuids))
2037 return master_node_uuids[0]
2038
2039
2040 def _GetOldMasterKeys(master_node_uuid, pub_key_file):
2041 old_master_keys_by_uuid = ssh.QueryPubKeyFile([master_node_uuid],
2042 key_file=pub_key_file)
2043 if not old_master_keys_by_uuid:
2044 raise errors.SshUpdateError("No public key of the master node (UUID '%s')"
2045 " found, not generating a new key."
2046 % master_node_uuid)
2047 return old_master_keys_by_uuid
2048
2049
2050 def _GetNewMasterKey(root_keyfiles, master_node_uuid):
2051 new_master_keys = []
2052 for (_, (_, public_key_file)) in root_keyfiles.items():
2053 public_key_dir = os.path.dirname(public_key_file)
2054 public_key_file_tmp_filename = \
2055 os.path.splitext(os.path.basename(public_key_file))[0] \
2056 + constants.SSHS_MASTER_SUFFIX + ".pub"
2057 public_key_path_tmp = os.path.join(public_key_dir,
2058 public_key_file_tmp_filename)
2059 if os.path.exists(public_key_path_tmp):
2060 # for some key types, there might not be any keys
2061 key = utils.ReadFile(public_key_path_tmp)
2062 new_master_keys.append(key)
2063 if not new_master_keys:
2064 raise errors.SshUpdateError("Cannot find any type of temporary SSH key.")
2065 return {master_node_uuid: new_master_keys}
2066
2067
2068 def _ReplaceMasterKeyOnMaster(root_keyfiles):
2069 number_of_moves = 0
2070 for (_, (private_key_file, public_key_file)) in root_keyfiles.items():
2071 key_dir = os.path.dirname(public_key_file)
2072 private_key_file_tmp = \
2073 os.path.basename(private_key_file) + constants.SSHS_MASTER_SUFFIX
2074 public_key_file_tmp = private_key_file_tmp + ".pub"
2075 private_key_path_tmp = os.path.join(key_dir,
2076 private_key_file_tmp)
2077 public_key_path_tmp = os.path.join(key_dir,
2078 public_key_file_tmp)
2079 if os.path.exists(public_key_file):
2080 utils.CreateBackup(public_key_file)
2081 utils.RemoveFile(public_key_file)
2082 if os.path.exists(private_key_file):
2083 utils.CreateBackup(private_key_file)
2084 utils.RemoveFile(private_key_file)
2085 if os.path.exists(public_key_path_tmp) and \
2086 os.path.exists(private_key_path_tmp):
2087 # for some key types, there might not be any keys
2088 shutil.move(public_key_path_tmp, public_key_file)
2089 shutil.move(private_key_path_tmp, private_key_file)
2090 number_of_moves += 1
2091 if not number_of_moves:
2092 raise errors.SshUpdateError("Could not move at least one master SSH key.")
2093
2094
2095 def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
2096 potential_master_candidates, old_key_type, new_key_type,
2097 new_key_bits,
2098 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS,
2099 ssconf_store=None,
2100 noded_cert_file=pathutils.NODED_CERT_FILE,
2101 run_cmd_fn=ssh.RunSshCmdWithStdin):
2102 """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
2103
2104 @type node_uuids: list of str
2105 @param node_uuids: list of node UUIDs whose keys should be renewed
2106 @type node_names: list of str
2107 @param node_names: list of node names whose keys should be removed. This list
2108 should match the C{node_uuids} parameter
2109 @type master_candidate_uuids: list of str
2110 @param master_candidate_uuids: list of UUIDs of master candidates or
2111 master node
2112 @type old_key_type: One of L{constants.SSHK_ALL}
2113 @param old_key_type: the type of SSH key already present on nodes
2114 @type new_key_type: One of L{constants.SSHK_ALL}
2115 @param new_key_type: the type of SSH key to be generated
2116 @type new_key_bits: int
2117 @param new_key_bits: the length of the key to be generated
2118 @type ganeti_pub_keys_file: str
2119 @param ganeti_pub_keys_file: file path of the the public key file
2120 @type noded_cert_file: str
2121 @param noded_cert_file: path of the noded SSL certificate file
2122 @type run_cmd_fn: function
2123 @param run_cmd_fn: function to run commands on remote nodes via SSH
2124 @raises ProgrammerError: if node_uuids and node_names don't match;
2125 SshUpdateError if a node's key is missing from the public key file,
2126 if a node's new SSH key could not be fetched from it, if there is
2127 none or more than one entry in the public key list for the master
2128 node.
2129
2130 """
2131 if not ssconf_store:
2132 ssconf_store = ssconf.SimpleStore()
2133 cluster_name = ssconf_store.GetClusterName()
2134
2135 if not len(node_uuids) == len(node_names):
2136 raise errors.ProgrammerError("List of nodes UUIDs and node names"
2137 " does not match in length.")
2138
2139 (_, root_keyfiles) = \
2140 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2141 (_, old_pub_keyfile) = root_keyfiles[old_key_type]
2142 (_, new_pub_keyfile) = root_keyfiles[new_key_type]
2143 old_master_key = utils.ReadFile(old_pub_keyfile)
2144
2145 node_uuid_name_map = zip(node_uuids, node_names)
2146
2147 master_node_name = ssconf_store.GetMasterNode()
2148 master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name)
2149 ssh_port_map = ssconf_store.GetSshPortMap()
2150 # List of all node errors that happened, but which did not abort the
2151 # procedure as a whole. It is important that this is a list to have a
2152 # somewhat chronological history of events.
2153 all_node_errors = []
2154
2155 # process non-master nodes
2156
2157 # keys to add in bulk at the end
2158 node_keys_to_add = []
2159
2160 # list of all nodes
2161 node_list = []
2162
2163 # list of keys to be removed before generating new keys
2164 node_info_to_remove = []
2165
2166 for node_uuid, node_name in node_uuid_name_map:
2167 if node_name == master_node_name:
2168 continue
2169 master_candidate = node_uuid in master_candidate_uuids
2170 potential_master_candidate = node_name in potential_master_candidates
2171 node_list.append((node_uuid, node_name, master_candidate,
2172 potential_master_candidate))
2173
2174 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid],
2175 key_file=ganeti_pub_keys_file)
2176 if not keys_by_uuid:
2177 raise errors.SshUpdateError("No public key of node %s (UUID %s) found,"
2178 " not generating a new key."
2179 % (node_name, node_uuid))
2180
2181 if master_candidate:
2182 logging.debug("Fetching old SSH key from node '%s'.", node_name)
2183 old_pub_key = ssh.ReadRemoteSshPubKeys(old_pub_keyfile,
2184 node_name, cluster_name,
2185 ssh_port_map[node_name],
2186 False, # ask_key
2187 False) # key_check
2188 if old_pub_key != old_master_key:
2189 # If we are already in a multi-key setup (that is past Ganeti 2.12),
2190 # we can safely remove the old key of the node. Otherwise, we cannot
2191 # remove that node's key, because it is also the master node's key
2192 # and that would terminate all communication from the master to the
2193 # node.
2194 node_info_to_remove.append(SshRemoveNodeInfo(
2195 uuid=node_uuid,
2196 name=node_name,
2197 from_authorized_keys=master_candidate,
2198 from_public_keys=False,
2199 clear_authorized_keys=False,
2200 clear_public_keys=False))
2201 else:
2202 logging.debug("Old key of node '%s' is the same as the current master"
2203 " key. Not deleting that key on the node.", node_name)
2204
2205 logging.debug("Removing old SSH keys of all master candidates.")
2206 if node_info_to_remove:
2207 node_errors = RemoveNodeSshKeyBulk(
2208 node_info_to_remove,
2209 master_candidate_uuids,
2210 potential_master_candidates,
2211 master_uuid=master_node_uuid)
2212 if node_errors:
2213 all_node_errors = all_node_errors + node_errors
2214
2215 for (node_uuid, node_name, master_candidate, potential_master_candidate) \
2216 in node_list:
2217
2218 logging.debug("Generating new SSH key for node '%s'.", node_name)
2219 _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, new_key_type,
2220 new_key_bits, pub_key_file=ganeti_pub_keys_file,
2221 ssconf_store=ssconf_store,
2222 noded_cert_file=noded_cert_file,
2223 run_cmd_fn=run_cmd_fn)
2224
2225 try:
2226 logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
2227 pub_key = ssh.ReadRemoteSshPubKeys(new_pub_keyfile,
2228 node_name, cluster_name,
2229 ssh_port_map[node_name],
2230 False, # ask_key
2231 False) # key_check
2232 except:
2233 raise errors.SshUpdateError("Could not fetch key of node %s"
2234 " (UUID %s)" % (node_name, node_uuid))
2235
2236 if potential_master_candidate:
2237 ssh.RemovePublicKey(node_uuid, key_file=ganeti_pub_keys_file)
2238 ssh.AddPublicKey(node_uuid, pub_key, key_file=ganeti_pub_keys_file)
2239
2240 node_info = SshAddNodeInfo(name=node_name,
2241 uuid=node_uuid,
2242 to_authorized_keys=master_candidate,
2243 to_public_keys=potential_master_candidate,
2244 get_public_keys=True)
2245 node_keys_to_add.append(node_info)
2246
2247 node_errors = AddNodeSshKeyBulk(
2248 node_keys_to_add, potential_master_candidates,
2249 pub_key_file=ganeti_pub_keys_file, ssconf_store=ssconf_store,
2250 noded_cert_file=noded_cert_file,
2251 run_cmd_fn=run_cmd_fn)
2252 if node_errors:
2253 all_node_errors = all_node_errors + node_errors
2254
2255 # Renewing the master node's key
2256
2257 # Preserve the old keys for now
2258 old_master_keys_by_uuid = _GetOldMasterKeys(master_node_uuid,
2259 ganeti_pub_keys_file)
2260
2261 # Generate a new master key with a suffix, don't touch the old one for now
2262 logging.debug("Generate new ssh key of master.")
2263 _GenerateNodeSshKey(master_node_uuid, master_node_name, ssh_port_map,
2264 new_key_type, new_key_bits,
2265 pub_key_file=ganeti_pub_keys_file,
2266 ssconf_store=ssconf_store,
2267 noded_cert_file=noded_cert_file,
2268 run_cmd_fn=run_cmd_fn,
2269 suffix=constants.SSHS_MASTER_SUFFIX)
2270 # Read newly created master key
2271 new_master_key_dict = _GetNewMasterKey(root_keyfiles, master_node_uuid)
2272
2273 # Replace master key in the master nodes' public key file
2274 ssh.RemovePublicKey(master_node_uuid, key_file=ganeti_pub_keys_file)
2275 for pub_key in new_master_key_dict[master_node_uuid]:
2276 ssh.AddPublicKey(master_node_uuid, pub_key, key_file=ganeti_pub_keys_file)
2277
2278 # Add new master key to all node's public and authorized keys
2279 logging.debug("Add new master key to all nodes.")
2280 node_errors = AddNodeSshKey(
2281 master_node_uuid, master_node_name, potential_master_candidates,
2282 to_authorized_keys=True, to_public_keys=True,
2283 get_public_keys=False, pub_key_file=ganeti_pub_keys_file,
2284 ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
2285 run_cmd_fn=run_cmd_fn)
2286 if node_errors:
2287 all_node_errors = all_node_errors + node_errors
2288
2289 # Remove the old key file and rename the new key to the non-temporary filename
2290 _ReplaceMasterKeyOnMaster(root_keyfiles)
2291
2292 # Remove old key from authorized keys
2293 (auth_key_file, _) = \
2294 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2295 ssh.RemoveAuthorizedKeys(auth_key_file,
2296 old_master_keys_by_uuid[master_node_uuid])
2297
2298 # Remove the old key from all node's authorized keys file
2299 logging.debug("Remove the old master key from all nodes.")
2300 node_errors = RemoveNodeSshKey(
2301 master_node_uuid, master_node_name, master_candidate_uuids,
2302 potential_master_candidates,
2303 keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
2304 from_public_keys=False, clear_authorized_keys=False,
2305 clear_public_keys=False)
2306 if node_errors:
2307 all_node_errors = all_node_errors + node_errors
2308
2309 return all_node_errors
2310
2311
2312 def GetBlockDevSizes(devices):
2313 """Return the size of the given block devices
2314
2315 @type devices: list
2316 @param devices: list of block device nodes to query
2317 @rtype: dict
2318 @return:
2319 dictionary of all block devices under /dev (key). The value is their
2320 size in MiB.
2321
2322 {'/dev/disk/by-uuid/123456-12321231-312312-312': 124}
2323
2324 """
2325 DEV_PREFIX = "/dev/"
2326 blockdevs = {}
2327
2328 for devpath in devices:
2329 if not utils.IsBelowDir(DEV_PREFIX, devpath):
2330 continue
2331
2332 try:
2333 st = os.stat(devpath)
2334 except EnvironmentError, err:
2335 logging.warning("Error stat()'ing device %s: %s", devpath, str(err))
2336 continue
2337
2338 if stat.S_ISBLK(st.st_mode):
2339 result = utils.RunCmd(["blockdev", "--getsize64", devpath])
2340 if result.failed:
2341 # We don't want to fail, just do not list this device as available
2342 logging.warning("Cannot get size for block device %s", devpath)
2343 continue
2344
2345 size = int(result.stdout) / (1024 * 1024)
2346 blockdevs[devpath] = size
2347 return blockdevs
2348
2349
2350 def GetVolumeList(vg_names):
2351 """Compute list of logical volumes and their size.
2352
2353 @type vg_names: list
2354 @param vg_names: the volume groups whose LVs we should list, or
2355 empty for all volume groups
2356 @rtype: dict
2357 @return:
2358 dictionary of all partions (key) with value being a tuple of
2359 their size (in MiB), inactive and online status::
2360
2361 {'xenvg/test1': ('20.06', True, True)}
2362
2363 in case of errors, a string is returned with the error
2364 details.
2365
2366 """
2367 lvs = {}
2368 sep = "|"
2369 if not vg_names:
2370 vg_names = []
2371 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2372 "--separator=%s" % sep,
2373 "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names)
2374 if result.failed:
2375 _Fail("Failed to list logical volumes, lvs output: %s", result.output)
2376
2377 for line in result.stdout.splitlines():
2378 line = line.strip()
2379 match = _LVSLINE_REGEX.match(line)
2380 if not match:
2381 logging.error("Invalid line returned from lvs output: '%s'", line)
2382 continue
2383 vg_name, name, size, attr = match.groups()
2384 inactive = attr[4] == "-"
2385 online = attr[5] == "o"
2386 virtual = attr[0] == "v"
2387 if virtual:
2388 # we don't want to report such volumes as existing, since they
2389 # don't really hold data
2390 continue
2391 lvs[vg_name + "/" + name] = (size, inactive, online)
2392
2393 return lvs
2394
2395
2396 def ListVolumeGroups():
2397 """List the volume groups and their size.
2398
2399 @rtype: dict
2400 @return: dictionary with keys volume name and values the
2401 size of the volume
2402
2403 """
2404 return utils.ListVolumeGroups()
2405
2406
2407 def NodeVolumes():
2408 """List all volumes on this node.
2409
2410 @rtype: list
2411 @return:
2412 A list of dictionaries, each having four keys:
2413 - name: the logical volume name,
2414 - size: the size of the logical volume
2415 - dev: the physical device on which the LV lives
2416 - vg: the volume group to which it belongs
2417
2418 In case of errors, we return an empty list and log the
2419 error.
2420
2421 Note that since a logical volume can live on multiple physical
2422 volumes, the resulting list might include a logical volume
2423 multiple times.
2424
2425 """
2426 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2427 "--separator=|",
2428 "--options=lv_name,lv_size,devices,vg_name"])
2429 if result.failed:
2430 _Fail("Failed to list logical volumes, lvs output: %s",
2431 result.output)
2432
2433 def parse_dev(dev):
2434 return dev.split("(")[0]
2435
2436 def handle_dev(dev):
2437 return [parse_dev(x) for x in dev.split(",")]
2438
2439 def map_line(line):
2440 line = [v.strip() for v in line]
2441 return [{"name": line[0], "size": line[1],
2442 "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])]
2443
2444 all_devs = []
2445 for line in result.stdout.splitlines():
2446 if line.count("|") >= 3:
2447 all_devs.extend(map_line(line.split("|")))
2448 else:
2449 logging.warning("Strange line in the output from lvs: '%s'", line)
2450 return all_devs
2451
2452
2453 def BridgesExist(bridges_list):
2454 """Check if a list of bridges exist on the current node.
2455
2456 @rtype: boolean
2457 @return: C{True} if all of them exist, C{False} otherwise
2458
2459 """
2460 missing = []
2461 for bridge in bridges_list:
2462 if not utils.BridgeExists(bridge):
2463 missing.append(bridge)
2464
2465 if missing:
2466 _Fail("Missing bridges %s", utils.CommaJoin(missing))
2467
2468
2469 def GetInstanceListForHypervisor(hname, hvparams=None,
2470 get_hv_fn=hypervisor.GetHypervisor):
2471 """Provides a list of instances of the given hypervisor.
2472
2473 @type hname: string
2474 @param hname: name of the hypervisor
2475 @type hvparams: dict of strings
2476 @param hvparams: hypervisor parameters for the given hypervisor
2477 @type get_hv_fn: function
2478 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2479 name; optional parameter to increase testability
2480
2481 @rtype: list
2482 @return: a list of all running instances on the current node
2483 - instance1.example.com
2484 - instance2.example.com
2485
2486 """
2487 try:
2488 return get_hv_fn(hname).ListInstances(hvparams=hvparams)
2489 except errors.HypervisorError, err:
2490 _Fail("Error enumerating instances (hypervisor %s): %s",
2491 hname, err, exc=True)
2492
2493
2494 def GetInstanceList(hypervisor_list, all_hvparams=None,
2495 get_hv_fn=hypervisor.GetHypervisor):
2496 """Provides a list of instances.
2497
2498 @type hypervisor_list: list
2499 @param hypervisor_list: the list of hypervisors to query information
2500 @type all_hvparams: dict of dict of strings
2501 @param all_hvparams: a dictionary mapping hypervisor types to respective
2502 cluster-wide hypervisor parameters
2503 @type get_hv_fn: function
2504 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2505 name; optional parameter to increase testability
2506
2507 @rtype: list
2508 @return: a list of all running instances on the current node
2509 - instance1.example.com
2510 - instance2.example.com
2511
2512 """
2513 results = []
2514 for hname in hypervisor_list:
2515 hvparams = all_hvparams[hname]
2516 results.extend(GetInstanceListForHypervisor(hname, hvparams=hvparams,
2517 get_hv_fn=get_hv_fn))
2518 return results
2519
2520
2521 def GetInstanceInfo(instance, hname, hvparams=None):
2522 """Gives back the information about an instance as a dictionary.
2523
2524 @type instance: string
2525 @param instance: the instance name
2526 @type hname: string
2527 @param hname: the hypervisor type of the instance
2528 @type hvparams: dict of strings
2529 @param hvparams: the instance's hvparams
2530
2531 @rtype: dict
2532 @return: dictionary with the following keys:
2533 - memory: memory size of instance (int)
2534 - state: state of instance (HvInstanceState)
2535 - time: cpu time of instance (float)
2536 - vcpus: the number of vcpus (int)
2537
2538 """
2539 output = {}
2540
2541 iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance,
2542 hvparams=hvparams)
2543 if iinfo is not None:
2544 output["memory"] = iinfo[2]
2545 output["vcpus"] = iinfo[3]
2546 output["state"] = iinfo[4]
2547 output["time"] = iinfo[5]
2548
2549 return output
2550
2551
2552 def GetInstanceMigratable(instance):
2553 """Computes whether an instance can be migrated.
2554
2555 @type instance: L{objects.Instance}
2556 @param instance: object representing the instance to be checked.
2557
2558 @rtype: tuple
2559 @return: tuple of (result, description) where:
2560 - result: whether the instance can be migrated or not
2561 - description: a description of the issue, if relevant
2562
2563 """
2564 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2565 iname = instance.name
2566 if iname not in hyper.ListInstances(hvparams=instance.hvparams):
2567 _Fail("Instance %s is not running", iname)
2568
2569 for idx in range(len(instance.disks_info)):
2570 link_name = _GetBlockDevSymlinkPath(iname, idx)
2571 if not os.path.islink(link_name):
2572 logging.warning("Instance %s is missing symlink %s for disk %d",
2573 iname, link_name, idx)
2574
2575
2576 def GetAllInstancesInfo(hypervisor_list, all_hvparams):
2577 """Gather data about all instances.
2578
2579 This is the equivalent of L{GetInstanceInfo}, except that it
2580 computes data for all instances at once, thus being faster if one
2581 needs data about more than one instance.
2582
2583 @type hypervisor_list: list
2584 @param hypervisor_list: list of hypervisors to query for instance data
2585 @type all_hvparams: dict of dict of strings
2586 @param all_hvparams: mapping of hypervisor names to hvparams
2587
2588 @rtype: dict
2589 @return: dictionary of instance: data, with data having the following keys:
2590 - memory: memory size of instance (int)
2591 - state: xen state of instance (string)
2592 - time: cpu time of instance (float)
2593 - vcpus: the number of vcpus
2594
2595 """
2596 output = {}
2597 for hname in hypervisor_list:
2598 hvparams = all_hvparams[hname]
2599 iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo(hvparams)
2600 if iinfo:
2601 for name, _, memory, vcpus, state, times in iinfo:
2602 value = {
2603 "memory": memory,
2604 "vcpus": vcpus,
2605 "state": state,
2606 "time": times,
2607 }
2608 if name in output:
2609 # we only check static parameters, like memory and vcpus,
2610 # and not state and time which can change between the
2611 # invocations of the different hypervisors
2612 for key in "memory", "vcpus":
2613 if value[key] != output[name][key]:
2614 _Fail("Instance %s is running twice"
2615 " with different parameters", name)
2616 output[name] = value
2617
2618 return output
2619
2620
2621 def GetInstanceConsoleInfo(instance_param_dict,
2622 get_hv_fn=hypervisor.GetHypervisor):
2623 """Gather data about the console access of a set of instances of this node.
2624
2625 This function assumes that the caller already knows which instances are on
2626 this node, by calling a function such as L{GetAllInstancesInfo} or
2627 L{GetInstanceList}.
2628
2629 For every instance, a large amount of configuration data needs to be
2630 provided to the hypervisor interface in order to receive the console
2631 information. Whether this could or should be cut down can be discussed.
2632 The information is provided in a dictionary indexed by instance name,
2633 allowing any number of instance queries to be done.
2634
2635 @type instance_param_dict: dict of string to tuple of dictionaries, where the
2636 dictionaries represent: L{objects.Instance}, L{objects.Node},
2637 L{objects.NodeGroup}, HvParams, BeParams
2638 @param instance_param_dict: mapping of instance name to parameters necessary
2639 for console information retrieval
2640
2641 @rtype: dict
2642 @return: dictionary of instance: data, with data having the following keys:
2643 - instance: instance name
2644 - kind: console kind
2645 - message: used with kind == CONS_MESSAGE, indicates console to be
2646 unavailable, supplies error message
2647 - host: host to connect to
2648 - port: port to use
2649 - user: user for login
2650 - command: the command, broken into parts as an array
2651 - display: unknown, potentially unused?
2652
2653 """
2654
2655 output = {}
2656 for inst_name in instance_param_dict:
2657 instance = instance_param_dict[inst_name]["instance"]
2658 pnode = instance_param_dict[inst_name]["node"]
2659 group = instance_param_dict[inst_name]["group"]
2660 hvparams = instance_param_dict[inst_name]["hvParams"]
2661 beparams = instance_param_dict[inst_name]["beParams"]
2662
2663 instance = objects.Instance.FromDict(instance)
2664 pnode = objects.Node.FromDict(pnode)
2665 group = objects.NodeGroup.FromDict(group)
2666
2667 h = get_hv_fn(instance.hypervisor)
2668 output[inst_name] = h.GetInstanceConsole(instance, pnode, group,
2669 hvparams, beparams).ToDict()
2670
2671 return output
2672
2673
2674 def _InstanceLogName(kind, os_name, instance, component):
2675 """Compute the OS log filename for a given instance and operation.
2676
2677 The instance name and os name are passed in as strings since not all
2678 operations have these as part of an instance object.
2679
2680 @type kind: string
2681 @param kind: the operation type (e.g. add, import, etc.)
2682 @type os_name: string
2683 @param os_name: the os name
2684 @type instance: string
2685 @param instance: the name of the instance being imported/added/etc.
2686 @type component: string or None
2687 @param component: the name of the component of the instance being
2688 transferred
2689
2690 """
2691 # TODO: Use tempfile.mkstemp to create unique filename
2692 if component:
2693 assert "/" not in component
2694 c_msg = "-%s" % component
2695 else:
2696 c_msg = ""
2697 base = ("%s-%s-%s%s-%s.log" %
2698 (kind, os_name, instance, c_msg, utils.TimestampForFilename()))
2699 return utils.PathJoin(pathutils.LOG_OS_DIR, base)
2700
2701
2702 def InstanceOsAdd(instance, reinstall, debug):
2703 """Add an OS to an instance.
2704
2705 @type instance: L{objects.Instance}
2706 @param instance: Instance whose OS is to be installed
2707 @type reinstall: boolean
2708 @param reinstall: whether this is an instance reinstall
2709 @type debug: integer
2710 @param debug: debug level, passed to the OS scripts
2711 @rtype: None
2712
2713 """
2714 inst_os = OSFromDisk(instance.os)
2715
2716 create_env = OSEnvironment(instance, inst_os, debug)
2717 if reinstall:
2718 create_env["INSTANCE_REINSTALL"] = "1"
2719
2720 logfile = _InstanceLogName("add", instance.os, instance.name, None)
2721
2722 result = utils.RunCmd([inst_os.create_script], env=create_env,
2723 cwd=inst_os.path, output=logfile, reset_env=True)
2724 if result.failed:
2725 logging.error("os create command '%s' returned error: %s, logfile: %s,"
2726 " output: %s", result.cmd, result.fail_reason, logfile,
2727 result.output)
2728 lines = [utils.SafeEncode(val)
2729 for val in utils.TailFile(logfile, lines=20)]
2730 _Fail("OS create script failed (%s), last lines in the"
2731 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2732
2733
2734 def RunRenameInstance(instance, old_name, debug):
2735 """Run the OS rename script for an instance.
2736
2737 @type instance: L{objects.Instance}
2738 @param instance: Instance whose OS is to be installed
2739 @type old_name: string
2740 @param old_name: previous instance name
2741 @type debug: integer
2742 @param debug: debug level, passed to the OS scripts
2743 @rtype: boolean
2744 @return: the success of the operation
2745
2746 """
2747 inst_os = OSFromDisk(instance.os)
2748
2749 rename_env = OSEnvironment(instance, inst_os, debug)
2750 rename_env["OLD_INSTANCE_NAME"] = old_name
2751
2752 logfile = _InstanceLogName("rename", instance.os,
2753 "%s-%s" % (old_name, instance.name), None)
2754
2755 result = utils.RunCmd([inst_os.rename_script], env=rename_env,
2756 cwd=inst_os.path, output=logfile, reset_env=True)
2757
2758 if result.failed:
2759 logging.error("os create command '%s' returned error: %s output: %s",
2760 result.cmd, result.fail_reason, result.output)
2761 lines = [utils.SafeEncode(val)
2762 for val in utils.TailFile(logfile, lines=20)]
2763 _Fail("OS rename script failed (%s), last lines in the"
2764 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2765
2766
2767 def _GetBlockDevSymlinkPath(instance_name, idx, _dir=None):
2768 """Returns symlink path for block device.
2769
2770 """
2771 if _dir is None:
2772 _dir = pathutils.DISK_LINKS_DIR
2773
2774 return utils.PathJoin(_dir,
2775 ("%s%s%s" %
2776 (instance_name, constants.DISK_SEPARATOR, idx)))
2777
2778
2779 def _SymlinkBlockDev(instance_name, device_path, idx):
2780 """Set up symlinks to a instance's block device.
2781
2782 This is an auxiliary function run when an instance is start (on the primary
2783 node) or when an instance is migrated (on the target node).
2784
2785
2786 @param instance_name: the name of the target instance
2787 @param device_path: path of the physical block device, on the node
2788 @param idx: the disk index
2789 @return: absolute path to the disk's symlink
2790
2791 """
2792 # In case we have only a userspace access URI, device_path is None
2793 if not device_path:
2794 return None
2795
2796 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2797 try:
2798 os.symlink(device_path, link_name)
2799 except OSError, err:
2800 if err.errno == errno.EEXIST:
2801 if (not os.path.islink(link_name) or
2802 os.readlink(link_name) != device_path):
2803 os.remove(link_name)
2804 os.symlink(device_path, link_name)
2805 else:
2806 raise
2807
2808 return link_name
2809
2810
2811 def _RemoveBlockDevLinks(instance_name, disks):
2812 """Remove the block device symlinks belonging to the given instance.
2813
2814 """
2815 for idx, _ in enumerate(disks):
2816 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2817 if os.path.islink(link_name):
2818 try:
2819 os.remove(link_name)
2820 except OSError:
2821 logging.exception("Can't remove symlink '%s'", link_name)
2822
2823
2824 def _CalculateDeviceURI(instance, disk, device):
2825 """Get the URI for the device.
2826
2827 @type instance: L{objects.Instance}
2828 @param instance: the instance which disk belongs to
2829 @type disk: L{objects.Disk}
2830 @param disk: the target disk object
2831 @type device: L{bdev.BlockDev}
2832 @param device: the corresponding BlockDevice
2833 @rtype: string
2834 @return: the device uri if any else None
2835
2836 """
2837 access_mode = disk.params.get(constants.LDP_ACCESS,
2838 constants.DISK_KERNELSPACE)
2839 if access_mode == constants.DISK_USERSPACE:
2840 # This can raise errors.BlockDeviceError
2841 return device.GetUserspaceAccessUri(instance.hypervisor)
2842 else:
2843 return None
2844
2845
2846 def _GatherAndLinkBlockDevs(instance):
2847 """Set up an instance's block device(s).
2848
2849 This is run on the primary node at instance startup. The block
2850 devices must be already assembled.
2851
2852 @type instance: L{objects.Instance}
2853 @param instance: the instance whose disks we should assemble
2854 @rtype: list
2855 @return: list of (disk_object, link_name, drive_uri)
2856
2857 """
2858 block_devices = []
2859 for idx, disk in enumerate(instance.disks_info):
2860 device = _RecursiveFindBD(disk)
2861 if device is None:
2862 raise errors.BlockDeviceError("Block device '%s' is not set up." %
2863 str(disk))
2864 device.Open()
2865 try:
2866 link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
2867 except OSError, e:
2868 raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
2869 e.strerror)
2870 uri = _CalculateDeviceURI(instance, disk, device)
2871
2872 block_devices.append((disk, link_name, uri))
2873
2874 return block_devices
2875
2876
2877 def _IsInstanceUserDown(instance_info):
2878 return instance_info and \
2879 "state" in instance_info and \
2880 hv_base.HvInstanceState.IsShutdown(instance_info["state"])
2881
2882
2883 def _GetInstanceInfo(instance):
2884 """Helper function L{GetInstanceInfo}"""
2885 return GetInstanceInfo(instance.name, instance.hypervisor,
2886 hvparams=instance.hvparams)
2887
2888
2889 def StartInstance(instance, startup_paused, reason, store_reason=True):
2890 """Start an instance.
2891
2892 @type instance: L{objects.Instance}
2893 @param instance: the instance object
2894 @type startup_paused: bool
2895 @param instance: pause instance at startup?
2896 @type reason: list of reasons
2897 @param reason: the reason trail for this startup
2898 @type store_reason: boolean
2899 @param store_reason: whether to store the shutdown reason trail on file
2900 @rtype: None
2901
2902 """
2903 try:
2904 instance_info = _GetInstanceInfo(instance)
2905 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2906
2907 if instance_info and not _IsInstanceUserDown(instance_info):
2908 logging.info("Instance '%s' already running, not starting", instance.name)
2909 if hyper.VerifyInstance(instance):
2910 return
2911 logging.info("Instance '%s' hypervisor config out of date. Restoring.",
2912 instance.name)
2913 block_devices = _GatherAndLinkBlockDevs(instance)
2914 hyper.RestoreInstance(instance, block_devices)
2915 return
2916
2917 block_devices = _GatherAndLinkBlockDevs(instance)
2918 hyper.StartInstance(instance, block_devices, startup_paused)
2919 if store_reason:
2920 _StoreInstReasonTrail(instance.name, reason)
2921 except errors.BlockDeviceError, err:
2922 _Fail("Block device error: %s", err, exc=True)
2923 except errors.HypervisorError, err:
2924 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2925 _Fail("Hypervisor error: %s", err, exc=True)
2926
2927
2928 def InstanceShutdown(instance, timeout, reason, store_reason=True):
2929 """Shut an instance down.
2930
2931 @note: this functions uses polling with a hardcoded timeout.
2932
2933 @type instance: L{objects.Instance}
2934 @param instance: the instance object
2935 @type timeout: integer
2936 @param timeout: maximum timeout for soft shutdown
2937 @type reason: list of reasons
2938 @param reason: the reason trail for this shutdown
2939 @type store_reason: boolean
2940 @param store_reason: whether to store the shutdown reason trail on file
2941 @rtype: None
2942
2943 """
2944 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2945
2946 if not _GetInstanceInfo(instance):
2947 logging.info("Instance '%s' not running, doing nothing", instance.name)
2948 return
2949
2950 class _TryShutdown(object):
2951 def __init__(self):
2952 self.tried_once = False
2953
2954 def __call__(self):
2955 try:
2956 hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout)
2957 if store_reason:
2958 _StoreInstReasonTrail(instance.name, reason)
2959 except errors.HypervisorError, err:
2960 # if the instance does no longer exist, consider this success and go to
2961 # cleanup, otherwise fail without retrying
2962 if _GetInstanceInfo(instance):
2963 _Fail("Failed to stop instance '%s': %s", instance.name, err)
2964 return
2965
2966 # TODO: Cleanup hypervisor implementations to prevent them from failing
2967 # silently. We could easily decide if we want to retry or not by using
2968 # HypervisorSoftError()/HypervisorHardError()
2969 self.tried_once = True
2970 if _GetInstanceInfo(instance):
2971 raise utils.RetryAgain()
2972
2973 try:
2974 utils.Retry(_TryShutdown(), 5, timeout)
2975 except utils.RetryTimeout:
2976 # the shutdown did not succeed
2977 logging.error("Shutdown of '%s' unsuccessful, forcing", instance.name)
2978
2979 try:
2980 hyper.StopInstance(instance, force=True)
2981 except errors.HypervisorError, err:
2982 # only raise an error if the instance still exists, otherwise
2983 # the error could simply be "instance ... unknown"!
2984 if _GetInstanceInfo(instance):
2985 _Fail("Failed to force stop instance '%s': %s", instance.name, err)
2986
2987 time.sleep(1)
2988
2989 if _GetInstanceInfo(instance):
2990 _Fail("Could not shutdown instance '%s' even by destroy", instance.name)
2991
2992 try:
2993 hyper.CleanupInstance(instance.name)
2994 except errors.HypervisorError, err:
2995 logging.warning("Failed to execute post-shutdown cleanup step: %s", err)
2996
2997 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2998
2999
3000 def InstanceReboot(instance, reboot_type, shutdown_timeout, reason):
3001 """Reboot an instance.
3002
3003 @type instance: L{objects.Instance}
3004 @param instance: the instance object to reboot
3005 @type reboot_type: str
3006 @param reboot_type: the type of reboot, one the following
3007 constants:
3008 - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
3009 instance OS, do not recreate the VM
3010 - L{constants.INSTANCE_REBOOT_HARD}: tear down and
3011 restart the VM (at the hypervisor level)
3012 - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is
3013 not accepted here, since that mode is handled differently, in
3014 cmdlib, and translates into full stop and start of the
3015 instance (instead of a call_instance_reboot RPC)
3016 @type shutdown_timeout: integer
3017 @param shutdown_timeout: maximum timeout for soft shutdown
3018 @type reason: list of reasons
3019 @param reason: the reason trail for this reboot
3020 @rtype: None
3021
3022 """
3023 # TODO: this is inconsistent with 'StartInstance' and 'InstanceShutdown'
3024 # because those functions simply 'return' on error whereas this one
3025 # raises an exception with '_Fail'
3026 if not _GetInstanceInfo(instance):
3027 _Fail("Cannot reboot instance '%s' that is not running", instance.name)
3028
3029 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3030 if reboot_type == constants.INSTANCE_REBOOT_SOFT:
3031 try:
3032 hyper.RebootInstance(instance)
3033 except errors.HypervisorError, err:
3034 _Fail("Failed to soft reboot instance '%s': %s", instance.name, err)
3035 elif reboot_type == constants.INSTANCE_REBOOT_HARD:
3036 try:
3037 InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False)
3038 StartInstance(instance, False, reason, store_reason=False)
3039 _StoreInstReasonTrail(instance.name, reason)
3040 except errors.HypervisorError, err:
3041 _Fail("Failed to hard reboot instance '%s': %s", instance.name, err)
3042 else:
3043 _Fail("Invalid reboot_type received: '%s'", reboot_type)
3044
3045
3046 def InstanceBalloonMemory(instance, memory):
3047 """Resize an instance's memory.
3048
3049 @type instance: L{objects.Instance}
3050 @param instance: the instance object
3051 @type memory: int
3052 @param memory: new memory amount in MB
3053 @rtype: None
3054
3055 """
3056 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3057 running = hyper.ListInstances(hvparams=instance.hvparams)
3058 if instance.name not in running:
3059 logging.info("Instance %s is not running, cannot balloon", instance.name)
3060 return
3061 try:
3062 hyper.BalloonInstanceMemory(instance, memory)
3063 except errors.HypervisorError, err:
3064 _Fail("Failed to balloon instance memory: %s", err, exc=True)
3065
3066
3067 def MigrationInfo(instance):
3068 """Gather information about an instance to be migrated.
3069
3070 @type instance: L{objects.Instance}
3071 @param instance: the instance definition
3072
3073 """
3074 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3075 try:
3076 info = hyper.MigrationInfo(instance)
3077 except errors.HypervisorError, err:
3078 _Fail("Failed to fetch migration information: %s", err, exc=True)
3079 return info
3080
3081
3082 def AcceptInstance(instance, info, target):
3083 """Prepare the node to accept an instance.
3084
3085 @type instance: L{objects.Instance}
3086 @param instance: the instance definition
3087 @type info: string/data (opaque)
3088 @param info: migration information, from the source node
3089 @type target: string
3090 @param target: target host (usually ip), on this node
3091
3092 """
3093 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3094 try:
3095 hyper.AcceptInstance(instance, info, target)
3096 except errors.HypervisorError, err:
3097 _Fail("Failed to accept instance: %s", err, exc=True)
3098
3099
3100 def FinalizeMigrationDst(instance, info, success):
3101 """Finalize any preparation to accept an instance.
3102
3103 @type instance: L{objects.Instance}
3104 @param instance: the instance definition
3105 @type info: string/data (opaque)
3106 @param info: migration information, from the source node
3107 @type success: boolean
3108 @param success: whether the migration was a success or a failure
3109
3110 """
3111 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3112 try:
3113 hyper.FinalizeMigrationDst(instance, info, success)
3114 except errors.HypervisorError, err:
3115 _Fail("Failed to finalize migration on the target node: %s", err, exc=True)
3116
3117
3118 def MigrateInstance(cluster_name, instance, target, live):
3119 """Migrates an instance to another node.
3120
3121 @type cluster_name: string
3122 @param cluster_name: name of the cluster
3123 @type instance: L{objects.Instance}
3124 @param instance: the instance definition
3125 @type target: string
3126 @param target: the target node name
3127 @type live: boolean
3128 @param live: whether the migration should be done live or not (the
3129 interpretation of this parameter is left to the hypervisor)
3130 @raise RPCFail: if migration fails for some reason
3131
3132 """
3133 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3134
3135 try:
3136 hyper.MigrateInstance(cluster_name, instance, target, live)
3137 except errors.HypervisorError, err:
3138 _Fail("Failed to migrate instance: %s", err, exc=True)
3139
3140
3141 def FinalizeMigrationSource(instance, success, live):
3142 """Finalize the instance migration on the source node.
3143
3144 @type instance: L{objects.Instance}
3145 @param instance: the instance definition of the migrated instance
3146 @type success: bool
3147 @param success: whether the migration succeeded or not
3148 @type live: bool
3149 @param live: whether the user requested a live migration or not
3150 @raise RPCFail: If the execution fails for some reason
3151
3152 """
3153 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3154
3155 try:
3156 hyper.FinalizeMigrationSource(instance, success, live)
3157 except Exception, err: # pylint: disable=W0703
3158 _Fail("Failed to finalize the migration on the source node: %s", err,
3159 exc=True)
3160
3161
3162 def GetMigrationStatus(instance):
3163 """Get the migration status
3164
3165 @type instance: L{objects.Instance}
3166 @param instance: the instance that is being migrated
3167 @rtype: L{objects.MigrationStatus}
3168 @return: the status of the current migration (one of
3169 L{constants.HV_MIGRATION_VALID_STATUSES}), plus any additional
3170 progress info that can be retrieved from the hypervisor
3171 @raise RPCFail: If the migration status cannot be retrieved
3172
3173 """
3174 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3175 try:
3176 return hyper.GetMigrationStatus(instance)
3177 except Exception, err: # pylint: disable=W0703
3178 _Fail("Failed to get migration status: %s", err, exc=True)
3179
3180
3181 def HotplugDevice(instance, action, dev_type, device, extra, seq):
3182 """Hotplug a device
3183
3184 Hotplug is currently supported only for KVM Hypervisor.
3185 @type instance: L{objects.Instance}
3186 @param instance: the instance to which we hotplug a device
3187 @type action: string
3188 @param action: the hotplug action to perform
3189 @type dev_type: string
3190 @param dev_type: the device type to hotplug
3191 @type device: either L{objects.NIC} or L{objects.Disk}
3192 @param device: the device object to hotplug
3193 @type extra: tuple
3194 @param extra: extra info used for disk hotplug (disk link, drive uri)
3195 @type seq: int
3196 @param seq: the index of the device from master perspective
3197 @raise RPCFail: in case instance does not have KVM hypervisor
3198
3199 """
3200 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3201 try:
3202 hyper.VerifyHotplugSupport(instance, action, dev_type)
3203 except errors.HotplugError, err:
3204 _Fail("Hotplug is not supported: %s", err)
3205
3206 if action == constants.HOTPLUG_ACTION_ADD:
3207 fn = hyper.HotAddDevice
3208 elif action == constants.HOTPLUG_ACTION_REMOVE:
3209 fn = hyper.HotDelDevice
3210 elif action == constants.HOTPLUG_ACTION_MODIFY:
3211 fn = hyper.HotModDevice
3212 else:
3213 assert action in constants.HOTPLUG_ALL_ACTIONS
3214
3215 return fn(instance, dev_type, device, extra, seq)
3216
3217
3218 def HotplugSupported(instance):
3219 """Checks if hotplug is generally supported.
3220
3221 """
3222 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3223 try:
3224 hyper.HotplugSupported(instance)
3225 except errors.HotplugError, err:
3226 _Fail("Hotplug is not supported: %s", err)
3227
3228
3229 def ModifyInstanceMetadata(metadata):
3230 """Sends instance data to the metadata daemon.
3231
3232 Uses the Luxi transport layer to communicate with the metadata
3233 daemon configuration server. It starts the metadata daemon if it is
3234 not running.
3235 The daemon must be enabled during at configuration time.
3236
3237 @type metadata: dict
3238 @param metadata: instance metadata obtained by calling
3239 L{objects.Instance.ToDict} on an instance object
3240
3241 """
3242 if not constants.ENABLE_METAD:
3243 raise errors.ProgrammerError("The metadata deamon is disabled, yet"
3244 " ModifyInstanceMetadata has been called")
3245
3246 if not utils.IsDaemonAlive(constants.METAD):
3247 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.METAD])
3248 if result.failed:
3249 raise errors.HypervisorError("Failed to start metadata daemon")
3250
3251 with contextlib.closing(metad.Client()) as client:
3252 client.UpdateConfig(metadata)
3253
3254
3255 def BlockdevCreate(disk, size, owner, on_primary, info, excl_stor):
3256 """Creates a block device for an instance.
3257
3258 @type disk: L{objects.Disk}
3259 @param disk: the object describing the disk we should create
3260 @type size: int
3261 @param size: the size of the physical underlying device, in MiB
3262 @type owner: str
3263 @param owner: the name of the instance for which disk is created,
3264 used for device cache data
3265 @type on_primary: boolean
3266 @param on_primary: indicates if it is the primary node or not
3267 @type info: string
3268 @param info: string that will be sent to the physical device
3269 creation, used for example to set (LVM) tags on LVs
3270 @type excl_stor: boolean
3271 @param excl_stor: Whether exclusive_storage is active
3272
3273 @return: the new unique_id of the device (this can sometime be
3274 computed only after creation), or None. On secondary nodes,
3275 it's not required to return anything.
3276
3277 """
3278 # TODO: remove the obsolete "size" argument
3279 # pylint: disable=W0613
3280 clist = []
3281 if disk.children:
3282 for child in disk.children:
3283 try:
3284 crdev = _RecursiveAssembleBD(child, owner, on_primary)
3285 except errors.BlockDeviceError, err:
3286 _Fail("Can't assemble device %s: %s", child, err)
3287 if on_primary or disk.AssembleOnSecondary():
3288 # we need the children open in case the device itself has to
3289 # be assembled
3290 try:
3291 # pylint: disable=E1103
3292 crdev.Open()
3293 except errors.BlockDeviceError, err:
3294 _Fail("Can't make child '%s' read-write: %s", child, err)
3295 clist.append(crdev)
3296
3297 try:
3298 device = bdev.Create(disk, clist, excl_stor)
3299 except errors.BlockDeviceError, err:
3300 _Fail("Can't create block device: %s", err)
3301
3302 if on_primary or disk.AssembleOnSecondary():
3303 try:
3304 device.Assemble()
3305 except errors.BlockDeviceError, err:
3306 _Fail("Can't assemble device after creation, unusual event: %s", err)
3307 if on_primary or disk.OpenOnSecondary():
3308 try:
3309 device.Open(force=True)
3310 except errors.BlockDeviceError, err:
3311 _Fail("Can't make device r/w after creation, unusual event: %s", err)
3312 DevCacheManager.UpdateCache(device.dev_path, owner,
3313 on_primary, disk.iv_name)
3314
3315 device.SetInfo(info)
3316
3317 return device.unique_id
3318
3319
3320 def _DumpDevice(source_path, target_path, offset, size, truncate):
3321 """This function images/wipes the device using a local file.
3322
3323 @type source_path: string
3324 @param source_path: path of the image or data source (e.g., "/dev/zero")
3325
3326 @type target_path: string
3327 @param target_path: path of the device to image/wipe
3328
3329 @type offset: int
3330 @param offset: offset in MiB in the output file
3331
3332 @type size: int
3333 @param size: maximum size in MiB to write (data source might be smaller)
3334
3335 @type truncate: bool
3336 @param truncate: whether the file should be truncated
3337
3338 @return: None
3339 @raise RPCFail: in case of failure
3340
3341 """
3342 # Internal sizes are always in Mebibytes; if the following "dd" command
3343 # should use a different block size the offset and size given to this
3344 # function must be adjusted accordingly before being passed to "dd".
3345 block_size = constants.DD_BLOCK_SIZE
3346
3347 cmd = [constants.DD_CMD, "if=%s" % source_path, "seek=%d" % offset,
3348 "bs=%s" % block_size, "oflag=direct", "of=%s" % target_path,
3349 "count=%d" % size]
3350
3351 if not truncate:
3352 cmd.