Merge branch 'stable-2.16' into stable-2.17
[ganeti-github.git] / lib / backend.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31 """Functions used by the node daemon
32
33 @var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
34 the L{UploadFile} function
35 @var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted
36 in the L{_CleanDirectory} function
37
38 """
39
40 # pylint: disable=E1103,C0302
41
42 # E1103: %s %r has no %r member (but some types could not be
43 # inferred), because the _TryOSFromDisk returns either (True, os_obj)
44 # or (False, "string") which confuses pylint
45
46 # C0302: This module has become too big and should be split up
47
48
49 import base64
50 import contextlib
51 import collections
52 import errno
53 import logging
54 import os
55 import os.path
56 import random
57 import re
58 import shutil
59 import signal
60 import stat
61 import tempfile
62 import time
63 import zlib
64
65 import pycurl
66
67 from ganeti import errors
68 from ganeti import http
69 from ganeti import utils
70 from ganeti import ssh
71 from ganeti import hypervisor
72 from ganeti.hypervisor import hv_base
73 from ganeti import constants
74 from ganeti.storage import bdev
75 from ganeti.storage import drbd
76 from ganeti.storage import extstorage
77 from ganeti.storage import filestorage
78 from ganeti import objects
79 from ganeti import ssconf
80 from ganeti import serializer
81 from ganeti import netutils
82 from ganeti import runtime
83 from ganeti import compat
84 from ganeti import pathutils
85 from ganeti import vcluster
86 from ganeti import ht
87 from ganeti.storage.base import BlockDev
88 from ganeti.storage.drbd import DRBD8
89 from ganeti import hooksmaster
90 import ganeti.metad as metad
91
92
93 _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id"
94 _ALLOWED_CLEAN_DIRS = compat.UniqueFrozenset([
95 pathutils.DATA_DIR,
96 pathutils.JOB_QUEUE_ARCHIVE_DIR,
97 pathutils.QUEUE_DIR,
98 pathutils.CRYPTO_KEYS_DIR,
99 ])
100 _MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60
101 _X509_KEY_FILE = "key"
102 _X509_CERT_FILE = "cert"
103 _IES_STATUS_FILE = "status"
104 _IES_PID_FILE = "pid"
105 _IES_CA_FILE = "ca"
106
107 #: Valid LVS output line regex
108 _LVSLINE_REGEX = re.compile(r"^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$")
109
110 # Actions for the master setup script
111 _MASTER_START = "start"
112 _MASTER_STOP = "stop"
113
114 #: Maximum file permissions for restricted command directory and executables
115 _RCMD_MAX_MODE = (stat.S_IRWXU |
116 stat.S_IRGRP | stat.S_IXGRP |
117 stat.S_IROTH | stat.S_IXOTH)
118
119 #: Delay before returning an error for restricted commands
120 _RCMD_INVALID_DELAY = 10
121
122 #: How long to wait to acquire lock for restricted commands (shorter than
123 #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many
124 #: command requests arrive
125 _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8
126
127
128 class RPCFail(Exception):
129 """Class denoting RPC failure.
130
131 Its argument is the error message.
132
133 """
134
135
136 def _GetInstReasonFilename(instance_name):
137 """Path of the file containing the reason of the instance status change.
138
139 @type instance_name: string
140 @param instance_name: The name of the instance
141 @rtype: string
142 @return: The path of the file
143
144 """
145 return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name)
146
147
148 def _StoreInstReasonTrail(instance_name, trail):
149 """Serialize a reason trail related to an instance change of state to file.
150
151 The exact location of the file depends on the name of the instance and on
152 the configuration of the Ganeti cluster defined at deploy time.
153
154 @type instance_name: string
155 @param instance_name: The name of the instance
156
157 @type trail: list of reasons
158 @param trail: reason trail
159
160 @rtype: None
161
162 """
163 json = serializer.DumpJson(trail)
164 filename = _GetInstReasonFilename(instance_name)
165 utils.WriteFile(filename, data=json)
166
167
168 def _Fail(msg, *args, **kwargs):
169 """Log an error and the raise an RPCFail exception.
170
171 This exception is then handled specially in the ganeti daemon and
172 turned into a 'failed' return type. As such, this function is a
173 useful shortcut for logging the error and returning it to the master
174 daemon.
175
176 @type msg: string
177 @param msg: the text of the exception
178 @raise RPCFail
179
180 """
181 if args:
182 msg = msg % args
183 if "log" not in kwargs or kwargs["log"]: # if we should log this error
184 if "exc" in kwargs and kwargs["exc"]:
185 logging.exception(msg)
186 else:
187 logging.error(msg)
188 raise RPCFail(msg)
189
190
191 def _GetConfig():
192 """Simple wrapper to return a SimpleStore.
193
194 @rtype: L{ssconf.SimpleStore}
195 @return: a SimpleStore instance
196
197 """
198 return ssconf.SimpleStore()
199
200
201 def _GetSshRunner(cluster_name):
202 """Simple wrapper to return an SshRunner.
203
204 @type cluster_name: str
205 @param cluster_name: the cluster name, which is needed
206 by the SshRunner constructor
207 @rtype: L{ssh.SshRunner}
208 @return: an SshRunner instance
209
210 """
211 return ssh.SshRunner(cluster_name)
212
213
214 def _Decompress(data):
215 """Unpacks data compressed by the RPC client.
216
217 @type data: list or tuple
218 @param data: Data sent by RPC client
219 @rtype: str
220 @return: Decompressed data
221
222 """
223 assert isinstance(data, (list, tuple))
224 assert len(data) == 2
225 (encoding, content) = data
226 if encoding == constants.RPC_ENCODING_NONE:
227 return content
228 elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
229 return zlib.decompress(base64.b64decode(content))
230 else:
231 raise AssertionError("Unknown data encoding")
232
233
234 def _CleanDirectory(path, exclude=None):
235 """Removes all regular files in a directory.
236
237 @type path: str
238 @param path: the directory to clean
239 @type exclude: list
240 @param exclude: list of files to be excluded, defaults
241 to the empty list
242
243 """
244 if path not in _ALLOWED_CLEAN_DIRS:
245 _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'",
246 path)
247
248 if not os.path.isdir(path):
249 return
250 if exclude is None:
251 exclude = []
252 else:
253 # Normalize excluded paths
254 exclude = [os.path.normpath(i) for i in exclude]
255
256 for rel_name in utils.ListVisibleFiles(path):
257 full_name = utils.PathJoin(path, rel_name)
258 if full_name in exclude:
259 continue
260 if os.path.isfile(full_name) and not os.path.islink(full_name):
261 utils.RemoveFile(full_name)
262
263
264 def _BuildUploadFileList():
265 """Build the list of allowed upload files.
266
267 This is abstracted so that it's built only once at module import time.
268
269 """
270 allowed_files = set([
271 pathutils.CLUSTER_CONF_FILE,
272 pathutils.ETC_HOSTS,
273 pathutils.SSH_KNOWN_HOSTS_FILE,
274 pathutils.VNC_PASSWORD_FILE,
275 pathutils.RAPI_CERT_FILE,
276 pathutils.SPICE_CERT_FILE,
277 pathutils.SPICE_CACERT_FILE,
278 pathutils.RAPI_USERS_FILE,
279 pathutils.CONFD_HMAC_KEY,
280 pathutils.CLUSTER_DOMAIN_SECRET_FILE,
281 ])
282
283 for hv_name in constants.HYPER_TYPES:
284 hv_class = hypervisor.GetHypervisorClass(hv_name)
285 allowed_files.update(hv_class.GetAncillaryFiles()[0])
286
287 assert pathutils.FILE_STORAGE_PATHS_FILE not in allowed_files, \
288 "Allowed file storage paths should never be uploaded via RPC"
289
290 return frozenset(allowed_files)
291
292
293 _ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
294
295
296 def JobQueuePurge():
297 """Removes job queue files and archived jobs.
298
299 @rtype: tuple
300 @return: True, None
301
302 """
303 _CleanDirectory(pathutils.QUEUE_DIR, exclude=[pathutils.JOB_QUEUE_LOCK_FILE])
304 _CleanDirectory(pathutils.JOB_QUEUE_ARCHIVE_DIR)
305
306
307 def GetMasterNodeName():
308 """Returns the master node name.
309
310 @rtype: string
311 @return: name of the master node
312 @raise RPCFail: in case of errors
313
314 """
315 try:
316 return _GetConfig().GetMasterNode()
317 except errors.ConfigurationError, err:
318 _Fail("Cluster configuration incomplete: %s", err, exc=True)
319
320
321 def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
322 """Decorator that runs hooks before and after the decorated function.
323
324 @type hook_opcode: string
325 @param hook_opcode: opcode of the hook
326 @type hooks_path: string
327 @param hooks_path: path of the hooks
328 @type env_builder_fn: function
329 @param env_builder_fn: function that returns a dictionary containing the
330 environment variables for the hooks. Will get all the parameters of the
331 decorated function.
332 @raise RPCFail: in case of pre-hook failure
333
334 """
335 def decorator(fn):
336 def wrapper(*args, **kwargs):
337 _, myself = ssconf.GetMasterAndMyself()
338 nodes = ([myself], [myself]) # these hooks run locally
339
340 env_fn = compat.partial(env_builder_fn, *args, **kwargs)
341
342 cfg = _GetConfig()
343 hr = HooksRunner()
344 hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes,
345 hr.RunLocalHooks, None, env_fn, None,
346 logging.warning, cfg.GetClusterName(),
347 cfg.GetMasterNode())
348 hm.RunPhase(constants.HOOKS_PHASE_PRE)
349 result = fn(*args, **kwargs)
350 hm.RunPhase(constants.HOOKS_PHASE_POST)
351
352 return result
353 return wrapper
354 return decorator
355
356
357 def _BuildMasterIpEnv(master_params, use_external_mip_script=None):
358 """Builds environment variables for master IP hooks.
359
360 @type master_params: L{objects.MasterNetworkParameters}
361 @param master_params: network parameters of the master
362 @type use_external_mip_script: boolean
363 @param use_external_mip_script: whether to use an external master IP
364 address setup script (unused, but necessary per the implementation of the
365 _RunLocalHooks decorator)
366
367 """
368 # pylint: disable=W0613
369 ver = netutils.IPAddress.GetVersionFromAddressFamily(master_params.ip_family)
370 env = {
371 "MASTER_NETDEV": master_params.netdev,
372 "MASTER_IP": master_params.ip,
373 "MASTER_NETMASK": str(master_params.netmask),
374 "CLUSTER_IP_VERSION": str(ver),
375 }
376
377 return env
378
379
380 def _RunMasterSetupScript(master_params, action, use_external_mip_script):
381 """Execute the master IP address setup script.
382
383 @type master_params: L{objects.MasterNetworkParameters}
384 @param master_params: network parameters of the master
385 @type action: string
386 @param action: action to pass to the script. Must be one of
387 L{backend._MASTER_START} or L{backend._MASTER_STOP}
388 @type use_external_mip_script: boolean
389 @param use_external_mip_script: whether to use an external master IP
390 address setup script
391 @raise backend.RPCFail: if there are errors during the execution of the
392 script
393
394 """
395 env = _BuildMasterIpEnv(master_params)
396
397 if use_external_mip_script:
398 setup_script = pathutils.EXTERNAL_MASTER_SETUP_SCRIPT
399 else:
400 setup_script = pathutils.DEFAULT_MASTER_SETUP_SCRIPT
401
402 result = utils.RunCmd([setup_script, action], env=env, reset_env=True)
403
404 if result.failed:
405 _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" %
406 (action, result.exit_code, result.output), log=True)
407
408
409 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup",
410 _BuildMasterIpEnv)
411 def ActivateMasterIp(master_params, use_external_mip_script):
412 """Activate the IP address of the master daemon.
413
414 @type master_params: L{objects.MasterNetworkParameters}
415 @param master_params: network parameters of the master
416 @type use_external_mip_script: boolean
417 @param use_external_mip_script: whether to use an external master IP
418 address setup script
419 @raise RPCFail: in case of errors during the IP startup
420
421 """
422 _RunMasterSetupScript(master_params, _MASTER_START,
423 use_external_mip_script)
424
425
426 def StartMasterDaemons(no_voting):
427 """Activate local node as master node.
428
429 The function will start the master daemons (ganeti-masterd and ganeti-rapi).
430
431 @type no_voting: boolean
432 @param no_voting: whether to start ganeti-masterd without a node vote
433 but still non-interactively
434 @rtype: None
435
436 """
437
438 if no_voting:
439 daemon_args = "--no-voting --yes-do-it"
440 else:
441 daemon_args = ""
442
443 env = {
444 "EXTRA_LUXID_ARGS": daemon_args,
445 "EXTRA_WCONFD_ARGS": daemon_args,
446 }
447
448 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-master"], env=env)
449 if result.failed:
450 msg = "Can't start Ganeti master: %s" % result.output
451 logging.error(msg)
452 _Fail(msg)
453
454
455 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown",
456 _BuildMasterIpEnv)
457 def DeactivateMasterIp(master_params, use_external_mip_script):
458 """Deactivate the master IP on this node.
459
460 @type master_params: L{objects.MasterNetworkParameters}
461 @param master_params: network parameters of the master
462 @type use_external_mip_script: boolean
463 @param use_external_mip_script: whether to use an external master IP
464 address setup script
465 @raise RPCFail: in case of errors during the IP turndown
466
467 """
468 _RunMasterSetupScript(master_params, _MASTER_STOP,
469 use_external_mip_script)
470
471
472 def StopMasterDaemons():
473 """Stop the master daemons on this node.
474
475 Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node.
476
477 @rtype: None
478
479 """
480 # TODO: log and report back to the caller the error failures; we
481 # need to decide in which case we fail the RPC for this
482
483 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-master"])
484 if result.failed:
485 logging.error("Could not stop Ganeti master, command %s had exitcode %s"
486 " and error %s",
487 result.cmd, result.exit_code, result.output)
488
489
490 def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
491 """Change the netmask of the master IP.
492
493 @param old_netmask: the old value of the netmask
494 @param netmask: the new value of the netmask
495 @param master_ip: the master IP
496 @param master_netdev: the master network device
497
498 """
499 if old_netmask == netmask:
500 return
501
502 if not netutils.IPAddress.Own(master_ip):
503 _Fail("The master IP address is not up, not attempting to change its"
504 " netmask")
505
506 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
507 "%s/%s" % (master_ip, netmask),
508 "dev", master_netdev, "label",
509 "%s:0" % master_netdev])
510 if result.failed:
511 _Fail("Could not set the new netmask on the master IP address")
512
513 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
514 "%s/%s" % (master_ip, old_netmask),
515 "dev", master_netdev, "label",
516 "%s:0" % master_netdev])
517 if result.failed:
518 _Fail("Could not bring down the master IP address with the old netmask")
519
520
521 def EtcHostsModify(mode, host, ip):
522 """Modify a host entry in /etc/hosts.
523
524 @param mode: The mode to operate. Either add or remove entry
525 @param host: The host to operate on
526 @param ip: The ip associated with the entry
527
528 """
529 if mode == constants.ETC_HOSTS_ADD:
530 if not ip:
531 RPCFail("Mode 'add' needs 'ip' parameter, but parameter not"
532 " present")
533 utils.AddHostToEtcHosts(host, ip)
534 elif mode == constants.ETC_HOSTS_REMOVE:
535 if ip:
536 RPCFail("Mode 'remove' does not allow 'ip' parameter, but"
537 " parameter is present")
538 utils.RemoveHostFromEtcHosts(host)
539 else:
540 RPCFail("Mode not supported")
541
542
543 def LeaveCluster(modify_ssh_setup):
544 """Cleans up and remove the current node.
545
546 This function cleans up and prepares the current node to be removed
547 from the cluster.
548
549 If processing is successful, then it raises an
550 L{errors.QuitGanetiException} which is used as a special case to
551 shutdown the node daemon.
552
553 @param modify_ssh_setup: boolean
554
555 """
556 _CleanDirectory(pathutils.DATA_DIR)
557 _CleanDirectory(pathutils.CRYPTO_KEYS_DIR)
558 JobQueuePurge()
559
560 if modify_ssh_setup:
561 try:
562 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
563
564 ssh.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
565
566 utils.RemoveFile(priv_key)
567 utils.RemoveFile(pub_key)
568 except errors.OpExecError:
569 logging.exception("Error while processing ssh files")
570 except IOError:
571 logging.exception("At least one SSH file was not accessible.")
572
573 try:
574 utils.RemoveFile(pathutils.CONFD_HMAC_KEY)
575 utils.RemoveFile(pathutils.RAPI_CERT_FILE)
576 utils.RemoveFile(pathutils.SPICE_CERT_FILE)
577 utils.RemoveFile(pathutils.SPICE_CACERT_FILE)
578 utils.RemoveFile(pathutils.NODED_CERT_FILE)
579 except: # pylint: disable=W0702
580 logging.exception("Error while removing cluster secrets")
581
582 utils.StopDaemon(constants.CONFD)
583 utils.StopDaemon(constants.MOND)
584 utils.StopDaemon(constants.KVMD)
585
586 # Raise a custom exception (handled in ganeti-noded)
587 raise errors.QuitGanetiException(True, "Shutdown scheduled")
588
589
590 def _CheckStorageParams(params, num_params):
591 """Performs sanity checks for storage parameters.
592
593 @type params: list
594 @param params: list of storage parameters
595 @type num_params: int
596 @param num_params: expected number of parameters
597
598 """
599 if params is None:
600 raise errors.ProgrammerError("No storage parameters for storage"
601 " reporting is provided.")
602 if not isinstance(params, list):
603 raise errors.ProgrammerError("The storage parameters are not of type"
604 " list: '%s'" % params)
605 if not len(params) == num_params:
606 raise errors.ProgrammerError("Did not receive the expected number of"
607 "storage parameters: expected %s,"
608 " received '%s'" % (num_params, len(params)))
609
610
611 def _CheckLvmStorageParams(params):
612 """Performs sanity check for the 'exclusive storage' flag.
613
614 @see: C{_CheckStorageParams}
615
616 """
617 _CheckStorageParams(params, 1)
618 excl_stor = params[0]
619 if not isinstance(params[0], bool):
620 raise errors.ProgrammerError("Exclusive storage parameter is not"
621 " boolean: '%s'." % excl_stor)
622 return excl_stor
623
624
625 def _GetLvmVgSpaceInfo(name, params):
626 """Wrapper around C{_GetVgInfo} which checks the storage parameters.
627
628 @type name: string
629 @param name: name of the volume group
630 @type params: list
631 @param params: list of storage parameters, which in this case should be
632 containing only one for exclusive storage
633
634 """
635 excl_stor = _CheckLvmStorageParams(params)
636 return _GetVgInfo(name, excl_stor)
637
638
639 def _GetVgInfo(
640 name, excl_stor, info_fn=bdev.LogicalVolume.GetVGInfo):
641 """Retrieves information about a LVM volume group.
642
643 """
644 # TODO: GetVGInfo supports returning information for multiple VGs at once
645 vginfo = info_fn([name], excl_stor)
646 if vginfo:
647 vg_free = int(round(vginfo[0][0], 0))
648 vg_size = int(round(vginfo[0][1], 0))
649 else:
650 vg_free = None
651 vg_size = None
652
653 return {
654 "type": constants.ST_LVM_VG,
655 "name": name,
656 "storage_free": vg_free,
657 "storage_size": vg_size,
658 }
659
660
661 def _GetLvmPvSpaceInfo(name, params):
662 """Wrapper around C{_GetVgSpindlesInfo} with sanity checks.
663
664 @see: C{_GetLvmVgSpaceInfo}
665
666 """
667 excl_stor = _CheckLvmStorageParams(params)
668 return _GetVgSpindlesInfo(name, excl_stor)
669
670
671 def _GetVgSpindlesInfo(
672 name, excl_stor, info_fn=bdev.LogicalVolume.GetVgSpindlesInfo):
673 """Retrieves information about spindles in an LVM volume group.
674
675 @type name: string
676 @param name: VG name
677 @type excl_stor: bool
678 @param excl_stor: exclusive storage
679 @rtype: dict
680 @return: dictionary whose keys are "name", "vg_free", "vg_size" for VG name,
681 free spindles, total spindles respectively
682
683 """
684 if excl_stor:
685 (vg_free, vg_size) = info_fn(name)
686 else:
687 vg_free = 0
688 vg_size = 0
689 return {
690 "type": constants.ST_LVM_PV,
691 "name": name,
692 "storage_free": vg_free,
693 "storage_size": vg_size,
694 }
695
696
697 def _GetHvInfo(name, hvparams, get_hv_fn=hypervisor.GetHypervisor):
698 """Retrieves node information from a hypervisor.
699
700 The information returned depends on the hypervisor. Common items:
701
702 - vg_size is the size of the configured volume group in MiB
703 - vg_free is the free size of the volume group in MiB
704 - memory_dom0 is the memory allocated for domain0 in MiB
705 - memory_free is the currently available (free) ram in MiB
706 - memory_total is the total number of ram in MiB
707 - hv_version: the hypervisor version, if available
708
709 @type hvparams: dict of string
710 @param hvparams: the hypervisor's hvparams
711
712 """
713 return get_hv_fn(name).GetNodeInfo(hvparams=hvparams)
714
715
716 def _GetHvInfoAll(hv_specs, get_hv_fn=hypervisor.GetHypervisor):
717 """Retrieves node information for all hypervisors.
718
719 See C{_GetHvInfo} for information on the output.
720
721 @type hv_specs: list of pairs (string, dict of strings)
722 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
723
724 """
725 if hv_specs is None:
726 return None
727
728 result = []
729 for hvname, hvparams in hv_specs:
730 result.append(_GetHvInfo(hvname, hvparams, get_hv_fn))
731 return result
732
733
734 def _GetNamedNodeInfo(names, fn):
735 """Calls C{fn} for all names in C{names} and returns a dictionary.
736
737 @rtype: None or dict
738
739 """
740 if names is None:
741 return None
742 else:
743 return map(fn, names)
744
745
746 def GetNodeInfo(storage_units, hv_specs):
747 """Gives back a hash with different information about the node.
748
749 @type storage_units: list of tuples (string, string, list)
750 @param storage_units: List of tuples (storage unit, identifier, parameters) to
751 ask for disk space information. In case of lvm-vg, the identifier is
752 the VG name. The parameters can contain additional, storage-type-specific
753 parameters, for example exclusive storage for lvm storage.
754 @type hv_specs: list of pairs (string, dict of strings)
755 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
756 @rtype: tuple; (string, None/dict, None/dict)
757 @return: Tuple containing boot ID, volume group information and hypervisor
758 information
759
760 """
761 bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n")
762 storage_info = _GetNamedNodeInfo(
763 storage_units,
764 (lambda (storage_type, storage_key, storage_params):
765 _ApplyStorageInfoFunction(storage_type, storage_key, storage_params)))
766 hv_info = _GetHvInfoAll(hv_specs)
767 return (bootid, storage_info, hv_info)
768
769
770 def _GetFileStorageSpaceInfo(path, params):
771 """Wrapper around filestorage.GetSpaceInfo.
772
773 The purpose of this wrapper is to call filestorage.GetFileStorageSpaceInfo
774 and ignore the *args parameter to not leak it into the filestorage
775 module's code.
776
777 @see: C{filestorage.GetFileStorageSpaceInfo} for description of the
778 parameters.
779
780 """
781 _CheckStorageParams(params, 0)
782 return filestorage.GetFileStorageSpaceInfo(path)
783
784
785 # FIXME: implement storage reporting for all missing storage types.
786 _STORAGE_TYPE_INFO_FN = {
787 constants.ST_BLOCK: None,
788 constants.ST_DISKLESS: None,
789 constants.ST_EXT: None,
790 constants.ST_FILE: _GetFileStorageSpaceInfo,
791 constants.ST_LVM_PV: _GetLvmPvSpaceInfo,
792 constants.ST_LVM_VG: _GetLvmVgSpaceInfo,
793 constants.ST_SHARED_FILE: None,
794 constants.ST_GLUSTER: None,
795 constants.ST_RADOS: None,
796 }
797
798
799 def _ApplyStorageInfoFunction(storage_type, storage_key, *args):
800 """Looks up and applies the correct function to calculate free and total
801 storage for the given storage type.
802
803 @type storage_type: string
804 @param storage_type: the storage type for which the storage shall be reported.
805 @type storage_key: string
806 @param storage_key: identifier of a storage unit, e.g. the volume group name
807 of an LVM storage unit
808 @type args: any
809 @param args: various parameters that can be used for storage reporting. These
810 parameters and their semantics vary from storage type to storage type and
811 are just propagated in this function.
812 @return: the results of the application of the storage space function (see
813 _STORAGE_TYPE_INFO_FN) if storage space reporting is implemented for that
814 storage type
815 @raises NotImplementedError: for storage types who don't support space
816 reporting yet
817 """
818 fn = _STORAGE_TYPE_INFO_FN[storage_type]
819 if fn is not None:
820 return fn(storage_key, *args)
821 else:
822 raise NotImplementedError
823
824
825 def _CheckExclusivePvs(pvi_list):
826 """Check that PVs are not shared among LVs
827
828 @type pvi_list: list of L{objects.LvmPvInfo} objects
829 @param pvi_list: information about the PVs
830
831 @rtype: list of tuples (string, list of strings)
832 @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...])
833
834 """
835 res = []
836 for pvi in pvi_list:
837 if len(pvi.lv_list) > 1:
838 res.append((pvi.name, pvi.lv_list))
839 return res
840
841
842 def _VerifyHypervisors(what, vm_capable, result, all_hvparams,
843 get_hv_fn=hypervisor.GetHypervisor):
844 """Verifies the hypervisor. Appends the results to the 'results' list.
845
846 @type what: C{dict}
847 @param what: a dictionary of things to check
848 @type vm_capable: boolean
849 @param vm_capable: whether or not this node is vm capable
850 @type result: dict
851 @param result: dictionary of verification results; results of the
852 verifications in this function will be added here
853 @type all_hvparams: dict of dict of string
854 @param all_hvparams: dictionary mapping hypervisor names to hvparams
855 @type get_hv_fn: function
856 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
857
858 """
859 if not vm_capable:
860 return
861
862 if constants.NV_HYPERVISOR in what:
863 result[constants.NV_HYPERVISOR] = {}
864 for hv_name in what[constants.NV_HYPERVISOR]:
865 hvparams = all_hvparams[hv_name]
866 try:
867 val = get_hv_fn(hv_name).Verify(hvparams=hvparams)
868 except errors.HypervisorError, err:
869 val = "Error while checking hypervisor: %s" % str(err)
870 result[constants.NV_HYPERVISOR][hv_name] = val
871
872
873 def _VerifyHvparams(what, vm_capable, result,
874 get_hv_fn=hypervisor.GetHypervisor):
875 """Verifies the hvparams. Appends the results to the 'results' list.
876
877 @type what: C{dict}
878 @param what: a dictionary of things to check
879 @type vm_capable: boolean
880 @param vm_capable: whether or not this node is vm capable
881 @type result: dict
882 @param result: dictionary of verification results; results of the
883 verifications in this function will be added here
884 @type get_hv_fn: function
885 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
886
887 """
888 if not vm_capable:
889 return
890
891 if constants.NV_HVPARAMS in what:
892 result[constants.NV_HVPARAMS] = []
893 for source, hv_name, hvparms in what[constants.NV_HVPARAMS]:
894 try:
895 logging.info("Validating hv %s, %s", hv_name, hvparms)
896 get_hv_fn(hv_name).ValidateParameters(hvparms)
897 except errors.HypervisorError, err:
898 result[constants.NV_HVPARAMS].append((source, hv_name, str(err)))
899
900
901 def _VerifyInstanceList(what, vm_capable, result, all_hvparams):
902 """Verifies the instance list.
903
904 @type what: C{dict}
905 @param what: a dictionary of things to check
906 @type vm_capable: boolean
907 @param vm_capable: whether or not this node is vm capable
908 @type result: dict
909 @param result: dictionary of verification results; results of the
910 verifications in this function will be added here
911 @type all_hvparams: dict of dict of string
912 @param all_hvparams: dictionary mapping hypervisor names to hvparams
913
914 """
915 if constants.NV_INSTANCELIST in what and vm_capable:
916 # GetInstanceList can fail
917 try:
918 val = GetInstanceList(what[constants.NV_INSTANCELIST],
919 all_hvparams=all_hvparams)
920 except RPCFail, err:
921 val = str(err)
922 result[constants.NV_INSTANCELIST] = val
923
924
925 def _VerifyNodeInfo(what, vm_capable, result, all_hvparams):
926 """Verifies the node info.
927
928 @type what: C{dict}
929 @param what: a dictionary of things to check
930 @type vm_capable: boolean
931 @param vm_capable: whether or not this node is vm capable
932 @type result: dict
933 @param result: dictionary of verification results; results of the
934 verifications in this function will be added here
935 @type all_hvparams: dict of dict of string
936 @param all_hvparams: dictionary mapping hypervisor names to hvparams
937
938 """
939 if constants.NV_HVINFO in what and vm_capable:
940 hvname = what[constants.NV_HVINFO]
941 hyper = hypervisor.GetHypervisor(hvname)
942 hvparams = all_hvparams[hvname]
943 result[constants.NV_HVINFO] = hyper.GetNodeInfo(hvparams=hvparams)
944
945
946 def _VerifyClientCertificate(cert_file=pathutils.NODED_CLIENT_CERT_FILE):
947 """Verify the existance and validity of the client SSL certificate.
948
949 Also, verify that the client certificate is not self-signed. Self-
950 signed client certificates stem from Ganeti versions 2.12.0 - 2.12.4
951 and should be replaced by client certificates signed by the server
952 certificate. Hence we output a warning when we encounter a self-signed
953 one.
954
955 """
956 create_cert_cmd = "gnt-cluster renew-crypto --new-node-certificates"
957 if not os.path.exists(cert_file):
958 return (constants.CV_ERROR,
959 "The client certificate does not exist. Run '%s' to create"
960 " client certificates for all nodes." % create_cert_cmd)
961
962 (errcode, msg) = utils.VerifyCertificate(cert_file)
963 if errcode is not None:
964 return (errcode, msg)
965
966 (errcode, msg) = utils.IsCertificateSelfSigned(cert_file)
967 if errcode is not None:
968 return (errcode, msg)
969
970 # if everything is fine, we return the digest to be compared to the config
971 return (None, utils.GetCertificateDigest(cert_filename=cert_file))
972
973
974 def _VerifySshSetup(node_status_list, my_name, ssh_key_type,
975 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS):
976 """Verifies the state of the SSH key files.
977
978 @type node_status_list: list of tuples
979 @param node_status_list: list of nodes of the cluster associated with a
980 couple of flags: (uuid, name, is_master_candidate,
981 is_potential_master_candidate, online)
982 @type my_name: str
983 @param my_name: name of this node
984 @type ssh_key_type: one of L{constants.SSHK_ALL}
985 @param ssh_key_type: type of key used on nodes
986 @type ganeti_pub_keys_file: str
987 @param ganeti_pub_keys_file: filename of the public keys file
988
989 """
990 if node_status_list is None:
991 return ["No node list to check against the pub_key_file received."]
992
993 my_status_list = [(my_uuid, name, mc, pot_mc, online) for
994 (my_uuid, name, mc, pot_mc, online)
995 in node_status_list if name == my_name]
996 if len(my_status_list) == 0:
997 return ["Cannot find node information for node '%s'." % my_name]
998 (my_uuid, _, _, potential_master_candidate, online) = \
999 my_status_list[0]
1000
1001 result = []
1002
1003 if not os.path.exists(ganeti_pub_keys_file):
1004 result.append("The public key file '%s' does not exist. Consider running"
1005 " 'gnt-cluster renew-crypto --new-ssh-keys"
1006 " [--no-ssh-key-check]' to fix this." % ganeti_pub_keys_file)
1007 return result
1008
1009 pot_mc_uuids = [uuid for (uuid, _, _, _, _) in node_status_list]
1010 offline_nodes = [uuid for (uuid, _, _, _, online) in node_status_list
1011 if not online]
1012 pub_keys = ssh.QueryPubKeyFile(None, key_file=ganeti_pub_keys_file)
1013
1014 if potential_master_candidate:
1015 # Check that the set of potential master candidates matches the
1016 # public key file
1017 pub_uuids_set = set(pub_keys.keys()) - set(offline_nodes)
1018 pot_mc_uuids_set = set(pot_mc_uuids) - set(offline_nodes)
1019 missing_uuids = set([])
1020 if pub_uuids_set != pot_mc_uuids_set:
1021 unknown_uuids = pub_uuids_set - pot_mc_uuids_set
1022 pub_key_path = "%s:%s" % (my_name, ganeti_pub_keys_file)
1023 if unknown_uuids:
1024 result.append("The following node UUIDs are listed in the shared public"
1025 " keys file %s, but are not potential master"
1026 " candidates: %s." %
1027 (pub_key_path, ", ".join(list(unknown_uuids))))
1028 missing_uuids = pot_mc_uuids_set - pub_uuids_set
1029 if missing_uuids:
1030 result.append("The following node UUIDs of potential master candidates"
1031 " are missing in the shared public keys file %s: %s." %
1032 (pub_key_path, ", ".join(list(missing_uuids))))
1033
1034 (_, key_files) = \
1035 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1036 (_, node_pub_key_file) = key_files[ssh_key_type]
1037
1038 my_keys = pub_keys[my_uuid]
1039
1040 node_pub_key = utils.ReadFile(node_pub_key_file)
1041 node_pub_key_path = "%s:%s" % (my_name, node_pub_key_file)
1042 if node_pub_key.strip() not in my_keys:
1043 result.append("The key for node %s in the cluster config does not match"
1044 " this node's key in the node public key file %s." %
1045 (my_name, node_pub_key_path))
1046 if len(my_keys) != 1:
1047 result.append("There is more than one key for node %s in the node public"
1048 " key file %s." % (my_name, node_pub_key_path))
1049 else:
1050 if len(pub_keys.keys()) > 0:
1051 result.append("The public key file %s is not empty, although"
1052 " the node is not a potential master candidate." %
1053 node_pub_key_path)
1054
1055 # Check that all master candidate keys are in the authorized_keys file
1056 (auth_key_file, _) = \
1057 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1058 for (uuid, name, mc, _, online) in node_status_list:
1059 if not online:
1060 continue
1061 if uuid in missing_uuids:
1062 continue
1063 if mc:
1064 for key in pub_keys[uuid]:
1065 if not ssh.HasAuthorizedKey(auth_key_file, key):
1066 result.append("A SSH key of master candidate '%s' (UUID: '%s') is"
1067 " not in the 'authorized_keys' file of node '%s'."
1068 % (name, uuid, my_name))
1069 else:
1070 for key in pub_keys[uuid]:
1071 if name != my_name and ssh.HasAuthorizedKey(auth_key_file, key):
1072 result.append("A SSH key of normal node '%s' (UUID: '%s') is in the"
1073 " 'authorized_keys' file of node '%s'."
1074 % (name, uuid, my_name))
1075 if name == my_name and not ssh.HasAuthorizedKey(auth_key_file, key):
1076 result.append("A SSH key of normal node '%s' (UUID: '%s') is not"
1077 " in the 'authorized_keys' file of itself."
1078 % (my_name, uuid))
1079
1080 return result
1081
1082
1083 def _VerifySshClutter(node_status_list, my_name):
1084 """Verifies that the 'authorized_keys' files are not cluttered up.
1085
1086 @type node_status_list: list of tuples
1087 @param node_status_list: list of nodes of the cluster associated with a
1088 couple of flags: (uuid, name, is_master_candidate,
1089 is_potential_master_candidate, online)
1090 @type my_name: str
1091 @param my_name: name of this node
1092
1093 """
1094 result = []
1095 (auth_key_file, _) = \
1096 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1097 node_names = [name for (_, name, _, _) in node_status_list]
1098 multiple_occurrences = ssh.CheckForMultipleKeys(auth_key_file, node_names)
1099 if multiple_occurrences:
1100 msg = "There are hosts which have more than one SSH key stored for the" \
1101 " same user in the 'authorized_keys' file of node %s. This can be" \
1102 " due to an unsuccessful operation which cluttered up the" \
1103 " 'authorized_keys' file. We recommend to clean this up manually. " \
1104 % my_name
1105 for host, occ in multiple_occurrences.items():
1106 msg += "Entry for '%s' in lines %s. " % (host, utils.CommaJoin(occ))
1107 result.append(msg)
1108
1109 return result
1110
1111
1112 def VerifyNodeNetTest(my_name, test_config):
1113 """Verify nodes are reachable.
1114
1115 @type my_name: string
1116 @param my_name: name of the node this test is running on
1117
1118 @type test_config: tuple (node_list, master_candidate_list)
1119 @param test_config: configuration for test as passed from
1120 LUClusterVerify() in what[constants.NV_NODENETTEST]
1121
1122 @rtype: dict
1123 @return: a dictionary with node names as keys and error messages
1124 as values
1125 """
1126 result = {}
1127 nodes, master_candidates = test_config
1128 port = netutils.GetDaemonPort(constants.NODED)
1129
1130 if my_name not in master_candidates:
1131 return result
1132
1133 my_pip, my_sip = next(
1134 ((pip, sip) for name, pip, sip in nodes if name == my_name),
1135 (None, None)
1136 )
1137 if not my_pip:
1138 result[my_name] = ("Can't find my own primary/secondary IP"
1139 " in the node list")
1140 return result
1141
1142 for name, pip, sip in nodes:
1143 fail = []
1144 if not netutils.TcpPing(pip, port, source=my_pip):
1145 fail.append("primary")
1146 if sip != pip:
1147 if not netutils.TcpPing(sip, port, source=my_sip):
1148 fail.append("secondary")
1149 if fail:
1150 result[name] = ("failure using the %s interface(s)" %
1151 " and ".join(fail))
1152 return result
1153
1154
1155 def VerifyMasterIP(my_name, test_config):
1156 """Verify master IP is reachable.
1157
1158 @type my_name: string
1159 @param my_name: name of the node this test is running on
1160
1161 @type test_config: tuple (master_name, master_up, master_candidates)
1162 @param test_config: configuration for test as passed from
1163 LUClusterVerify() in what[constants.NV_MASTERIP]
1164
1165 @rtype: bool or None
1166 @return: Boolean test result, None if skipped
1167 """
1168 master_name, master_ip, master_candidates = test_config
1169 port = netutils.GetDaemonPort(constants.NODED)
1170 if my_name not in master_candidates:
1171 return None
1172
1173 if master_name == my_name:
1174 source = constants.IP4_ADDRESS_LOCALHOST
1175 else:
1176 source = None
1177 return netutils.TcpPing(master_ip, port, source=source)
1178
1179
1180 def VerifyNode(what, cluster_name, all_hvparams):
1181 """Verify the status of the local node.
1182
1183 Based on the input L{what} parameter, various checks are done on the
1184 local node.
1185
1186 If the I{filelist} key is present, this list of
1187 files is checksummed and the file/checksum pairs are returned.
1188
1189 If the I{nodelist} key is present, we check that we have
1190 connectivity via ssh with the target nodes (and check the hostname
1191 report).
1192
1193 If the I{node-net-test} key is present, we check that we have
1194 connectivity to the given nodes via both primary IP and, if
1195 applicable, secondary IPs.
1196
1197 @type what: C{dict}
1198 @param what: a dictionary of things to check:
1199 - filelist: list of files for which to compute checksums
1200 - nodelist: list of nodes we should check ssh communication with
1201 - node-net-test: list of nodes we should check node daemon port
1202 connectivity with
1203 - hypervisor: list with hypervisors to run the verify for
1204 @type cluster_name: string
1205 @param cluster_name: the cluster's name
1206 @type all_hvparams: dict of dict of strings
1207 @param all_hvparams: a dictionary mapping hypervisor names to hvparams
1208 @rtype: dict
1209 @return: a dictionary with the same keys as the input dict, and
1210 values representing the result of the checks
1211
1212 """
1213 result = {}
1214 my_name = netutils.Hostname.GetSysName()
1215 vm_capable = my_name not in what.get(constants.NV_NONVMNODES, [])
1216
1217 _VerifyHypervisors(what, vm_capable, result, all_hvparams)
1218 _VerifyHvparams(what, vm_capable, result)
1219
1220 if constants.NV_FILELIST in what:
1221 fingerprints = utils.FingerprintFiles(map(vcluster.LocalizeVirtualPath,
1222 what[constants.NV_FILELIST]))
1223 result[constants.NV_FILELIST] = \
1224 dict((vcluster.MakeVirtualPath(key), value)
1225 for (key, value) in fingerprints.items())
1226
1227 if constants.NV_CLIENT_CERT in what:
1228 result[constants.NV_CLIENT_CERT] = _VerifyClientCertificate()
1229
1230 if constants.NV_SSH_SETUP in what:
1231 node_status_list, key_type = what[constants.NV_SSH_SETUP]
1232 result[constants.NV_SSH_SETUP] = \
1233 _VerifySshSetup(node_status_list, my_name, key_type)
1234 if constants.NV_SSH_CLUTTER in what:
1235 result[constants.NV_SSH_CLUTTER] = \
1236 _VerifySshClutter(what[constants.NV_SSH_SETUP], my_name)
1237
1238 if constants.NV_NODELIST in what:
1239 (nodes, bynode, mcs) = what[constants.NV_NODELIST]
1240
1241 # Add nodes from other groups (different for each node)
1242 try:
1243 nodes.extend(bynode[my_name])
1244 except KeyError:
1245 pass
1246
1247 # Use a random order
1248 random.shuffle(nodes)
1249
1250 # Try to contact all nodes
1251 val = {}
1252 ssh_port_map = ssconf.SimpleStore().GetSshPortMap()
1253 for node in nodes:
1254 # We only test if master candidates can communicate to other nodes.
1255 # We cannot test if normal nodes cannot communicate with other nodes,
1256 # because the administrator might have installed additional SSH keys,
1257 # over which Ganeti has no power.
1258 if my_name in mcs:
1259 success, message = _GetSshRunner(cluster_name). \
1260 VerifyNodeHostname(node, ssh_port_map[node])
1261 if not success:
1262 val[node] = message
1263
1264 result[constants.NV_NODELIST] = val
1265
1266 if constants.NV_NODENETTEST in what:
1267 result[constants.NV_NODENETTEST] = VerifyNodeNetTest(
1268 my_name, what[constants.NV_NODENETTEST])
1269
1270 if constants.NV_MASTERIP in what:
1271 result[constants.NV_MASTERIP] = VerifyMasterIP(
1272 my_name, what[constants.NV_MASTERIP])
1273
1274 if constants.NV_USERSCRIPTS in what:
1275 result[constants.NV_USERSCRIPTS] = \
1276 [script for script in what[constants.NV_USERSCRIPTS]
1277 if not utils.IsExecutable(script)]
1278
1279 if constants.NV_OOB_PATHS in what:
1280 result[constants.NV_OOB_PATHS] = tmp = []
1281 for path in what[constants.NV_OOB_PATHS]:
1282 try:
1283 st = os.stat(path)
1284 except OSError, err:
1285 tmp.append("error stating out of band helper: %s" % err)
1286 else:
1287 if stat.S_ISREG(st.st_mode):
1288 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR:
1289 tmp.append(None)
1290 else:
1291 tmp.append("out of band helper %s is not executable" % path)
1292 else:
1293 tmp.append("out of band helper %s is not a file" % path)
1294
1295 if constants.NV_LVLIST in what and vm_capable:
1296 try:
1297 val = GetVolumeList(utils.ListVolumeGroups().keys())
1298 except RPCFail, err:
1299 val = str(err)
1300 result[constants.NV_LVLIST] = val
1301
1302 _VerifyInstanceList(what, vm_capable, result, all_hvparams)
1303
1304 if constants.NV_VGLIST in what and vm_capable:
1305 result[constants.NV_VGLIST] = utils.ListVolumeGroups()
1306
1307 if constants.NV_PVLIST in what and vm_capable:
1308 check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what
1309 val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST],
1310 filter_allocatable=False,
1311 include_lvs=check_exclusive_pvs)
1312 if check_exclusive_pvs:
1313 result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val)
1314 for pvi in val:
1315 # Avoid sending useless data on the wire
1316 pvi.lv_list = []
1317 result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val)
1318
1319 if constants.NV_VERSION in what:
1320 result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
1321 constants.RELEASE_VERSION)
1322
1323 _VerifyNodeInfo(what, vm_capable, result, all_hvparams)
1324
1325 if constants.NV_DRBDVERSION in what and vm_capable:
1326 try:
1327 drbd_version = DRBD8.GetProcInfo().GetVersionString()
1328 except errors.BlockDeviceError, err:
1329 logging.warning("Can't get DRBD version", exc_info=True)
1330 drbd_version = str(err)
1331 result[constants.NV_DRBDVERSION] = drbd_version
1332
1333 if constants.NV_DRBDLIST in what and vm_capable:
1334 try:
1335 used_minors = drbd.DRBD8.GetUsedDevs()
1336 except errors.BlockDeviceError, err:
1337 logging.warning("Can't get used minors list", exc_info=True)
1338 used_minors = str(err)
1339 result[constants.NV_DRBDLIST] = used_minors
1340
1341 if constants.NV_DRBDHELPER in what and vm_capable:
1342 status = True
1343 try:
1344 payload = drbd.DRBD8.GetUsermodeHelper()
1345 except errors.BlockDeviceError, err:
1346 logging.error("Can't get DRBD usermode helper: %s", str(err))
1347 status = False
1348 payload = str(err)
1349 result[constants.NV_DRBDHELPER] = (status, payload)
1350
1351 if constants.NV_NODESETUP in what:
1352 result[constants.NV_NODESETUP] = tmpr = []
1353 if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"):
1354 tmpr.append("The sysfs filesytem doesn't seem to be mounted"
1355 " under /sys, missing required directories /sys/block"
1356 " and /sys/class/net")
1357 if (not os.path.isdir("/proc/sys") or
1358 not os.path.isfile("/proc/sysrq-trigger")):
1359 tmpr.append("The procfs filesystem doesn't seem to be mounted"
1360 " under /proc, missing required directory /proc/sys and"
1361 " the file /proc/sysrq-trigger")
1362
1363 if constants.NV_TIME in what:
1364 result[constants.NV_TIME] = utils.SplitTime(time.time())
1365
1366 if constants.NV_OSLIST in what and vm_capable:
1367 result[constants.NV_OSLIST] = DiagnoseOS()
1368
1369 if constants.NV_BRIDGES in what and vm_capable:
1370 result[constants.NV_BRIDGES] = [bridge
1371 for bridge in what[constants.NV_BRIDGES]
1372 if not utils.BridgeExists(bridge)]
1373
1374 if what.get(constants.NV_ACCEPTED_STORAGE_PATHS) == my_name:
1375 result[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1376 filestorage.ComputeWrongFileStoragePaths()
1377
1378 if what.get(constants.NV_FILE_STORAGE_PATH):
1379 pathresult = filestorage.CheckFileStoragePath(
1380 what[constants.NV_FILE_STORAGE_PATH])
1381 if pathresult:
1382 result[constants.NV_FILE_STORAGE_PATH] = pathresult
1383
1384 if what.get(constants.NV_SHARED_FILE_STORAGE_PATH):
1385 pathresult = filestorage.CheckFileStoragePath(
1386 what[constants.NV_SHARED_FILE_STORAGE_PATH])
1387 if pathresult:
1388 result[constants.NV_SHARED_FILE_STORAGE_PATH] = pathresult
1389
1390 return result
1391
1392
1393 def GetCryptoTokens(token_requests):
1394 """Perform actions on the node's cryptographic tokens.
1395
1396 Token types can be 'ssl' or 'ssh'. So far only some actions are implemented
1397 for 'ssl'. Action 'get' returns the digest of the public client ssl
1398 certificate. Action 'create' creates a new client certificate and private key
1399 and also returns the digest of the certificate. The third parameter of a
1400 token request are optional parameters for the actions, so far only the
1401 filename is supported.
1402
1403 @type token_requests: list of tuples of (string, string, dict), where the
1404 first string is in constants.CRYPTO_TYPES, the second in
1405 constants.CRYPTO_ACTIONS. The third parameter is a dictionary of string
1406 to string.
1407 @param token_requests: list of requests of cryptographic tokens and actions
1408 to perform on them. The actions come with a dictionary of options.
1409 @rtype: list of tuples (string, string)
1410 @return: list of tuples of the token type and the public crypto token
1411
1412 """
1413 tokens = []
1414 for (token_type, action, _) in token_requests:
1415 if token_type not in constants.CRYPTO_TYPES:
1416 raise errors.ProgrammerError("Token type '%s' not supported." %
1417 token_type)
1418 if action not in constants.CRYPTO_ACTIONS:
1419 raise errors.ProgrammerError("Action '%s' is not supported." %
1420 action)
1421 if token_type == constants.CRYPTO_TYPE_SSL_DIGEST:
1422 tokens.append((token_type,
1423 utils.GetCertificateDigest()))
1424 return tokens
1425
1426
1427 def EnsureDaemon(daemon_name, run):
1428 """Ensures the given daemon is running or stopped.
1429
1430 @type daemon_name: string
1431 @param daemon_name: name of the daemon (e.g., constants.KVMD)
1432
1433 @type run: bool
1434 @param run: whether to start or stop the daemon
1435
1436 @rtype: bool
1437 @return: 'True' if daemon successfully started/stopped,
1438 'False' otherwise
1439
1440 """
1441 allowed_daemons = [constants.KVMD]
1442
1443 if daemon_name not in allowed_daemons:
1444 fn = lambda _: False
1445 elif run:
1446 fn = utils.EnsureDaemon
1447 else:
1448 fn = utils.StopDaemon
1449
1450 return fn(daemon_name)
1451
1452
1453 def _InitSshUpdateData(data, noded_cert_file, ssconf_store):
1454 (_, noded_cert) = \
1455 utils.ExtractX509Certificate(utils.ReadFile(noded_cert_file))
1456 data[constants.SSHS_NODE_DAEMON_CERTIFICATE] = noded_cert
1457
1458 cluster_name = ssconf_store.GetClusterName()
1459 data[constants.SSHS_CLUSTER_NAME] = cluster_name
1460
1461
1462 def AddNodeSshKey(node_uuid, node_name,
1463 potential_master_candidates,
1464 to_authorized_keys=False,
1465 to_public_keys=False,
1466 get_public_keys=False,
1467 pub_key_file=pathutils.SSH_PUB_KEYS,
1468 ssconf_store=None,
1469 noded_cert_file=pathutils.NODED_CERT_FILE,
1470 run_cmd_fn=ssh.RunSshCmdWithStdin,
1471 ssh_update_debug=False,
1472 ssh_update_verbose=False):
1473 """Distributes a node's public SSH key across the cluster.
1474
1475 Note that this function should only be executed on the master node, which
1476 then will copy the new node's key to all nodes in the cluster via SSH.
1477
1478 Also note: at least one of the flags C{to_authorized_keys},
1479 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1480 the function to actually perform any actions.
1481
1482 @type node_uuid: str
1483 @param node_uuid: the UUID of the node whose key is added
1484 @type node_name: str
1485 @param node_name: the name of the node whose key is added
1486 @type potential_master_candidates: list of str
1487 @param potential_master_candidates: list of node names of potential master
1488 candidates; this should match the list of uuids in the public key file
1489 @type to_authorized_keys: boolean
1490 @param to_authorized_keys: whether the key should be added to the
1491 C{authorized_keys} file of all nodes
1492 @type to_public_keys: boolean
1493 @param to_public_keys: whether the keys should be added to the public key file
1494 @type get_public_keys: boolean
1495 @param get_public_keys: whether the node should add the clusters' public keys
1496 to its {ganeti_pub_keys} file
1497
1498 """
1499 node_list = [SshAddNodeInfo(name=node_name, uuid=node_uuid,
1500 to_authorized_keys=to_authorized_keys,
1501 to_public_keys=to_public_keys,
1502 get_public_keys=get_public_keys)]
1503 return AddNodeSshKeyBulk(node_list,
1504 potential_master_candidates,
1505 pub_key_file=pub_key_file,
1506 ssconf_store=ssconf_store,
1507 noded_cert_file=noded_cert_file,
1508 run_cmd_fn=run_cmd_fn,
1509 ssh_update_debug=ssh_update_debug,
1510 ssh_update_verbose=ssh_update_verbose)
1511
1512
1513 # Node info named tuple specifically for the use with AddNodeSshKeyBulk
1514 SshAddNodeInfo = collections.namedtuple(
1515 "SshAddNodeInfo",
1516 ["uuid",
1517 "name",
1518 "to_authorized_keys",
1519 "to_public_keys",
1520 "get_public_keys"])
1521
1522
1523 def AddNodeSshKeyBulk(node_list,
1524 potential_master_candidates,
1525 pub_key_file=pathutils.SSH_PUB_KEYS,
1526 ssconf_store=None,
1527 noded_cert_file=pathutils.NODED_CERT_FILE,
1528 run_cmd_fn=ssh.RunSshCmdWithStdin,
1529 ssh_update_debug=False,
1530 ssh_update_verbose=False):
1531 """Distributes a node's public SSH key across the cluster.
1532
1533 Note that this function should only be executed on the master node, which
1534 then will copy the new node's key to all nodes in the cluster via SSH.
1535
1536 Also note: at least one of the flags C{to_authorized_keys},
1537 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1538 the function to actually perform any actions.
1539
1540 @type node_list: list of SshAddNodeInfo tuples
1541 @param node_list: list of tuples containing the necessary node information for
1542 adding their keys
1543 @type potential_master_candidates: list of str
1544 @param potential_master_candidates: list of node names of potential master
1545 candidates; this should match the list of uuids in the public key file
1546
1547 """
1548 # whether there are any keys to be added or retrieved at all
1549 to_authorized_keys = any([node_info.to_authorized_keys for node_info in
1550 node_list])
1551 to_public_keys = any([node_info.to_public_keys for node_info in
1552 node_list])
1553
1554 if not ssconf_store:
1555 ssconf_store = ssconf.SimpleStore()
1556
1557 for node_info in node_list:
1558 # replacement not necessary for keys that are not supposed to be in the
1559 # list of public keys
1560 if not node_info.to_public_keys:
1561 continue
1562 # Check and fix sanity of key file
1563 keys_by_name = ssh.QueryPubKeyFile([node_info.name], key_file=pub_key_file)
1564 keys_by_uuid = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1565
1566 if (not keys_by_name or node_info.name not in keys_by_name) \
1567 and (not keys_by_uuid or node_info.uuid not in keys_by_uuid):
1568 raise errors.SshUpdateError(
1569 "No keys found for the new node '%s' (UUID %s) in the list of public"
1570 " SSH keys, neither for the name or the UUID" %
1571 (node_info.name, node_info.uuid))
1572 else:
1573 if node_info.name in keys_by_name:
1574 # Replace the name by UUID in the file as the name should only be used
1575 # temporarily
1576 ssh.ReplaceNameByUuid(node_info.uuid, node_info.name,
1577 error_fn=errors.SshUpdateError,
1578 key_file=pub_key_file)
1579
1580 # Retrieve updated map of UUIDs to keys
1581 keys_by_uuid = ssh.QueryPubKeyFile(
1582 [node_info.uuid for node_info in node_list], key_file=pub_key_file)
1583
1584 # Update the master node's key files
1585 (auth_key_file, _) = \
1586 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1587 for node_info in node_list:
1588 if node_info.to_authorized_keys:
1589 ssh.AddAuthorizedKeys(auth_key_file, keys_by_uuid[node_info.uuid])
1590
1591 base_data = {}
1592 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1593 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1594
1595 ssh_port_map = ssconf_store.GetSshPortMap()
1596
1597 # Update the target nodes themselves
1598 for node_info in node_list:
1599 logging.debug("Updating SSH key files of target node '%s'.", node_info.name)
1600 if node_info.get_public_keys:
1601 node_data = {}
1602 _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
1603 all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
1604 node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1605 (constants.SSHS_OVERRIDE, all_keys)
1606
1607 try:
1608 backoff = 5 # seconds
1609 utils.RetryByNumberOfTimes(
1610 constants.SSHS_MAX_RETRIES, backoff,
1611 errors.SshUpdateError,
1612 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1613 ssh_port_map.get(node_info.name), node_data,
1614 debug=ssh_update_debug, verbose=ssh_update_verbose,
1615 use_cluster_key=False, ask_key=False, strict_host_check=False)
1616 except errors.SshUpdateError as e:
1617 # Clean up the master's public key file if adding key fails
1618 if node_info.to_public_keys:
1619 ssh.RemovePublicKey(node_info.uuid)
1620 raise e
1621
1622 # Update all nodes except master and the target nodes
1623 keys_by_uuid_auth = ssh.QueryPubKeyFile(
1624 [node_info.uuid for node_info in node_list
1625 if node_info.to_authorized_keys],
1626 key_file=pub_key_file)
1627 if to_authorized_keys:
1628 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1629 (constants.SSHS_ADD, keys_by_uuid_auth)
1630
1631 pot_mc_data = base_data.copy()
1632 keys_by_uuid_pub = ssh.QueryPubKeyFile(
1633 [node_info.uuid for node_info in node_list
1634 if node_info.to_public_keys],
1635 key_file=pub_key_file)
1636 if to_public_keys:
1637 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1638 (constants.SSHS_REPLACE_OR_ADD, keys_by_uuid_pub)
1639
1640 all_nodes = ssconf_store.GetNodeList()
1641 master_node = ssconf_store.GetMasterNode()
1642 online_nodes = ssconf_store.GetOnlineNodeList()
1643
1644 node_errors = []
1645 for node in all_nodes:
1646 if node == master_node:
1647 logging.debug("Skipping master node '%s'.", master_node)
1648 continue
1649 if node not in online_nodes:
1650 logging.debug("Skipping offline node '%s'.", node)
1651 continue
1652 if node in potential_master_candidates:
1653 logging.debug("Updating SSH key files of node '%s'.", node)
1654 try:
1655 backoff = 5 # seconds
1656 utils.RetryByNumberOfTimes(
1657 constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
1658 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1659 ssh_port_map.get(node), pot_mc_data,
1660 debug=ssh_update_debug, verbose=ssh_update_verbose,
1661 use_cluster_key=False, ask_key=False, strict_host_check=False)
1662 except errors.SshUpdateError as last_exception:
1663 error_msg = ("When adding the key of node '%s', updating SSH key"
1664 " files of node '%s' failed after %s retries."
1665 " Not trying again. Last error was: %s." %
1666 (node, node_info.name, constants.SSHS_MAX_RETRIES,
1667 last_exception))
1668 node_errors.append((node, error_msg))
1669 # We only log the error and don't throw an exception, because
1670 # one unreachable node shall not abort the entire procedure.
1671 logging.error(error_msg)
1672
1673 else:
1674 if to_authorized_keys:
1675 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
1676 ssh_port_map.get(node), base_data,
1677 debug=ssh_update_debug, verbose=ssh_update_verbose,
1678 use_cluster_key=False, ask_key=False,
1679 strict_host_check=False)
1680
1681 return node_errors
1682
1683
1684 # TODO: will be fixed with pending patch series.
1685 # pylint: disable=R0913
1686 def RemoveNodeSshKey(node_uuid, node_name,
1687 master_candidate_uuids,
1688 potential_master_candidates,
1689 master_uuid=None,
1690 keys_to_remove=None,
1691 from_authorized_keys=False,
1692 from_public_keys=False,
1693 clear_authorized_keys=False,
1694 clear_public_keys=False,
1695 pub_key_file=pathutils.SSH_PUB_KEYS,
1696 ssconf_store=None,
1697 noded_cert_file=pathutils.NODED_CERT_FILE,
1698 readd=False,
1699 run_cmd_fn=ssh.RunSshCmdWithStdin,
1700 ssh_update_debug=False,
1701 ssh_update_verbose=False):
1702 """Removes the node's SSH keys from the key files and distributes those.
1703
1704 Note that at least one of the flags C{from_authorized_keys},
1705 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1706 has to be set to C{True} for the function to perform any action at all.
1707 Not doing so will trigger an assertion in the function.
1708
1709 @type node_uuid: str
1710 @param node_uuid: UUID of the node whose key is removed
1711 @type node_name: str
1712 @param node_name: name of the node whose key is remove
1713 @type master_candidate_uuids: list of str
1714 @param master_candidate_uuids: list of UUIDs of the current master candidates
1715 @type potential_master_candidates: list of str
1716 @param potential_master_candidates: list of names of potential master
1717 candidates
1718 @type keys_to_remove: dict of str to list of str
1719 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1720 to be removed. This list is supposed to be used only if the keys are not
1721 in the public keys file. This is for example the case when removing a
1722 master node's key.
1723 @type from_authorized_keys: boolean
1724 @param from_authorized_keys: whether or not the key should be removed
1725 from the C{authorized_keys} file
1726 @type from_public_keys: boolean
1727 @param from_public_keys: whether or not the key should be remove from
1728 the C{ganeti_pub_keys} file
1729 @type clear_authorized_keys: boolean
1730 @param clear_authorized_keys: whether or not the C{authorized_keys} file
1731 should be cleared on the node whose keys are removed
1732 @type clear_public_keys: boolean
1733 @param clear_public_keys: whether to clear the node's C{ganeti_pub_key} file
1734 @type readd: boolean
1735 @param readd: whether this is called during a readd operation.
1736 @rtype: list of string
1737 @returns: list of feedback messages
1738
1739 """
1740 node_list = [SshRemoveNodeInfo(uuid=node_uuid,
1741 name=node_name,
1742 from_authorized_keys=from_authorized_keys,
1743 from_public_keys=from_public_keys,
1744 clear_authorized_keys=clear_authorized_keys,
1745 clear_public_keys=clear_public_keys)]
1746 return RemoveNodeSshKeyBulk(node_list,
1747 master_candidate_uuids,
1748 potential_master_candidates,
1749 master_uuid=master_uuid,
1750 keys_to_remove=keys_to_remove,
1751 pub_key_file=pub_key_file,
1752 ssconf_store=ssconf_store,
1753 noded_cert_file=noded_cert_file,
1754 readd=readd,
1755 run_cmd_fn=run_cmd_fn,
1756 ssh_update_debug=ssh_update_debug,
1757 ssh_update_verbose=ssh_update_verbose)
1758
1759
1760 # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
1761 SshRemoveNodeInfo = collections.namedtuple(
1762 "SshRemoveNodeInfo",
1763 ["uuid",
1764 "name",
1765 "from_authorized_keys",
1766 "from_public_keys",
1767 "clear_authorized_keys",
1768 "clear_public_keys"])
1769
1770
1771 def RemoveNodeSshKeyBulk(node_list,
1772 master_candidate_uuids,
1773 potential_master_candidates,
1774 master_uuid=None,
1775 keys_to_remove=None,
1776 pub_key_file=pathutils.SSH_PUB_KEYS,
1777 ssconf_store=None,
1778 noded_cert_file=pathutils.NODED_CERT_FILE,
1779 readd=False,
1780 run_cmd_fn=ssh.RunSshCmdWithStdin,
1781 ssh_update_debug=False,
1782 ssh_update_verbose=False):
1783 """Removes the node's SSH keys from the key files and distributes those.
1784
1785 Note that at least one of the flags C{from_authorized_keys},
1786 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1787 of at least one node has to be set to C{True} for the function to perform any
1788 action at all. Not doing so will trigger an assertion in the function.
1789
1790 @type node_list: list of C{SshRemoveNodeInfo}.
1791 @param node_list: list of information about nodes whose keys are being removed
1792 @type master_candidate_uuids: list of str
1793 @param master_candidate_uuids: list of UUIDs of the current master candidates
1794 @type potential_master_candidates: list of str
1795 @param potential_master_candidates: list of names of potential master
1796 candidates
1797 @type keys_to_remove: dict of str to list of str
1798 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1799 to be removed. This list is supposed to be used only if the keys are not
1800 in the public keys file. This is for example the case when removing a
1801 master node's key.
1802 @type readd: boolean
1803 @param readd: whether this is called during a readd operation.
1804 @rtype: list of string
1805 @returns: list of feedback messages
1806
1807 """
1808 # Non-disruptive error messages, list of (node, msg) pairs
1809 result_msgs = []
1810
1811 # whether there are any keys to be added or retrieved at all
1812 from_authorized_keys = any([node_info.from_authorized_keys for node_info in
1813 node_list])
1814 from_public_keys = any([node_info.from_public_keys for node_info in
1815 node_list])
1816 clear_authorized_keys = any([node_info.clear_authorized_keys for node_info in
1817 node_list])
1818 clear_public_keys = any([node_info.clear_public_keys for node_info in
1819 node_list])
1820
1821 # Make sure at least one of these flags is true.
1822 if not (from_authorized_keys or from_public_keys or clear_authorized_keys
1823 or clear_public_keys):
1824 raise errors.SshUpdateError("No removal from any key file was requested.")
1825
1826 if not ssconf_store:
1827 ssconf_store = ssconf.SimpleStore()
1828
1829 master_node = ssconf_store.GetMasterNode()
1830 ssh_port_map = ssconf_store.GetSshPortMap()
1831
1832 all_keys_to_remove = {}
1833 if from_authorized_keys or from_public_keys:
1834 for node_info in node_list:
1835 # Skip nodes that don't actually need any keys to be removed.
1836 if not (node_info.from_authorized_keys or node_info.from_public_keys):
1837 continue
1838 if node_info.name == master_node and not keys_to_remove:
1839 raise errors.SshUpdateError("Cannot remove the master node's keys.")
1840 if keys_to_remove:
1841 keys = keys_to_remove
1842 else:
1843 keys = ssh.QueryPubKeyFile([node_info.uuid], key_file=pub_key_file)
1844 if (not keys or node_info.uuid not in keys) and not readd:
1845 raise errors.SshUpdateError("Node '%s' not found in the list of"
1846 " public SSH keys. It seems someone"
1847 " tries to remove a key from outside"
1848 " the cluster!" % node_info.uuid)
1849 # During an upgrade all nodes have the master key. In this case we
1850 # should not remove it to avoid accidentally shutting down cluster
1851 # SSH communication
1852 master_keys = None
1853 if master_uuid:
1854 master_keys = ssh.QueryPubKeyFile([master_uuid],
1855 key_file=pub_key_file)
1856
1857 # Remove any master keys from the list of keys to remove from the node
1858 keys[node_info.uuid] = list(
1859 set(keys[node_info.uuid]) - set(master_keys))
1860
1861 all_keys_to_remove.update(keys)
1862
1863 if all_keys_to_remove:
1864 base_data = {}
1865 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1866 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1867
1868 if from_authorized_keys:
1869 # UUIDs of nodes that are supposed to be removed from the
1870 # authorized_keys files.
1871 nodes_remove_from_authorized_keys = [
1872 node_info.uuid for node_info in node_list
1873 if node_info.from_authorized_keys]
1874 keys_to_remove_from_authorized_keys = dict([
1875 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1876 if uuid in nodes_remove_from_authorized_keys])
1877 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1878 (constants.SSHS_REMOVE, keys_to_remove_from_authorized_keys)
1879 (auth_key_file, _) = \
1880 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False,
1881 dircheck=False)
1882
1883 for uuid in nodes_remove_from_authorized_keys:
1884 ssh.RemoveAuthorizedKeys(auth_key_file,
1885 keys_to_remove_from_authorized_keys[uuid])
1886
1887 pot_mc_data = base_data.copy()
1888
1889 if from_public_keys:
1890 nodes_remove_from_public_keys = [
1891 node_info.uuid for node_info in node_list
1892 if node_info.from_public_keys]
1893 keys_to_remove_from_public_keys = dict([
1894 (uuid, keys) for (uuid, keys) in all_keys_to_remove.items()
1895 if uuid in nodes_remove_from_public_keys])
1896 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1897 (constants.SSHS_REMOVE, keys_to_remove_from_public_keys)
1898
1899 all_nodes = ssconf_store.GetNodeList()
1900 online_nodes = ssconf_store.GetOnlineNodeList()
1901 all_nodes_to_remove = [node_info.name for node_info in node_list]
1902 logging.debug("Removing keys of nodes '%s' from all nodes but itself and"
1903 " master.", ", ".join(all_nodes_to_remove))
1904 for node in all_nodes:
1905 if node == master_node:
1906 logging.debug("Skipping master node '%s'.", master_node)
1907 continue
1908 if node not in online_nodes:
1909 logging.debug("Skipping offline node '%s'.", node)
1910 continue
1911 if node in all_nodes_to_remove:
1912 logging.debug("Skipping node whose key is removed itself '%s'.", node)
1913 continue
1914 ssh_port = ssh_port_map.get(node)
1915 if not ssh_port:
1916 raise errors.OpExecError("No SSH port information available for"
1917 " node '%s', map: %s." %
1918 (node, ssh_port_map))
1919 error_msg_final = ("When removing the key of node '%s', updating the"
1920 " SSH key files of node '%s' failed. Last error"
1921 " was: %s.")
1922
1923 if node in potential_master_candidates or from_authorized_keys:
1924 if node in potential_master_candidates:
1925 node_desc = "potential master candidate"
1926 else:
1927 node_desc = "normal"
1928 logging.debug("Updating key setup of %s node %s.", node_desc, node)
1929 try:
1930 backoff = 5 # seconds
1931 utils.RetryByNumberOfTimes(
1932 constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
1933 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1934 ssh_port, pot_mc_data,
1935 debug=ssh_update_debug, verbose=ssh_update_verbose,
1936 use_cluster_key=False, ask_key=False, strict_host_check=False)
1937 except errors.SshUpdateError as last_exception:
1938 error_msg = error_msg_final % (
1939 node_info.name, node, last_exception)
1940 result_msgs.append((node, error_msg))
1941 logging.error(error_msg)
1942
1943 for node_info in node_list:
1944 if node_info.clear_authorized_keys or node_info.from_public_keys or \
1945 node_info.clear_public_keys:
1946 data = {}
1947 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1948 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1949 ssh_port = ssh_port_map.get(node_info.name)
1950 if not ssh_port:
1951 raise errors.OpExecError("No SSH port information available for"
1952 " node '%s', which is leaving the cluster.")
1953
1954 if node_info.clear_authorized_keys:
1955 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore,
1956 # we have to specify exactly which keys to clear to leave keys untouched
1957 # that were not added by Ganeti.
1958 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids
1959 if uuid != node_info.uuid]
1960 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids,
1961 key_file=pub_key_file)
1962 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1963 (constants.SSHS_REMOVE, candidate_keys)
1964
1965 if node_info.clear_public_keys:
1966 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1967 (constants.SSHS_CLEAR, {})
1968 elif node_info.from_public_keys:
1969 # Since clearing the public keys subsumes removing just a single key,
1970 # we only do it if clear_public_keys is 'False'.
1971
1972 if all_keys_to_remove:
1973 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1974 (constants.SSHS_REMOVE, all_keys_to_remove)
1975
1976 # If we have no changes to any keyfile, just return
1977 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or
1978 constants.SSHS_SSH_AUTHORIZED_KEYS in data):
1979 return
1980
1981 logging.debug("Updating SSH key setup of target node '%s'.",
1982 node_info.name)
1983 try:
1984 backoff = 5 # seconds
1985 utils.RetryByNumberOfTimes(
1986 constants.SSHS_MAX_RETRIES, backoff,
1987 errors.SshUpdateError,
1988 run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
1989 ssh_port, data,
1990 debug=ssh_update_debug, verbose=ssh_update_verbose,
1991 use_cluster_key=False, ask_key=False, strict_host_check=False)
1992 except errors.SshUpdateError as last_exception:
1993 result_msgs.append(
1994 (node_info.name,
1995 ("Removing SSH keys from node '%s' failed."
1996 " This can happen when the node is already unreachable."
1997 " Error: %s" % (node_info.name, last_exception))))
1998
1999 if all_keys_to_remove and from_public_keys:
2000 for node_uuid in nodes_remove_from_public_keys:
2001 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
2002
2003 return result_msgs
2004 # pylint: enable=R0913
2005
2006
2007 def RemoveSshKeyFromPublicKeyFile(node_name,
2008 pub_key_file=pathutils.SSH_PUB_KEYS,
2009 ssconf_store=None):
2010 """Removes a SSH key from the master's public key file.
2011
2012 This is an operation that is only used to clean up after failed operations
2013 (for example failed hooks before adding a node). To avoid abuse of this
2014 function (and the matching RPC call), we add a safety check to make sure
2015 that only stray keys can be removed that belong to nodes that are not
2016 in the cluster (anymore).
2017
2018 @type node_name: string
2019 @param node_name: the name of the node whose key is removed
2020
2021 """
2022 if not ssconf_store:
2023 ssconf_store = ssconf.SimpleStore()
2024
2025 node_list = ssconf_store.GetNodeList()
2026
2027 if node_name in node_list:
2028 raise errors.SshUpdateError("Cannot remove key of node '%s',"
2029 " because it still belongs to the cluster."
2030 % node_name)
2031
2032 keys_by_name = ssh.QueryPubKeyFile([node_name], key_file=pub_key_file)
2033 if not keys_by_name or node_name not in keys_by_name:
2034 logging.info("The node '%s' whose key is supposed to be removed does not"
2035 " have an entry in the public key file. Hence, there is"
2036 " nothing left to do.", node_name)
2037
2038 ssh.RemovePublicKey(node_name, key_file=pub_key_file)
2039
2040
2041 def _GenerateNodeSshKey(node_name, ssh_port_map, ssh_key_type, ssh_key_bits,
2042 ssconf_store=None,
2043 noded_cert_file=pathutils.NODED_CERT_FILE,
2044 run_cmd_fn=ssh.RunSshCmdWithStdin,
2045 suffix="",
2046 ssh_update_debug=False,
2047 ssh_update_verbose=False):
2048 """Generates the root SSH key pair on the node.
2049
2050 @type node_name: str
2051 @param node_name: name of the node whose key is remove
2052 @type ssh_port_map: dict of str to int
2053 @param ssh_port_map: mapping of node names to their SSH port
2054 @type ssh_key_type: One of L{constants.SSHK_ALL}
2055 @param ssh_key_type: the type of SSH key to be generated
2056 @type ssh_key_bits: int
2057 @param ssh_key_bits: the length of the key to be generated
2058
2059 """
2060 if not ssconf_store:
2061 ssconf_store = ssconf.SimpleStore()
2062
2063 data = {}
2064 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
2065 cluster_name = data[constants.SSHS_CLUSTER_NAME]
2066 data[constants.SSHS_GENERATE] = (ssh_key_type, ssh_key_bits, suffix)
2067
2068 run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
2069 ssh_port_map.get(node_name), data,
2070 debug=ssh_update_debug, verbose=ssh_update_verbose,
2071 use_cluster_key=False, ask_key=False, strict_host_check=False)
2072
2073
2074 def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
2075 master_node_uuids = [node_uuid for (node_uuid, node_name)
2076 in node_uuid_name_map
2077 if node_name == master_node_name]
2078 if len(master_node_uuids) != 1:
2079 raise errors.SshUpdateError("No (unique) master UUID found. Master node"
2080 " name: '%s', Master UUID: '%s'" %
2081 (master_node_name, master_node_uuids))
2082 return master_node_uuids[0]
2083
2084
2085 def _GetOldMasterKeys(master_node_uuid, pub_key_file):
2086 old_master_keys_by_uuid = ssh.QueryPubKeyFile([master_node_uuid],
2087 key_file=pub_key_file)
2088 if not old_master_keys_by_uuid:
2089 raise errors.SshUpdateError("No public key of the master node (UUID '%s')"
2090 " found, not generating a new key."
2091 % master_node_uuid)
2092 return old_master_keys_by_uuid
2093
2094
2095 def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
2096 potential_master_candidates, old_key_type, new_key_type,
2097 new_key_bits,
2098 ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS,
2099 ssconf_store=None,
2100 noded_cert_file=pathutils.NODED_CERT_FILE,
2101 run_cmd_fn=ssh.RunSshCmdWithStdin,
2102 ssh_update_debug=False,
2103 ssh_update_verbose=False):
2104 """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
2105
2106 @type node_uuids: list of str
2107 @param node_uuids: list of node UUIDs whose keys should be renewed
2108 @type node_names: list of str
2109 @param node_names: list of node names whose keys should be removed. This list
2110 should match the C{node_uuids} parameter
2111 @type master_candidate_uuids: list of str
2112 @param master_candidate_uuids: list of UUIDs of master candidates or
2113 master node
2114 @type old_key_type: One of L{constants.SSHK_ALL}
2115 @param old_key_type: the type of SSH key already present on nodes
2116 @type new_key_type: One of L{constants.SSHK_ALL}
2117 @param new_key_type: the type of SSH key to be generated
2118 @type new_key_bits: int
2119 @param new_key_bits: the length of the key to be generated
2120 @type ganeti_pub_keys_file: str
2121 @param ganeti_pub_keys_file: file path of the the public key file
2122 @type noded_cert_file: str
2123 @param noded_cert_file: path of the noded SSL certificate file
2124 @type run_cmd_fn: function
2125 @param run_cmd_fn: function to run commands on remote nodes via SSH
2126 @raises ProgrammerError: if node_uuids and node_names don't match;
2127 SshUpdateError if a node's key is missing from the public key file,
2128 if a node's new SSH key could not be fetched from it, if there is
2129 none or more than one entry in the public key list for the master
2130 node.
2131
2132 """
2133 if not ssconf_store:
2134 ssconf_store = ssconf.SimpleStore()
2135 cluster_name = ssconf_store.GetClusterName()
2136
2137 if not len(node_uuids) == len(node_names):
2138 raise errors.ProgrammerError("List of nodes UUIDs and node names"
2139 " does not match in length.")
2140
2141 old_pub_keyfile = ssh.GetSshPubKeyFilename(old_key_type)
2142 new_pub_keyfile = ssh.GetSshPubKeyFilename(new_key_type)
2143 old_master_key = ssh.ReadLocalSshPubKeys([old_key_type])
2144
2145 node_uuid_name_map = zip(node_uuids, node_names)
2146
2147 master_node_name = ssconf_store.GetMasterNode()
2148 master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name)
2149 ssh_port_map = ssconf_store.GetSshPortMap()
2150 # List of all node errors that happened, but which did not abort the
2151 # procedure as a whole. It is important that this is a list to have a
2152 # somewhat chronological history of events.
2153 all_node_errors = []
2154
2155 # process non-master nodes
2156
2157 # keys to add in bulk at the end
2158 node_keys_to_add = []
2159
2160 # list of all nodes
2161 node_list = []
2162
2163 # list of keys to be removed before generating new keys
2164 node_info_to_remove = []
2165
2166 for node_uuid, node_name in node_uuid_name_map:
2167 if node_name == master_node_name:
2168 continue
2169 master_candidate = node_uuid in master_candidate_uuids
2170 potential_master_candidate = node_name in potential_master_candidates
2171 node_list.append((node_uuid, node_name, master_candidate,
2172 potential_master_candidate))
2173
2174 if master_candidate:
2175 logging.debug("Fetching old SSH key from node '%s'.", node_name)
2176 old_pub_key = ssh.ReadRemoteSshPubKey(old_pub_keyfile,
2177 node_name, cluster_name,
2178 ssh_port_map[node_name],
2179 False, # ask_key
2180 False) # key_check
2181 if old_pub_key != old_master_key:
2182 # If we are already in a multi-key setup (that is past Ganeti 2.12),
2183 # we can safely remove the old key of the node. Otherwise, we cannot
2184 # remove that node's key, because it is also the master node's key
2185 # and that would terminate all communication from the master to the
2186 # node.
2187 node_info_to_remove.append(SshRemoveNodeInfo(
2188 uuid=node_uuid,
2189 name=node_name,
2190 from_authorized_keys=master_candidate,
2191 from_public_keys=False,
2192 clear_authorized_keys=False,
2193 clear_public_keys=False))
2194 else:
2195 logging.debug("Old key of node '%s' is the same as the current master"
2196 " key. Not deleting that key on the node.", node_name)
2197
2198 logging.debug("Removing old SSH keys of all master candidates.")
2199 if node_info_to_remove:
2200 node_errors = RemoveNodeSshKeyBulk(
2201 node_info_to_remove,
2202 master_candidate_uuids,
2203 potential_master_candidates,
2204 master_uuid=master_node_uuid,
2205 pub_key_file=ganeti_pub_keys_file,
2206 ssconf_store=ssconf_store,
2207 noded_cert_file=noded_cert_file,
2208 run_cmd_fn=run_cmd_fn,
2209 ssh_update_debug=ssh_update_debug,
2210 ssh_update_verbose=ssh_update_verbose)
2211 if node_errors:
2212 all_node_errors = all_node_errors + node_errors
2213
2214 for (node_uuid, node_name, master_candidate, potential_master_candidate) \
2215 in node_list:
2216
2217 logging.debug("Generating new SSH key for node '%s'.", node_name)
2218 _GenerateNodeSshKey(node_name, ssh_port_map, new_key_type, new_key_bits,
2219 ssconf_store=ssconf_store,
2220 noded_cert_file=noded_cert_file,
2221 run_cmd_fn=run_cmd_fn,
2222 ssh_update_verbose=ssh_update_verbose,
2223 ssh_update_debug=ssh_update_debug)
2224
2225 try:
2226 logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
2227 pub_key = ssh.ReadRemoteSshPubKey(new_pub_keyfile,
2228 node_name, cluster_name,
2229 ssh_port_map[node_name],
2230 False, # ask_key
2231 False) # key_check
2232 except:
2233 raise errors.SshUpdateError("Could not fetch key of node %s"
2234 " (UUID %s)" % (node_name, node_uuid))
2235
2236 if potential_master_candidate:
2237 ssh.RemovePublicKey(node_uuid, key_file=ganeti_pub_keys_file)
2238 ssh.AddPublicKey(node_uuid, pub_key, key_file=ganeti_pub_keys_file)
2239
2240 node_info = SshAddNodeInfo(name=node_name,
2241 uuid=node_uuid,
2242 to_authorized_keys=master_candidate,
2243 to_public_keys=potential_master_candidate,
2244 get_public_keys=True)
2245 node_keys_to_add.append(node_info)
2246
2247 node_errors = AddNodeSshKeyBulk(
2248 node_keys_to_add, potential_master_candidates,
2249 pub_key_file=ganeti_pub_keys_file, ssconf_store=ssconf_store,
2250 noded_cert_file=noded_cert_file,
2251 run_cmd_fn=run_cmd_fn,
2252 ssh_update_debug=ssh_update_debug,
2253 ssh_update_verbose=ssh_update_verbose)
2254 if node_errors:
2255 all_node_errors = all_node_errors + node_errors
2256
2257 # Renewing the master node's key
2258
2259 # Preserve the old keys for now
2260 old_master_keys_by_uuid = _GetOldMasterKeys(master_node_uuid,
2261 ganeti_pub_keys_file)
2262
2263 # Generate a new master key with a suffix, don't touch the old one for now
2264 logging.debug("Generate new ssh key of master.")
2265 _GenerateNodeSshKey(master_node_name, ssh_port_map,
2266 new_key_type, new_key_bits,
2267 ssconf_store=ssconf_store,
2268 noded_cert_file=noded_cert_file,
2269 run_cmd_fn=run_cmd_fn,
2270 suffix=constants.SSHS_MASTER_SUFFIX,
2271 ssh_update_debug=ssh_update_debug,
2272 ssh_update_verbose=ssh_update_verbose)
2273 # Read newly created master key
2274 new_master_keys = ssh.ReadLocalSshPubKeys(
2275 [new_key_type], suffix=constants.SSHS_MASTER_SUFFIX)
2276
2277 # Replace master key in the master nodes' public key file
2278 ssh.RemovePublicKey(master_node_uuid, key_file=ganeti_pub_keys_file)
2279 for pub_key in new_master_keys:
2280 ssh.AddPublicKey(master_node_uuid, pub_key, key_file=ganeti_pub_keys_file)
2281
2282 # Add new master key to all node's public and authorized keys
2283 logging.debug("Add new master key to all nodes.")
2284 node_errors = AddNodeSshKey(
2285 master_node_uuid, master_node_name, potential_master_candidates,
2286 to_authorized_keys=True, to_public_keys=True,
2287 get_public_keys=False, pub_key_file=ganeti_pub_keys_file,
2288 ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
2289 run_cmd_fn=run_cmd_fn,
2290 ssh_update_debug=ssh_update_debug,
2291 ssh_update_verbose=ssh_update_verbose)
2292 if node_errors:
2293 all_node_errors = all_node_errors + node_errors
2294
2295 # Remove the old key file and rename the new key to the non-temporary filename
2296 ssh.ReplaceSshKeys(new_key_type, new_key_type,
2297 src_key_suffix=constants.SSHS_MASTER_SUFFIX)
2298
2299 # Remove old key from authorized keys
2300 (auth_key_file, _) = \
2301 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2302 ssh.RemoveAuthorizedKeys(auth_key_file,
2303 old_master_keys_by_uuid[master_node_uuid])
2304
2305 # Remove the old key from all node's authorized keys file
2306 logging.debug("Remove the old master key from all nodes.")
2307 node_errors = RemoveNodeSshKey(
2308 master_node_uuid, master_node_name, master_candidate_uuids,
2309 potential_master_candidates,
2310 keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
2311 from_public_keys=False, clear_authorized_keys=False,
2312 clear_public_keys=False,
2313 pub_key_file=ganeti_pub_keys_file,
2314 ssconf_store=ssconf_store,
2315 noded_cert_file=noded_cert_file,
2316 run_cmd_fn=run_cmd_fn,
2317 ssh_update_debug=ssh_update_debug,
2318 ssh_update_verbose=ssh_update_verbose)
2319 if node_errors:
2320 all_node_errors = all_node_errors + node_errors
2321
2322 return all_node_errors
2323
2324
2325 def GetBlockDevSizes(devices):
2326 """Return the size of the given block devices
2327
2328 @type devices: list
2329 @param devices: list of block device nodes to query
2330 @rtype: dict
2331 @return:
2332 dictionary of all block devices under /dev (key). The value is their
2333 size in MiB.
2334
2335 {'/dev/disk/by-uuid/123456-12321231-312312-312': 124}
2336
2337 """
2338 DEV_PREFIX = "/dev/"
2339 blockdevs = {}
2340
2341 for devpath in devices:
2342 if not utils.IsBelowDir(DEV_PREFIX, devpath):
2343 continue
2344
2345 try:
2346 st = os.stat(devpath)
2347 except EnvironmentError, err:
2348 logging.warning("Error stat()'ing device %s: %s", devpath, str(err))
2349 continue
2350
2351 if stat.S_ISBLK(st.st_mode):
2352 result = utils.RunCmd(["blockdev", "--getsize64", devpath])
2353 if result.failed:
2354 # We don't want to fail, just do not list this device as available
2355 logging.warning("Cannot get size for block device %s", devpath)
2356 continue
2357
2358 size = int(result.stdout) / (1024 * 1024)
2359 blockdevs[devpath] = size
2360 return blockdevs
2361
2362
2363 def GetVolumeList(vg_names):
2364 """Compute list of logical volumes and their size.
2365
2366 @type vg_names: list
2367 @param vg_names: the volume groups whose LVs we should list, or
2368 empty for all volume groups
2369 @rtype: dict
2370 @return:
2371 dictionary of all partions (key) with value being a tuple of
2372 their size (in MiB), inactive and online status::
2373
2374 {'xenvg/test1': ('20.06', True, True)}
2375
2376 in case of errors, a string is returned with the error
2377 details.
2378
2379 """
2380 lvs = {}
2381 sep = "|"
2382 if not vg_names:
2383 vg_names = []
2384 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2385 "--separator=%s" % sep,
2386 "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names)
2387 if result.failed:
2388 _Fail("Failed to list logical volumes, lvs output: %s", result.output)
2389
2390 for line in result.stdout.splitlines():
2391 line = line.strip()
2392 match = _LVSLINE_REGEX.match(line)
2393 if not match:
2394 logging.error("Invalid line returned from lvs output: '%s'", line)
2395 continue
2396 vg_name, name, size, attr = match.groups()
2397 inactive = attr[4] == "-"
2398 online = attr[5] == "o"
2399 virtual = attr[0] == "v"
2400 if virtual:
2401 # we don't want to report such volumes as existing, since they
2402 # don't really hold data
2403 continue
2404 lvs[vg_name + "/" + name] = (size, inactive, online)
2405
2406 return lvs
2407
2408
2409 def ListVolumeGroups():
2410 """List the volume groups and their size.
2411
2412 @rtype: dict
2413 @return: dictionary with keys volume name and values the
2414 size of the volume
2415
2416 """
2417 return utils.ListVolumeGroups()
2418
2419
2420 def NodeVolumes():
2421 """List all volumes on this node.
2422
2423 @rtype: list
2424 @return:
2425 A list of dictionaries, each having four keys:
2426 - name: the logical volume name,
2427 - size: the size of the logical volume
2428 - dev: the physical device on which the LV lives
2429 - vg: the volume group to which it belongs
2430
2431 In case of errors, we return an empty list and log the
2432 error.
2433
2434 Note that since a logical volume can live on multiple physical
2435 volumes, the resulting list might include a logical volume
2436 multiple times.
2437
2438 """
2439 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2440 "--separator=|",
2441 "--options=lv_name,lv_size,devices,vg_name"])
2442 if result.failed:
2443 _Fail("Failed to list logical volumes, lvs output: %s",
2444 result.output)
2445
2446 def parse_dev(dev):
2447 return dev.split("(")[0]
2448
2449 def handle_dev(dev):
2450 return [parse_dev(x) for x in dev.split(",")]
2451
2452 def map_line(line):
2453 line = [v.strip() for v in line]
2454 return [{"name": line[0], "size": line[1],
2455 "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])]
2456
2457 all_devs = []
2458 for line in result.stdout.splitlines():
2459 if line.count("|") >= 3:
2460 all_devs.extend(map_line(line.split("|")))
2461 else:
2462 logging.warning("Strange line in the output from lvs: '%s'", line)
2463 return all_devs
2464
2465
2466 def BridgesExist(bridges_list):
2467 """Check if a list of bridges exist on the current node.
2468
2469 @rtype: boolean
2470 @return: C{True} if all of them exist, C{False} otherwise
2471
2472 """
2473 missing = []
2474 for bridge in bridges_list:
2475 if not utils.BridgeExists(bridge):
2476 missing.append(bridge)
2477
2478 if missing:
2479 _Fail("Missing bridges %s", utils.CommaJoin(missing))
2480
2481
2482 def GetInstanceListForHypervisor(hname, hvparams=None,
2483 get_hv_fn=hypervisor.GetHypervisor):
2484 """Provides a list of instances of the given hypervisor.
2485
2486 @type hname: string
2487 @param hname: name of the hypervisor
2488 @type hvparams: dict of strings
2489 @param hvparams: hypervisor parameters for the given hypervisor
2490 @type get_hv_fn: function
2491 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2492 name; optional parameter to increase testability
2493
2494 @rtype: list
2495 @return: a list of all running instances on the current node
2496 - instance1.example.com
2497 - instance2.example.com
2498
2499 """
2500 try:
2501 return get_hv_fn(hname).ListInstances(hvparams=hvparams)
2502 except errors.HypervisorError, err:
2503 _Fail("Error enumerating instances (hypervisor %s): %s",
2504 hname, err, exc=True)
2505
2506
2507 def GetInstanceList(hypervisor_list, all_hvparams=None,
2508 get_hv_fn=hypervisor.GetHypervisor):
2509 """Provides a list of instances.
2510
2511 @type hypervisor_list: list
2512 @param hypervisor_list: the list of hypervisors to query information
2513 @type all_hvparams: dict of dict of strings
2514 @param all_hvparams: a dictionary mapping hypervisor types to respective
2515 cluster-wide hypervisor parameters
2516 @type get_hv_fn: function
2517 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2518 name; optional parameter to increase testability
2519
2520 @rtype: list
2521 @return: a list of all running instances on the current node
2522 - instance1.example.com
2523 - instance2.example.com
2524
2525 """
2526 results = []
2527 for hname in hypervisor_list:
2528 hvparams = all_hvparams[hname]
2529 results.extend(GetInstanceListForHypervisor(hname, hvparams=hvparams,
2530 get_hv_fn=get_hv_fn))
2531 return results
2532
2533
2534 def GetInstanceInfo(instance, hname, hvparams=None):
2535 """Gives back the information about an instance as a dictionary.
2536
2537 @type instance: string
2538 @param instance: the instance name
2539 @type hname: string
2540 @param hname: the hypervisor type of the instance
2541 @type hvparams: dict of strings
2542 @param hvparams: the instance's hvparams
2543
2544 @rtype: dict
2545 @return: dictionary with the following keys:
2546 - memory: memory size of instance (int)
2547 - state: state of instance (HvInstanceState)
2548 - time: cpu time of instance (float)
2549 - vcpus: the number of vcpus (int)
2550
2551 """
2552 output = {}
2553
2554 iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance,
2555 hvparams=hvparams)
2556 if iinfo is not None:
2557 output["memory"] = iinfo[2]
2558 output["vcpus"] = iinfo[3]
2559 output["state"] = iinfo[4]
2560 output["time"] = iinfo[5]
2561
2562 return output
2563
2564
2565 def GetInstanceMigratable(instance):
2566 """Computes whether an instance can be migrated.
2567
2568 @type instance: L{objects.Instance}
2569 @param instance: object representing the instance to be checked.
2570
2571 @rtype: tuple
2572 @return: tuple of (result, description) where:
2573 - result: whether the instance can be migrated or not
2574 - description: a description of the issue, if relevant
2575
2576 """
2577 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2578 iname = instance.name
2579 if iname not in hyper.ListInstances(hvparams=instance.hvparams):
2580 _Fail("Instance %s is not running", iname)
2581
2582 for idx in range(len(instance.disks_info)):
2583 link_name = _GetBlockDevSymlinkPath(iname, idx)
2584 if not os.path.islink(link_name):
2585 logging.warning("Instance %s is missing symlink %s for disk %d",
2586 iname, link_name, idx)
2587
2588
2589 def GetAllInstancesInfo(hypervisor_list, all_hvparams):
2590 """Gather data about all instances.
2591
2592 This is the equivalent of L{GetInstanceInfo}, except that it
2593 computes data for all instances at once, thus being faster if one
2594 needs data about more than one instance.
2595
2596 @type hypervisor_list: list
2597 @param hypervisor_list: list of hypervisors to query for instance data
2598 @type all_hvparams: dict of dict of strings
2599 @param all_hvparams: mapping of hypervisor names to hvparams
2600
2601 @rtype: dict
2602 @return: dictionary of instance: data, with data having the following keys:
2603 - memory: memory size of instance (int)
2604 - state: xen state of instance (string)
2605 - time: cpu time of instance (float)
2606 - vcpus: the number of vcpus
2607
2608 """
2609 output = {}
2610 for hname in hypervisor_list:
2611 hvparams = all_hvparams[hname]
2612 iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo(hvparams)
2613 if iinfo:
2614 for name, _, memory, vcpus, state, times in iinfo:
2615 value = {
2616 "memory": memory,
2617 "vcpus": vcpus,
2618 "state": state,
2619 "time": times,
2620 }
2621 if name in output:
2622 # we only check static parameters, like memory and vcpus,
2623 # and not state and time which can change between the
2624 # invocations of the different hypervisors
2625 for key in "memory", "vcpus":
2626 if value[key] != output[name][key]:
2627 _Fail("Instance %s is running twice"
2628 " with different parameters", name)
2629 output[name] = value
2630
2631 return output
2632
2633
2634 def GetInstanceConsoleInfo(instance_param_dict,
2635 get_hv_fn=hypervisor.GetHypervisor):
2636 """Gather data about the console access of a set of instances of this node.
2637
2638 This function assumes that the caller already knows which instances are on
2639 this node, by calling a function such as L{GetAllInstancesInfo} or
2640 L{GetInstanceList}.
2641
2642 For every instance, a large amount of configuration data needs to be
2643 provided to the hypervisor interface in order to receive the console
2644 information. Whether this could or should be cut down can be discussed.
2645 The information is provided in a dictionary indexed by instance name,
2646 allowing any number of instance queries to be done.
2647
2648 @type instance_param_dict: dict of string to tuple of dictionaries, where the
2649 dictionaries represent: L{objects.Instance}, L{objects.Node},
2650 L{objects.NodeGroup}, HvParams, BeParams
2651 @param instance_param_dict: mapping of instance name to parameters necessary
2652 for console information retrieval
2653
2654 @rtype: dict
2655 @return: dictionary of instance: data, with data having the following keys:
2656 - instance: instance name
2657 - kind: console kind
2658 - message: used with kind == CONS_MESSAGE, indicates console to be
2659 unavailable, supplies error message
2660 - host: host to connect to
2661 - port: port to use
2662 - user: user for login
2663 - command: the command, broken into parts as an array
2664 - display: unknown, potentially unused?
2665
2666 """
2667
2668 output = {}
2669 for inst_name in instance_param_dict:
2670 instance = instance_param_dict[inst_name]["instance"]
2671 pnode = instance_param_dict[inst_name]["node"]
2672 group = instance_param_dict[inst_name]["group"]
2673 hvparams = instance_param_dict[inst_name]["hvParams"]
2674 beparams = instance_param_dict[inst_name]["beParams"]
2675
2676 instance = objects.Instance.FromDict(instance)
2677 pnode = objects.Node.FromDict(pnode)
2678 group = objects.NodeGroup.FromDict(group)
2679
2680 h = get_hv_fn(instance.hypervisor)
2681 output[inst_name] = h.GetInstanceConsole(instance, pnode, group,
2682 hvparams, beparams).ToDict()
2683
2684 return output
2685
2686
2687 def _InstanceLogName(kind, os_name, instance, component):
2688 """Compute the OS log filename for a given instance and operation.
2689
2690 The instance name and os name are passed in as strings since not all
2691 operations have these as part of an instance object.
2692
2693 @type kind: string
2694 @param kind: the operation type (e.g. add, import, etc.)
2695 @type os_name: string
2696 @param os_name: the os name
2697 @type instance: string
2698 @param instance: the name of the instance being imported/added/etc.
2699 @type component: string or None
2700 @param component: the name of the component of the instance being
2701 transferred
2702
2703 """
2704 # TODO: Use tempfile.mkstemp to create unique filename
2705 if component:
2706 assert "/" not in component
2707 c_msg = "-%s" % component
2708 else:
2709 c_msg = ""
2710 base = ("%s-%s-%s%s-%s.log" %
2711 (kind, os_name, instance, c_msg, utils.TimestampForFilename()))
2712 return utils.PathJoin(pathutils.LOG_OS_DIR, base)
2713
2714
2715 def InstanceOsAdd(instance, reinstall, debug):
2716 """Add an OS to an instance.
2717
2718 @type instance: L{objects.Instance}
2719 @param instance: Instance whose OS is to be installed
2720 @type reinstall: boolean
2721 @param reinstall: whether this is an instance reinstall
2722 @type debug: integer
2723 @param debug: debug level, passed to the OS scripts
2724 @rtype: None
2725
2726 """
2727 inst_os = OSFromDisk(instance.os)
2728
2729 create_env = OSEnvironment(instance, inst_os, debug)
2730 if reinstall:
2731 create_env["INSTANCE_REINSTALL"] = "1"
2732
2733 logfile = _InstanceLogName("add", instance.os, instance.name, None)
2734
2735 result = utils.RunCmd([inst_os.create_script], env=create_env,
2736 cwd=inst_os.path, output=logfile, reset_env=True)
2737 if result.failed:
2738 logging.error("os create command '%s' returned error: %s, logfile: %s,"
2739 " output: %s", result.cmd, result.fail_reason, logfile,
2740 result.output)
2741 lines = [utils.SafeEncode(val)
2742 for val in utils.TailFile(logfile, lines=20)]
2743 _Fail("OS create script failed (%s), last lines in the"
2744 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2745
2746
2747 def RunRenameInstance(instance, old_name, debug):
2748 """Run the OS rename script for an instance.
2749
2750 @type instance: L{objects.Instance}
2751 @param instance: Instance whose OS is to be installed
2752 @type old_name: string
2753 @param old_name: previous instance name
2754 @type debug: integer
2755 @param debug: debug level, passed to the OS scripts
2756 @rtype: boolean
2757 @return: the success of the operation
2758
2759 """
2760 inst_os = OSFromDisk(instance.os)
2761
2762 rename_env = OSEnvironment(instance, inst_os, debug)
2763 rename_env["OLD_INSTANCE_NAME"] = old_name
2764
2765 logfile = _InstanceLogName("rename", instance.os,
2766 "%s-%s" % (old_name, instance.name), None)
2767
2768 result = utils.RunCmd([inst_os.rename_script], env=rename_env,
2769 cwd=inst_os.path, output=logfile, reset_env=True)
2770
2771 if result.failed:
2772 logging.error("os create command '%s' returned error: %s output: %s",
2773 result.cmd, result.fail_reason, result.output)
2774 lines = [utils.SafeEncode(val)
2775 for val in utils.TailFile(logfile, lines=20)]
2776 _Fail("OS rename script failed (%s), last lines in the"
2777 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2778
2779
2780 def _GetBlockDevSymlinkPath(instance_name, idx, _dir=None):
2781 """Returns symlink path for block device.
2782
2783 """
2784 if _dir is None:
2785 _dir = pathutils.DISK_LINKS_DIR
2786
2787 return utils.PathJoin(_dir,
2788 ("%s%s%s" %
2789 (instance_name, constants.DISK_SEPARATOR, idx)))
2790
2791
2792 def _SymlinkBlockDev(instance_name, device_path, idx):
2793 """Set up symlinks to a instance's block device.
2794
2795 This is an auxiliary function run when an instance is start (on the primary
2796 node) or when an instance is migrated (on the target node).
2797
2798
2799 @param instance_name: the name of the target instance
2800 @param device_path: path of the physical block device, on the node
2801 @param idx: the disk index
2802 @return: absolute path to the disk's symlink
2803
2804 """
2805 # In case we have only a userspace access URI, device_path is None
2806 if not device_path:
2807 return None
2808
2809 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2810 try:
2811 os.symlink(device_path, link_name)
2812 except OSError, err:
2813 if err.errno == errno.EEXIST:
2814 if (not os.path.islink(link_name) or
2815 os.readlink(link_name) != device_path):
2816 os.remove(link_name)
2817 os.symlink(device_path, link_name)
2818 else:
2819 raise
2820
2821 return link_name
2822
2823
2824 def _RemoveBlockDevLinks(instance_name, disks):
2825 """Remove the block device symlinks belonging to the given instance.
2826
2827 """
2828 for idx, _ in enumerate(disks):
2829 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2830 if os.path.islink(link_name):
2831 try:
2832 os.remove(link_name)
2833 except OSError:
2834 logging.exception("Can't remove symlink '%s'", link_name)
2835
2836
2837 def _CalculateDeviceURI(instance, disk, device):
2838 """Get the URI for the device.
2839
2840 @type instance: L{objects.Instance}
2841 @param instance: the instance which disk belongs to
2842 @type disk: L{objects.Disk}
2843 @param disk: the target disk object
2844 @type device: L{bdev.BlockDev}
2845 @param device: the corresponding BlockDevice
2846 @rtype: string
2847 @return: the device uri if any else None
2848
2849 """
2850 access_mode = disk.params.get(constants.LDP_ACCESS,
2851 constants.DISK_KERNELSPACE)
2852 if access_mode == constants.DISK_USERSPACE:
2853 # This can raise errors.BlockDeviceError
2854 return device.GetUserspaceAccessUri(instance.hypervisor)
2855 else:
2856 return None
2857
2858
2859 def _GatherAndLinkBlockDevs(instance):
2860 """Set up an instance's block device(s).
2861
2862 This is run on the primary node at instance startup. The block
2863 devices must be already assembled.
2864
2865 @type instance: L{objects.Instance}
2866 @param instance: the instance whose disks we should assemble
2867 @rtype: list
2868 @return: list of (disk_object, link_name, drive_uri)
2869
2870 """
2871 block_devices = []
2872 for idx, disk in enumerate(instance.disks_info):
2873 device = _RecursiveFindBD(disk)
2874 if device is None:
2875 raise errors.BlockDeviceError("Block device '%s' is not set up." %
2876 str(disk))
2877 device.Open()
2878 try:
2879 link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
2880 except OSError, e:
2881 raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
2882 e.strerror)
2883 uri = _CalculateDeviceURI(instance, disk, device)
2884
2885 block_devices.append((disk, link_name, uri))
2886
2887 return block_devices
2888
2889
2890 def _IsInstanceUserDown(instance_info):
2891 return instance_info and \
2892 "state" in instance_info and \
2893 hv_base.HvInstanceState.IsShutdown(instance_info["state"])
2894
2895
2896 def _GetInstanceInfo(instance):
2897 """Helper function L{GetInstanceInfo}"""
2898 return GetInstanceInfo(instance.name, instance.hypervisor,
2899 hvparams=instance.hvparams)
2900
2901
2902 def StartInstance(instance, startup_paused, reason, store_reason=True):
2903 """Start an instance.
2904
2905 @type instance: L{objects.Instance}
2906 @param instance: the instance object
2907 @type startup_paused: bool
2908 @param instance: pause instance at startup?
2909 @type reason: list of reasons
2910 @param reason: the reason trail for this startup
2911 @type store_reason: boolean
2912 @param store_reason: whether to store the shutdown reason trail on file
2913 @rtype: None
2914
2915 """
2916 try:
2917 instance_info = _GetInstanceInfo(instance)
2918 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2919
2920 if instance_info and not _IsInstanceUserDown(instance_info):
2921 logging.info("Instance '%s' already running, not starting", instance.name)
2922 if hyper.VerifyInstance(instance):
2923 return
2924 logging.info("Instance '%s' hypervisor config out of date. Restoring.",
2925 instance.name)
2926 block_devices = _GatherAndLinkBlockDevs(instance)
2927 hyper.RestoreInstance(instance, block_devices)
2928 return
2929
2930 block_devices = _GatherAndLinkBlockDevs(instance)
2931 hyper.StartInstance(instance, block_devices, startup_paused)
2932 if store_reason:
2933 _StoreInstReasonTrail(instance.name, reason)
2934 except errors.BlockDeviceError, err:
2935 _Fail("Block device error: %s", err, exc=True)
2936 except errors.HypervisorError, err:
2937 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2938 _Fail("Hypervisor error: %s", err, exc=True)
2939
2940
2941 def InstanceShutdown(instance, timeout, reason, store_reason=True):
2942 """Shut an instance down.
2943
2944 @note: this functions uses polling with a hardcoded timeout.
2945
2946 @type instance: L{objects.Instance}
2947 @param instance: the instance object
2948 @type timeout: integer
2949 @param timeout: maximum timeout for soft shutdown
2950 @type reason: list of reasons
2951 @param reason: the reason trail for this shutdown
2952 @type store_reason: boolean
2953 @param store_reason: whether to store the shutdown reason trail on file
2954 @rtype: None
2955
2956 """
2957 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2958
2959 if not _GetInstanceInfo(instance):
2960 logging.info("Instance '%s' not running, doing nothing", instance.name)
2961 return
2962
2963 class _TryShutdown(object):
2964 def __init__(self):
2965 self.tried_once = False
2966
2967 def __call__(self):
2968 try:
2969 hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout)
2970 if store_reason:
2971 _StoreInstReasonTrail(instance.name, reason)
2972 except errors.HypervisorError, err:
2973 # if the instance does no longer exist, consider this success and go to
2974 # cleanup, otherwise fail without retrying
2975 if _GetInstanceInfo(instance):
2976 _Fail("Failed to stop instance '%s': %s", instance.name, err)
2977 return
2978
2979 # TODO: Cleanup hypervisor implementations to prevent them from failing
2980 # silently. We could easily decide if we want to retry or not by using
2981 # HypervisorSoftError()/HypervisorHardError()
2982 self.tried_once = True
2983 if _GetInstanceInfo(instance):
2984 raise utils.RetryAgain()
2985
2986 try:
2987 utils.Retry(_TryShutdown(), 5, timeout)
2988 except utils.RetryTimeout:
2989 # the shutdown did not succeed
2990 logging.error("Shutdown of '%s' unsuccessful, forcing", instance.name)
2991
2992 try:
2993 hyper.StopInstance(instance, force=True)
2994 except errors.HypervisorError, err:
2995 # only raise an error if the instance still exists, otherwise
2996 # the error could simply be "instance ... unknown"!
2997 if _GetInstanceInfo(instance):
2998 _Fail("Failed to force stop instance '%s': %s", instance.name, err)
2999
3000 time.sleep(1)
3001
3002 if _GetInstanceInfo(instance):
3003 _Fail("Could not shutdown instance '%s' even by destroy", instance.name)
3004
3005 try:
3006 hyper.CleanupInstance(instance.name)
3007 except errors.HypervisorError, err:
3008 logging.warning("Failed to execute post-shutdown cleanup step: %s", err)
3009
3010 _RemoveBlockDevLinks(instance.name, instance.disks_info)
3011
3012
3013 def InstanceReboot(instance, reboot_type, shutdown_timeout, reason):
3014 """Reboot an instance.
3015
3016 @type instance: L{objects.Instance}
3017 @param instance: the instance object to reboot
3018 @type reboot_type: str
3019 @param reboot_type: the type of reboot, one the following
3020 constants:
3021 - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
3022 instance OS, do not recreate the VM
3023 - L{constants.INSTANCE_REBOOT_HARD}: tear down and
3024 restart the VM (at the hypervisor level)
3025 - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is
3026 not accepted here, since that mode is handled differently, in
3027 cmdlib, and translates into full stop and start of the
3028 instance (instead of a call_instance_reboot RPC)
3029 @type shutdown_timeout: integer
3030 @param shutdown_timeout: maximum timeout for soft shutdown
3031 @type reason: list of reasons
3032 @param reason: the reason trail for this reboot
3033 @rtype: None
3034
3035 """
3036 # TODO: this is inconsistent with 'StartInstance' and 'InstanceShutdown'
3037 # because those functions simply 'return' on error whereas this one
3038 # raises an exception with '_Fail'
3039 if not _GetInstanceInfo(instance):
3040 _Fail("Cannot reboot instance '%s' that is not running", instance.name)
3041
3042 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3043 if reboot_type == constants.INSTANCE_REBOOT_SOFT:
3044 try:
3045 hyper.RebootInstance(instance)
3046 except errors.HypervisorError, err:
3047 _Fail("Failed to soft reboot instance '%s': %s", instance.name, err)
3048 elif reboot_type == constants.INSTANCE_REBOOT_HARD:
3049 try:
3050 InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False)
3051 StartInstance(instance, False, reason, store_reason=False)
3052 _StoreInstReasonTrail(instance.name, reason)
3053 except errors.HypervisorError, err:
3054 _Fail("Failed to hard reboot instance '%s': %s", instance.name, err)
3055 else:
3056 _Fail("Invalid reboot_type received: '%s'", reboot_type)
3057
3058
3059 def InstanceBalloonMemory(instance, memory):
3060 """Resize an instance's memory.
3061
3062 @type instance: L{objects.Instance}
3063 @param instance: the instance object
3064 @type memory: int
3065 @param memory: new memory amount in MB
3066 @rtype: None
3067
3068 """
3069 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3070 running = hyper.ListInstances(hvparams=instance.hvparams)
3071 if instance.name not in running:
3072 logging.info("Instance %s is not running, cannot balloon", instance.name)
3073 return
3074 try:
3075 hyper.BalloonInstanceMemory(instance, memory)
3076 except errors.HypervisorError, err:
3077 _Fail("Failed to balloon instance memory: %s", err, exc=True)
3078
3079
3080 def MigrationInfo(instance):
3081 """Gather information about an instance to be migrated.
3082
3083 @type instance: L{objects.Instance}
3084 @param instance: the instance definition
3085
3086 """
3087 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3088 try:
3089 info = hyper.MigrationInfo(instance)
3090 except errors.HypervisorError, err:
3091 _Fail("Failed to fetch migration information: %s", err, exc=True)
3092 return info
3093
3094
3095 def AcceptInstance(instance, info, target):
3096 """Prepare the node to accept an instance.
3097
3098 @type instance: L{objects.Instance}
3099 @param instance: the instance definition
3100 @type info: string/data (opaque)
3101 @param info: migration information, from the source node
3102 @type target: string
3103 @param target: target host (usually ip), on this node
3104
3105 """
3106 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3107 try:
3108 hyper.AcceptInstance(instance, info, target)
3109 except errors.HypervisorError, err:
3110 _Fail("Failed to accept instance: %s", err, exc=True)
3111
3112
3113 def FinalizeMigrationDst(instance, info, success):
3114 """Finalize any preparation to accept an instance.
3115
3116 @type instance: L{objects.Instance}
3117 @param instance: the instance definition
3118 @type info: string/data (opaque)
3119 @param info: migration information, from the source node
3120 @type success: boolean
3121 @param success: whether the migration was a success or a failure
3122
3123 """
3124 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3125 try:
3126 hyper.FinalizeMigrationDst(instance, info, success)
3127 except errors.HypervisorError, err:
3128 _Fail("Failed to finalize migration on the target node: %s", err, exc=True)
3129
3130
3131 def MigrateInstance(cluster_name, instance, target, live):
3132 """Migrates an instance to another node.
3133
3134 @type cluster_name: string
3135 @param cluster_name: name of the cluster
3136 @type instance: L{objects.Instance}
3137 @param instance: the instance definition
3138 @type target: string
3139 @param target: the target node name
3140 @type live: boolean
3141 @param live: whether the migration should be done live or not (the
3142 interpretation of this parameter is left to the hypervisor)
3143 @raise RPCFail: if migration fails for some reason
3144
3145 """
3146 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3147
3148 try:
3149 hyper.MigrateInstance(cluster_name, instance, target, live)
3150 except errors.HypervisorError, err:
3151 _Fail("Failed to migrate instance: %s", err, exc=True)
3152
3153
3154 def FinalizeMigrationSource(instance, success, live):
3155 """Finalize the instance migration on the source node.
3156
3157 @type instance: L{objects.Instance}
3158 @param instance: the instance definition of the migrated instance
3159 @type success: bool
3160 @param success: whether the migration succeeded or not
3161 @type live: bool
3162 @param live: whether the user requested a live migration or not
3163 @raise RPCFail: If the execution fails for some reason
3164
3165 """
3166 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3167
3168 try:
3169 hyper.FinalizeMigrationSource(instance, success, live)
3170 except Exception, err: # pylint: disable=W0703
3171 _Fail("Failed to finalize the migration on the source node: %s", err,
3172 exc=True)
3173
3174
3175 def GetMigrationStatus(instance):
3176 """Get the migration status
3177
3178 @type instance: L{objects.Instance}
3179 @param instance: the instance that is being migrated
3180 @rtype: L{objects.MigrationStatus}
3181 @return: the status of the current migration (one of
3182 L{constants.HV_MIGRATION_VALID_STATUSES}), plus any additional
3183 progress info that can be retrieved from the hypervisor
3184 @raise RPCFail: If the migration status cannot be retrieved
3185
3186 """
3187 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3188 try:
3189 return hyper.GetMigrationStatus(instance)
3190 except Exception, err: # pylint: disable=W0703
3191 _Fail("Failed to get migration status: %s", err, exc=True)
3192
3193
3194 def HotplugDevice(instance, action, dev_type, device, extra, seq):
3195 """Hotplug a device
3196
3197 Hotplug is currently supported only for KVM Hypervisor.
3198 @type instance: L{objects.Instance}
3199 @param instance: the instance to which we hotplug a device
3200 @type action: string
3201 @param action: the hotplug action to perform
3202 @type dev_type: string
3203 @param dev_type: the device type to hotplug
3204 @type device: either L{objects.NIC} or L{objects.Disk}
3205 @param device: the device object to hotplug
3206 @type extra: tuple
3207 @param extra: extra info used for disk hotplug (disk link, drive uri)
3208 @type seq: int
3209 @param seq: the index of the device from master perspective
3210 @raise RPCFail: in case instance does not have KVM hypervisor
3211
3212 """
3213 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3214 try:
3215 hyper.VerifyHotplugSupport(instance, action, dev_type)
3216 except errors.HotplugError, err:
3217 _Fail("Hotplug is not supported: %s", err)
3218
3219 if action == constants.HOTPLUG_ACTION_ADD:
3220 fn = hyper.HotAddDevice
3221 elif action == constants.HOTPLUG_ACTION_REMOVE:
3222 fn = hyper.HotDelDevice
3223 elif action == constants.HOTPLUG_ACTION_MODIFY:
3224 fn = hyper.HotModDevice
3225 else:
3226 assert action in constants.HOTPLUG_ALL_ACTIONS
3227
3228 return fn(instance, dev_type, device, extra, seq)
3229
3230
3231 def HotplugSupported(instance):
3232 """Checks if hotplug is generally supported.
3233
3234 """
3235 hyper = hypervisor.GetHypervisor(instance.hypervisor)
3236 try:
3237 hyper.HotplugSupported(instance)
3238 except errors.HotplugError, err:
3239 _Fail("Hotplug is not supported: %s", err)
3240
3241
3242 def ModifyInstanceMetadata(metadata):
3243 """Sends instance data to the metadata daemon.
3244
3245 Uses the Luxi transport layer to communicate with the metadata
3246 daemon configuration server. It starts the metadata daemon if it is
3247 not running.
3248 The daemon must be enabled during at configuration time.
3249
3250 @type metadata: dict
3251 @param metadata: instance metadata obtained by calling
3252 L{objects.Instance.ToDict} on an instance object
3253
3254 """
3255 if not constants.ENABLE_METAD:
3256 raise errors.ProgrammerError("The metadata deamon is disabled, yet"
3257 " ModifyInstanceMetadata has been called")
3258
3259 if not utils.IsDaemonAlive(constants.METAD):
3260 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.METAD])
3261 if result.failed:
3262 raise errors.HypervisorError("Failed to start metadata daemon")
3263
3264 with contextlib.closing(metad.Client()) as client:
3265 client.UpdateConfig(metadata)
3266
3267
3268 def BlockdevCreate(disk, size, owner, on_primary, info, excl_stor):
3269 """Creates a block device for an instance.
3270
3271 @type disk: L{objects.Disk}
3272 @param disk: the object describing the disk we should create
3273 @type size: int
3274 @param size: the size of the physical underlying device, in MiB
3275 @type owner: str
3276 @param owner: the name of the instance for which disk is created,
3277 used for device cache data
3278 @type on_primary: boolean
3279 @param on_primary: indicates if it is the primary node or not
3280 @type info: string
3281 @param info: string that will be sent to the physical device
3282 creation, used for example to set (LVM) tags on LVs
3283 @type excl_stor: boolean
3284 @param excl_stor: Whether exclusive_storage is active
3285
3286 @return: the new unique_id of the device (this can sometime be
3287 computed only after creation), or None. On secondary nodes,
3288 it's not required to return anything.
3289
3290 """
3291 # TODO: remove the obsolete "size" argument
3292 # pylint: disable=W0613
3293 clist = []
3294 if disk.children:
3295 for child in disk.children:
3296 try:
3297 crdev = _RecursiveAssembleBD(child, owner, on_primary)
3298 except errors.BlockDeviceError, err:
3299 _Fail("Can't assemble device %s: %s", child, err)
3300 if on_primary or disk.AssembleOnSecondary():
3301 # we need the children open in case the device itself has to
3302 # be assembled
3303 try:
3304 # pylint: disable=E1103
3305 crdev.Open()
3306 except errors.BlockDeviceError, err:
3307 _Fail("Can't make child '%s' read-write: %s", child, err)
3308 clist.append(crdev)
3309
3310 try:
3311 device = bdev.Create(disk, clist, excl_stor)
3312 except errors.BlockDeviceError, err:
3313 _Fail("Can't create block device: %s", err)
3314
3315 if on_primary or disk.AssembleOnSecondary():
3316 try:
3317 device.Assemble()
3318 except errors.BlockDeviceError, err:
3319 _Fail("Can't assemble device after creation, unusual event: %s", err)
3320 if on_primary or disk.OpenOnSecondary():
3321 try:
3322 device.Open(force=True)
3323 except errors.BlockDeviceError, err:
3324 _Fail("Can't make device r/w after creation, unusual event: %s", err)
3325 DevCacheManager.UpdateCache(device.dev_path, owner,
3326 on_primary, disk.iv_name)
3327
3328 device.SetInfo(info)
3329
3330 return device.unique_id
3331
3332
3333 def _DumpDevice(source_path, target_path, offset, size, truncate):
3334 """This function images/wipes the device using a local file.
3335
3336 @type source_path: string
3337 @param source_path: path of the image or data source (e.g., "/dev/zero")
3338
3339 @type target_path: string
3340 @param target_path: path of the device to image/wipe
3341
3342 @type offset: int
3343 @param offset: offset in MiB in the output file
3344
3345 @type size: int
3346 @param size: maximum size in MiB to write (data source might be smaller)
3347
3348 @type truncate: bool
3349 @param truncate: whether the file should be truncated
3350
3351 @return: None
3352 @raise RPCFail: in case of failure
3353
3354 """
3355 # Internal sizes are always in Mebibytes; if the following "dd" command
3356 # should use a different block size the offset and size given to this
3357 # function must be adjusted accordingly before being passed to "dd".
3358 block_size = constants.DD_BLOCK_SIZE
3359
3360 cmd = [constants.DD_CMD, "if=%s" % source_path, "seek=%d" % offset,
3361 "bs=%s" % block_size, "oflag=direct", "of=%s" % target_path,