Use ssconf for SSH ports in NodeVerify
[ganeti-github.git] / lib / backend.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31 """Functions used by the node daemon
32
33 @var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
34 the L{UploadFile} function
35 @var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted
36 in the L{_CleanDirectory} function
37
38 """
39
40 # pylint: disable=E1103,C0302
41
42 # E1103: %s %r has no %r member (but some types could not be
43 # inferred), because the _TryOSFromDisk returns either (True, os_obj)
44 # or (False, "string") which confuses pylint
45
46 # C0302: This module has become too big and should be split up
47
48
49 import base64
50 import errno
51 import logging
52 import os
53 import os.path
54 import pycurl
55 import random
56 import re
57 import shutil
58 import signal
59 import stat
60 import tempfile
61 import time
62 import zlib
63 import copy
64 import contextlib
65
66 from ganeti import errors
67 from ganeti import http
68 from ganeti import utils
69 from ganeti import ssh
70 from ganeti import hypervisor
71 from ganeti.hypervisor import hv_base
72 from ganeti import constants
73 from ganeti.storage import bdev
74 from ganeti.storage import drbd
75 from ganeti.storage import extstorage
76 from ganeti.storage import filestorage
77 from ganeti import objects
78 from ganeti import ssconf
79 from ganeti import serializer
80 from ganeti import netutils
81 from ganeti import runtime
82 from ganeti import compat
83 from ganeti import pathutils
84 from ganeti import vcluster
85 from ganeti import ht
86 from ganeti.storage.base import BlockDev
87 from ganeti.storage.drbd import DRBD8
88 from ganeti import hooksmaster
89 import ganeti.metad as metad
90
91
92 _BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id"
93 _ALLOWED_CLEAN_DIRS = compat.UniqueFrozenset([
94 pathutils.DATA_DIR,
95 pathutils.JOB_QUEUE_ARCHIVE_DIR,
96 pathutils.QUEUE_DIR,
97 pathutils.CRYPTO_KEYS_DIR,
98 ])
99 _MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60
100 _X509_KEY_FILE = "key"
101 _X509_CERT_FILE = "cert"
102 _IES_STATUS_FILE = "status"
103 _IES_PID_FILE = "pid"
104 _IES_CA_FILE = "ca"
105
106 #: Valid LVS output line regex
107 _LVSLINE_REGEX = re.compile(r"^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6,})\|?$")
108
109 # Actions for the master setup script
110 _MASTER_START = "start"
111 _MASTER_STOP = "stop"
112
113 #: Maximum file permissions for restricted command directory and executables
114 _RCMD_MAX_MODE = (stat.S_IRWXU |
115 stat.S_IRGRP | stat.S_IXGRP |
116 stat.S_IROTH | stat.S_IXOTH)
117
118 #: Delay before returning an error for restricted commands
119 _RCMD_INVALID_DELAY = 10
120
121 #: How long to wait to acquire lock for restricted commands (shorter than
122 #: L{_RCMD_INVALID_DELAY}) to reduce blockage of noded forks when many
123 #: command requests arrive
124 _RCMD_LOCK_TIMEOUT = _RCMD_INVALID_DELAY * 0.8
125
126
127 class RPCFail(Exception):
128 """Class denoting RPC failure.
129
130 Its argument is the error message.
131
132 """
133
134
135 def _GetInstReasonFilename(instance_name):
136 """Path of the file containing the reason of the instance status change.
137
138 @type instance_name: string
139 @param instance_name: The name of the instance
140 @rtype: string
141 @return: The path of the file
142
143 """
144 return utils.PathJoin(pathutils.INSTANCE_REASON_DIR, instance_name)
145
146
147 def _StoreInstReasonTrail(instance_name, trail):
148 """Serialize a reason trail related to an instance change of state to file.
149
150 The exact location of the file depends on the name of the instance and on
151 the configuration of the Ganeti cluster defined at deploy time.
152
153 @type instance_name: string
154 @param instance_name: The name of the instance
155
156 @type trail: list of reasons
157 @param trail: reason trail
158
159 @rtype: None
160
161 """
162 json = serializer.DumpJson(trail)
163 filename = _GetInstReasonFilename(instance_name)
164 utils.WriteFile(filename, data=json)
165
166
167 def _Fail(msg, *args, **kwargs):
168 """Log an error and the raise an RPCFail exception.
169
170 This exception is then handled specially in the ganeti daemon and
171 turned into a 'failed' return type. As such, this function is a
172 useful shortcut for logging the error and returning it to the master
173 daemon.
174
175 @type msg: string
176 @param msg: the text of the exception
177 @raise RPCFail
178
179 """
180 if args:
181 msg = msg % args
182 if "log" not in kwargs or kwargs["log"]: # if we should log this error
183 if "exc" in kwargs and kwargs["exc"]:
184 logging.exception(msg)
185 else:
186 logging.error(msg)
187 raise RPCFail(msg)
188
189
190 def _GetConfig():
191 """Simple wrapper to return a SimpleStore.
192
193 @rtype: L{ssconf.SimpleStore}
194 @return: a SimpleStore instance
195
196 """
197 return ssconf.SimpleStore()
198
199
200 def _GetSshRunner(cluster_name):
201 """Simple wrapper to return an SshRunner.
202
203 @type cluster_name: str
204 @param cluster_name: the cluster name, which is needed
205 by the SshRunner constructor
206 @rtype: L{ssh.SshRunner}
207 @return: an SshRunner instance
208
209 """
210 return ssh.SshRunner(cluster_name)
211
212
213 def _Decompress(data):
214 """Unpacks data compressed by the RPC client.
215
216 @type data: list or tuple
217 @param data: Data sent by RPC client
218 @rtype: str
219 @return: Decompressed data
220
221 """
222 assert isinstance(data, (list, tuple))
223 assert len(data) == 2
224 (encoding, content) = data
225 if encoding == constants.RPC_ENCODING_NONE:
226 return content
227 elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
228 return zlib.decompress(base64.b64decode(content))
229 else:
230 raise AssertionError("Unknown data encoding")
231
232
233 def _CleanDirectory(path, exclude=None):
234 """Removes all regular files in a directory.
235
236 @type path: str
237 @param path: the directory to clean
238 @type exclude: list
239 @param exclude: list of files to be excluded, defaults
240 to the empty list
241
242 """
243 if path not in _ALLOWED_CLEAN_DIRS:
244 _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'",
245 path)
246
247 if not os.path.isdir(path):
248 return
249 if exclude is None:
250 exclude = []
251 else:
252 # Normalize excluded paths
253 exclude = [os.path.normpath(i) for i in exclude]
254
255 for rel_name in utils.ListVisibleFiles(path):
256 full_name = utils.PathJoin(path, rel_name)
257 if full_name in exclude:
258 continue
259 if os.path.isfile(full_name) and not os.path.islink(full_name):
260 utils.RemoveFile(full_name)
261
262
263 def _BuildUploadFileList():
264 """Build the list of allowed upload files.
265
266 This is abstracted so that it's built only once at module import time.
267
268 """
269 allowed_files = set([
270 pathutils.CLUSTER_CONF_FILE,
271 pathutils.ETC_HOSTS,
272 pathutils.SSH_KNOWN_HOSTS_FILE,
273 pathutils.VNC_PASSWORD_FILE,
274 pathutils.RAPI_CERT_FILE,
275 pathutils.SPICE_CERT_FILE,
276 pathutils.SPICE_CACERT_FILE,
277 pathutils.RAPI_USERS_FILE,
278 pathutils.CONFD_HMAC_KEY,
279 pathutils.CLUSTER_DOMAIN_SECRET_FILE,
280 ])
281
282 for hv_name in constants.HYPER_TYPES:
283 hv_class = hypervisor.GetHypervisorClass(hv_name)
284 allowed_files.update(hv_class.GetAncillaryFiles()[0])
285
286 assert pathutils.FILE_STORAGE_PATHS_FILE not in allowed_files, \
287 "Allowed file storage paths should never be uploaded via RPC"
288
289 return frozenset(allowed_files)
290
291
292 _ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
293
294
295 def JobQueuePurge():
296 """Removes job queue files and archived jobs.
297
298 @rtype: tuple
299 @return: True, None
300
301 """
302 _CleanDirectory(pathutils.QUEUE_DIR, exclude=[pathutils.JOB_QUEUE_LOCK_FILE])
303 _CleanDirectory(pathutils.JOB_QUEUE_ARCHIVE_DIR)
304
305
306 def GetMasterNodeName():
307 """Returns the master node name.
308
309 @rtype: string
310 @return: name of the master node
311 @raise RPCFail: in case of errors
312
313 """
314 try:
315 return _GetConfig().GetMasterNode()
316 except errors.ConfigurationError, err:
317 _Fail("Cluster configuration incomplete: %s", err, exc=True)
318
319
320 def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
321 """Decorator that runs hooks before and after the decorated function.
322
323 @type hook_opcode: string
324 @param hook_opcode: opcode of the hook
325 @type hooks_path: string
326 @param hooks_path: path of the hooks
327 @type env_builder_fn: function
328 @param env_builder_fn: function that returns a dictionary containing the
329 environment variables for the hooks. Will get all the parameters of the
330 decorated function.
331 @raise RPCFail: in case of pre-hook failure
332
333 """
334 def decorator(fn):
335 def wrapper(*args, **kwargs):
336 _, myself = ssconf.GetMasterAndMyself()
337 nodes = ([myself], [myself]) # these hooks run locally
338
339 env_fn = compat.partial(env_builder_fn, *args, **kwargs)
340
341 cfg = _GetConfig()
342 hr = HooksRunner()
343 hm = hooksmaster.HooksMaster(hook_opcode, hooks_path, nodes,
344 hr.RunLocalHooks, None, env_fn, None,
345 logging.warning, cfg.GetClusterName(),
346 cfg.GetMasterNode())
347 hm.RunPhase(constants.HOOKS_PHASE_PRE)
348 result = fn(*args, **kwargs)
349 hm.RunPhase(constants.HOOKS_PHASE_POST)
350
351 return result
352 return wrapper
353 return decorator
354
355
356 def _BuildMasterIpEnv(master_params, use_external_mip_script=None):
357 """Builds environment variables for master IP hooks.
358
359 @type master_params: L{objects.MasterNetworkParameters}
360 @param master_params: network parameters of the master
361 @type use_external_mip_script: boolean
362 @param use_external_mip_script: whether to use an external master IP
363 address setup script (unused, but necessary per the implementation of the
364 _RunLocalHooks decorator)
365
366 """
367 # pylint: disable=W0613
368 ver = netutils.IPAddress.GetVersionFromAddressFamily(master_params.ip_family)
369 env = {
370 "MASTER_NETDEV": master_params.netdev,
371 "MASTER_IP": master_params.ip,
372 "MASTER_NETMASK": str(master_params.netmask),
373 "CLUSTER_IP_VERSION": str(ver),
374 }
375
376 return env
377
378
379 def _RunMasterSetupScript(master_params, action, use_external_mip_script):
380 """Execute the master IP address setup script.
381
382 @type master_params: L{objects.MasterNetworkParameters}
383 @param master_params: network parameters of the master
384 @type action: string
385 @param action: action to pass to the script. Must be one of
386 L{backend._MASTER_START} or L{backend._MASTER_STOP}
387 @type use_external_mip_script: boolean
388 @param use_external_mip_script: whether to use an external master IP
389 address setup script
390 @raise backend.RPCFail: if there are errors during the execution of the
391 script
392
393 """
394 env = _BuildMasterIpEnv(master_params)
395
396 if use_external_mip_script:
397 setup_script = pathutils.EXTERNAL_MASTER_SETUP_SCRIPT
398 else:
399 setup_script = pathutils.DEFAULT_MASTER_SETUP_SCRIPT
400
401 result = utils.RunCmd([setup_script, action], env=env, reset_env=True)
402
403 if result.failed:
404 _Fail("Failed to %s the master IP. Script return value: %s, output: '%s'" %
405 (action, result.exit_code, result.output), log=True)
406
407
408 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup",
409 _BuildMasterIpEnv)
410 def ActivateMasterIp(master_params, use_external_mip_script):
411 """Activate the IP address of the master daemon.
412
413 @type master_params: L{objects.MasterNetworkParameters}
414 @param master_params: network parameters of the master
415 @type use_external_mip_script: boolean
416 @param use_external_mip_script: whether to use an external master IP
417 address setup script
418 @raise RPCFail: in case of errors during the IP startup
419
420 """
421 _RunMasterSetupScript(master_params, _MASTER_START,
422 use_external_mip_script)
423
424
425 def StartMasterDaemons(no_voting):
426 """Activate local node as master node.
427
428 The function will start the master daemons (ganeti-masterd and ganeti-rapi).
429
430 @type no_voting: boolean
431 @param no_voting: whether to start ganeti-masterd without a node vote
432 but still non-interactively
433 @rtype: None
434
435 """
436
437 if no_voting:
438 masterd_args = "--no-voting --yes-do-it"
439 else:
440 masterd_args = ""
441
442 env = {
443 "EXTRA_MASTERD_ARGS": masterd_args,
444 }
445
446 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-master"], env=env)
447 if result.failed:
448 msg = "Can't start Ganeti master: %s" % result.output
449 logging.error(msg)
450 _Fail(msg)
451
452
453 @RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown",
454 _BuildMasterIpEnv)
455 def DeactivateMasterIp(master_params, use_external_mip_script):
456 """Deactivate the master IP on this node.
457
458 @type master_params: L{objects.MasterNetworkParameters}
459 @param master_params: network parameters of the master
460 @type use_external_mip_script: boolean
461 @param use_external_mip_script: whether to use an external master IP
462 address setup script
463 @raise RPCFail: in case of errors during the IP turndown
464
465 """
466 _RunMasterSetupScript(master_params, _MASTER_STOP,
467 use_external_mip_script)
468
469
470 def StopMasterDaemons():
471 """Stop the master daemons on this node.
472
473 Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node.
474
475 @rtype: None
476
477 """
478 # TODO: log and report back to the caller the error failures; we
479 # need to decide in which case we fail the RPC for this
480
481 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-master"])
482 if result.failed:
483 logging.error("Could not stop Ganeti master, command %s had exitcode %s"
484 " and error %s",
485 result.cmd, result.exit_code, result.output)
486
487
488 def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
489 """Change the netmask of the master IP.
490
491 @param old_netmask: the old value of the netmask
492 @param netmask: the new value of the netmask
493 @param master_ip: the master IP
494 @param master_netdev: the master network device
495
496 """
497 if old_netmask == netmask:
498 return
499
500 if not netutils.IPAddress.Own(master_ip):
501 _Fail("The master IP address is not up, not attempting to change its"
502 " netmask")
503
504 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
505 "%s/%s" % (master_ip, netmask),
506 "dev", master_netdev, "label",
507 "%s:0" % master_netdev])
508 if result.failed:
509 _Fail("Could not set the new netmask on the master IP address")
510
511 result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
512 "%s/%s" % (master_ip, old_netmask),
513 "dev", master_netdev, "label",
514 "%s:0" % master_netdev])
515 if result.failed:
516 _Fail("Could not bring down the master IP address with the old netmask")
517
518
519 def EtcHostsModify(mode, host, ip):
520 """Modify a host entry in /etc/hosts.
521
522 @param mode: The mode to operate. Either add or remove entry
523 @param host: The host to operate on
524 @param ip: The ip associated with the entry
525
526 """
527 if mode == constants.ETC_HOSTS_ADD:
528 if not ip:
529 RPCFail("Mode 'add' needs 'ip' parameter, but parameter not"
530 " present")
531 utils.AddHostToEtcHosts(host, ip)
532 elif mode == constants.ETC_HOSTS_REMOVE:
533 if ip:
534 RPCFail("Mode 'remove' does not allow 'ip' parameter, but"
535 " parameter is present")
536 utils.RemoveHostFromEtcHosts(host)
537 else:
538 RPCFail("Mode not supported")
539
540
541 def LeaveCluster(modify_ssh_setup):
542 """Cleans up and remove the current node.
543
544 This function cleans up and prepares the current node to be removed
545 from the cluster.
546
547 If processing is successful, then it raises an
548 L{errors.QuitGanetiException} which is used as a special case to
549 shutdown the node daemon.
550
551 @param modify_ssh_setup: boolean
552
553 """
554 _CleanDirectory(pathutils.DATA_DIR)
555 _CleanDirectory(pathutils.CRYPTO_KEYS_DIR)
556 JobQueuePurge()
557
558 if modify_ssh_setup:
559 try:
560 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
561
562 ssh.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
563
564 utils.RemoveFile(priv_key)
565 utils.RemoveFile(pub_key)
566 except errors.OpExecError:
567 logging.exception("Error while processing ssh files")
568
569 try:
570 utils.RemoveFile(pathutils.CONFD_HMAC_KEY)
571 utils.RemoveFile(pathutils.RAPI_CERT_FILE)
572 utils.RemoveFile(pathutils.SPICE_CERT_FILE)
573 utils.RemoveFile(pathutils.SPICE_CACERT_FILE)
574 utils.RemoveFile(pathutils.NODED_CERT_FILE)
575 except: # pylint: disable=W0702
576 logging.exception("Error while removing cluster secrets")
577
578 utils.StopDaemon(constants.CONFD)
579 utils.StopDaemon(constants.MOND)
580 utils.StopDaemon(constants.KVMD)
581
582 # Raise a custom exception (handled in ganeti-noded)
583 raise errors.QuitGanetiException(True, "Shutdown scheduled")
584
585
586 def _CheckStorageParams(params, num_params):
587 """Performs sanity checks for storage parameters.
588
589 @type params: list
590 @param params: list of storage parameters
591 @type num_params: int
592 @param num_params: expected number of parameters
593
594 """
595 if params is None:
596 raise errors.ProgrammerError("No storage parameters for storage"
597 " reporting is provided.")
598 if not isinstance(params, list):
599 raise errors.ProgrammerError("The storage parameters are not of type"
600 " list: '%s'" % params)
601 if not len(params) == num_params:
602 raise errors.ProgrammerError("Did not receive the expected number of"
603 "storage parameters: expected %s,"
604 " received '%s'" % (num_params, len(params)))
605
606
607 def _CheckLvmStorageParams(params):
608 """Performs sanity check for the 'exclusive storage' flag.
609
610 @see: C{_CheckStorageParams}
611
612 """
613 _CheckStorageParams(params, 1)
614 excl_stor = params[0]
615 if not isinstance(params[0], bool):
616 raise errors.ProgrammerError("Exclusive storage parameter is not"
617 " boolean: '%s'." % excl_stor)
618 return excl_stor
619
620
621 def _GetLvmVgSpaceInfo(name, params):
622 """Wrapper around C{_GetVgInfo} which checks the storage parameters.
623
624 @type name: string
625 @param name: name of the volume group
626 @type params: list
627 @param params: list of storage parameters, which in this case should be
628 containing only one for exclusive storage
629
630 """
631 excl_stor = _CheckLvmStorageParams(params)
632 return _GetVgInfo(name, excl_stor)
633
634
635 def _GetVgInfo(
636 name, excl_stor, info_fn=bdev.LogicalVolume.GetVGInfo):
637 """Retrieves information about a LVM volume group.
638
639 """
640 # TODO: GetVGInfo supports returning information for multiple VGs at once
641 vginfo = info_fn([name], excl_stor)
642 if vginfo:
643 vg_free = int(round(vginfo[0][0], 0))
644 vg_size = int(round(vginfo[0][1], 0))
645 else:
646 vg_free = None
647 vg_size = None
648
649 return {
650 "type": constants.ST_LVM_VG,
651 "name": name,
652 "storage_free": vg_free,
653 "storage_size": vg_size,
654 }
655
656
657 def _GetLvmPvSpaceInfo(name, params):
658 """Wrapper around C{_GetVgSpindlesInfo} with sanity checks.
659
660 @see: C{_GetLvmVgSpaceInfo}
661
662 """
663 excl_stor = _CheckLvmStorageParams(params)
664 return _GetVgSpindlesInfo(name, excl_stor)
665
666
667 def _GetVgSpindlesInfo(
668 name, excl_stor, info_fn=bdev.LogicalVolume.GetVgSpindlesInfo):
669 """Retrieves information about spindles in an LVM volume group.
670
671 @type name: string
672 @param name: VG name
673 @type excl_stor: bool
674 @param excl_stor: exclusive storage
675 @rtype: dict
676 @return: dictionary whose keys are "name", "vg_free", "vg_size" for VG name,
677 free spindles, total spindles respectively
678
679 """
680 if excl_stor:
681 (vg_free, vg_size) = info_fn(name)
682 else:
683 vg_free = 0
684 vg_size = 0
685 return {
686 "type": constants.ST_LVM_PV,
687 "name": name,
688 "storage_free": vg_free,
689 "storage_size": vg_size,
690 }
691
692
693 def _GetHvInfo(name, hvparams, get_hv_fn=hypervisor.GetHypervisor):
694 """Retrieves node information from a hypervisor.
695
696 The information returned depends on the hypervisor. Common items:
697
698 - vg_size is the size of the configured volume group in MiB
699 - vg_free is the free size of the volume group in MiB
700 - memory_dom0 is the memory allocated for domain0 in MiB
701 - memory_free is the currently available (free) ram in MiB
702 - memory_total is the total number of ram in MiB
703 - hv_version: the hypervisor version, if available
704
705 @type hvparams: dict of string
706 @param hvparams: the hypervisor's hvparams
707
708 """
709 return get_hv_fn(name).GetNodeInfo(hvparams=hvparams)
710
711
712 def _GetHvInfoAll(hv_specs, get_hv_fn=hypervisor.GetHypervisor):
713 """Retrieves node information for all hypervisors.
714
715 See C{_GetHvInfo} for information on the output.
716
717 @type hv_specs: list of pairs (string, dict of strings)
718 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
719
720 """
721 if hv_specs is None:
722 return None
723
724 result = []
725 for hvname, hvparams in hv_specs:
726 result.append(_GetHvInfo(hvname, hvparams, get_hv_fn))
727 return result
728
729
730 def _GetNamedNodeInfo(names, fn):
731 """Calls C{fn} for all names in C{names} and returns a dictionary.
732
733 @rtype: None or dict
734
735 """
736 if names is None:
737 return None
738 else:
739 return map(fn, names)
740
741
742 def GetNodeInfo(storage_units, hv_specs):
743 """Gives back a hash with different information about the node.
744
745 @type storage_units: list of tuples (string, string, list)
746 @param storage_units: List of tuples (storage unit, identifier, parameters) to
747 ask for disk space information. In case of lvm-vg, the identifier is
748 the VG name. The parameters can contain additional, storage-type-specific
749 parameters, for example exclusive storage for lvm storage.
750 @type hv_specs: list of pairs (string, dict of strings)
751 @param hv_specs: list of pairs of a hypervisor's name and its hvparams
752 @rtype: tuple; (string, None/dict, None/dict)
753 @return: Tuple containing boot ID, volume group information and hypervisor
754 information
755
756 """
757 bootid = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n")
758 storage_info = _GetNamedNodeInfo(
759 storage_units,
760 (lambda (storage_type, storage_key, storage_params):
761 _ApplyStorageInfoFunction(storage_type, storage_key, storage_params)))
762 hv_info = _GetHvInfoAll(hv_specs)
763 return (bootid, storage_info, hv_info)
764
765
766 def _GetFileStorageSpaceInfo(path, params):
767 """Wrapper around filestorage.GetSpaceInfo.
768
769 The purpose of this wrapper is to call filestorage.GetFileStorageSpaceInfo
770 and ignore the *args parameter to not leak it into the filestorage
771 module's code.
772
773 @see: C{filestorage.GetFileStorageSpaceInfo} for description of the
774 parameters.
775
776 """
777 _CheckStorageParams(params, 0)
778 return filestorage.GetFileStorageSpaceInfo(path)
779
780
781 # FIXME: implement storage reporting for all missing storage types.
782 _STORAGE_TYPE_INFO_FN = {
783 constants.ST_BLOCK: None,
784 constants.ST_DISKLESS: None,
785 constants.ST_EXT: None,
786 constants.ST_FILE: _GetFileStorageSpaceInfo,
787 constants.ST_LVM_PV: _GetLvmPvSpaceInfo,
788 constants.ST_LVM_VG: _GetLvmVgSpaceInfo,
789 constants.ST_SHARED_FILE: None,
790 constants.ST_GLUSTER: None,
791 constants.ST_RADOS: None,
792 }
793
794
795 def _ApplyStorageInfoFunction(storage_type, storage_key, *args):
796 """Looks up and applies the correct function to calculate free and total
797 storage for the given storage type.
798
799 @type storage_type: string
800 @param storage_type: the storage type for which the storage shall be reported.
801 @type storage_key: string
802 @param storage_key: identifier of a storage unit, e.g. the volume group name
803 of an LVM storage unit
804 @type args: any
805 @param args: various parameters that can be used for storage reporting. These
806 parameters and their semantics vary from storage type to storage type and
807 are just propagated in this function.
808 @return: the results of the application of the storage space function (see
809 _STORAGE_TYPE_INFO_FN) if storage space reporting is implemented for that
810 storage type
811 @raises NotImplementedError: for storage types who don't support space
812 reporting yet
813 """
814 fn = _STORAGE_TYPE_INFO_FN[storage_type]
815 if fn is not None:
816 return fn(storage_key, *args)
817 else:
818 raise NotImplementedError
819
820
821 def _CheckExclusivePvs(pvi_list):
822 """Check that PVs are not shared among LVs
823
824 @type pvi_list: list of L{objects.LvmPvInfo} objects
825 @param pvi_list: information about the PVs
826
827 @rtype: list of tuples (string, list of strings)
828 @return: offending volumes, as tuples: (pv_name, [lv1_name, lv2_name...])
829
830 """
831 res = []
832 for pvi in pvi_list:
833 if len(pvi.lv_list) > 1:
834 res.append((pvi.name, pvi.lv_list))
835 return res
836
837
838 def _VerifyHypervisors(what, vm_capable, result, all_hvparams,
839 get_hv_fn=hypervisor.GetHypervisor):
840 """Verifies the hypervisor. Appends the results to the 'results' list.
841
842 @type what: C{dict}
843 @param what: a dictionary of things to check
844 @type vm_capable: boolean
845 @param vm_capable: whether or not this node is vm capable
846 @type result: dict
847 @param result: dictionary of verification results; results of the
848 verifications in this function will be added here
849 @type all_hvparams: dict of dict of string
850 @param all_hvparams: dictionary mapping hypervisor names to hvparams
851 @type get_hv_fn: function
852 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
853
854 """
855 if not vm_capable:
856 return
857
858 if constants.NV_HYPERVISOR in what:
859 result[constants.NV_HYPERVISOR] = {}
860 for hv_name in what[constants.NV_HYPERVISOR]:
861 hvparams = all_hvparams[hv_name]
862 try:
863 val = get_hv_fn(hv_name).Verify(hvparams=hvparams)
864 except errors.HypervisorError, err:
865 val = "Error while checking hypervisor: %s" % str(err)
866 result[constants.NV_HYPERVISOR][hv_name] = val
867
868
869 def _VerifyHvparams(what, vm_capable, result,
870 get_hv_fn=hypervisor.GetHypervisor):
871 """Verifies the hvparams. Appends the results to the 'results' list.
872
873 @type what: C{dict}
874 @param what: a dictionary of things to check
875 @type vm_capable: boolean
876 @param vm_capable: whether or not this node is vm capable
877 @type result: dict
878 @param result: dictionary of verification results; results of the
879 verifications in this function will be added here
880 @type get_hv_fn: function
881 @param get_hv_fn: function to retrieve the hypervisor, to improve testability
882
883 """
884 if not vm_capable:
885 return
886
887 if constants.NV_HVPARAMS in what:
888 result[constants.NV_HVPARAMS] = []
889 for source, hv_name, hvparms in what[constants.NV_HVPARAMS]:
890 try:
891 logging.info("Validating hv %s, %s", hv_name, hvparms)
892 get_hv_fn(hv_name).ValidateParameters(hvparms)
893 except errors.HypervisorError, err:
894 result[constants.NV_HVPARAMS].append((source, hv_name, str(err)))
895
896
897 def _VerifyInstanceList(what, vm_capable, result, all_hvparams):
898 """Verifies the instance list.
899
900 @type what: C{dict}
901 @param what: a dictionary of things to check
902 @type vm_capable: boolean
903 @param vm_capable: whether or not this node is vm capable
904 @type result: dict
905 @param result: dictionary of verification results; results of the
906 verifications in this function will be added here
907 @type all_hvparams: dict of dict of string
908 @param all_hvparams: dictionary mapping hypervisor names to hvparams
909
910 """
911 if constants.NV_INSTANCELIST in what and vm_capable:
912 # GetInstanceList can fail
913 try:
914 val = GetInstanceList(what[constants.NV_INSTANCELIST],
915 all_hvparams=all_hvparams)
916 except RPCFail, err:
917 val = str(err)
918 result[constants.NV_INSTANCELIST] = val
919
920
921 def _VerifyNodeInfo(what, vm_capable, result, all_hvparams):
922 """Verifies the node info.
923
924 @type what: C{dict}
925 @param what: a dictionary of things to check
926 @type vm_capable: boolean
927 @param vm_capable: whether or not this node is vm capable
928 @type result: dict
929 @param result: dictionary of verification results; results of the
930 verifications in this function will be added here
931 @type all_hvparams: dict of dict of string
932 @param all_hvparams: dictionary mapping hypervisor names to hvparams
933
934 """
935 if constants.NV_HVINFO in what and vm_capable:
936 hvname = what[constants.NV_HVINFO]
937 hyper = hypervisor.GetHypervisor(hvname)
938 hvparams = all_hvparams[hvname]
939 result[constants.NV_HVINFO] = hyper.GetNodeInfo(hvparams=hvparams)
940
941
942 def _VerifyClientCertificate(cert_file=pathutils.NODED_CLIENT_CERT_FILE):
943 """Verify the existance and validity of the client SSL certificate.
944
945 Also, verify that the client certificate is not self-signed. Self-
946 signed client certificates stem from Ganeti versions 2.12.0 - 2.12.4
947 and should be replaced by client certificates signed by the server
948 certificate. Hence we output a warning when we encounter a self-signed
949 one.
950
951 """
952 create_cert_cmd = "gnt-cluster renew-crypto --new-node-certificates"
953 if not os.path.exists(cert_file):
954 return (constants.CV_ERROR,
955 "The client certificate does not exist. Run '%s' to create"
956 " client certificates for all nodes." % create_cert_cmd)
957
958 (errcode, msg) = utils.VerifyCertificate(cert_file)
959 if errcode is not None:
960 return (errcode, msg)
961
962 (errcode, msg) = utils.IsCertificateSelfSigned(cert_file)
963 if errcode is not None:
964 return (errcode, msg)
965
966 # if everything is fine, we return the digest to be compared to the config
967 return (None, utils.GetCertificateDigest(cert_filename=cert_file))
968
969
970 def _VerifySshSetup(node_status_list, my_name,
971 pub_key_file=pathutils.SSH_PUB_KEYS):
972 """Verifies the state of the SSH key files.
973
974 @type node_status_list: list of tuples
975 @param node_status_list: list of nodes of the cluster associated with a
976 couple of flags: (uuid, name, is_master_candidate,
977 is_potential_master_candidate, online)
978 @type my_name: str
979 @param my_name: name of this node
980 @type pub_key_file: str
981 @param pub_key_file: filename of the public key file
982
983 """
984 if node_status_list is None:
985 return ["No node list to check against the pub_key_file received."]
986
987 my_status_list = [(my_uuid, name, mc, pot_mc, online) for
988 (my_uuid, name, mc, pot_mc, online)
989 in node_status_list if name == my_name]
990 if len(my_status_list) == 0:
991 return ["Cannot find node information for node '%s'." % my_name]
992 (my_uuid, _, _, potential_master_candidate, online) = \
993 my_status_list[0]
994
995 result = []
996
997 if not os.path.exists(pub_key_file):
998 result.append("The public key file '%s' does not exist. Consider running"
999 " 'gnt-cluster renew-crypto --new-ssh-keys"
1000 " [--no-ssh-key-check]' to fix this." % pub_key_file)
1001 return result
1002
1003 pot_mc_uuids = [uuid for (uuid, _, _, _, _) in node_status_list]
1004 offline_nodes = [uuid for (uuid, _, _, _, online) in node_status_list
1005 if not online]
1006 pub_keys = ssh.QueryPubKeyFile(None)
1007
1008 if potential_master_candidate:
1009 # Check that the set of potential master candidates matches the
1010 # public key file
1011 pub_uuids_set = set(pub_keys.keys()) - set(offline_nodes)
1012 pot_mc_uuids_set = set(pot_mc_uuids) - set(offline_nodes)
1013 missing_uuids = set([])
1014 if pub_uuids_set != pot_mc_uuids_set:
1015 unknown_uuids = pub_uuids_set - pot_mc_uuids_set
1016 if unknown_uuids:
1017 result.append("The following node UUIDs are listed in the public key"
1018 " file on node '%s', but are not potential master"
1019 " candidates: %s."
1020 % (my_name, ", ".join(list(unknown_uuids))))
1021 missing_uuids = pot_mc_uuids_set - pub_uuids_set
1022 if missing_uuids:
1023 result.append("The following node UUIDs of potential master candidates"
1024 " are missing in the public key file on node %s: %s."
1025 % (my_name, ", ".join(list(missing_uuids))))
1026
1027 (_, key_files) = \
1028 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1029 (_, dsa_pub_key_filename) = key_files[constants.SSHK_DSA]
1030
1031 my_keys = pub_keys[my_uuid]
1032
1033 dsa_pub_key = utils.ReadFile(dsa_pub_key_filename)
1034 if dsa_pub_key.strip() not in my_keys:
1035 result.append("The dsa key of node %s does not match this node's key"
1036 " in the pub key file." % (my_name))
1037 if len(my_keys) != 1:
1038 result.append("There is more than one key for node %s in the public key"
1039 " file." % my_name)
1040 else:
1041 if len(pub_keys.keys()) > 0:
1042 result.append("The public key file of node '%s' is not empty, although"
1043 " the node is not a potential master candidate."
1044 % my_name)
1045
1046 # Check that all master candidate keys are in the authorized_keys file
1047 (auth_key_file, _) = \
1048 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1049 for (uuid, name, mc, _, online) in node_status_list:
1050 if not online:
1051 continue
1052 if uuid in missing_uuids:
1053 continue
1054 if mc:
1055 for key in pub_keys[uuid]:
1056 if not ssh.HasAuthorizedKey(auth_key_file, key):
1057 result.append("A SSH key of master candidate '%s' (UUID: '%s') is"
1058 " not in the 'authorized_keys' file of node '%s'."
1059 % (name, uuid, my_name))
1060 else:
1061 for key in pub_keys[uuid]:
1062 if name != my_name and ssh.HasAuthorizedKey(auth_key_file, key):
1063 result.append("A SSH key of normal node '%s' (UUID: '%s') is in the"
1064 " 'authorized_keys' file of node '%s'."
1065 % (name, uuid, my_name))
1066 if name == my_name and not ssh.HasAuthorizedKey(auth_key_file, key):
1067 result.append("A SSH key of normal node '%s' (UUID: '%s') is not"
1068 " in the 'authorized_keys' file of itself."
1069 % (my_name, uuid))
1070
1071 return result
1072
1073
1074 def _VerifySshClutter(node_status_list, my_name):
1075 """Verifies that the 'authorized_keys' files are not cluttered up.
1076
1077 @type node_status_list: list of tuples
1078 @param node_status_list: list of nodes of the cluster associated with a
1079 couple of flags: (uuid, name, is_master_candidate,
1080 is_potential_master_candidate, online)
1081 @type my_name: str
1082 @param my_name: name of this node
1083
1084 """
1085 result = []
1086 (auth_key_file, _) = \
1087 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1088 node_names = [name for (_, name, _, _) in node_status_list]
1089 multiple_occurrences = ssh.CheckForMultipleKeys(auth_key_file, node_names)
1090 if multiple_occurrences:
1091 msg = "There are hosts which have more than one SSH key stored for the" \
1092 " same user in the 'authorized_keys' file of node %s. This can be" \
1093 " due to an unsuccessful operation which cluttered up the" \
1094 " 'authorized_keys' file. We recommend to clean this up manually. " \
1095 % my_name
1096 for host, occ in multiple_occurrences.items():
1097 msg += "Entry for '%s' in lines %s. " % (host, utils.CommaJoin(occ))
1098 result.append(msg)
1099
1100 return result
1101
1102
1103 def VerifyNode(what, cluster_name, all_hvparams, node_groups):
1104 """Verify the status of the local node.
1105
1106 Based on the input L{what} parameter, various checks are done on the
1107 local node.
1108
1109 If the I{filelist} key is present, this list of
1110 files is checksummed and the file/checksum pairs are returned.
1111
1112 If the I{nodelist} key is present, we check that we have
1113 connectivity via ssh with the target nodes (and check the hostname
1114 report).
1115
1116 If the I{node-net-test} key is present, we check that we have
1117 connectivity to the given nodes via both primary IP and, if
1118 applicable, secondary IPs.
1119
1120 @type what: C{dict}
1121 @param what: a dictionary of things to check:
1122 - filelist: list of files for which to compute checksums
1123 - nodelist: list of nodes we should check ssh communication with
1124 - node-net-test: list of nodes we should check node daemon port
1125 connectivity with
1126 - hypervisor: list with hypervisors to run the verify for
1127 @type cluster_name: string
1128 @param cluster_name: the cluster's name
1129 @type all_hvparams: dict of dict of strings
1130 @param all_hvparams: a dictionary mapping hypervisor names to hvparams
1131 @type node_groups: a dict of strings
1132 @param node_groups: node _names_ mapped to their group uuids (it's enough to
1133 have only those nodes that are in `what["nodelist"]`)
1134 @rtype: dict
1135 @return: a dictionary with the same keys as the input dict, and
1136 values representing the result of the checks
1137
1138 """
1139 result = {}
1140 my_name = netutils.Hostname.GetSysName()
1141 port = netutils.GetDaemonPort(constants.NODED)
1142 vm_capable = my_name not in what.get(constants.NV_NONVMNODES, [])
1143
1144 _VerifyHypervisors(what, vm_capable, result, all_hvparams)
1145 _VerifyHvparams(what, vm_capable, result)
1146
1147 if constants.NV_FILELIST in what:
1148 fingerprints = utils.FingerprintFiles(map(vcluster.LocalizeVirtualPath,
1149 what[constants.NV_FILELIST]))
1150 result[constants.NV_FILELIST] = \
1151 dict((vcluster.MakeVirtualPath(key), value)
1152 for (key, value) in fingerprints.items())
1153
1154 if constants.NV_CLIENT_CERT in what:
1155 result[constants.NV_CLIENT_CERT] = _VerifyClientCertificate()
1156
1157 if constants.NV_SSH_SETUP in what:
1158 result[constants.NV_SSH_SETUP] = \
1159 _VerifySshSetup(what[constants.NV_SSH_SETUP], my_name)
1160 if constants.NV_SSH_CLUTTER in what:
1161 result[constants.NV_SSH_CLUTTER] = \
1162 _VerifySshClutter(what[constants.NV_SSH_SETUP], my_name)
1163
1164 if constants.NV_NODELIST in what:
1165 (nodes, bynode, mcs) = what[constants.NV_NODELIST]
1166
1167 # Add nodes from other groups (different for each node)
1168 try:
1169 nodes.extend(bynode[my_name])
1170 except KeyError:
1171 pass
1172
1173 # Use a random order
1174 random.shuffle(nodes)
1175
1176 # Try to contact all nodes
1177 val = {}
1178 ssh_port_map = ssconf.SimpleStore().GetSshPortMap()
1179 for node in nodes:
1180 # We only test if master candidates can communicate to other nodes.
1181 # We cannot test if normal nodes cannot communicate with other nodes,
1182 # because the administrator might have installed additional SSH keys,
1183 # over which Ganeti has no power.
1184 if my_name in mcs:
1185 success, message = _GetSshRunner(cluster_name). \
1186 VerifyNodeHostname(node, ssh_port_map)
1187 if not success:
1188 val[node] = message
1189
1190 result[constants.NV_NODELIST] = val
1191
1192 if constants.NV_NODENETTEST in what:
1193 result[constants.NV_NODENETTEST] = tmp = {}
1194 my_pip = my_sip = None
1195 for name, pip, sip in what[constants.NV_NODENETTEST]:
1196 if name == my_name:
1197 my_pip = pip
1198 my_sip = sip
1199 break
1200 if not my_pip:
1201 tmp[my_name] = ("Can't find my own primary/secondary IP"
1202 " in the node list")
1203 else:
1204 for name, pip, sip in what[constants.NV_NODENETTEST]:
1205 fail = []
1206 if not netutils.TcpPing(pip, port, source=my_pip):
1207 fail.append("primary")
1208 if sip != pip:
1209 if not netutils.TcpPing(sip, port, source=my_sip):
1210 fail.append("secondary")
1211 if fail:
1212 tmp[name] = ("failure using the %s interface(s)" %
1213 " and ".join(fail))
1214
1215 if constants.NV_MASTERIP in what:
1216 # FIXME: add checks on incoming data structures (here and in the
1217 # rest of the function)
1218 master_name, master_ip = what[constants.NV_MASTERIP]
1219 if master_name == my_name:
1220 source = constants.IP4_ADDRESS_LOCALHOST
1221 else:
1222 source = None
1223 result[constants.NV_MASTERIP] = netutils.TcpPing(master_ip, port,
1224 source=source)
1225
1226 if constants.NV_USERSCRIPTS in what:
1227 result[constants.NV_USERSCRIPTS] = \
1228 [script for script in what[constants.NV_USERSCRIPTS]
1229 if not utils.IsExecutable(script)]
1230
1231 if constants.NV_OOB_PATHS in what:
1232 result[constants.NV_OOB_PATHS] = tmp = []
1233 for path in what[constants.NV_OOB_PATHS]:
1234 try:
1235 st = os.stat(path)
1236 except OSError, err:
1237 tmp.append("error stating out of band helper: %s" % err)
1238 else:
1239 if stat.S_ISREG(st.st_mode):
1240 if stat.S_IMODE(st.st_mode) & stat.S_IXUSR:
1241 tmp.append(None)
1242 else:
1243 tmp.append("out of band helper %s is not executable" % path)
1244 else:
1245 tmp.append("out of band helper %s is not a file" % path)
1246
1247 if constants.NV_LVLIST in what and vm_capable:
1248 try:
1249 val = GetVolumeList(utils.ListVolumeGroups().keys())
1250 except RPCFail, err:
1251 val = str(err)
1252 result[constants.NV_LVLIST] = val
1253
1254 _VerifyInstanceList(what, vm_capable, result, all_hvparams)
1255
1256 if constants.NV_VGLIST in what and vm_capable:
1257 result[constants.NV_VGLIST] = utils.ListVolumeGroups()
1258
1259 if constants.NV_PVLIST in what and vm_capable:
1260 check_exclusive_pvs = constants.NV_EXCLUSIVEPVS in what
1261 val = bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST],
1262 filter_allocatable=False,
1263 include_lvs=check_exclusive_pvs)
1264 if check_exclusive_pvs:
1265 result[constants.NV_EXCLUSIVEPVS] = _CheckExclusivePvs(val)
1266 for pvi in val:
1267 # Avoid sending useless data on the wire
1268 pvi.lv_list = []
1269 result[constants.NV_PVLIST] = map(objects.LvmPvInfo.ToDict, val)
1270
1271 if constants.NV_VERSION in what:
1272 result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
1273 constants.RELEASE_VERSION)
1274
1275 _VerifyNodeInfo(what, vm_capable, result, all_hvparams)
1276
1277 if constants.NV_DRBDVERSION in what and vm_capable:
1278 try:
1279 drbd_version = DRBD8.GetProcInfo().GetVersionString()
1280 except errors.BlockDeviceError, err:
1281 logging.warning("Can't get DRBD version", exc_info=True)
1282 drbd_version = str(err)
1283 result[constants.NV_DRBDVERSION] = drbd_version
1284
1285 if constants.NV_DRBDLIST in what and vm_capable:
1286 try:
1287 used_minors = drbd.DRBD8.GetUsedDevs()
1288 except errors.BlockDeviceError, err:
1289 logging.warning("Can't get used minors list", exc_info=True)
1290 used_minors = str(err)
1291 result[constants.NV_DRBDLIST] = used_minors
1292
1293 if constants.NV_DRBDHELPER in what and vm_capable:
1294 status = True
1295 try:
1296 payload = drbd.DRBD8.GetUsermodeHelper()
1297 except errors.BlockDeviceError, err:
1298 logging.error("Can't get DRBD usermode helper: %s", str(err))
1299 status = False
1300 payload = str(err)
1301 result[constants.NV_DRBDHELPER] = (status, payload)
1302
1303 if constants.NV_NODESETUP in what:
1304 result[constants.NV_NODESETUP] = tmpr = []
1305 if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"):
1306 tmpr.append("The sysfs filesytem doesn't seem to be mounted"
1307 " under /sys, missing required directories /sys/block"
1308 " and /sys/class/net")
1309 if (not os.path.isdir("/proc/sys") or
1310 not os.path.isfile("/proc/sysrq-trigger")):
1311 tmpr.append("The procfs filesystem doesn't seem to be mounted"
1312 " under /proc, missing required directory /proc/sys and"
1313 " the file /proc/sysrq-trigger")
1314
1315 if constants.NV_TIME in what:
1316 result[constants.NV_TIME] = utils.SplitTime(time.time())
1317
1318 if constants.NV_OSLIST in what and vm_capable:
1319 result[constants.NV_OSLIST] = DiagnoseOS()
1320
1321 if constants.NV_BRIDGES in what and vm_capable:
1322 result[constants.NV_BRIDGES] = [bridge
1323 for bridge in what[constants.NV_BRIDGES]
1324 if not utils.BridgeExists(bridge)]
1325
1326 if what.get(constants.NV_ACCEPTED_STORAGE_PATHS) == my_name:
1327 result[constants.NV_ACCEPTED_STORAGE_PATHS] = \
1328 filestorage.ComputeWrongFileStoragePaths()
1329
1330 if what.get(constants.NV_FILE_STORAGE_PATH):
1331 pathresult = filestorage.CheckFileStoragePath(
1332 what[constants.NV_FILE_STORAGE_PATH])
1333 if pathresult:
1334 result[constants.NV_FILE_STORAGE_PATH] = pathresult
1335
1336 if what.get(constants.NV_SHARED_FILE_STORAGE_PATH):
1337 pathresult = filestorage.CheckFileStoragePath(
1338 what[constants.NV_SHARED_FILE_STORAGE_PATH])
1339 if pathresult:
1340 result[constants.NV_SHARED_FILE_STORAGE_PATH] = pathresult
1341
1342 return result
1343
1344
1345 def GetCryptoTokens(token_requests):
1346 """Perform actions on the node's cryptographic tokens.
1347
1348 Token types can be 'ssl' or 'ssh'. So far only some actions are implemented
1349 for 'ssl'. Action 'get' returns the digest of the public client ssl
1350 certificate. Action 'create' creates a new client certificate and private key
1351 and also returns the digest of the certificate. The third parameter of a
1352 token request are optional parameters for the actions, so far only the
1353 filename is supported.
1354
1355 @type token_requests: list of tuples of (string, string, dict), where the
1356 first string is in constants.CRYPTO_TYPES, the second in
1357 constants.CRYPTO_ACTIONS. The third parameter is a dictionary of string
1358 to string.
1359 @param token_requests: list of requests of cryptographic tokens and actions
1360 to perform on them. The actions come with a dictionary of options.
1361 @rtype: list of tuples (string, string)
1362 @return: list of tuples of the token type and the public crypto token
1363
1364 """
1365 tokens = []
1366 for (token_type, action, _) in token_requests:
1367 if token_type not in constants.CRYPTO_TYPES:
1368 raise errors.ProgrammerError("Token type '%s' not supported." %
1369 token_type)
1370 if action not in constants.CRYPTO_ACTIONS:
1371 raise errors.ProgrammerError("Action '%s' is not supported." %
1372 action)
1373 if token_type == constants.CRYPTO_TYPE_SSL_DIGEST:
1374 tokens.append((token_type,
1375 utils.GetCertificateDigest()))
1376 return tokens
1377
1378
1379 def EnsureDaemon(daemon_name, run):
1380 """Ensures the given daemon is running or stopped.
1381
1382 @type daemon_name: string
1383 @param daemon_name: name of the daemon (e.g., constants.KVMD)
1384
1385 @type run: bool
1386 @param run: whether to start or stop the daemon
1387
1388 @rtype: bool
1389 @return: 'True' if daemon successfully started/stopped,
1390 'False' otherwise
1391
1392 """
1393 allowed_daemons = [constants.KVMD]
1394
1395 if daemon_name not in allowed_daemons:
1396 fn = lambda _: False
1397 elif run:
1398 fn = utils.EnsureDaemon
1399 else:
1400 fn = utils.StopDaemon
1401
1402 return fn(daemon_name)
1403
1404
1405 def _InitSshUpdateData(data, noded_cert_file, ssconf_store):
1406 (_, noded_cert) = \
1407 utils.ExtractX509Certificate(utils.ReadFile(noded_cert_file))
1408 data[constants.SSHS_NODE_DAEMON_CERTIFICATE] = noded_cert
1409
1410 cluster_name = ssconf_store.GetClusterName()
1411 data[constants.SSHS_CLUSTER_NAME] = cluster_name
1412
1413
1414 def AddNodeSshKey(node_uuid, node_name,
1415 potential_master_candidates,
1416 to_authorized_keys=False,
1417 to_public_keys=False,
1418 get_public_keys=False,
1419 pub_key_file=pathutils.SSH_PUB_KEYS,
1420 ssconf_store=None,
1421 noded_cert_file=pathutils.NODED_CERT_FILE,
1422 run_cmd_fn=ssh.RunSshCmdWithStdin):
1423 """Distributes a node's public SSH key across the cluster.
1424
1425 Note that this function should only be executed on the master node, which
1426 then will copy the new node's key to all nodes in the cluster via SSH.
1427
1428 Also note: at least one of the flags C{to_authorized_keys},
1429 C{to_public_keys}, and C{get_public_keys} has to be set to C{True} for
1430 the function to actually perform any actions.
1431
1432 @type node_uuid: str
1433 @param node_uuid: the UUID of the node whose key is added
1434 @type node_name: str
1435 @param node_name: the name of the node whose key is added
1436 @type potential_master_candidates: list of str
1437 @param potential_master_candidates: list of node names of potential master
1438 candidates; this should match the list of uuids in the public key file
1439 @type to_authorized_keys: boolean
1440 @param to_authorized_keys: whether the key should be added to the
1441 C{authorized_keys} file of all nodes
1442 @type to_public_keys: boolean
1443 @param to_public_keys: whether the keys should be added to the public key file
1444 @type get_public_keys: boolean
1445 @param get_public_keys: whether the node should add the clusters' public keys
1446 to its {ganeti_pub_keys} file
1447
1448 """
1449 # assure that at least one of those flags is true, as the function would
1450 # not do anything otherwise
1451 assert (to_authorized_keys or to_public_keys or get_public_keys)
1452
1453 if not ssconf_store:
1454 ssconf_store = ssconf.SimpleStore()
1455
1456 # Check and fix sanity of key file
1457 keys_by_name = ssh.QueryPubKeyFile([node_name], key_file=pub_key_file)
1458 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
1459
1460 if (not keys_by_name or node_name not in keys_by_name) \
1461 and (not keys_by_uuid or node_uuid not in keys_by_uuid):
1462 raise errors.SshUpdateError(
1463 "No keys found for the new node '%s' (UUID %s) in the list of public"
1464 " SSH keys, neither for the name or the UUID" % (node_name, node_uuid))
1465 else:
1466 if node_name in keys_by_name:
1467 keys_by_uuid = {}
1468 # Replace the name by UUID in the file as the name should only be used
1469 # temporarily
1470 ssh.ReplaceNameByUuid(node_uuid, node_name,
1471 error_fn=errors.SshUpdateError,
1472 key_file=pub_key_file)
1473 keys_by_uuid[node_uuid] = keys_by_name[node_name]
1474
1475 # Update the master node's key files
1476 if to_authorized_keys:
1477 (auth_key_file, _) = \
1478 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1479 ssh.AddAuthorizedKeys(auth_key_file, keys_by_uuid[node_uuid])
1480
1481 base_data = {}
1482 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1483 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1484
1485 ssh_port_map = ssconf_store.GetSshPortMap()
1486
1487 # Update the target node itself
1488 logging.debug("Updating SSH key files of target node '%s'.", node_name)
1489 if get_public_keys:
1490 node_data = {}
1491 _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
1492 all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
1493 node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1494 (constants.SSHS_OVERRIDE, all_keys)
1495
1496 try:
1497 utils.RetryByNumberOfTimes(
1498 constants.SSHS_MAX_RETRIES,
1499 errors.SshUpdateError,
1500 run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE,
1501 ssh_port_map.get(node_name), node_data,
1502 debug=False, verbose=False, use_cluster_key=False,
1503 ask_key=False, strict_host_check=False)
1504 except errors.SshUpdateError as e:
1505 # Clean up the master's public key file if adding key fails
1506 if to_public_keys:
1507 ssh.RemovePublicKey(node_uuid)
1508 raise e
1509
1510 # Update all nodes except master and the target node
1511 if to_authorized_keys:
1512 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1513 (constants.SSHS_ADD, keys_by_uuid)
1514
1515 pot_mc_data = copy.deepcopy(base_data)
1516 if to_public_keys:
1517 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1518 (constants.SSHS_REPLACE_OR_ADD, keys_by_uuid)
1519
1520 all_nodes = ssconf_store.GetNodeList()
1521 master_node = ssconf_store.GetMasterNode()
1522 online_nodes = ssconf_store.GetOnlineNodeList()
1523
1524 node_errors = []
1525 for node in all_nodes:
1526 if node == master_node:
1527 logging.debug("Skipping master node '%s'.", master_node)
1528 continue
1529 if node not in online_nodes:
1530 logging.debug("Skipping offline node '%s'.", node)
1531 continue
1532 if node in potential_master_candidates:
1533 logging.debug("Updating SSH key files of node '%s'.", node)
1534 try:
1535 utils.RetryByNumberOfTimes(
1536 constants.SSHS_MAX_RETRIES,
1537 errors.SshUpdateError,
1538 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1539 ssh_port_map.get(node), pot_mc_data,
1540 debug=False, verbose=False, use_cluster_key=False,
1541 ask_key=False, strict_host_check=False)
1542 except errors.SshUpdateError as last_exception:
1543 error_msg = ("When adding the key of node '%s', updating SSH key"
1544 " files of node '%s' failed after %s retries."
1545 " Not trying again. Last error was: %s." %
1546 (node, node_name, constants.SSHS_MAX_RETRIES,
1547 last_exception))
1548 node_errors.append((node, error_msg))
1549 # We only log the error and don't throw an exception, because
1550 # one unreachable node shall not abort the entire procedure.
1551 logging.error(error_msg)
1552
1553 else:
1554 if to_authorized_keys:
1555 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
1556 ssh_port_map.get(node), base_data,
1557 debug=False, verbose=False, use_cluster_key=False,
1558 ask_key=False, strict_host_check=False)
1559
1560 return node_errors
1561
1562
1563 def RemoveNodeSshKey(node_uuid, node_name,
1564 master_candidate_uuids,
1565 potential_master_candidates,
1566 master_uuid=None,
1567 keys_to_remove=None,
1568 from_authorized_keys=False,
1569 from_public_keys=False,
1570 clear_authorized_keys=False,
1571 clear_public_keys=False,
1572 pub_key_file=pathutils.SSH_PUB_KEYS,
1573 ssconf_store=None,
1574 noded_cert_file=pathutils.NODED_CERT_FILE,
1575 readd=False,
1576 run_cmd_fn=ssh.RunSshCmdWithStdin):
1577 """Removes the node's SSH keys from the key files and distributes those.
1578
1579 Note that at least one of the flags C{from_authorized_keys},
1580 C{from_public_keys}, C{clear_authorized_keys}, and C{clear_public_keys}
1581 has to be set to C{True} for the function to perform any action at all.
1582 Not doing so will trigger an assertion in the function.
1583
1584 @type node_uuid: str
1585 @param node_uuid: UUID of the node whose key is removed
1586 @type node_name: str
1587 @param node_name: name of the node whose key is remove
1588 @type master_candidate_uuids: list of str
1589 @param master_candidate_uuids: list of UUIDs of the current master candidates
1590 @type potential_master_candidates: list of str
1591 @param potential_master_candidates: list of names of potential master
1592 candidates
1593 @type keys_to_remove: dict of str to list of str
1594 @param keys_to_remove: a dictionary mapping node UUIDS to lists of SSH keys
1595 to be removed. This list is supposed to be used only if the keys are not
1596 in the public keys file. This is for example the case when removing a
1597 master node's key.
1598 @type from_authorized_keys: boolean
1599 @param from_authorized_keys: whether or not the key should be removed
1600 from the C{authorized_keys} file
1601 @type from_public_keys: boolean
1602 @param from_public_keys: whether or not the key should be remove from
1603 the C{ganeti_pub_keys} file
1604 @type clear_authorized_keys: boolean
1605 @param clear_authorized_keys: whether or not the C{authorized_keys} file
1606 should be cleared on the node whose keys are removed
1607 @type clear_public_keys: boolean
1608 @param clear_public_keys: whether to clear the node's C{ganeti_pub_key} file
1609 @type readd: boolean
1610 @param readd: whether this is called during a readd operation.
1611 @rtype: list of string
1612 @returns: list of feedback messages
1613
1614 """
1615 # Non-disruptive error messages, list of (node, msg) pairs
1616 result_msgs = []
1617
1618 # Make sure at least one of these flags is true.
1619 if not (from_authorized_keys or from_public_keys or clear_authorized_keys
1620 or clear_public_keys):
1621 raise errors.SshUpdateError("No removal from any key file was requested.")
1622
1623 if not ssconf_store:
1624 ssconf_store = ssconf.SimpleStore()
1625
1626 master_node = ssconf_store.GetMasterNode()
1627 ssh_port_map = ssconf_store.GetSshPortMap()
1628
1629 if from_authorized_keys or from_public_keys:
1630 if keys_to_remove:
1631 keys = keys_to_remove
1632 else:
1633 keys = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
1634 if (not keys or node_uuid not in keys) and not readd:
1635 raise errors.SshUpdateError("Node '%s' not found in the list of public"
1636 " SSH keys. It seems someone tries to"
1637 " remove a key from outside the cluster!"
1638 % node_uuid)
1639 # During an upgrade all nodes have the master key. In this case we
1640 # should not remove it to avoid accidentally shutting down cluster
1641 # SSH communication
1642 master_keys = None
1643 if master_uuid:
1644 master_keys = ssh.QueryPubKeyFile([master_uuid], key_file=pub_key_file)
1645 for master_key in master_keys:
1646 if master_key in keys[node_uuid]:
1647 keys[node_uuid].remove(master_key)
1648
1649 if node_name == master_node and not keys_to_remove:
1650 raise errors.SshUpdateError("Cannot remove the master node's keys.")
1651
1652 if node_uuid in keys:
1653 base_data = {}
1654 _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
1655 cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
1656
1657 if from_authorized_keys:
1658 base_data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1659 (constants.SSHS_REMOVE, keys)
1660 (auth_key_file, _) = \
1661 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False,
1662 dircheck=False)
1663 ssh.RemoveAuthorizedKeys(auth_key_file, keys[node_uuid])
1664
1665 pot_mc_data = copy.deepcopy(base_data)
1666
1667 if from_public_keys:
1668 pot_mc_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1669 (constants.SSHS_REMOVE, keys)
1670 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
1671
1672 all_nodes = ssconf_store.GetNodeList()
1673 online_nodes = ssconf_store.GetOnlineNodeList()
1674 logging.debug("Removing key of node '%s' from all nodes but itself and"
1675 " master.", node_name)
1676 for node in all_nodes:
1677 if node == master_node:
1678 logging.debug("Skipping master node '%s'.", master_node)
1679 continue
1680 if node not in online_nodes:
1681 logging.debug("Skipping offline node '%s'.", node)
1682 continue
1683 ssh_port = ssh_port_map.get(node)
1684 if not ssh_port:
1685 raise errors.OpExecError("No SSH port information available for"
1686 " node '%s', map: %s." %
1687 (node, ssh_port_map))
1688 error_msg_final = ("When removing the key of node '%s', updating the"
1689 " SSH key files of node '%s' failed. Last error"
1690 " was: %s.")
1691 if node in potential_master_candidates:
1692 logging.debug("Updating key setup of potential master candidate node"
1693 " %s.", node)
1694 try:
1695 utils.RetryByNumberOfTimes(
1696 constants.SSHS_MAX_RETRIES,
1697 errors.SshUpdateError,
1698 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1699 ssh_port, pot_mc_data,
1700 debug=False, verbose=False, use_cluster_key=False,
1701 ask_key=False, strict_host_check=False)
1702 except errors.SshUpdateError as last_exception:
1703 error_msg = error_msg_final % (
1704 node_name, node, last_exception)
1705 result_msgs.append((node, error_msg))
1706 logging.error(error_msg)
1707
1708 else:
1709 if from_authorized_keys:
1710 logging.debug("Updating key setup of normal node %s.", node)
1711 try:
1712 utils.RetryByNumberOfTimes(
1713 constants.SSHS_MAX_RETRIES,
1714 errors.SshUpdateError,
1715 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
1716 ssh_port, base_data,
1717 debug=False, verbose=False, use_cluster_key=False,
1718 ask_key=False, strict_host_check=False)
1719 except errors.SshUpdateError as last_exception:
1720 error_msg = error_msg_final % (
1721 node_name, node, last_exception)
1722 result_msgs.append((node, error_msg))
1723 logging.error(error_msg)
1724
1725 if clear_authorized_keys or from_public_keys or clear_public_keys:
1726 data = {}
1727 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1728 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1729 ssh_port = ssh_port_map.get(node_name)
1730 if not ssh_port:
1731 raise errors.OpExecError("No SSH port information available for"
1732 " node '%s', which is leaving the cluster.")
1733
1734 if clear_authorized_keys:
1735 # The 'authorized_keys' file is not solely managed by Ganeti. Therefore,
1736 # we have to specify exactly which keys to clear to leave keys untouched
1737 # that were not added by Ganeti.
1738 other_master_candidate_uuids = [uuid for uuid in master_candidate_uuids
1739 if uuid != node_uuid]
1740 candidate_keys = ssh.QueryPubKeyFile(other_master_candidate_uuids,
1741 key_file=pub_key_file)
1742 data[constants.SSHS_SSH_AUTHORIZED_KEYS] = \
1743 (constants.SSHS_REMOVE, candidate_keys)
1744
1745 if clear_public_keys:
1746 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1747 (constants.SSHS_CLEAR, {})
1748 elif from_public_keys:
1749 # Since clearing the public keys subsumes removing just a single key,
1750 # we only do it of clear_public_keys is 'False'.
1751
1752 if keys[node_uuid]:
1753 data[constants.SSHS_SSH_PUBLIC_KEYS] = \
1754 (constants.SSHS_REMOVE, keys)
1755
1756 # If we have no changes to any keyfile, just return
1757 if not (constants.SSHS_SSH_PUBLIC_KEYS in data or
1758 constants.SSHS_SSH_AUTHORIZED_KEYS in data):
1759 return
1760
1761 logging.debug("Updating SSH key setup of target node '%s'.", node_name)
1762 try:
1763 utils.RetryByNumberOfTimes(
1764 constants.SSHS_MAX_RETRIES,
1765 errors.SshUpdateError,
1766 run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE,
1767 ssh_port, data,
1768 debug=False, verbose=False, use_cluster_key=False,
1769 ask_key=False, strict_host_check=False)
1770 except errors.SshUpdateError as last_exception:
1771 result_msgs.append(
1772 (node_name,
1773 ("Removing SSH keys from node '%s' failed."
1774 " This can happen when the node is already unreachable."
1775 " Error: %s" % (node_name, last_exception))))
1776
1777 return result_msgs
1778
1779
1780 def _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map,
1781 pub_key_file=pathutils.SSH_PUB_KEYS,
1782 ssconf_store=None,
1783 noded_cert_file=pathutils.NODED_CERT_FILE,
1784 run_cmd_fn=ssh.RunSshCmdWithStdin,
1785 suffix=""):
1786 """Generates the root SSH key pair on the node.
1787
1788 @type node_uuid: str
1789 @param node_uuid: UUID of the node whose key is removed
1790 @type node_name: str
1791 @param node_name: name of the node whose key is remove
1792 @type ssh_port_map: dict of str to int
1793 @param ssh_port_map: mapping of node names to their SSH port
1794
1795 """
1796 if not ssconf_store:
1797 ssconf_store = ssconf.SimpleStore()
1798
1799 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
1800 if not keys_by_uuid or node_uuid not in keys_by_uuid:
1801 raise errors.SshUpdateError("Node %s (UUID: %s) whose key is requested to"
1802 " be regenerated is not registered in the"
1803 " public keys file." % (node_name, node_uuid))
1804
1805 data = {}
1806 _InitSshUpdateData(data, noded_cert_file, ssconf_store)
1807 cluster_name = data[constants.SSHS_CLUSTER_NAME]
1808 data[constants.SSHS_GENERATE] = {constants.SSHS_SUFFIX: suffix}
1809
1810 run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
1811 ssh_port_map.get(node_name), data,
1812 debug=False, verbose=False, use_cluster_key=False,
1813 ask_key=False, strict_host_check=False)
1814
1815
1816 def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
1817 master_node_uuids = [node_uuid for (node_uuid, node_name)
1818 in node_uuid_name_map
1819 if node_name == master_node_name]
1820 if len(master_node_uuids) != 1:
1821 raise errors.SshUpdateError("No (unique) master UUID found. Master node"
1822 " name: '%s', Master UUID: '%s'" %
1823 (master_node_name, master_node_uuids))
1824 return master_node_uuids[0]
1825
1826
1827 def _GetOldMasterKeys(master_node_uuid, pub_key_file):
1828 old_master_keys_by_uuid = ssh.QueryPubKeyFile([master_node_uuid],
1829 key_file=pub_key_file)
1830 if not old_master_keys_by_uuid:
1831 raise errors.SshUpdateError("No public key of the master node (UUID '%s')"
1832 " found, not generating a new key."
1833 % master_node_uuid)
1834 return old_master_keys_by_uuid
1835
1836
1837 def _GetNewMasterKey(root_keyfiles, master_node_uuid):
1838 new_master_keys = []
1839 for (_, (_, public_key_file)) in root_keyfiles.items():
1840 public_key_dir = os.path.dirname(public_key_file)
1841 public_key_file_tmp_filename = \
1842 os.path.splitext(os.path.basename(public_key_file))[0] \
1843 + constants.SSHS_MASTER_SUFFIX + ".pub"
1844 public_key_path_tmp = os.path.join(public_key_dir,
1845 public_key_file_tmp_filename)
1846 if os.path.exists(public_key_path_tmp):
1847 # for some key types, there might not be any keys
1848 key = utils.ReadFile(public_key_path_tmp)
1849 new_master_keys.append(key)
1850 if not new_master_keys:
1851 raise errors.SshUpdateError("Cannot find any type of temporary SSH key.")
1852 return {master_node_uuid: new_master_keys}
1853
1854
1855 def _ReplaceMasterKeyOnMaster(root_keyfiles):
1856 number_of_moves = 0
1857 for (_, (private_key_file, public_key_file)) in root_keyfiles.items():
1858 key_dir = os.path.dirname(public_key_file)
1859 private_key_file_tmp = \
1860 os.path.basename(private_key_file) + constants.SSHS_MASTER_SUFFIX
1861 public_key_file_tmp = private_key_file_tmp + ".pub"
1862 private_key_path_tmp = os.path.join(key_dir,
1863 private_key_file_tmp)
1864 public_key_path_tmp = os.path.join(key_dir,
1865 public_key_file_tmp)
1866 if os.path.exists(public_key_file):
1867 utils.CreateBackup(public_key_file)
1868 utils.RemoveFile(public_key_file)
1869 if os.path.exists(private_key_file):
1870 utils.CreateBackup(private_key_file)
1871 utils.RemoveFile(private_key_file)
1872 if os.path.exists(public_key_path_tmp) and \
1873 os.path.exists(private_key_path_tmp):
1874 # for some key types, there might not be any keys
1875 shutil.move(public_key_path_tmp, public_key_file)
1876 shutil.move(private_key_path_tmp, private_key_file)
1877 number_of_moves += 1
1878 if not number_of_moves:
1879 raise errors.SshUpdateError("Could not move at least one master SSH key.")
1880
1881
1882 def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
1883 potential_master_candidates,
1884 pub_key_file=pathutils.SSH_PUB_KEYS,
1885 ssconf_store=None,
1886 noded_cert_file=pathutils.NODED_CERT_FILE,
1887 run_cmd_fn=ssh.RunSshCmdWithStdin):
1888 """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
1889
1890 @type node_uuids: list of str
1891 @param node_uuids: list of node UUIDs whose keys should be renewed
1892 @type node_names: list of str
1893 @param node_names: list of node names whose keys should be removed. This list
1894 should match the C{node_uuids} parameter
1895 @type master_candidate_uuids: list of str
1896 @param master_candidate_uuids: list of UUIDs of master candidates or
1897 master node
1898 @type pub_key_file: str
1899 @param pub_key_file: file path of the the public key file
1900 @type noded_cert_file: str
1901 @param noded_cert_file: path of the noded SSL certificate file
1902 @type run_cmd_fn: function
1903 @param run_cmd_fn: function to run commands on remote nodes via SSH
1904 @raises ProgrammerError: if node_uuids and node_names don't match;
1905 SshUpdateError if a node's key is missing from the public key file,
1906 if a node's new SSH key could not be fetched from it, if there is
1907 none or more than one entry in the public key list for the master
1908 node.
1909
1910 """
1911 if not ssconf_store:
1912 ssconf_store = ssconf.SimpleStore()
1913 cluster_name = ssconf_store.GetClusterName()
1914
1915 if not len(node_uuids) == len(node_names):
1916 raise errors.ProgrammerError("List of nodes UUIDs and node names"
1917 " does not match in length.")
1918
1919 (_, root_keyfiles) = \
1920 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
1921 (_, dsa_pub_keyfile) = root_keyfiles[constants.SSHK_DSA]
1922 old_master_key = utils.ReadFile(dsa_pub_keyfile)
1923
1924 node_uuid_name_map = zip(node_uuids, node_names)
1925
1926 master_node_name = ssconf_store.GetMasterNode()
1927 master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name)
1928 ssh_port_map = ssconf_store.GetSshPortMap()
1929 # List of all node errors that happened, but which did not abort the
1930 # procedure as a whole. It is important that this is a list to have a
1931 # somewhat chronological history of events.
1932 all_node_errors = []
1933
1934 # process non-master nodes
1935 for node_uuid, node_name in node_uuid_name_map:
1936 if node_name == master_node_name:
1937 continue
1938 master_candidate = node_uuid in master_candidate_uuids
1939 potential_master_candidate = node_name in potential_master_candidates
1940
1941 keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
1942 if not keys_by_uuid:
1943 raise errors.SshUpdateError("No public key of node %s (UUID %s) found,"
1944 " not generating a new key."
1945 % (node_name, node_uuid))
1946
1947 if master_candidate:
1948 logging.debug("Fetching old SSH key from node '%s'.", node_name)
1949 old_pub_key = ssh.ReadRemoteSshPubKeys(dsa_pub_keyfile,
1950 node_name, cluster_name,
1951 ssh_port_map[node_name],
1952 False, # ask_key
1953 False) # key_check
1954 if old_pub_key != old_master_key:
1955 # If we are already in a multi-key setup (that is past Ganeti 2.12),
1956 # we can safely remove the old key of the node. Otherwise, we cannot
1957 # remove that node's key, because it is also the master node's key
1958 # and that would terminate all communication from the master to the
1959 # node.
1960 logging.debug("Removing SSH key of node '%s'.", node_name)
1961 node_errors = RemoveNodeSshKey(
1962 node_uuid, node_name, master_candidate_uuids,
1963 potential_master_candidates,
1964 master_uuid=master_node_uuid, from_authorized_keys=master_candidate,
1965 from_public_keys=False, clear_authorized_keys=False,
1966 clear_public_keys=False)
1967 if node_errors:
1968 all_node_errors = all_node_errors + node_errors
1969 else:
1970 logging.debug("Old key of node '%s' is the same as the current master"
1971 " key. Not deleting that key on the node.", node_name)
1972
1973 logging.debug("Generating new SSH key for node '%s'.", node_name)
1974 _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map,
1975 pub_key_file=pub_key_file,
1976 ssconf_store=ssconf_store,
1977 noded_cert_file=noded_cert_file,
1978 run_cmd_fn=run_cmd_fn)
1979
1980 try:
1981 logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
1982 pub_key = ssh.ReadRemoteSshPubKeys(dsa_pub_keyfile,
1983 node_name, cluster_name,
1984 ssh_port_map[node_name],
1985 False, # ask_key
1986 False) # key_check
1987 except:
1988 raise errors.SshUpdateError("Could not fetch key of node %s"
1989 " (UUID %s)" % (node_name, node_uuid))
1990
1991 if potential_master_candidate:
1992 ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
1993 ssh.AddPublicKey(node_uuid, pub_key, key_file=pub_key_file)
1994
1995 logging.debug("Add ssh key of node '%s'.", node_name)
1996 node_errors = AddNodeSshKey(
1997 node_uuid, node_name, potential_master_candidates,
1998 to_authorized_keys=master_candidate,
1999 to_public_keys=potential_master_candidate,
2000 get_public_keys=True,
2001 pub_key_file=pub_key_file, ssconf_store=ssconf_store,
2002 noded_cert_file=noded_cert_file,
2003 run_cmd_fn=run_cmd_fn)
2004 if node_errors:
2005 all_node_errors = all_node_errors + node_errors
2006
2007 # Renewing the master node's key
2008
2009 # Preserve the old keys for now
2010 old_master_keys_by_uuid = _GetOldMasterKeys(master_node_uuid, pub_key_file)
2011
2012 # Generate a new master key with a suffix, don't touch the old one for now
2013 logging.debug("Generate new ssh key of master.")
2014 _GenerateNodeSshKey(master_node_uuid, master_node_name, ssh_port_map,
2015 pub_key_file=pub_key_file,
2016 ssconf_store=ssconf_store,
2017 noded_cert_file=noded_cert_file,
2018 run_cmd_fn=run_cmd_fn,
2019 suffix=constants.SSHS_MASTER_SUFFIX)
2020 # Read newly created master key
2021 new_master_key_dict = _GetNewMasterKey(root_keyfiles, master_node_uuid)
2022
2023 # Replace master key in the master nodes' public key file
2024 ssh.RemovePublicKey(master_node_uuid, key_file=pub_key_file)
2025 for pub_key in new_master_key_dict[master_node_uuid]:
2026 ssh.AddPublicKey(master_node_uuid, pub_key, key_file=pub_key_file)
2027
2028 # Add new master key to all node's public and authorized keys
2029 logging.debug("Add new master key to all nodes.")
2030 node_errors = AddNodeSshKey(
2031 master_node_uuid, master_node_name, potential_master_candidates,
2032 to_authorized_keys=True, to_public_keys=True,
2033 get_public_keys=False, pub_key_file=pub_key_file,
2034 ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
2035 run_cmd_fn=run_cmd_fn)
2036 if node_errors:
2037 all_node_errors = all_node_errors + node_errors
2038
2039 # Remove the old key file and rename the new key to the non-temporary filename
2040 _ReplaceMasterKeyOnMaster(root_keyfiles)
2041
2042 # Remove old key from authorized keys
2043 (auth_key_file, _) = \
2044 ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
2045 ssh.RemoveAuthorizedKeys(auth_key_file,
2046 old_master_keys_by_uuid[master_node_uuid])
2047
2048 # Remove the old key from all node's authorized keys file
2049 logging.debug("Remove the old master key from all nodes.")
2050 node_errors = RemoveNodeSshKey(
2051 master_node_uuid, master_node_name, master_candidate_uuids,
2052 potential_master_candidates,
2053 keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
2054 from_public_keys=False, clear_authorized_keys=False,
2055 clear_public_keys=False)
2056 if node_errors:
2057 all_node_errors = all_node_errors + node_errors
2058
2059 return all_node_errors
2060
2061
2062 def GetBlockDevSizes(devices):
2063 """Return the size of the given block devices
2064
2065 @type devices: list
2066 @param devices: list of block device nodes to query
2067 @rtype: dict
2068 @return:
2069 dictionary of all block devices under /dev (key). The value is their
2070 size in MiB.
2071
2072 {'/dev/disk/by-uuid/123456-12321231-312312-312': 124}
2073
2074 """
2075 DEV_PREFIX = "/dev/"
2076 blockdevs = {}
2077
2078 for devpath in devices:
2079 if not utils.IsBelowDir(DEV_PREFIX, devpath):
2080 continue
2081
2082 try:
2083 st = os.stat(devpath)
2084 except EnvironmentError, err:
2085 logging.warning("Error stat()'ing device %s: %s", devpath, str(err))
2086 continue
2087
2088 if stat.S_ISBLK(st.st_mode):
2089 result = utils.RunCmd(["blockdev", "--getsize64", devpath])
2090 if result.failed:
2091 # We don't want to fail, just do not list this device as available
2092 logging.warning("Cannot get size for block device %s", devpath)
2093 continue
2094
2095 size = int(result.stdout) / (1024 * 1024)
2096 blockdevs[devpath] = size
2097 return blockdevs
2098
2099
2100 def GetVolumeList(vg_names):
2101 """Compute list of logical volumes and their size.
2102
2103 @type vg_names: list
2104 @param vg_names: the volume groups whose LVs we should list, or
2105 empty for all volume groups
2106 @rtype: dict
2107 @return:
2108 dictionary of all partions (key) with value being a tuple of
2109 their size (in MiB), inactive and online status::
2110
2111 {'xenvg/test1': ('20.06', True, True)}
2112
2113 in case of errors, a string is returned with the error
2114 details.
2115
2116 """
2117 lvs = {}
2118 sep = "|"
2119 if not vg_names:
2120 vg_names = []
2121 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2122 "--separator=%s" % sep,
2123 "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names)
2124 if result.failed:
2125 _Fail("Failed to list logical volumes, lvs output: %s", result.output)
2126
2127 for line in result.stdout.splitlines():
2128 line = line.strip()
2129 match = _LVSLINE_REGEX.match(line)
2130 if not match:
2131 logging.error("Invalid line returned from lvs output: '%s'", line)
2132 continue
2133 vg_name, name, size, attr = match.groups()
2134 inactive = attr[4] == "-"
2135 online = attr[5] == "o"
2136 virtual = attr[0] == "v"
2137 if virtual:
2138 # we don't want to report such volumes as existing, since they
2139 # don't really hold data
2140 continue
2141 lvs[vg_name + "/" + name] = (size, inactive, online)
2142
2143 return lvs
2144
2145
2146 def ListVolumeGroups():
2147 """List the volume groups and their size.
2148
2149 @rtype: dict
2150 @return: dictionary with keys volume name and values the
2151 size of the volume
2152
2153 """
2154 return utils.ListVolumeGroups()
2155
2156
2157 def NodeVolumes():
2158 """List all volumes on this node.
2159
2160 @rtype: list
2161 @return:
2162 A list of dictionaries, each having four keys:
2163 - name: the logical volume name,
2164 - size: the size of the logical volume
2165 - dev: the physical device on which the LV lives
2166 - vg: the volume group to which it belongs
2167
2168 In case of errors, we return an empty list and log the
2169 error.
2170
2171 Note that since a logical volume can live on multiple physical
2172 volumes, the resulting list might include a logical volume
2173 multiple times.
2174
2175 """
2176 result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
2177 "--separator=|",
2178 "--options=lv_name,lv_size,devices,vg_name"])
2179 if result.failed:
2180 _Fail("Failed to list logical volumes, lvs output: %s",
2181 result.output)
2182
2183 def parse_dev(dev):
2184 return dev.split("(")[0]
2185
2186 def handle_dev(dev):
2187 return [parse_dev(x) for x in dev.split(",")]
2188
2189 def map_line(line):
2190 line = [v.strip() for v in line]
2191 return [{"name": line[0], "size": line[1],
2192 "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])]
2193
2194 all_devs = []
2195 for line in result.stdout.splitlines():
2196 if line.count("|") >= 3:
2197 all_devs.extend(map_line(line.split("|")))
2198 else:
2199 logging.warning("Strange line in the output from lvs: '%s'", line)
2200 return all_devs
2201
2202
2203 def BridgesExist(bridges_list):
2204 """Check if a list of bridges exist on the current node.
2205
2206 @rtype: boolean
2207 @return: C{True} if all of them exist, C{False} otherwise
2208
2209 """
2210 missing = []
2211 for bridge in bridges_list:
2212 if not utils.BridgeExists(bridge):
2213 missing.append(bridge)
2214
2215 if missing:
2216 _Fail("Missing bridges %s", utils.CommaJoin(missing))
2217
2218
2219 def GetInstanceListForHypervisor(hname, hvparams=None,
2220 get_hv_fn=hypervisor.GetHypervisor):
2221 """Provides a list of instances of the given hypervisor.
2222
2223 @type hname: string
2224 @param hname: name of the hypervisor
2225 @type hvparams: dict of strings
2226 @param hvparams: hypervisor parameters for the given hypervisor
2227 @type get_hv_fn: function
2228 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2229 name; optional parameter to increase testability
2230
2231 @rtype: list
2232 @return: a list of all running instances on the current node
2233 - instance1.example.com
2234 - instance2.example.com
2235
2236 """
2237 try:
2238 return get_hv_fn(hname).ListInstances(hvparams=hvparams)
2239 except errors.HypervisorError, err:
2240 _Fail("Error enumerating instances (hypervisor %s): %s",
2241 hname, err, exc=True)
2242
2243
2244 def GetInstanceList(hypervisor_list, all_hvparams=None,
2245 get_hv_fn=hypervisor.GetHypervisor):
2246 """Provides a list of instances.
2247
2248 @type hypervisor_list: list
2249 @param hypervisor_list: the list of hypervisors to query information
2250 @type all_hvparams: dict of dict of strings
2251 @param all_hvparams: a dictionary mapping hypervisor types to respective
2252 cluster-wide hypervisor parameters
2253 @type get_hv_fn: function
2254 @param get_hv_fn: function that returns a hypervisor for the given hypervisor
2255 name; optional parameter to increase testability
2256
2257 @rtype: list
2258 @return: a list of all running instances on the current node
2259 - instance1.example.com
2260 - instance2.example.com
2261
2262 """
2263 results = []
2264 for hname in hypervisor_list:
2265 hvparams = all_hvparams[hname]
2266 results.extend(GetInstanceListForHypervisor(hname, hvparams=hvparams,
2267 get_hv_fn=get_hv_fn))
2268 return results
2269
2270
2271 def GetInstanceInfo(instance, hname, hvparams=None):
2272 """Gives back the information about an instance as a dictionary.
2273
2274 @type instance: string
2275 @param instance: the instance name
2276 @type hname: string
2277 @param hname: the hypervisor type of the instance
2278 @type hvparams: dict of strings
2279 @param hvparams: the instance's hvparams
2280
2281 @rtype: dict
2282 @return: dictionary with the following keys:
2283 - memory: memory size of instance (int)
2284 - state: state of instance (HvInstanceState)
2285 - time: cpu time of instance (float)
2286 - vcpus: the number of vcpus (int)
2287
2288 """
2289 output = {}
2290
2291 iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance,
2292 hvparams=hvparams)
2293 if iinfo is not None:
2294 output["memory"] = iinfo[2]
2295 output["vcpus"] = iinfo[3]
2296 output["state"] = iinfo[4]
2297 output["time"] = iinfo[5]
2298
2299 return output
2300
2301
2302 def GetInstanceMigratable(instance):
2303 """Computes whether an instance can be migrated.
2304
2305 @type instance: L{objects.Instance}
2306 @param instance: object representing the instance to be checked.
2307
2308 @rtype: tuple
2309 @return: tuple of (result, description) where:
2310 - result: whether the instance can be migrated or not
2311 - description: a description of the issue, if relevant
2312
2313 """
2314 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2315 iname = instance.name
2316 if iname not in hyper.ListInstances(hvparams=instance.hvparams):
2317 _Fail("Instance %s is not running", iname)
2318
2319 for idx in range(len(instance.disks_info)):
2320 link_name = _GetBlockDevSymlinkPath(iname, idx)
2321 if not os.path.islink(link_name):
2322 logging.warning("Instance %s is missing symlink %s for disk %d",
2323 iname, link_name, idx)
2324
2325
2326 def GetAllInstancesInfo(hypervisor_list, all_hvparams):
2327 """Gather data about all instances.
2328
2329 This is the equivalent of L{GetInstanceInfo}, except that it
2330 computes data for all instances at once, thus being faster if one
2331 needs data about more than one instance.
2332
2333 @type hypervisor_list: list
2334 @param hypervisor_list: list of hypervisors to query for instance data
2335 @type all_hvparams: dict of dict of strings
2336 @param all_hvparams: mapping of hypervisor names to hvparams
2337
2338 @rtype: dict
2339 @return: dictionary of instance: data, with data having the following keys:
2340 - memory: memory size of instance (int)
2341 - state: xen state of instance (string)
2342 - time: cpu time of instance (float)
2343 - vcpus: the number of vcpus
2344
2345 """
2346 output = {}
2347 for hname in hypervisor_list:
2348 hvparams = all_hvparams[hname]
2349 iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo(hvparams)
2350 if iinfo:
2351 for name, _, memory, vcpus, state, times in iinfo:
2352 value = {
2353 "memory": memory,
2354 "vcpus": vcpus,
2355 "state": state,
2356 "time": times,
2357 }
2358 if name in output:
2359 # we only check static parameters, like memory and vcpus,
2360 # and not state and time which can change between the
2361 # invocations of the different hypervisors
2362 for key in "memory", "vcpus":
2363 if value[key] != output[name][key]:
2364 _Fail("Instance %s is running twice"
2365 " with different parameters", name)
2366 output[name] = value
2367
2368 return output
2369
2370
2371 def GetInstanceConsoleInfo(instance_param_dict,
2372 get_hv_fn=hypervisor.GetHypervisor):
2373 """Gather data about the console access of a set of instances of this node.
2374
2375 This function assumes that the caller already knows which instances are on
2376 this node, by calling a function such as L{GetAllInstancesInfo} or
2377 L{GetInstanceList}.
2378
2379 For every instance, a large amount of configuration data needs to be
2380 provided to the hypervisor interface in order to receive the console
2381 information. Whether this could or should be cut down can be discussed.
2382 The information is provided in a dictionary indexed by instance name,
2383 allowing any number of instance queries to be done.
2384
2385 @type instance_param_dict: dict of string to tuple of dictionaries, where the
2386 dictionaries represent: L{objects.Instance}, L{objects.Node},
2387 L{objects.NodeGroup}, HvParams, BeParams
2388 @param instance_param_dict: mapping of instance name to parameters necessary
2389 for console information retrieval
2390
2391 @rtype: dict
2392 @return: dictionary of instance: data, with data having the following keys:
2393 - instance: instance name
2394 - kind: console kind
2395 - message: used with kind == CONS_MESSAGE, indicates console to be
2396 unavailable, supplies error message
2397 - host: host to connect to
2398 - port: port to use
2399 - user: user for login
2400 - command: the command, broken into parts as an array
2401 - display: unknown, potentially unused?
2402
2403 """
2404
2405 output = {}
2406 for inst_name in instance_param_dict:
2407 instance = instance_param_dict[inst_name]["instance"]
2408 pnode = instance_param_dict[inst_name]["node"]
2409 group = instance_param_dict[inst_name]["group"]
2410 hvparams = instance_param_dict[inst_name]["hvParams"]
2411 beparams = instance_param_dict[inst_name]["beParams"]
2412
2413 instance = objects.Instance.FromDict(instance)
2414 pnode = objects.Node.FromDict(pnode)
2415 group = objects.NodeGroup.FromDict(group)
2416
2417 h = get_hv_fn(instance.hypervisor)
2418 output[inst_name] = h.GetInstanceConsole(instance, pnode, group,
2419 hvparams, beparams).ToDict()
2420
2421 return output
2422
2423
2424 def _InstanceLogName(kind, os_name, instance, component):
2425 """Compute the OS log filename for a given instance and operation.
2426
2427 The instance name and os name are passed in as strings since not all
2428 operations have these as part of an instance object.
2429
2430 @type kind: string
2431 @param kind: the operation type (e.g. add, import, etc.)
2432 @type os_name: string
2433 @param os_name: the os name
2434 @type instance: string
2435 @param instance: the name of the instance being imported/added/etc.
2436 @type component: string or None
2437 @param component: the name of the component of the instance being
2438 transferred
2439
2440 """
2441 # TODO: Use tempfile.mkstemp to create unique filename
2442 if component:
2443 assert "/" not in component
2444 c_msg = "-%s" % component
2445 else:
2446 c_msg = ""
2447 base = ("%s-%s-%s%s-%s.log" %
2448 (kind, os_name, instance, c_msg, utils.TimestampForFilename()))
2449 return utils.PathJoin(pathutils.LOG_OS_DIR, base)
2450
2451
2452 def InstanceOsAdd(instance, reinstall, debug):
2453 """Add an OS to an instance.
2454
2455 @type instance: L{objects.Instance}
2456 @param instance: Instance whose OS is to be installed
2457 @type reinstall: boolean
2458 @param reinstall: whether this is an instance reinstall
2459 @type debug: integer
2460 @param debug: debug level, passed to the OS scripts
2461 @rtype: None
2462
2463 """
2464 inst_os = OSFromDisk(instance.os)
2465
2466 create_env = OSEnvironment(instance, inst_os, debug)
2467 if reinstall:
2468 create_env["INSTANCE_REINSTALL"] = "1"
2469
2470 logfile = _InstanceLogName("add", instance.os, instance.name, None)
2471
2472 result = utils.RunCmd([inst_os.create_script], env=create_env,
2473 cwd=inst_os.path, output=logfile, reset_env=True)
2474 if result.failed:
2475 logging.error("os create command '%s' returned error: %s, logfile: %s,"
2476 " output: %s", result.cmd, result.fail_reason, logfile,
2477 result.output)
2478 lines = [utils.SafeEncode(val)
2479 for val in utils.TailFile(logfile, lines=20)]
2480 _Fail("OS create script failed (%s), last lines in the"
2481 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2482
2483
2484 def RunRenameInstance(instance, old_name, debug):
2485 """Run the OS rename script for an instance.
2486
2487 @type instance: L{objects.Instance}
2488 @param instance: Instance whose OS is to be installed
2489 @type old_name: string
2490 @param old_name: previous instance name
2491 @type debug: integer
2492 @param debug: debug level, passed to the OS scripts
2493 @rtype: boolean
2494 @return: the success of the operation
2495
2496 """
2497 inst_os = OSFromDisk(instance.os)
2498
2499 rename_env = OSEnvironment(instance, inst_os, debug)
2500 rename_env["OLD_INSTANCE_NAME"] = old_name
2501
2502 logfile = _InstanceLogName("rename", instance.os,
2503 "%s-%s" % (old_name, instance.name), None)
2504
2505 result = utils.RunCmd([inst_os.rename_script], env=rename_env,
2506 cwd=inst_os.path, output=logfile, reset_env=True)
2507
2508 if result.failed:
2509 logging.error("os create command '%s' returned error: %s output: %s",
2510 result.cmd, result.fail_reason, result.output)
2511 lines = [utils.SafeEncode(val)
2512 for val in utils.TailFile(logfile, lines=20)]
2513 _Fail("OS rename script failed (%s), last lines in the"
2514 " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
2515
2516
2517 def _GetBlockDevSymlinkPath(instance_name, idx, _dir=None):
2518 """Returns symlink path for block device.
2519
2520 """
2521 if _dir is None:
2522 _dir = pathutils.DISK_LINKS_DIR
2523
2524 return utils.PathJoin(_dir,
2525 ("%s%s%s" %
2526 (instance_name, constants.DISK_SEPARATOR, idx)))
2527
2528
2529 def _SymlinkBlockDev(instance_name, device_path, idx):
2530 """Set up symlinks to a instance's block device.
2531
2532 This is an auxiliary function run when an instance is start (on the primary
2533 node) or when an instance is migrated (on the target node).
2534
2535
2536 @param instance_name: the name of the target instance
2537 @param device_path: path of the physical block device, on the node
2538 @param idx: the disk index
2539 @return: absolute path to the disk's symlink
2540
2541 """
2542 # In case we have only a userspace access URI, device_path is None
2543 if not device_path:
2544 return None
2545
2546 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2547 try:
2548 os.symlink(device_path, link_name)
2549 except OSError, err:
2550 if err.errno == errno.EEXIST:
2551 if (not os.path.islink(link_name) or
2552 os.readlink(link_name) != device_path):
2553 os.remove(link_name)
2554 os.symlink(device_path, link_name)
2555 else:
2556 raise
2557
2558 return link_name
2559
2560
2561 def _RemoveBlockDevLinks(instance_name, disks):
2562 """Remove the block device symlinks belonging to the given instance.
2563
2564 """
2565 for idx, _ in enumerate(disks):
2566 link_name = _GetBlockDevSymlinkPath(instance_name, idx)
2567 if os.path.islink(link_name):
2568 try:
2569 os.remove(link_name)
2570 except OSError:
2571 logging.exception("Can't remove symlink '%s'", link_name)
2572
2573
2574 def _CalculateDeviceURI(instance, disk, device):
2575 """Get the URI for the device.
2576
2577 @type instance: L{objects.Instance}
2578 @param instance: the instance which disk belongs to
2579 @type disk: L{objects.Disk}
2580 @param disk: the target disk object
2581 @type device: L{bdev.BlockDev}
2582 @param device: the corresponding BlockDevice
2583 @rtype: string
2584 @return: the device uri if any else None
2585
2586 """
2587 access_mode = disk.params.get(constants.LDP_ACCESS,
2588 constants.DISK_KERNELSPACE)
2589 if access_mode == constants.DISK_USERSPACE:
2590 # This can raise errors.BlockDeviceError
2591 return device.GetUserspaceAccessUri(instance.hypervisor)
2592 else:
2593 return None
2594
2595
2596 def _GatherAndLinkBlockDevs(instance):
2597 """Set up an instance's block device(s).
2598
2599 This is run on the primary node at instance startup. The block
2600 devices must be already assembled.
2601
2602 @type instance: L{objects.Instance}
2603 @param instance: the instance whose disks we should assemble
2604 @rtype: list
2605 @return: list of (disk_object, link_name, drive_uri)
2606
2607 """
2608 block_devices = []
2609 for idx, disk in enumerate(instance.disks_info):
2610 device = _RecursiveFindBD(disk)
2611 if device is None:
2612 raise errors.BlockDeviceError("Block device '%s' is not set up." %
2613 str(disk))
2614 device.Open()
2615 try:
2616 link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
2617 except OSError, e:
2618 raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
2619 e.strerror)
2620 uri = _CalculateDeviceURI(instance, disk, device)
2621
2622 block_devices.append((disk, link_name, uri))
2623
2624 return block_devices
2625
2626
2627 def _IsInstanceUserDown(instance_info):
2628 return instance_info and \
2629 "state" in instance_info and \
2630 hv_base.HvInstanceState.IsShutdown(instance_info["state"])
2631
2632
2633 def _GetInstanceInfo(instance):
2634 """Helper function L{GetInstanceInfo}"""
2635 return GetInstanceInfo(instance.name, instance.hypervisor,
2636 hvparams=instance.hvparams)
2637
2638
2639 def StartInstance(instance, startup_paused, reason, store_reason=True):
2640 """Start an instance.
2641
2642 @type instance: L{objects.Instance}
2643 @param instance: the instance object
2644 @type startup_paused: bool
2645 @param instance: pause instance at startup?
2646 @type reason: list of reasons
2647 @param reason: the reason trail for this startup
2648 @type store_reason: boolean
2649 @param store_reason: whether to store the shutdown reason trail on file
2650 @rtype: None
2651
2652 """
2653 instance_info = _GetInstanceInfo(instance)
2654
2655 if instance_info and not _IsInstanceUserDown(instance_info):
2656 logging.info("Instance '%s' already running, not starting", instance.name)
2657 return
2658
2659 try:
2660 block_devices = _GatherAndLinkBlockDevs(instance)
2661 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2662 hyper.StartInstance(instance, block_devices, startup_paused)
2663 if store_reason:
2664 _StoreInstReasonTrail(instance.name, reason)
2665 except errors.BlockDeviceError, err:
2666 _Fail("Block device error: %s", err, exc=True)
2667 except errors.HypervisorError, err:
2668 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2669 _Fail("Hypervisor error: %s", err, exc=True)
2670
2671
2672 def InstanceShutdown(instance, timeout, reason, store_reason=True):
2673 """Shut an instance down.
2674
2675 @note: this functions uses polling with a hardcoded timeout.
2676
2677 @type instance: L{objects.Instance}
2678 @param instance: the instance object
2679 @type timeout: integer
2680 @param timeout: maximum timeout for soft shutdown
2681 @type reason: list of reasons
2682 @param reason: the reason trail for this shutdown
2683 @type store_reason: boolean
2684 @param store_reason: whether to store the shutdown reason trail on file
2685 @rtype: None
2686
2687 """
2688 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2689
2690 if not _GetInstanceInfo(instance):
2691 logging.info("Instance '%s' not running, doing nothing", instance.name)
2692 return
2693
2694 class _TryShutdown(object):
2695 def __init__(self):
2696 self.tried_once = False
2697
2698 def __call__(self):
2699 if not _GetInstanceInfo(instance):
2700 return
2701
2702 try:
2703 hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout)
2704 if store_reason:
2705 _StoreInstReasonTrail(instance.name, reason)
2706 except errors.HypervisorError, err:
2707 # if the instance is no longer existing, consider this a
2708 # success and go to cleanup
2709 if not _GetInstanceInfo(instance):
2710 return
2711
2712 _Fail("Failed to stop instance '%s': %s", instance.name, err)
2713
2714 self.tried_once = True
2715
2716 raise utils.RetryAgain()
2717
2718 try:
2719 utils.Retry(_TryShutdown(), 5, timeout)
2720 except utils.RetryTimeout:
2721 # the shutdown did not succeed
2722 logging.error("Shutdown of '%s' unsuccessful, forcing", instance.name)
2723
2724 try:
2725 hyper.StopInstance(instance, force=True)
2726 except errors.HypervisorError, err:
2727 # only raise an error if the instance still exists, otherwise
2728 # the error could simply be "instance ... unknown"!
2729 if _GetInstanceInfo(instance):
2730 _Fail("Failed to force stop instance '%s': %s", instance.name, err)
2731
2732 time.sleep(1)
2733
2734 if _GetInstanceInfo(instance):
2735 _Fail("Could not shutdown instance '%s' even by destroy", instance.name)
2736
2737 try:
2738 hyper.CleanupInstance(instance.name)
2739 except errors.HypervisorError, err:
2740 logging.warning("Failed to execute post-shutdown cleanup step: %s", err)
2741
2742 _RemoveBlockDevLinks(instance.name, instance.disks_info)
2743
2744
2745 def InstanceReboot(instance, reboot_type, shutdown_timeout, reason):
2746 """Reboot an instance.
2747
2748 @type instance: L{objects.Instance}
2749 @param instance: the instance object to reboot
2750 @type reboot_type: str
2751 @param reboot_type: the type of reboot, one the following
2752 constants:
2753 - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
2754 instance OS, do not recreate the VM
2755 - L{constants.INSTANCE_REBOOT_HARD}: tear down and
2756 restart the VM (at the hypervisor level)
2757 - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is
2758 not accepted here, since that mode is handled differently, in
2759 cmdlib, and translates into full stop and start of the
2760 instance (instead of a call_instance_reboot RPC)
2761 @type shutdown_timeout: integer
2762 @param shutdown_timeout: maximum timeout for soft shutdown
2763 @type reason: list of reasons
2764 @param reason: the reason trail for this reboot
2765 @rtype: None
2766
2767 """
2768 # TODO: this is inconsistent with 'StartInstance' and 'InstanceShutdown'
2769 # because those functions simply 'return' on error whereas this one
2770 # raises an exception with '_Fail'
2771 if not _GetInstanceInfo(instance):
2772 _Fail("Cannot reboot instance '%s' that is not running", instance.name)
2773
2774 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2775 if reboot_type == constants.INSTANCE_REBOOT_SOFT:
2776 try:
2777 hyper.RebootInstance(instance)
2778 except errors.HypervisorError, err:
2779 _Fail("Failed to soft reboot instance '%s': %s", instance.name, err)
2780 elif reboot_type == constants.INSTANCE_REBOOT_HARD:
2781 try:
2782 InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False)
2783 result = StartInstance(instance, False, reason, store_reason=False)
2784 _StoreInstReasonTrail(instance.name, reason)
2785 return result
2786 except errors.HypervisorError, err:
2787 _Fail("Failed to hard reboot instance '%s': %s", instance.name, err)
2788 else:
2789 _Fail("Invalid reboot_type received: '%s'", reboot_type)
2790
2791
2792 def InstanceBalloonMemory(instance, memory):
2793 """Resize an instance's memory.
2794
2795 @type instance: L{objects.Instance}
2796 @param instance: the instance object
2797 @type memory: int
2798 @param memory: new memory amount in MB
2799 @rtype: None
2800
2801 """
2802 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2803 running = hyper.ListInstances(hvparams=instance.hvparams)
2804 if instance.name not in running:
2805 logging.info("Instance %s is not running, cannot balloon", instance.name)
2806 return
2807 try:
2808 hyper.BalloonInstanceMemory(instance, memory)
2809 except errors.HypervisorError, err:
2810 _Fail("Failed to balloon instance memory: %s", err, exc=True)
2811
2812
2813 def MigrationInfo(instance):
2814 """Gather information about an instance to be migrated.
2815
2816 @type instance: L{objects.Instance}
2817 @param instance: the instance definition
2818
2819 """
2820 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2821 try:
2822 info = hyper.MigrationInfo(instance)
2823 except errors.HypervisorError, err:
2824 _Fail("Failed to fetch migration information: %s", err, exc=True)
2825 return info
2826
2827
2828 def AcceptInstance(instance, info, target):
2829 """Prepare the node to accept an instance.
2830
2831 @type instance: L{objects.Instance}
2832 @param instance: the instance definition
2833 @type info: string/data (opaque)
2834 @param info: migration information, from the source node
2835 @type target: string
2836 @param target: target host (usually ip), on this node
2837
2838 """
2839 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2840 try:
2841 hyper.AcceptInstance(instance, info, target)
2842 except errors.HypervisorError, err:
2843 _Fail("Failed to accept instance: %s", err, exc=True)
2844
2845
2846 def FinalizeMigrationDst(instance, info, success):
2847 """Finalize any preparation to accept an instance.
2848
2849 @type instance: L{objects.Instance}
2850 @param instance: the instance definition
2851 @type info: string/data (opaque)
2852 @param info: migration information, from the source node
2853 @type success: boolean
2854 @param success: whether the migration was a success or a failure
2855
2856 """
2857 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2858 try:
2859 hyper.FinalizeMigrationDst(instance, info, success)
2860 except errors.HypervisorError, err:
2861 _Fail("Failed to finalize migration on the target node: %s", err, exc=True)
2862
2863
2864 def MigrateInstance(cluster_name, instance, target, live):
2865 """Migrates an instance to another node.
2866
2867 @type cluster_name: string
2868 @param cluster_name: name of the cluster
2869 @type instance: L{objects.Instance}
2870 @param instance: the instance definition
2871 @type target: string
2872 @param target: the target node name
2873 @type live: boolean
2874 @param live: whether the migration should be done live or not (the
2875 interpretation of this parameter is left to the hypervisor)
2876 @raise RPCFail: if migration fails for some reason
2877
2878 """
2879 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2880
2881 try:
2882 hyper.MigrateInstance(cluster_name, instance, target, live)
2883 except errors.HypervisorError, err:
2884 _Fail("Failed to migrate instance: %s", err, exc=True)
2885
2886
2887 def FinalizeMigrationSource(instance, success, live):
2888 """Finalize the instance migration on the source node.
2889
2890 @type instance: L{objects.Instance}
2891 @param instance: the instance definition of the migrated instance
2892 @type success: bool
2893 @param success: whether the migration succeeded or not
2894 @type live: bool
2895 @param live: whether the user requested a live migration or not
2896 @raise RPCFail: If the execution fails for some reason
2897
2898 """
2899 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2900
2901 try:
2902 hyper.FinalizeMigrationSource(instance, success, live)
2903 except Exception, err: # pylint: disable=W0703
2904 _Fail("Failed to finalize the migration on the source node: %s", err,
2905 exc=True)
2906
2907
2908 def GetMigrationStatus(instance):
2909 """Get the migration status
2910
2911 @type instance: L{objects.Instance}
2912 @param instance: the instance that is being migrated
2913 @rtype: L{objects.MigrationStatus}
2914 @return: the status of the current migration (one of
2915 L{constants.HV_MIGRATION_VALID_STATUSES}), plus any additional
2916 progress info that can be retrieved from the hypervisor
2917 @raise RPCFail: If the migration status cannot be retrieved
2918
2919 """
2920 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2921 try:
2922 return hyper.GetMigrationStatus(instance)
2923 except Exception, err: # pylint: disable=W0703
2924 _Fail("Failed to get migration status: %s", err, exc=True)
2925
2926
2927 def HotplugDevice(instance, action, dev_type, device, extra, seq):
2928 """Hotplug a device
2929
2930 Hotplug is currently supported only for KVM Hypervisor.
2931 @type instance: L{objects.Instance}
2932 @param instance: the instance to which we hotplug a device
2933 @type action: string
2934 @param action: the hotplug action to perform
2935 @type dev_type: string
2936 @param dev_type: the device type to hotplug
2937 @type device: either L{objects.NIC} or L{objects.Disk}
2938 @param device: the device object to hotplug
2939 @type extra: tuple
2940 @param extra: extra info used for disk hotplug (disk link, drive uri)
2941 @type seq: int
2942 @param seq: the index of the device from master perspective
2943 @raise RPCFail: in case instance does not have KVM hypervisor
2944
2945 """
2946 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2947 try:
2948 hyper.VerifyHotplugSupport(instance, action, dev_type)
2949 except errors.HotplugError, err:
2950 _Fail("Hotplug is not supported: %s", err)
2951
2952 if action == constants.HOTPLUG_ACTION_ADD:
2953 fn = hyper.HotAddDevice
2954 elif action == constants.HOTPLUG_ACTION_REMOVE:
2955 fn = hyper.HotDelDevice
2956 elif action == constants.HOTPLUG_ACTION_MODIFY:
2957 fn = hyper.HotModDevice
2958 else:
2959 assert action in constants.HOTPLUG_ALL_ACTIONS
2960
2961 return fn(instance, dev_type, device, extra, seq)
2962
2963
2964 def HotplugSupported(instance):
2965 """Checks if hotplug is generally supported.
2966
2967 """
2968 hyper = hypervisor.GetHypervisor(instance.hypervisor)
2969 try:
2970 hyper.HotplugSupported(instance)
2971 except errors.HotplugError, err:
2972 _Fail("Hotplug is not supported: %s", err)
2973
2974
2975 def ModifyInstanceMetadata(metadata):
2976 """Sends instance data to the metadata daemon.
2977
2978 Uses the Luxi transport layer to communicate with the metadata
2979 daemon configuration server. It starts the metadata daemon if it is
2980 not running.
2981 The daemon must be enabled during at configuration time.
2982
2983 @type metadata: dict
2984 @param metadata: instance metadata obtained by calling
2985 L{objects.Instance.ToDict} on an instance object
2986
2987 """
2988 if not constants.ENABLE_METAD:
2989 raise errors.ProgrammerError("The metadata deamon is disabled, yet"
2990 " ModifyInstanceMetadata has been called")
2991
2992 if not utils.IsDaemonAlive(constants.METAD):
2993 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.METAD])
2994 if result.failed:
2995 raise errors.HypervisorError("Failed to start metadata daemon")
2996
2997 with contextlib.closing(metad.Client()) as client:
2998 client.UpdateConfig(metadata)
2999
3000
3001 def BlockdevCreate(disk, size, owner, on_primary, info, excl_stor):
3002 """Creates a block device for an instance.
3003
3004 @type disk: L{objects.Disk}
3005 @param disk: the object describing the disk we should create
3006 @type size: int
3007 @param size: the size of the physical underlying device, in MiB
3008 @type owner: str
3009 @param owner: the name of the instance for which disk is created,
3010 used for device cache data
3011 @type on_primary: boolean
3012 @param on_primary: indicates if it is the primary node or not
3013 @type info: string
3014 @param info: string that will be sent to the physical device
3015 creation, used for example to set (LVM) tags on LVs
3016 @type excl_stor: boolean
3017 @param excl_stor: Whether exclusive_storage is active
3018
3019 @return: the new unique_id of the device (this can sometime be
3020 computed only after creation), or None. On secondary nodes,
3021 it's not required to return anything.
3022
3023 """
3024 # TODO: remove the obsolete "size" argument
3025 # pylint: disable=W0613
3026 clist = []
3027 if disk.children:
3028 for child in disk.children:
3029 try:
3030 crdev = _RecursiveAssembleBD(child, owner, on_primary)
3031 except errors.BlockDeviceError, err:
3032 _Fail("Can't assemble device %s: %s", child, err)
3033 if on_primary or disk.AssembleOnSecondary():
3034 # we need the children open in case the device itself has to
3035 # be assembled
3036 try:
3037 # pylint: disable=E1103
3038 crdev.Open()
3039 except errors.BlockDeviceError, err:
3040 _Fail("Can't make child '%s' read-write: %s", child, err)
3041 clist.append(crdev)
3042
3043 try:
3044 device = bdev.Create(disk, clist, excl_stor)
3045 except errors.BlockDeviceError, err:
3046 _Fail("Can't create block device: %s", err)
3047
3048 if on_primary or disk.AssembleOnSecondary():
3049 try:
3050 device.Assemble()
3051 except errors.BlockDeviceError, err:
3052 _Fail("Can't assemble device after creation, unusual event: %s", err)
3053 if on_primary or disk.OpenOnSecondary():
3054 try:
3055 device.Open(force=True)
3056 except errors.BlockDeviceError, err:
3057 _Fail("Can't make device r/w after creation, unusual event: %s", err)
3058 DevCacheManager.UpdateCache(device.dev_path, owner,
3059 on_primary, disk.iv_name)
3060
3061 device.SetInfo(info)
3062
3063 return device.unique_id
3064
3065
3066 def _DumpDevice(source_path, target_path, offset, size, truncate):
3067 """This function images/wipes the device using a local file.
3068
3069 @type source_path: string
3070 @param source_path: path of the image or data source (e.g., "/dev/zero")
3071
3072 @type target_path: string
3073 @param target_path: path of the device to image/wipe
3074
3075 @type offset: int
3076 @param offset: offset in MiB in the output file
3077
3078 @type size: int
3079 @param size: maximum size in MiB to write (data source might be smaller)
3080
3081 @type truncate: bool
3082 @param truncate: whether the file should be truncated
3083
3084 @return: None
3085 @raise RPCFail: in case of failure
3086
3087 """
3088 # Internal sizes are always in Mebibytes; if the following "dd" command
3089 # should use a different block size the offset and size given to this
3090 # function must be adjusted accordingly before being passed to "dd".
3091 block_size = constants.DD_BLOCK_SIZE
3092
3093 cmd = [constants.DD_CMD, "if=%s" % source_path, "seek=%d" % offset,
3094 "bs=%s" % block_size, "oflag=direct", "of=%s" % target_path,
3095 "count=%d" % size]
3096
3097 if not truncate:
3098 cmd.append("conv=notrunc")
3099
3100 result = utils.RunCmd(cmd)
3101
3102 if result.failed:
3103 _Fail("Dump command '%s' exited with error: %s; output: %s", result.cmd,
3104 result.fail_reason, result.output)
3105
3106
3107 def _DownloadAndDumpDevice(source_url, target_path, size):
3108 """This function images a device using a downloaded image file.
3109
3110 @type source_url: string
3111 @param source_url: URL of image to dump to disk
3112
3113 @type target_path: string
3114 @param target_path: path of the device to image
3115
3116 @type size: int
3117 @param size: maximum size in MiB to write (data source might be smaller)
3118
3119 @rtype: NoneType
3120 @return: None
3121 @raise RPCFail: in case of download or write failures
3122
3123 """
3124 class DDParams(object):
3125 def __init__(self, current_size, total_size):
3126 self.current_size = current_size
3127 self.total_size = total_size
3128 self.image_size_error = False
3129
3130 def dd_write(ddparams, out):
3131 if ddparams.current_size < ddparams.total_size:
3132 ddparams.current_size += len(out)
3133 target_file.write(out)
3134 else:
3135 ddparams.image_size_error = True
3136 return -1
3137
3138 target_file = open(target_path, "r+")
3139 ddparams = DDParams(0, 1024 * 1024 * size)
3140
3141 curl = pycurl.Curl()
3142 curl.setopt(pycurl.VERBOSE, True)
3143 curl.setopt(pycurl.NOSIGNAL, True)
3144 curl.setopt(pycurl.USERAGENT, http.HTTP_GANETI_VERSION)
3145 curl.setopt(pycurl.URL, source_url)
3146 curl.setopt(pycurl.WRITEFUNCTION, lambda out: dd_write(ddparams, out))
3147
3148 try:
3149 curl.perform()
3150 except pycurl.error:
3151 if ddparams.image_size_error:
3152 _Fail("Disk image larger than the disk")
3153 else:
3154 raise
3155
3156 target_file.close()
3157
3158
3159 def BlockdevConvert(src_disk, target_disk):
3160 """Copies data from source block device to target.
3161
3162 This function gets the export and import commands from the source and
3163 target devices respectively, and then concatenates them to a single
3164 command using a pipe ("|"). Finally, executes the unified command that
3165 will transfer the data between the devices during the disk template
3166 conversion operation.
3167
3168 @type src_disk: L{objects.Disk}
3169 @param src_disk: the disk object we want to copy from
3170 @type target_disk: L{objects.Disk}
3171 @param target_disk: the disk object we want to copy to
3172
3173 @rtype: NoneType
3174 @return: None
3175 @raise RPCFail: in case of failure
3176
3177 """
3178 src_dev = _RecursiveFindBD(src_disk)
3179 if src_dev is None:
3180 _Fail("Cannot copy from device '%s': device not found", src_disk.uuid)
3181
3182 dest_dev = _RecursiveFindBD(target_disk)
3183 if dest_dev is None:
3184 _Fail("Cannot copy to device '%s': device not found", target_disk.uuid)
3185
3186 src_cmd = src_dev.Export()
3187 dest_cmd = dest_dev.Import()
3188 command = "%s | %s" % (utils.ShellQuoteArgs(src_cmd),
3189 utils.ShellQuoteArgs(dest_cmd))
3190
3191 result = utils.RunCmd(command)
3192 if result.failed:
3193 _Fail("Disk conversion command '%s' exited with error: %s; output: %s",
3194 result.cmd, result.fail_reason, result.output)
3195
3196
3197 def BlockdevWipe(disk, offset, size):
3198 """Wipes a block device.
3199
3200 @type disk: L{objects.Disk}
3201 @param disk: the disk object we want to wipe
3202 @type offset: int
3203 @param offset: The offset in MiB in the file
3204 @type size: int
3205 @param size: The size in MiB to write
3206
3207 """
3208 try:
3209 rdev = _RecursiveFindBD(disk)
3210 except errors.BlockDeviceError:
3211 rdev = None
3212
3213 if not rdev:
3214 _Fail("Cannot wipe device %s: device not found", disk.iv_name)
3215 if offset < 0:
3216 _Fail("Negative offset")
3217 if size < 0:
3218 _Fail("Negative size")
3219 if offset > rdev.size:
3220 _Fail("Wipe offset is bigger than device size")
3221 if (offset + size) > rdev.size:
3222 _Fail("Wipe offset and size are bigger than device size")
3223
3224 _DumpDevice("/dev/zero", rdev.dev_path, offset, size, True)
3225
3226
3227 def BlockdevImage(disk, image, size):
3228 """Images a block device either by dumping a local file or
3229 downloading a URL.
3230
3231 @type disk: L{objects.Disk}
3232 @param disk: the disk object we want to image
3233
3234 @type image: string
3235 @param image: file path to the disk image be dumped
3236
3237 @type size: int
3238 @param size: The size in MiB to write
3239
3240 @rtype: NoneType
3241 @return: None
3242 @raise RPCFail: in case of failure
3243
3244 """
3245 if not (utils.IsUrl(image) or os.path.exists(image)):
3246 _Fail("Image '%s' not found", image)
3247
3248 try:
3249 rdev = _RecursiveFindBD(disk)
3250 except errors.BlockDeviceError:
3251 rdev = None
3252
3253 if not rdev:
3254 _Fail("Cannot image device %s: device not found", disk.iv_name)
3255 if size < 0:
3256 _Fail("Negative size")
3257 if size > rdev.size:
3258 _Fail("Image size is bigger than device size")
3259
3260 if utils.IsUrl(image):
3261 _DownloadAndDumpDevice(image, rdev.dev_path, size)
3262 else:
3263 _DumpDevice(image, rdev.dev_path, 0, size, False)
3264
3265
3266 def BlockdevPauseResumeSync(disks, pause):
3267 """Pause or resume the sync of the block device.
3268
3269 @type disks: list of L{objects.Disk}
3270 @param disks: the disks object we want to pause/resume
3271 @type pause: bool
3272 @param pause: Wheater to pause or resume
3273
3274 """
3275 success = []
3276 for disk in disks:
3277 try:
3278 rdev = _RecursiveFindBD(disk)
3279 except errors.BlockDeviceError:
3280 rdev = None
3281
3282 if not rdev:
3283 success.append((False, ("Cannot change sync for device %s:"
3284 " device not found" % disk.iv_name)))
3285 continue
3286
3287 result = rdev.PauseResumeSync(pause)
3288
3289 if result:
3290 success.append((result, None))
3291 else:
3292 if pause:
3293 msg = "Pause"
3294 else:
3295 msg = "Resume"
3296 success.append((result, "%s for device %s failed" % (msg, disk.iv_name)))
3297
3298 return success
3299
3300
3301 def BlockdevRemove(disk):
3302 """Remove a block device.
3303
3304 @note: This is intended to be called recursively.
3305
3306 @type disk: L{objects.Disk}
3307 @param disk: the disk object we should remove
3308 @rtype: boolean
3309 @return: the success of the operation
3310
3311 """
3312 msgs = []
3313 try:
3314 rdev = _RecursiveFindBD(disk)
3315 except errors.BlockDeviceError, err:
3316 # probably can't attach
3317 logging.info("Can't attach to device %s in remove", disk)
3318 rdev = None
3319 if rdev is not None:
3320 r_path = rdev.dev_path
3321
3322 def _TryRemove():
3323 try:
3324 rdev.Remove()
3325 return []
3326 except errors.BlockDeviceError, err:
3327 return [str(err)]
3328
3329 msgs.extend(utils.SimpleRetry([], _TryRemove,
3330 constants.DISK_REMOVE_RETRY_INTERVAL,
3331 constants.DISK_REMOVE_RETRY_TIMEOUT))
3332
3333 if not msgs:
3334 DevCacheManager.RemoveCache(r_path)
3335
3336 if disk.children:
3337 for child in disk.children:
3338 try:
3339 BlockdevRemove(child)
3340 except RPCFail, err:
3341 msgs.append(str(err))
3342
3343 if msgs:
3344 _Fail("; ".join(msgs))
3345
3346
3347 def _RecursiveAssembleBD(disk, owner, as_primary):
3348 """Activate a block device for an instance.
3349
3350 This is run on the primary and secondary nodes for an instance.
3351
3352 @note: this function is called recursively.
3353
3354 @type disk: L{objects.Disk}
3355 @param disk: the disk we try to assemble
3356 @type owner: str
3357 @param owner: the name of the instance which owns the disk